diff --git a/.clang-format b/.clang-format
index 6caaa71987..f71507a24b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,6 +2,7 @@
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 
+---
 # Use LLVM's style
 BasedOnStyle: LLVM
-ColumnLimit:  80
+ColumnLimit: 80
diff --git a/.devcontainer.json b/.devcontainer.json
index 3cbc1ab13f..40c726bb83 100644
--- a/.devcontainer.json
+++ b/.devcontainer.json
@@ -1,24 +1,24 @@
 {
-    "image": "ghcr.io/pulp-platform/deeploy:main",
-    "name": "deeploy_main",
-    "customizations": {
-        "vscode": {
-            "extensions": [
-                "ms-vscode.cpptools-extension-pack",
-                "twxs.cmake",
-                "josetr.cmake-language-support-vscode",
-                "ms-vscode.cmake-tools",
-                "ms-python.python",
-                "ms-vscode-remote.remote-containers",
-                "rioj7.command-variable"
-            ]
-        }
-    },
-    "mounts": [
-        {
-            "source": "${localWorkspaceFolder}",
-            "target": "/app/Deeploy",
-            "type": "bind"
-        }
-    ]
+  "image": "ghcr.io/pulp-platform/deeploy:main",
+  "name": "deeploy_main",
+  "customizations": {
+    "vscode": {
+      "extensions": [
+        "ms-vscode.cpptools-extension-pack",
+        "twxs.cmake",
+        "josetr.cmake-language-support-vscode",
+        "ms-vscode.cmake-tools",
+        "ms-python.python",
+        "ms-vscode-remote.remote-containers",
+        "rioj7.command-variable"
+      ]
+    }
+  },
+  "mounts": [
+    {
+      "source": "${localWorkspaceFolder}",
+      "target": "/app/Deeploy",
+      "type": "bind"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index fd088f6161..b367d31545 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,13 +1,13 @@
 Describe the intent of your PR here.
 
 ## Added
-- 
+-
 
 ## Changed
-- 
+-
 
 ## Fixed
-- 
+-
 
 ## PR Merge Checklist
 
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
deleted file mode 100644
index 4b74c66f97..0000000000
--- a/.github/workflows/CI.yml
+++ /dev/null
@@ -1,1027 +0,0 @@
-name: CI
-
-on:
-  push:
-    branches:
-      - '**'
-    tags:
-      - 'v*.*.*'
-  pull_request:
-  workflow_dispatch:
-    inputs:
-      docker_image_deeploy:
-        description: 'Deeploy Image to use'
-        required: false
-        default: 'ghcr.io/pulp-platform/deeploy:devel'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  select-docker-image-and-runner:
-    runs-on: ubuntu-latest
-    outputs:
-      image: ${{ steps.set-docker-image.outputs.image }}
-      runner: ${{ steps.set-runner.outputs.runner }}
-    steps:
-      - id: set-docker-image
-        run: |
-          if [[ -n "${{ github.event.inputs.docker_image_deeploy }}" ]]; then
-            IMAGE="${{ github.event.inputs.docker_image_deeploy }}"
-          elif [[ "${{ github.ref }}" == refs/tags/* ]]; then
-            TAG_NAME="${GITHUB_REF##refs/tags/}"
-            IMAGE="ghcr.io/pulp-platform/deeploy:${TAG_NAME}"
-          elif [[ "${{ github.ref_name }}" == "main" ]]; then
-            IMAGE="ghcr.io/pulp-platform/deeploy:main"
-          else
-            IMAGE="ghcr.io/pulp-platform/deeploy:devel"
-          fi
-          echo "Selected image: ${IMAGE}"
-          echo "image=${IMAGE}" >> $GITHUB_OUTPUT
-
-      - id: set-runner
-        run: |
-          if [[ "${{ github.repository }}" == "pulp-platform/Deeploy" ]]; then
-            echo "Selected self-hosted runner for Deeploy repository"
-            echo "runner=self-hosted" >> $GITHUB_OUTPUT
-          else
-            echo "Selected ubuntu-latest runner for external repository"
-            echo "runner=ubuntu-latest" >> $GITHUB_OUTPUT
-          fi
-
-  build-deeploy:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-
-
-  ### Generic Tests ###
-  generic-kernels:
-    uses: ./.github/workflows/TestRunnerGeneric.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        Adder
-        MultIO
-        test1DConvolution
-        test2DConvolution
-        test1DDWConvolution
-        test2DDWConvolution
-        test1DPad
-        test2DPad
-        testGEMM
-        testMatMul
-        testMatMulAdd
-        testMaxPool
-        testRQConv
-        testRQMatMul
-        testReduceSum
-        testReduceMean
-        testSlice
-        testRequantizedDWConv
-        test2DRequantizedConv
-        iSoftmax
-        testFloatAdder
-        testFloatGEMM
-        testFloat2DConvolution
-        testFloat2DConvolutionBias
-        testFloat2DConvolutionZeroBias
-        testFloatLayerNorm
-        testFloatDiv
-        testFloat2DDWConvolution
-        testFloat2DDWConvolutionBias
-        testFloat2DDWConvolutionZeroBias
-        testFloatRelu
-        testFloatMaxPool
-        testFloatMatmul
-        testFloatReshapeWithSkipConnection
-        testFloatSoftmax
-        testFloatTranspose
-        testFloatMul
-        Quant
-        Dequant
-        QuantizedLinear
-
-
-  generic-models:
-    uses: ./.github/workflows/TestRunnerGeneric.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        simpleRegression
-        WaveFormer
-        simpleCNN
-        ICCT
-        ICCT_ITA
-        ICCT_8
-        ICCT_ITA_8
-        miniMobileNet
-        miniMobileNetv2
-        CCT/CCT_1_16_16_8
-        testFloatDemoTinyViT
-
-  ### SoftHier Tests ###
-  softhier-kernels:
-    uses: ./.github/workflows/TestRunnerSoftHier.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        Adder
-
-  ### CortexM Tests ###
-  cortexm-kernels:
-    uses: ./.github/workflows/TestRunnerCortexM.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        Adder
-        MultIO
-        test1DPad
-        test2DPad
-        testMatMul
-        testMatMulAdd
-        testMaxPool
-        testRQConv
-        testReduceSum
-        testReduceMean
-        testSlice
-
-  cortexm-models:
-    uses: ./.github/workflows/TestRunnerCortexM.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        simpleRegression
-        WaveFormer
-
-
-  ### Chimera Tests ###
-  chimera-kernels:
-    uses: ./.github/workflows/TestRunnerChimera.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        Adder
-      simulators: |
-        gvsoc
-
-
-  ### Snitch Tests ###
-  snitch-kernels:
-    uses: ./.github/workflows/TestRunnerSnitch.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        Adder
-        iSoftmax
-        TestiNoNorm
-        TestAdderLarge
-        TestiSoftmaxLarge
-        testMatMul
-        testRQGEMM
-        TestRQAdd
-        testRQGEMMTransB
-        testFloatSoftmax
-      num-cores: 9
-      simulators: |
-        gvsoc
-
-  snitch-kernels-tiled-singlebuffer-L2:
-    uses: ./.github/workflows/TestRunnerTiledSnitchSequential.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      tests-config: |
-        [
-          {
-            "name": "TestiNoNorm",
-            "L1": [5000, 10000]
-          },
-          {
-            "name": "TestAdderLarge",
-            "L1": [5000, 10000]
-          },
-          {
-            "name": "TestiSoftmaxLarge",
-            "L1": [5000, 10000]
-          },
-          {
-            "name": "testRQGEMM",
-            "L1": [2000, 5000]
-          },
-          {
-            "name": "testFloatSoftmax",
-            "L1": [2000, 5000, 10000]
-          },
-
-          {
-            "name": "TestRQAdd",
-            "L1": [5000, 10000]
-          },
-
-          {
-            "name": "testFloatGEMM",
-            "L1": [2000, 5000, 10000]
-          },
-
-          {
-            "name": "testFloatGEMMtransB",
-            "L1": [2000, 5000, 10000]
-          }
-        ]
-      simulators: |
-        gvsoc
-
-  ### Mempool Tests ###
-  mempool-kernels:
-    uses: ./.github/workflows/TestRunnerMempool.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-          Adder
-          MultIO
-          test1DConvolution
-          test2DConvolution
-          test1DDWConvolution
-          test2DDWConvolution
-          test1DPad
-          test2DPad
-          testGEMM
-          testMatMul
-          testMatMulAdd
-          testMaxPool
-          testRQConv
-          testRQGEMM
-          testRQMatMul
-          testReduceSum
-          testReduceMean
-          testSlice
-          testRequantizedDWConv
-          test2DRequantizedConv
-
-  mempool-models:
-    uses: ./.github/workflows/TestRunnerMempool.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        simpleRegression
-        simpleCNN
-        ICCT
-        ICCT_ITA
-        ICCT_8
-        ICCT_ITA_8
-        miniMobileNet
-        miniMobileNetv2
-
-
-  ### Siracusa Tests ###
-  siracusa-kernels:
-    uses: ./.github/workflows/TestRunnerSiracusa.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        Adder
-        MultIO
-        test1DPad
-        test2DPad
-        testMatMul
-        testMatMulAdd
-        testRequantizedDWConv
-        test2DRequantizedConv
-        iSoftmax
-        testConcat
-        testRMSNorm
-        trueIntegerDivSandwich
-        Hardswish
-        RQHardswish
-        testBacktracking
-        testFloatAdder
-        testFloatGEMM
-        testFloat2DConvolution
-        testFloatLayerNorm
-        testFloatRelu
-        testFloatMaxPool
-        testFloatMatmul
-        testFloatSoftmax
-        testFloatTranspose
-        testFloatMul
-        Quant
-        Dequant
-        testFloatReduceSum
-        testFloatSoftmaxGrad
-        testFloatSoftmaxCrossEntropy
-        testFloatSoftmaxCrossEntropyGrad
-        QuantizedLinear
-      num-cores: 8
-
-  siracusa-models:
-    uses: ./.github/workflows/TestRunnerSiracusa.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-names: |
-        simpleRegression
-        miniMobileNet
-        miniMobileNetv2
-        Attention
-        MLPerf/KeywordSpotting
-        MLPerf/ImageClassification
-        MLPerf/AnomalyDetection
-        CCT/CCT_1_16_16_8
-        testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8
-      num-cores: 8
-
-  siracusa-kernels-tiled-singlebuffer-L2:
-    uses: ./.github/workflows/TestRunnerTiledSiracusaSequential.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      tests-config: |
-        [
-          {
-            "name": "testMatMul",
-            "L1": [64000, 32000, 16000]
-          },
-          {
-            "name": "test2DRequantizedConv",
-            "L1": [8000, 6000, 4000]
-          },
-          {
-            "name": "test2DRequantizedStriddedPaddedConv",
-            "L1": [600]
-          },
-          {
-            "name": "testRequantizedDWConv",
-            "L1": [2561]
-          },
-          {
-            "name": "iSoftmax",
-            "L1": [800, 500, 300]
-          },
-          {
-            "name": "testConcat",
-            "L1": [32000, 16000, 8000]
-          },
-          {
-            "name": "testRMSNorm",
-            "L1": [2048, 1024, 512]
-          },
-          {
-            "name": "Hardswish",
-            "L1": [750]
-          },
-          {
-            "name": "RQHardswish",
-            "L1": [750]
-          },
-          {
-            "name": "testFloatGEMM",
-            "L1": [8000]
-          },
-          {
-            "name": "testFloat2DConvolution",
-            "L1": [8000]
-          },
-          {
-            "name": "testFloatLayerNorm",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatRelu",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatMaxPool",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatMatmul",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatSoftmax",
-            "L1": [4000]
-          },
-          {
-            "name": "testFloatTranspose",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatMul",
-            "L1": [2000]
-          },
-          {
-          "name": "largeFloatAdd",
-          "L1": [220000]
-          }
-        ]
-      num-cores: 8
-
-  siracusa-kernels-tiled-doublebuffer-L2:
-    uses: ./.github/workflows/TestRunnerTiledSiracusaSequential.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      tests-config: |
-        [
-          {
-            "name": "testMatMul",
-            "L1": [64000, 32000, 16000]
-          },
-          {
-            "name": "test2DRequantizedConv",
-            "L1": [8000, 6000, 5000]
-          },
-          {
-            "name": "testRequantizedDWConv",
-            "L1": [5121]
-          },
-          {
-            "name": "iSoftmax",
-            "L1": [1600, 1000, 600]
-          },
-          {
-            "name": "testConcat",
-            "L1": [64000, 32000, 16000]
-          },
-          {
-            "name": "testRMSNorm",
-            "L1": [4096, 2048, 1024]
-          },
-          {
-            "name": "Hardswish",
-            "L1": [750]
-          },
-          {
-            "name": "RQHardswish",
-            "L1": [800]
-          },
-          {
-            "name": "testFloatGEMM",
-            "L1": [8000]
-          },
-          {
-            "name": "testFloat2DConvolution",
-            "L1": [15000]
-          },
-          {
-            "name": "testFloatLayerNorm",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatRelu",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatMaxPool",
-            "L1": [5000]
-          },
-          {
-            "name": "testFloatMatmul",
-            "L1": [5000]
-          },
-          {
-            "name": "testFloatSoftmax",
-            "L1": [8000]
-          },
-          {
-            "name": "testFloatTranspose",
-            "L1": [2000]
-          },
-          {
-            "name": "testFloatMul",
-            "L1": [2000]
-          }
-        ]
-      num-cores: 8
-      double-buffer: true
-
-  siracusa-models-tiled-singlebuffer-L2:
-    strategy:
-      fail-fast: false
-      matrix:
-        test-data:
-          - name: "simpleRegression"
-            L1: [45000, 30000, 15000]
-          - name: "miniMobileNet"
-            L1: [60000, 12000, 6000, 3000]
-          - name: "miniMobileNetv2"
-            L1: [60000, 16000, 12000, 8000]
-          - name: "Attention"
-            L1: [60000, 10000, 5000]
-          - name: "microLlama/microLlama1"
-            L1: [60000, 10000, 5000]
-          - name: "microLlama/microLlama8"
-            L1: [60000, 10000, 5000]
-          - name: "microLlama/microLlama8_parallel"
-            L1: [60000, 10000, 5000]
-          - name: "MLPerf/KeywordSpotting"
-            L1: [64000]
-          - name: "MLPerf/ImageClassification"
-            L1: [64000]
-          - name: "MLPerf/AnomalyDetection"
-            L1: [64000]
-          - name: "CCT/CCT_1_16_16_8"
-            L1: [64000]
-          - name: "testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8"
-            L1: [64000]
-        num-cores:
-          - 8
-    uses: ./.github/workflows/TestRunnerTiledSiracusa.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-name: ${{ matrix.test-data.name }}
-      num-cores: ${{ matrix.num-cores }}
-      L1: ${{ toJson(matrix.test-data.L1) }}
-
-  siracusa-models-tiled-singlebuffer-L3:
-    strategy:
-      fail-fast: false
-      matrix:
-        test-data:
-          - name: "simpleRegression"
-            L1: [45000, 30000, 16000] # SCHEREMO: 15000 leads to non-2d transfers in L3!
-          - name: "miniMobileNet"
-            L1: [60000, 12000, 6000] # SCHEREMO: 3000 leads to non-2d transfers in L3!
-          - name: "miniMobileNetv2"
-            L1: [60000, 16000, 12000, 8000]
-          - name: "Attention"
-            L1: [60000, 10000, 5000, 2500]
-          - name: "Transformer"
-            L1: [60000, 30000, 15000]
-          - name: "microLlama/microLlama1"
-            L1: [60000, 10000, 5000]
-          - name: "CCT/CCT_2_32_32_128"
-            L1: [128000]
-          - name: "testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128"
-            L1: [64000]
-        num-cores:
-          - 8
-        default-memory-level:
-          - "L3"
-    uses: ./.github/workflows/TestRunnerTiledSiracusa.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-name: ${{ matrix.test-data.name }}
-      num-cores: ${{ matrix.num-cores }}
-      L1: ${{ toJson(matrix.test-data.L1) }}
-      default-memory-level: ${{ matrix.default-memory-level }}
-
-  # TEMPORARILY DISABLE L3 TRANSFER DUE TO DRIVER BUG CAUSING SPORADIC CRASH
-  siracusa-models-tiled-doublebuffer-L3:
-    strategy:
-      fail-fast: false
-      matrix:
-        test-data:
-          - name: "simpleRegression"
-            L1: [60000, 45000, 30000]
-          - name: "miniMobileNet"
-            L1: [60000, 24000, 12000, 6000]
-          - name: "miniMobileNetv2"
-            L1: [60000, 32000, 24000, 16000]
-          - name: "Attention"
-            L1: [60000, 20000, 10000, 5000]
-          - name: "Transformer"
-            L1: [60000, 30000, 15000]
-          - name: "microLlama/microLlama1"
-            L1: [60000, 20000, 10000]
-          - name: "microLlama/microLlama8"
-            L1: [60000, 20000, 10000]
-          - name: "microLlama/microLlama8_parallel"
-            L1: [60000, 20000, 10000]
-          - name: "CCT/CCT_2_32_32_128"
-            L1: [128000]
-          - name: "testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128"
-            L1: [64000]
-        num-cores:
-          - 8
-        double-buffer:
-          - true
-        default-memory-level:
-          - "L3"
-    uses: ./.github/workflows/TestRunnerTiledSiracusa.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-name: ${{ matrix.test-data.name }}
-      num-cores: ${{ matrix.num-cores }}
-      L1: ${{ toJson(matrix.test-data.L1) }}
-      double-buffer: ${{ matrix.double-buffer }}
-      default-memory-level: ${{ matrix.default-memory-level }}
-
-  siracusa-neureka-kernels-tiled-singlebuffer-L2:
-    uses: ./.github/workflows/TestRunnerTiledSiracusaWithNeurekaSequential.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      tests-config: |
-        [
-          {
-            "name": "testRequantizedLinear",
-            "L1": [16000]
-          },
-          {
-            "name": "testPointwise",
-            "L1": [32000]
-          },
-          {
-            "name": "testPointwiseConvBNReLU",
-            "L1": [32000]
-          },
-          {
-            "name": "testPointwiseUnsignedWeights",
-            "L1": [32000]
-          }
-        ]
-      num-cores: 8
-
-  siracusa-neureka-kernels-tiled-doublebuffer-L2:
-    uses: ./.github/workflows/TestRunnerTiledSiracusaWithNeurekaSequential.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      tests-config: |
-        [
-          {
-            "name": "testRequantizedLinear",
-            "L1": [16000]
-          },
-          {
-            "name": "testPointwise",
-            "L1": [32000]
-          },
-          {
-            "name": "testPointwiseConvBNReLU",
-            "L1": [32000]
-          },
-          {
-            "name": "testPointwiseUnsignedWeights",
-            "L1": [32000]
-          }
-        ]
-      num-cores: 8
-      double-buffer: true
-
-  siracusa-neureka-models-tiled-singlebuffer-L3:
-    strategy:
-      fail-fast: false
-      matrix:
-        test-data:
-          - name: "miniMobileNet"
-            L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
-          - name: "Attention"
-            L1: [2500]
-          - name: "Transformer"
-            L1: [15000]
-          - name: "microLlama/microLlama1"
-            L1: [10000]
-        num-cores:
-          - 8
-        default-memory-level:
-          - "L3"
-    uses: ./.github/workflows/TestRunnerTiledSiracusaWithNeureka.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-name: ${{ matrix.test-data.name }}
-      num-cores: ${{ matrix.num-cores }}
-      L1: ${{ toJson(matrix.test-data.L1) }}
-      default-memory-level: ${{ matrix.default-memory-level }}
-
-  siracusa-neureka-models-tiled-doublebuffer-L3:
-    strategy:
-      fail-fast: false
-      matrix:
-        test-data:
-          - name: "miniMobileNet"
-            L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
-          - name: "Attention"
-            L1: [5000]
-          - name: "Transformer"
-            L1: [30000]
-        num-cores:
-          - 8
-        double-buffer:
-          - true
-        default-memory-level:
-          - "L3"
-    uses: ./.github/workflows/TestRunnerTiledSiracusaWithNeureka.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-name: ${{ matrix.test-data.name }}
-      num-cores: ${{ matrix.num-cores }}
-      L1: ${{ toJson(matrix.test-data.L1) }}
-      double-buffer: ${{ matrix.double-buffer }}
-      default-memory-level: ${{ matrix.default-memory-level }}
-
-  siracusa-neureka-kernels-tiled-singlebuffer-L2-wmem:
-    uses: ./.github/workflows/TestRunnerTiledSiracusaWithNeurekaSequential.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      tests-config: |
-        [
-          {
-            "name": "testRequantizedLinear",
-            "L1": [16000]
-          },
-          {
-            "name": "testPointwise",
-            "L1": [32000]
-          },
-          {
-            "name": "testPointwiseConvBNReLU",
-            "L1": [32000]
-          },
-          {
-            "name": "testPointwiseUnsignedWeights",
-            "L1": [32000]
-          }
-        ]
-      num-cores: 8
-      neureka-wmem: true
-
-  siracusa-neureka-models-tiled-doublebuffer-L3-wmem:
-    strategy:
-      fail-fast: false
-      matrix:
-        test-data:
-          - name: "miniMobileNet"
-            L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
-          - name: "Attention"
-            L1: [3500]
-          # - name: "Transformer"
-          #   L1: [30000]
-          - name: "microLlama/microLlama1"
-            L1: [10000]
-        num-cores:
-          - 8
-        double-buffer:
-          - true
-        default-memory-level:
-          - "L3"
-        neureka-wmem:
-          - true
-    uses: ./.github/workflows/TestRunnerTiledSiracusaWithNeureka.yml
-    needs: select-docker-image-and-runner
-    with:
-      runner: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-      docker-image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-      test-name: ${{ matrix.test-data.name }}
-      num-cores: ${{ matrix.num-cores }}
-      L1: ${{ toJson(matrix.test-data.L1) }}
-      double-buffer: ${{ matrix.double-buffer }}
-      default-memory-level: ${{ matrix.default-memory-level }}
-      neureka-wmem: ${{ matrix.neureka-wmem }}
-
-
-  ### Deeploy Extension and Internal Tests ###
-  deeploy-memory-allocation:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa  --defaultMemLevel=L2 --l1=64000 --l2=75000 --memAllocStrategy=MiniMalloc
-          python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa  --defaultMemLevel=L2 --l1=64000 --l2=60000 --memAllocStrategy=MiniMalloc --shouldFail
-          python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa  --defaultMemLevel=L2 --l1=64000 --l2=90000 --memAllocStrategy=TetrisRandom
-          python testMVP.py -t Tests/CCT/CCT_1_16_16_8 -p Siracusa  --defaultMemLevel=L2 --l1=64000 --l2=75000 --memAllocStrategy=TetrisRandom --shouldFail
-
-  deeploy-state-serialization:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python deeployStateEqualityTest.py -t ./Tests/simpleRegression -p QEMU-ARM
-          python deeployStateEqualityTest.py -t ./Tests/simpleRegression -p Siracusa
-          python deeployStateEqualityTest.py -t ./Tests/simpleRegression -p MemPool
-          python deeployStateEqualityTest.py -t ./Tests/simpleRegression -p Generic
-        shell: bash
-
-  deeploy-memory-level-extension:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python testMemoryLevelExtension.py -t ./Tests/simpleRegression -p QEMU-ARM
-          python testMemoryLevelExtension.py -t ./Tests/simpleRegression -p Siracusa
-          python testMemoryLevelExtension.py -t ./Tests/simpleRegression -p MemPool
-          python testMemoryLevelExtension.py -t ./Tests/simpleRegression -p Generic
-        shell: bash
-
-  deeploy-tiler-extension:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python testTilerExtension.py -p Siracusa -t ./Tests/simpleRegression
-          python testTilerExtension.py -p Siracusa -t ./Tests/simpleCNN
-          python testTilerExtension.py -p Siracusa -t ./Tests/testMatMul
-          python testTilerExtension.py -p Siracusa -t ./Tests/testMaxPool
-          python testTilerExtension.py -p Siracusa -t ./Tests/simpleRegression --l1 2000 --shouldFail
-          python testTilerExtension.py -p Siracusa -t ./Tests/simpleCNN --l1 2000 --shouldFail
-          python testTilerExtension.py -p Siracusa -t ./Tests/testMatMul --l1 2000 --shouldFail
-          python testTilerExtension.py -p Siracusa -t ./Tests/testMaxPool --l1 2000 --shouldFail
-        shell: bash
-
-  deeploy-memory-allocation-extension:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python testTilerExtension.py -p Siracusa -t ./Tests/simpleRegression
-          python testTilerExtension.py -p Siracusa -t ./Tests/simpleCNN
-          python testTilerExtension.py -p Siracusa -t ./Tests/miniMobileNet
-          python testTilerExtension.py -p Siracusa -t ./Tests/miniMobileNetv2
-          python testTilerExtension.py -p Siracusa -t ./Tests/testMatMul
-          python testTilerExtension.py -p Siracusa -t ./Tests/testMaxPool
-        shell: bash
-
-  deeploy-typing:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python testTypes.py
-        shell: bash
-
-  deeploy-debug:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python testPrintInputOutputTransformation.py -p Generic -t ./Tests/simpleRegression
-          python testPrintInputOutputTransformation.py -p Siracusa -t ./Tests/simpleRegression
-          python testDebugPrintPass.py -p Generic -t ./Tests/simpleRegression
-        shell: bash
-
-  deeploy-regex-matching:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Run Test
-        run: |
-          cd DeeployTest
-          python testRegexMatching.py
-        shell: bash
-
-  linting:
-    runs-on: ${{ needs.select-docker-image-and-runner.outputs.runner }}
-    needs: select-docker-image-and-runner
-    container:
-      image: ${{ needs.select-docker-image-and-runner.outputs.image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: |
-          pip install -e .
-          cd DeeployTest
-      - name: Format Python
-        run: |
-          yapf -rpd -e "third_party/" -e "install/" -e "toolchain/" .
-        shell: bash
-      - name: Format Python Imports
-        run: |
-          isort --sg "**/third_party/*"  --sg "install/*" --sg "toolchain/*" ./ -c -v
-          autoflake -c -r --remove-all-unused-imports --ignore-init-module-imports --exclude "*/third_party/**" ./
-        shell: bash
-      - name: Format C
-        run: |
-          python scripts/run_clang_format.py -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -r --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format ./ scripts
-        shell: bash
-      - name: Format Python Licenses
-        run: |
-          grep -Lr "SPDX-License-Identifier: Apache-2.0" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude "run_clang_format.py" | grep ".*\.py$" || [[ $? == 1 ]]
-        shell: bash
-      - name: Format C Licenses
-        run: |
-          grep -Lr "SPDX-License-Identifier: Apache-2.0" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude-dir="runtime" | grep ".*\.c$" || [[ $? == 1 ]]
-        shell: bash
-      - name: Format C Header Licenses
-        run: |
-          grep -Lr "SPDX-License-Identifier: Apache-2.0" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude-dir="runtime" | grep ".*\.h$" || [[ $? == 1 ]]
-        shell: bash
diff --git a/.github/workflows/GenerateCCache.yml b/.github/workflows/GenerateCCache.yml
deleted file mode 100644
index b22d04bd1e..0000000000
--- a/.github/workflows/GenerateCCache.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-name: GenerateCCache
-
-on:
-  workflow_dispatch:
-    inputs:
-      docker_image_deeploy:
-        description: 'Deeploy Image to use'
-        required: false
-        default: 'ghcr.io/pulp-platform/deeploy:devel'
-  schedule:
-    # Runs the workflow on the default branch every day at 1AM CET to keep the cache fresh
-    - cron: "0 1 * * *"
-
-jobs:
-
-    generate-ccache:
-      runs-on: ubuntu-latest
-      container:
-        image:  ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }}
-      steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Generate CCache
-        run: |
-          cd DeeployTest
-          mkdir -p /app/.ccache
-          export CCACHE_DIR=/app/.ccache
-          python testRunner_generic.py -t ./Tests/Adder
-          python testRunner_mempool.py  -t ./Tests/Adder
-          python testRunner_cortexm.py  -t ./Tests/Adder
-          python testRunner_snitch.py  -t ./Tests/Adder
-          python testRunner_siracusa.py  -t ./Tests/Adder
-          python testRunner_tiled_siracusa.py  -t ./Tests/Adder
-          python testRunner_tiled_siracusa_w_neureka.py  -t ./Tests/Adder
-      - name: Clean and Upload CCache
-        uses: actions/cache@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-
-    
\ No newline at end of file
diff --git a/.github/workflows/GitLabCI.yml b/.github/workflows/GitLabCI.yml
deleted file mode 100644
index c7e5b5d93c..0000000000
--- a/.github/workflows/GitLabCI.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: GitLabCI
-
-on: 
-  # push:
-  # pull_request:
-  workflow_dispatch:
-
-jobs:
-  gitlab-ci:
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check Gitlab CI
-        uses: pulp-platform/pulp-actions/gitlab-ci@v2
-        # Skip on forks or pull requests from forks due to missing secrets.
-        if: (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository)
-        with:
-          domain: iis-git.ee.ethz.ch
-          repo: github-mirror/Deeploy
-          token: ${{ secrets.GITLAB_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerChimera.yml b/.github/workflows/TestRunnerChimera.yml
deleted file mode 100644
index daffbff246..0000000000
--- a/.github/workflows/TestRunnerChimera.yml
+++ /dev/null
@@ -1,54 +0,0 @@
-name: TestRunnerChimera
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      docker-image:
-        required: true
-        type: string
-      test-names:
-        required: true
-        type: string
-      simulators:
-        required: true
-        type: string      
-
-jobs:
-  test-runner-chimera:
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.docker-image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Cache ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-      - name: Run Test
-        run: |
-          testNames="${{ inputs.test-names }}"
-          simulators="${{inputs.simulators}}"
-          cd DeeployTest
-          mkdir -p /app/.ccache
-          export CCACHE_DIR=/app/.ccache
-          export CHIMERA_SDK_HOME=/app/install/chimera-sdk
-          echo "$simulators" | while IFS= read -r simulator; do
-            if [[ -n "$simulator" ]]; then
-              echo "$testNames" | while IFS= read -r testName; do
-                if [[ -n "$testName" ]]; then
-                  echo "Running test $testName using $simulator"
-                  python testRunner_chimera.py -t Tests/$testName --simulator=$simulator
-                fi
-              done
-            fi
-          done
-        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerSnitch.yml b/.github/workflows/TestRunnerSnitch.yml
deleted file mode 100644
index 302b21656a..0000000000
--- a/.github/workflows/TestRunnerSnitch.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: TestRunnerSnitch
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      docker-image:
-        required: true
-        type: string
-      test-names:
-        required: true
-        type: string
-      num-cores:
-        required: true
-        type: number
-      simulators:
-        required: true
-        type: string      
-
-jobs:
-  test-runner-snitch:
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.docker-image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Cache ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-      - name: Run Test
-        run: |
-          testNames="${{ inputs.test-names }}"
-          simulators="${{inputs.simulators}}"
-          cd DeeployTest
-          mkdir -p /app/.ccache
-          export CCACHE_DIR=/app/.ccache
-          echo "$simulators" | while IFS= read -r simulator; do
-            if [[ -n "$simulator" ]]; then
-              echo "$testNames" | while IFS= read -r testName; do
-                if [[ -n "$testName" ]]; then
-                  echo "Running test $testName using $simulator"
-                  python testRunner_snitch.py -t Tests/$testName --simulator=$simulator --cores=${{ inputs.num-cores }}
-                fi
-              done
-            fi
-          done
-        shell: bash
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerTiledSiracusa.yml b/.github/workflows/TestRunnerTiledSiracusa.yml
deleted file mode 100644
index da0b8dcd75..0000000000
--- a/.github/workflows/TestRunnerTiledSiracusa.yml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: TestRunnerTiledSiracusa
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      docker-image:
-        required: true
-        type: string
-      test-name:
-        required: true
-        type: string
-      num-cores:
-        required: false
-        default: 8
-        type: number
-      L1:
-        required: false
-        default: "[64000]"
-        type: string
-      default-memory-level:
-        required: false
-        default: "L2"
-        type: string
-      double-buffer:
-        required: false
-        default: false
-        type: boolean
-      memory-allocation-strategy:
-        required: false
-        default: "MiniMalloc"
-        type: string
-      search-strategy:
-        required: false
-        default: "random-max"
-        type: string
-
-jobs:
-
-  test-runner-siracusa-tiled:
-    strategy:
-      fail-fast: false
-      matrix:
-        L1: ${{ fromJSON(inputs.L1) }}
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.docker-image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Cache ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-      - name: Run Test
-        uses: nick-fields/retry@v3
-        with:
-          timeout_minutes: 15
-          max_attempts: 3
-          retry_on: timeout
-          command: |
-            cd DeeployTest
-            mkdir -p /app/.ccache
-            export CCACHE_DIR=/app/.ccache
-            python testRunner_tiled_siracusa.py -t Tests/${{ inputs.test-name }} --cores=${{ inputs.num-cores }} --l1 ${{ matrix.L1 }} --defaultMemLevel=${{ inputs.default-memory-level }} ${{ inputs.double-buffer && '--doublebuffer' || '' }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }}
-          shell: bash
-        
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerTiledSiracusaSequential.yml b/.github/workflows/TestRunnerTiledSiracusaSequential.yml
deleted file mode 100644
index b4dc047003..0000000000
--- a/.github/workflows/TestRunnerTiledSiracusaSequential.yml
+++ /dev/null
@@ -1,72 +0,0 @@
-name: TestRunnerTiledSiracusaSequential
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      docker-image:
-        required: true
-        type: string
-      tests-config:
-        required: true
-        type: string
-      num-cores:
-        required: false
-        default: 8
-        type: number
-      default-memory-level:
-        required: false
-        default: "L2"
-        type: string
-      double-buffer:
-        required: false
-        default: false
-        type: boolean
-      memory-allocation-strategy:
-        required: false
-        default: "MiniMalloc"
-        type: string
-      search-strategy:
-        required: false
-        default: "random-max"
-        type: string
-
-jobs:
-
-  test-runner-siracusa-tiled:
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.docker-image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Install jq
-        run: apt-get install -y jq
-      - name: Cache ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-      - name: Run Tests
-        run: |
-          cd DeeployTest
-          echo '${{ inputs.tests-config }}' > tests.json
-          mkdir -p /app/.ccache
-          export CCACHE_DIR=/app/.ccache
-
-          jq -c '.[]' tests.json | while read test; do
-            testName=$(echo "$test" | jq -r '.name')
-            L1_values=$(echo "$test" | jq -r '.L1[]')
-            for L1_value in $L1_values; do
-              echo "Running test: $testName with L1: $L1_value"
-              python testRunner_tiled_siracusa.py -t Tests/$testName --cores=${{ inputs.num-cores }} --l1 $L1_value --defaultMemLevel=${{ inputs.default-memory-level }} ${{ inputs.double-buffer && '--doublebuffer' || '' }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }}
-            done
-          done
-        shell: bash
-        
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerTiledSiracusaWithNeureka.yml b/.github/workflows/TestRunnerTiledSiracusaWithNeureka.yml
deleted file mode 100644
index 621d5d9976..0000000000
--- a/.github/workflows/TestRunnerTiledSiracusaWithNeureka.yml
+++ /dev/null
@@ -1,79 +0,0 @@
-name: TestRunnerTiledSiracusaWithNeureka
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      docker-image:
-        required: true
-        type: string
-      test-name:
-        required: true
-        type: string
-      num-cores:
-        required: false
-        default: 8
-        type: number
-      L1:
-        required: false
-        default: "[64000]"
-        type: string
-      default-memory-level:
-        required: false
-        default: "L2"
-        type: string
-      double-buffer:
-        required: false
-        default: false
-        type: boolean
-      memory-allocation-strategy:
-          required: false
-          default: "MiniMalloc"
-          type: string
-      search-strategy:
-          required: false
-          default: "random-max"
-          type: string
-      neureka-wmem:
-        required: false
-        default: false
-        type: boolean
-      
-
-jobs:
-
-  test-runner-siracusa-neureka-tiled:
-    strategy:
-      fail-fast: false
-      matrix:
-        L1: ${{ fromJSON(inputs.L1) }}
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.docker-image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Cache ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-      - name: Run Test
-        uses: nick-fields/retry@v3
-        with:
-          timeout_minutes: 5
-          max_attempts: 3
-          retry_on: timeout
-          command: |
-            cd DeeployTest
-            mkdir -p /app/.ccache
-            export CCACHE_DIR=/app/.ccache
-            python testRunner_tiled_siracusa_w_neureka.py -t Tests/${{ inputs.test-name }} --cores=${{ inputs.num-cores }} --l1 ${{ matrix.L1 }} --defaultMemLevel=${{ inputs.default-memory-level }} ${{ inputs.double-buffer && '--doublebuffer' || '' }} ${{ inputs.neureka-wmem && '--neureka-wmem' || '' }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }}
-          shell: bash
-        
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerTiledSiracusaWithNeurekaSequential.yml b/.github/workflows/TestRunnerTiledSiracusaWithNeurekaSequential.yml
deleted file mode 100644
index 783d0c5785..0000000000
--- a/.github/workflows/TestRunnerTiledSiracusaWithNeurekaSequential.yml
+++ /dev/null
@@ -1,77 +0,0 @@
-name: TestRunnerTiledSiracusaWithNeurekaSequential
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      docker-image:
-        required: true
-        type: string
-      tests-config:
-        required: true
-        type: string
-      num-cores:
-        required: false
-        default: 8
-        type: number
-      default-memory-level:
-        required: false
-        default: "L2"
-        type: string
-      double-buffer:
-        required: false
-        default: false
-        type: boolean
-      memory-allocation-strategy:
-          required: false
-          default: "MiniMalloc"
-          type: string
-      search-strategy:
-          required: false
-          default: "random-max"
-          type: string
-      neureka-wmem:
-        required: false
-        default: false
-        type: boolean
-
-jobs:
-
-  test-runner-siracusa-neureka-tiled:
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.docker-image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Install jq
-        run: apt-get install -y jq
-      - name: Cache ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-      - name: Run Tests
-        run: |
-          cd DeeployTest
-          echo '${{ inputs.tests-config }}' > tests.json
-          mkdir -p /app/.ccache
-          export CCACHE_DIR=/app/.ccache
-
-          jq -c '.[]' tests.json | while read test; do
-            testName=$(echo "$test" | jq -r '.name')
-            L1_values=$(echo "$test" | jq -r '.L1[]')
-            for L1_value in $L1_values; do
-              echo "Running test: $testName with L1: $L1_value"
-              python testRunner_tiled_siracusa_w_neureka.py -t Tests/$testName --cores=${{ inputs.num-cores }} --l1 $L1_value --defaultMemLevel=${{ inputs.default-memory-level }} ${{ inputs.double-buffer && '--doublebuffer' || '' }} ${{ inputs.neureka-wmem && '--neureka-wmem' || '' }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }}
-            done
-          done
-
-        shell: bash
-        
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerTiledSnitchSequential.yml b/.github/workflows/TestRunnerTiledSnitchSequential.yml
deleted file mode 100644
index 9a56172f96..0000000000
--- a/.github/workflows/TestRunnerTiledSnitchSequential.yml
+++ /dev/null
@@ -1,76 +0,0 @@
-name: TestRunnerTiledSnitchSequential
-
-on:
-  workflow_call:
-    inputs:
-      runner:
-        required: true
-        type: string
-      docker-image:
-        required: true
-        type: string
-      tests-config:
-        required: true
-        type: string
-      num-cores:
-        required: false
-        default: 9
-        type: number
-      default-memory-level:
-        required: false
-        default: "L2"
-        type: string
-      memory-allocation-strategy:
-          required: false
-          default: "MiniMalloc"
-          type: string
-      search-strategy:
-          required: false
-          default: "random-max"
-          type: string
-      simulators:
-        required: true
-        type: string
-
-
-jobs:
-
-  test-runner-snitch-tiled:
-    runs-on: ${{ inputs.runner }}
-    container:
-      image: ${{ inputs.docker-image }}
-    steps:
-      - name: Checkout Repo
-        uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - name: Build Deeploy
-        run: pip install -e .
-      - name: Install jq
-        run: apt-get install -y jq
-      - name: Cache ccache
-        uses: actions/cache/restore@v4
-        with:
-          path: /app/.ccache
-          key: ccache-ci
-      - name: Run Tests
-        run: |
-          simulators="${{inputs.simulators}}"
-          cd DeeployTest
-          echo '${{ inputs.tests-config }}' > tests.json
-          mkdir -p /app/.ccache
-          export CCACHE_DIR=/app/.ccache
-          echo "$simulators" | while IFS= read -r simulator; do
-            if [[ -n "$simulator" ]]; then
-              jq -c '.[]' tests.json | while read test; do
-                testName=$(echo "$test" | jq -r '.name')
-                L1_values=$(echo "$test" | jq -r '.L1[]')
-                for L1_value in $L1_values; do
-                  echo "Running test: $testName with L1: $L1_value using $simulator"
-                  python testRunner_tiled_snitch.py -t Tests/$testName --cores=${{ inputs.num-cores }} --simulator=$simulator --l1 $L1_value --defaultMemLevel=${{ inputs.default-memory-level }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }}
-                done
-              done
-            fi
-          done
-        shell: bash
-        
\ No newline at end of file
diff --git a/.github/workflows/_runner-chimera.yml b/.github/workflows/_runner-chimera.yml
new file mode 100644
index 0000000000..14e80631d1
--- /dev/null
+++ b/.github/workflows/_runner-chimera.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-chimera
+
+"on":
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      docker-image:
+        required: true
+        type: string
+      pytest-marker:
+        required: true
+        type: string
+
+jobs:
+  test-runner-chimera:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+      - name: Cache ccache
+        uses: actions/cache/restore@v4
+        with:
+          path: /app/.ccache
+          key: ccache-ci
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
+        run: |
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          export CHIMERA_SDK_HOME=/app/install/chimera-sdk
+          pytest test_platforms.py -v -n 4 -m "chimera and ${{ inputs.pytest-marker }}"
+        shell: bash
diff --git a/.github/workflows/TestRunnerCortexM.yml b/.github/workflows/_runner-cortexm.yml
similarity index 67%
rename from .github/workflows/TestRunnerCortexM.yml
rename to .github/workflows/_runner-cortexm.yml
index 2b30b8a0b5..3fbdf0ee16 100644
--- a/.github/workflows/TestRunnerCortexM.yml
+++ b/.github/workflows/_runner-cortexm.yml
@@ -1,6 +1,11 @@
-name: TestRunnerCortexM
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: _runner-cortexm
+
+"on":
   workflow_call:
     inputs:
       runner:
@@ -9,7 +14,7 @@ on:
       docker-image:
         required: true
         type: string
-      test-names:
+      pytest-marker:
         required: true
         type: string
 
@@ -24,23 +29,17 @@ jobs:
         with:
           submodules: recursive
       - name: Build Deeploy
+        shell: bash
         run: pip install -e .
       - name: Cache ccache
         uses: actions/cache/restore@v4
         with:
           path: /app/.ccache
           key: ccache-ci
-      - name: Run Test
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
         run: |
-          testNames="${{ inputs.test-names }}"
           cd DeeployTest
           mkdir -p /app/.ccache
           export CCACHE_DIR=/app/.ccache
-          echo "$testNames" | while IFS= read -r testName; do
-            if [[ -n "$testName" ]]; then
-              echo "Running test: $testName"
-              python testRunner_cortexm.py -t Tests/$testName
-            fi
-          done
+          pytest test_platforms.py -v -n 4 -m "cortexm and ${{ inputs.pytest-marker }}"
         shell: bash
-          
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerGeneric.yml b/.github/workflows/_runner-generic.yml
similarity index 67%
rename from .github/workflows/TestRunnerGeneric.yml
rename to .github/workflows/_runner-generic.yml
index 92ec086cbf..6681cbac96 100644
--- a/.github/workflows/TestRunnerGeneric.yml
+++ b/.github/workflows/_runner-generic.yml
@@ -1,6 +1,11 @@
-name: TestRunnerGeneric
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: _runner-generic
+
+"on":
   workflow_call:
     inputs:
       runner:
@@ -9,7 +14,7 @@ on:
       docker-image:
         required: true
         type: string
-      test-names:
+      pytest-marker:
         required: true
         type: string
 
@@ -24,23 +29,17 @@ jobs:
         with:
           submodules: recursive
       - name: Build Deeploy
+        shell: bash
         run: pip install -e .
       - name: Cache ccache
         uses: actions/cache/restore@v4
         with:
           path: /app/.ccache
           key: ccache-ci
-      - name: Run Test
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
         run: |
-          testNames="${{ inputs.test-names }}"
           cd DeeployTest
           mkdir -p /app/.ccache
           export CCACHE_DIR=/app/.ccache
-          echo "$testNames" | while IFS= read -r testName; do
-            if [[ -n "$testName" ]]; then
-              echo "Running test: $testName"
-              python testRunner_generic.py -t Tests/$testName
-            fi
-          done
+          pytest test_platforms.py -v -n 4 -m "generic and ${{ inputs.pytest-marker }}"
         shell: bash
-          
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerMempool.yml b/.github/workflows/_runner-mempool.yml
similarity index 67%
rename from .github/workflows/TestRunnerMempool.yml
rename to .github/workflows/_runner-mempool.yml
index 233964eaf1..deb4809330 100644
--- a/.github/workflows/TestRunnerMempool.yml
+++ b/.github/workflows/_runner-mempool.yml
@@ -1,6 +1,11 @@
-name: TestRunnerMempool
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: _runner-mempool
+
+"on":
   workflow_call:
     inputs:
       runner:
@@ -9,7 +14,7 @@ on:
       docker-image:
         required: true
         type: string
-      test-names:
+      pytest-marker:
         required: true
         type: string
 
@@ -24,23 +29,17 @@ jobs:
         with:
           submodules: recursive
       - name: Build Deeploy
+        shell: bash
         run: pip install -e .
       - name: Cache ccache
         uses: actions/cache/restore@v4
         with:
           path: /app/.ccache
           key: ccache-ci
-      - name: Run Test
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
         run: |
-          testNames="${{ inputs.test-names }}"
           cd DeeployTest
           mkdir -p /app/.ccache
           export CCACHE_DIR=/app/.ccache
-          echo "$testNames" | while IFS= read -r testName; do
-            if [[ -n "$testName" ]]; then
-              echo "Running test: $testName"
-              python testRunner_mempool.py -t Tests/$testName
-            fi
-          done
+          pytest test_platforms.py -v -n 4 -m "mempool and ${{ inputs.pytest-marker }}"
         shell: bash
-          
\ No newline at end of file
diff --git a/.github/workflows/_runner-siracusa-neureka-tiled.yml b/.github/workflows/_runner-siracusa-neureka-tiled.yml
new file mode 100644
index 0000000000..b1f5f2fcb3
--- /dev/null
+++ b/.github/workflows/_runner-siracusa-neureka-tiled.yml
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-siracusa-neureka-tiled-sequential
+
+"on":
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      docker-image:
+        required: true
+        type: string
+      pytest-marker:
+        required: true
+        type: string
+
+jobs:
+  test-runner-siracusa-neureka-tiled:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
+        run: |
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          pytest test_platforms.py -v -n 4 -m "siracusa_neureka_tiled and ${{ inputs.pytest-marker }}"
+        shell: bash
diff --git a/.github/workflows/TestRunnerSoftHier.yml b/.github/workflows/_runner-siracusa-tiled.yml
similarity index 55%
rename from .github/workflows/TestRunnerSoftHier.yml
rename to .github/workflows/_runner-siracusa-tiled.yml
index 49ba951c7e..ea9c8989af 100644
--- a/.github/workflows/TestRunnerSoftHier.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -1,6 +1,11 @@
-name: TestRunnerSoftHier
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: _runner-siracusa-tiled
+
+"on":
   workflow_call:
     inputs:
       runner:
@@ -9,12 +14,12 @@ on:
       docker-image:
         required: true
         type: string
-      test-names:
+      pytest-marker:
         required: true
         type: string
 
 jobs:
-  test-runner-softhier:
+  test-runner-siracusa-tiled:
     runs-on: ${{ inputs.runner }}
     container:
       image: ${{ inputs.docker-image }}
@@ -24,17 +29,10 @@ jobs:
         with:
           submodules: recursive
       - name: Build Deeploy
+        shell: bash
         run: pip install -e .
       - name: Run Test
         run: |
-          testNames="${{ inputs.test-names }}"
-          export SOFTHIER_INSTALL_DIR=/app/install/softhier
           cd DeeployTest
-          echo "$testNames" | while IFS= read -r testName; do
-            if [[ -n "$testName" ]]; then
-              echo "Running test: $testName"
-              python testRunner_softhier.py -t Tests/$testName --toolchain=GCC
-            fi
-          done
+          pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}"
         shell: bash
-        
\ No newline at end of file
diff --git a/.github/workflows/TestRunnerSiracusa.yml b/.github/workflows/_runner-siracusa.yml
similarity index 64%
rename from .github/workflows/TestRunnerSiracusa.yml
rename to .github/workflows/_runner-siracusa.yml
index edf3adfce5..ea8fe5d405 100644
--- a/.github/workflows/TestRunnerSiracusa.yml
+++ b/.github/workflows/_runner-siracusa.yml
@@ -1,6 +1,11 @@
-name: TestRunnerSiracusa
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: _runner-siracusa
+
+"on":
   workflow_call:
     inputs:
       runner:
@@ -9,12 +14,10 @@ on:
       docker-image:
         required: true
         type: string
-      test-names:
+      pytest-marker:
         required: true
         type: string
-      num-cores:
-        required: true
-        type: number
+        description: "Pytest marker for test selection (e.g., 'kernels', 'models')"
 
 jobs:
   test-runner-siracusa:
@@ -27,6 +30,7 @@ jobs:
         with:
           submodules: recursive
       - name: Build Deeploy
+        shell: bash
         run: pip install -e .
       - name: Cache ccache
         uses: actions/cache/restore@v4
@@ -35,15 +39,10 @@ jobs:
           key: ccache-ci
       - name: Run Test
         run: |
-          testNames="${{ inputs.test-names }}"
           cd DeeployTest
           mkdir -p /app/.ccache
           export CCACHE_DIR=/app/.ccache
-          echo "$testNames" | while IFS= read -r testName; do
-            if [[ -n "$testName" ]]; then
-              echo "Running test: $testName"
-              python testRunner_siracusa.py -t Tests/$testName --cores=${{ inputs.num-cores }}
-            fi
-          done
+
+          # Run tests using pytest markers
+          pytest test_platforms.py -v -n 4 -m "siracusa and ${{ inputs.pytest-marker }}"
         shell: bash
-        
\ No newline at end of file
diff --git a/.github/workflows/_runner-snitch-tiled-sequential.yml b/.github/workflows/_runner-snitch-tiled-sequential.yml
new file mode 100644
index 0000000000..fbd5195b08
--- /dev/null
+++ b/.github/workflows/_runner-snitch-tiled-sequential.yml
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-snitch-tiled-sequential
+
+"on":
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      docker-image:
+        required: true
+        type: string
+      pytest-marker:
+        required: true
+        type: string
+
+jobs:
+  test-runner-snitch-tiled:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
+        run: |
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          pytest test_platforms.py -v -n 4 -m "snitch_tiled and ${{ inputs.pytest-marker }}"
+        shell: bash
diff --git a/.github/workflows/_runner-snitch.yml b/.github/workflows/_runner-snitch.yml
new file mode 100644
index 0000000000..bc599e4fe7
--- /dev/null
+++ b/.github/workflows/_runner-snitch.yml
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-snitch
+
+"on":
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      docker-image:
+        required: true
+        type: string
+      pytest-marker:
+        required: true
+        type: string
+
+jobs:
+  test-runner-snitch:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+      - name: Cache ccache
+        uses: actions/cache/restore@v4
+        with:
+          path: /app/.ccache
+          key: ccache-ci
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
+        run: |
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          pytest test_platforms.py -v -n 4 -m "snitch and ${{ inputs.pytest-marker }}"
+        shell: bash
diff --git a/.github/workflows/_runner-softhier.yml b/.github/workflows/_runner-softhier.yml
new file mode 100644
index 0000000000..b067664f40
--- /dev/null
+++ b/.github/workflows/_runner-softhier.yml
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-softhier
+
+"on":
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      docker-image:
+        required: true
+        type: string
+      pytest-marker:
+        required: true
+        type: string
+
+jobs:
+  test-runner-softhier:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+      - name: Run Test # VJUNG: Run tests with 4 parallel threads as GitHub action VM has 4 cores.
+        run: |
+          export SOFTHIER_INSTALL_DIR=/app/install/softhier
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          pytest test_platforms.py -v -n 4 -m "softhier and ${{ inputs.pytest-marker }}" --toolchain=GCC --toolchain-install-dir=$SOFTHIER_INSTALL_DIR/third_party/toolchain/install
+        shell: bash
diff --git a/.github/workflows/_select-env.yml b/.github/workflows/_select-env.yml
new file mode 100644
index 0000000000..1085c7eaa1
--- /dev/null
+++ b/.github/workflows/_select-env.yml
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _select-env
+"on":
+  workflow_call:
+    inputs:
+      docker_image_deeploy:
+        type: string
+        required: true
+    outputs:
+      image:
+        value: ${{ jobs.select.outputs.image }}
+      runner:
+        value: ${{ jobs.select.outputs.runner }}
+
+jobs:
+  select:
+    runs-on: ubuntu-latest
+    outputs:
+      image: ${{ steps.set-docker-image.outputs.image }}
+      runner: ${{ steps.set-runner.outputs.runner }}
+    steps:
+      - id: set-docker-image
+        shell: bash
+        run: |
+          if [[ -n "${{ inputs.docker_image_deeploy }}" ]]; then
+            IMAGE="${{ inputs.docker_image_deeploy }}"
+          elif [[ "${{ github.ref }}" == refs/tags/* ]]; then
+            TAG_NAME="${GITHUB_REF##refs/tags/}"
+            IMAGE="ghcr.io/pulp-platform/deeploy:${TAG_NAME}"
+          elif [[ "${{ github.ref_name }}" == "main" ]]; then
+            IMAGE="ghcr.io/pulp-platform/deeploy:main"
+          else
+            IMAGE="ghcr.io/pulp-platform/deeploy:devel"
+          fi
+          echo "image=${IMAGE}" >> "$GITHUB_OUTPUT"
+
+      - id: set-runner
+        shell: bash
+        run: |
+          if [[ "${{ github.repository }}" == "pulp-platform/Deeploy" ]]; then
+            echo "runner=self-hosted" >> "$GITHUB_OUTPUT"
+          else
+            echo "runner=ubuntu-latest" >> "$GITHUB_OUTPUT"
+          fi
diff --git a/.github/workflows/ci-deeploy.yml b/.github/workflows/ci-deeploy.yml
new file mode 100644
index 0000000000..fc468306b1
--- /dev/null
+++ b/.github/workflows/ci-deeploy.yml
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Deeploy
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  build-deeploy:
+    needs: select-env
+    runs-on: ${{ needs.select-env.outputs.runner }}
+    container:
+      image: ${{ needs.select-env.outputs.image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+
+  deeploy-internal-tests:
+    needs: select-env
+    runs-on: ${{ needs.select-env.outputs.runner }}
+    container:
+      image: ${{ needs.select-env.outputs.image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+      - name: Run Internal Tests
+        shell: bash
+        run: |
+          cd DeeployTest
+          pytest -v -m deeploy_internal -n 4
diff --git a/.github/workflows/ci-lint.yml b/.github/workflows/ci-lint.yml
new file mode 100644
index 0000000000..75163aafaf
--- /dev/null
+++ b/.github/workflows/ci-lint.yml
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Lint & Licenses
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  linting:
+    needs: select-env
+    runs-on: ${{ needs.select-env.outputs.runner }}
+    container:
+      image: ${{ needs.select-env.outputs.image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: |
+          pip install . --extra-index-url=https://pypi.ngc.nvidia.com
+          pip install -r requirements-dev.txt
+      - name: Format Python
+        shell: bash
+        run: |
+          yapf -rpd -e "*/TEST_*/" -e "*/third_party/" -e "install/" -e "toolchain/" .
+      - name: Format Python Imports
+        shell: bash
+        run: |
+          isort --quiet --sg "**/TEST_*/*" --sg "**/third_party/*" --sg "install/*" --sg "toolchain/*" ./ -c
+          autoflake --quiet -c -r --remove-all-unused-imports --ignore-init-module-imports --exclude "**/third_party/*,**/install/*,**/toolchain/*" .
+      - name: Format C
+        shell: bash
+        run: |
+          python scripts/run_clang_format.py -e "*/TEST_*/*" -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -r --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format . scripts
+      - name: Format YAML
+        shell: bash
+        run: |
+          yamllint .
+      - name: Check Licenses
+        shell: bash
+        run: |
+          python scripts/reuse_skip_wrapper.py $(find . \( -name '*.py' -o -name '*.c' -o -name '*.h' -o -name '*.html' -o -name '*.rst' -o -name '*.yml' -o -name '*.yaml' \) -not -path "*toolchain*" -not -path "*third_party*" -not -path "*.git/*" -not -path "*install/*" -type f)
diff --git a/.github/workflows/ci-platform-chimera.yml b/.github/workflows/ci-platform-chimera.yml
new file mode 100644
index 0000000000..aad065ae78
--- /dev/null
+++ b/.github/workflows/ci-platform-chimera.yml
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Chimera
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  chimera-kernels:
+    needs: select-env
+    uses: ./.github/workflows/_runner-chimera.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels"
diff --git a/.github/workflows/ci-platform-cortexm.yml b/.github/workflows/ci-platform-cortexm.yml
new file mode 100644
index 0000000000..0e03e17d0b
--- /dev/null
+++ b/.github/workflows/ci-platform-cortexm.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Cortex-M
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  cortexm-kernels:
+    needs: select-env
+    uses: ./.github/workflows/_runner-cortexm.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels"
+
+  cortexm-models:
+    needs: select-env
+    uses: ./.github/workflows/_runner-cortexm.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models"
diff --git a/.github/workflows/ci-platform-generic.yml b/.github/workflows/ci-platform-generic.yml
new file mode 100644
index 0000000000..83c191180f
--- /dev/null
+++ b/.github/workflows/ci-platform-generic.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Generic
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  generic-kernels:
+    needs: select-env
+    uses: ./.github/workflows/_runner-generic.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels"
+
+  generic-models:
+    needs: select-env
+    uses: ./.github/workflows/_runner-generic.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models"
diff --git a/.github/workflows/ci-platform-mempool.yml b/.github/workflows/ci-platform-mempool.yml
new file mode 100644
index 0000000000..efda508257
--- /dev/null
+++ b/.github/workflows/ci-platform-mempool.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Mempool
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  mempool-kernels:
+    needs: select-env
+    uses: ./.github/workflows/_runner-mempool.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels"
+
+  mempool-models:
+    needs: select-env
+    uses: ./.github/workflows/_runner-mempool.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models"
diff --git a/.github/workflows/ci-platform-siracusa-neureka-tiled.yml b/.github/workflows/ci-platform-siracusa-neureka-tiled.yml
new file mode 100644
index 0000000000..e76ee648c0
--- /dev/null
+++ b/.github/workflows/ci-platform-siracusa-neureka-tiled.yml
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Siracusa + Neureka (Tiled)
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  siracusa-neureka-kernels-tiled-singlebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-neureka-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and singlebuffer and l2 and not wmem"
+
+  siracusa-neureka-kernels-tiled-doublebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-neureka-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and doublebuffer and l2 and not wmem"
+
+  siracusa-neureka-models-tiled-singlebuffer-L3:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-neureka-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and singlebuffer and l3 and not wmem"
+
+  siracusa-neureka-models-tiled-doublebuffer-L3:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-neureka-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and doublebuffer and l3 and not wmem"
+
+  siracusa-neureka-kernels-tiled-singlebuffer-L2-wmem:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-neureka-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and singlebuffer and l2 and wmem"
+
+  siracusa-neureka-models-tiled-doublebuffer-L3-wmem:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-neureka-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and doublebuffer and l3 and wmem"
diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
new file mode 100644
index 0000000000..6597f3e625
--- /dev/null
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Siracusa (Tiled)
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  # Kernel tests - L2 singlebuffer
+  siracusa-kernels-tiled-l2-singlebuffer:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and l2 and singlebuffer"
+
+  # Kernel tests - L2 doublebuffer
+  siracusa-kernels-tiled-l2-doublebuffer:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and l2 and doublebuffer"
+
+  # Model tests - L2 singlebuffer
+  siracusa-models-tiled-l2-singlebuffer:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and l2 and singlebuffer"
+
+  # Model tests - L2 doublebuffer
+  siracusa-models-tiled-l2-doublebuffer:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and l2 and doublebuffer"
+
+  # Model tests - L3 singlebuffer
+  siracusa-models-tiled-l3-singlebuffer:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and l3 and singlebuffer"
+
+  # Model tests - L3 doublebuffer
+  siracusa-models-tiled-l3-doublebuffer:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "models and l3 and doublebuffer"
diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml
new file mode 100644
index 0000000000..8e102cdc78
--- /dev/null
+++ b/.github/workflows/ci-platform-siracusa.yml
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Siracusa
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  siracusa-kernels:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: kernels
+
+  siracusa-models:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: models
diff --git a/.github/workflows/ci-platform-snitch-tiled.yml b/.github/workflows/ci-platform-snitch-tiled.yml
new file mode 100644
index 0000000000..5390d8ad16
--- /dev/null
+++ b/.github/workflows/ci-platform-snitch-tiled.yml
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Snitch (Tiled)
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  snitch-kernels-tiled-singlebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-snitch-tiled-sequential.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and singlebuffer and l2"
diff --git a/.github/workflows/ci-platform-snitch.yml b/.github/workflows/ci-platform-snitch.yml
new file mode 100644
index 0000000000..c1ae694148
--- /dev/null
+++ b/.github/workflows/ci-platform-snitch.yml
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Snitch
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  snitch-kernels:
+    needs: select-env
+    uses: ./.github/workflows/_runner-snitch.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels"
diff --git a/.github/workflows/ci-platform-softhier.yml b/.github/workflows/ci-platform-softhier.yml
new file mode 100644
index 0000000000..28a85160be
--- /dev/null
+++ b/.github/workflows/ci-platform-softhier.yml
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • SoftHier
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
+
+  softhier-kernels:
+    needs: select-env
+    uses: ./.github/workflows/_runner-softhier.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels"
diff --git a/.github/workflows/BuildDockerDeeploy.yml b/.github/workflows/docker-build-deeploy.yml
similarity index 67%
rename from .github/workflows/BuildDockerDeeploy.yml
rename to .github/workflows/docker-build-deeploy.yml
index 2b2bf5d169..9edb90f103 100644
--- a/.github/workflows/BuildDockerDeeploy.yml
+++ b/.github/workflows/docker-build-deeploy.yml
@@ -1,12 +1,17 @@
-name: BuildDockerDeeploy
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: Docker • Build Deeploy Container
+
+"on":
   workflow_dispatch:
     inputs:
       docker_image_toolchain:
-        description: 'Deeploy Toolchain Image to use'
+        description: "Deeploy Toolchain Image to use"
         required: false
-        default: 'ghcr.io/pulp-platform/deeploy-toolchain:latest'
+        default: "ghcr.io/pulp-platform/deeploy-toolchain:latest"
 
 jobs:
   prepare:
@@ -34,8 +39,11 @@ jobs:
 
   build-deeploy:
     name: Build Deploy Image
-    needs: [ prepare ]
+    needs: [prepare]
     runs-on: ${{ matrix.runner }}
+    outputs:
+      digest-amd64: ${{ steps.digest.outputs.digest-amd64 }}
+      digest-arm64: ${{ steps.digest.outputs.digest-arm64 }}
     strategy:
       fail-fast: false
       matrix:
@@ -57,17 +65,18 @@ jobs:
           haskell: true
           large-packages: true
 
-      - uses: docker/setup-buildx-action@v1
+      - uses: docker/setup-buildx-action@v3
 
       - name: GHCR Log-in
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Build Cache for Docker
-        uses: actions/cache@v3
+        id: cache
+        uses: actions/cache@v4
         with:
           path: var-ccache
           key: ${{ runner.os }}-${{ matrix.platform }}-build-cache-deeploy
@@ -85,9 +94,10 @@ jobs:
         run: |
           echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
         env:
-          OWNER: '${{ github.repository_owner }}'
+          OWNER: "${{ github.repository_owner }}"
 
       - name: Build and push final deploy image
+        id: build
         uses: docker/build-push-action@v6
         with:
           platforms: linux/${{ matrix.platform }}
@@ -98,17 +108,19 @@ jobs:
           push: true
           build-args: |
             BASE_IMAGE=${{ github.event.inputs.docker_image_toolchain }}
-          tags: |
-            ghcr.io/${{ env.OWNER_LC }}/deeploy:latest-${{ matrix.platform }}
-            ghcr.io/${{ env.OWNER_LC }}/deeploy:${{ needs.prepare.outputs.docker_tag }}-${{ matrix.platform }}
+          outputs: type=image,name=ghcr.io/${{ env.OWNER_LC }}/deeploy,annotation-index=true,name-canonical=true,push=true
+
+      - name: Extract image digest
+        id: digest
+        run: echo "digest-${{ matrix.platform }}=${{ steps.build.outputs.digest }}" >> $GITHUB_OUTPUT
 
   merge-deeploy-images:
     name: Merge Deeploy Images
     runs-on: ubuntu-latest
-    needs: [ prepare, build-deeploy ]
+    needs: [prepare, build-deeploy]
     steps:
       - name: GHCR Log-in
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
@@ -118,10 +130,15 @@ jobs:
         run: |
           echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
         env:
-          OWNER: '${{ github.repository_owner }}'
+          OWNER: "${{ github.repository_owner }}"
 
-      - uses: Noelware/docker-manifest-action@v1
+      - name: Merge Deeploy Images
+        uses: Noelware/docker-manifest-action@v1
         with:
-          inputs: ghcr.io/${{ env.OWNER_LC }}/deeploy:latest-amd64,ghcr.io/${{ env.OWNER_LC }}/deeploy:latest-arm64
-          tags: ghcr.io/${{ env.OWNER_LC }}/deeploy:latest,ghcr.io/${{ env.OWNER_LC }}/deeploy:${{ needs.prepare.outputs.docker_tag }}
-          push: true
\ No newline at end of file
+          inputs: |
+            ghcr.io/${{ env.OWNER_LC }}/deeploy@${{ needs.build-deeploy.outputs.digest-amd64 }},
+            ghcr.io/${{ env.OWNER_LC }}/deeploy@${{ needs.build-deeploy.outputs.digest-arm64 }}
+          tags: |
+            ghcr.io/${{ env.OWNER_LC }}/deeploy:latest,
+            ghcr.io/${{ env.OWNER_LC }}/deeploy:${{ needs.prepare.outputs.docker_tag }}
+          push: true
diff --git a/.github/workflows/BuildDockerToolchain.yml b/.github/workflows/docker-build-toolchain.yml
similarity index 67%
rename from .github/workflows/BuildDockerToolchain.yml
rename to .github/workflows/docker-build-toolchain.yml
index 7b128948d1..d2b8fc7d63 100644
--- a/.github/workflows/BuildDockerToolchain.yml
+++ b/.github/workflows/docker-build-toolchain.yml
@@ -1,6 +1,11 @@
-name: BuildDockerToolchain
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: Docker • Build Toolchain Container
+
+"on":
   workflow_dispatch:
 
 jobs:
@@ -29,8 +34,11 @@ jobs:
 
   build-toolchain:
     name: Build Deeploy Toolchain Image
-    needs: [ prepare ]
+    needs: [prepare]
     runs-on: ${{ matrix.runner }}
+    outputs:
+      digest-amd64: ${{ steps.digest.outputs.digest-amd64 }}
+      digest-arm64: ${{ steps.digest.outputs.digest-arm64 }}
     strategy:
       fail-fast: false
       matrix:
@@ -41,7 +49,7 @@ jobs:
           - platform: arm64
             runner: ubuntu-22.04-arm
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - name: Free up disk space
         uses: jlumbroso/free-disk-space@v1.3.1
@@ -51,17 +59,18 @@ jobs:
           haskell: true
           large-packages: true
 
-      - uses: docker/setup-buildx-action@v1
+      - uses: docker/setup-buildx-action@v3
 
       - name: GHCR Log-in
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Build Cache for Docker
-        uses: actions/cache@v3
+        id: cache
+        uses: actions/cache@v4
         with:
           path: var-ccache
           key: ${{ runner.os }}-${{ matrix.platform }}-build-cache-toolchain
@@ -79,26 +88,29 @@ jobs:
         run: |
           echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
         env:
-          OWNER: '${{ github.repository_owner }}'
+          OWNER: "${{ github.repository_owner }}"
 
       - name: Build and push toolchain image
+        id: build
         uses: docker/build-push-action@v6
         with:
           platforms: linux/${{ matrix.platform }}
           context: .
           file: Container/Dockerfile.toolchain
           push: true
-          tags: |
-            ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:latest-${{ matrix.platform }}
-            ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:${{ needs.prepare.outputs.docker_tag }}-${{ matrix.platform }}
+          outputs: type=image,name=ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain,annotation-index=true,name-canonical=true,push=true
+
+      - name: Extract image digest
+        id: digest
+        run: echo "digest-${{ matrix.platform }}=${{ steps.build.outputs.digest }}" >> $GITHUB_OUTPUT
 
   merge-toolchain-images:
     name: Merge Deeploy Toolchain Images
     runs-on: ubuntu-latest
-    needs: [ prepare, build-toolchain ]
+    needs: [prepare, build-toolchain]
     steps:
       - name: GHCR Log-in
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
         with:
           registry: ghcr.io
           username: ${{ github.actor }}
@@ -108,10 +120,15 @@ jobs:
         run: |
           echo "OWNER_LC=${OWNER,,}" >>${GITHUB_ENV}
         env:
-          OWNER: '${{ github.repository_owner }}'
+          OWNER: "${{ github.repository_owner }}"
 
-      - uses: Noelware/docker-manifest-action@v1
+      - name: Merge Toolchain Images
+        uses: Noelware/docker-manifest-action@v1
         with:
-          inputs: ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:latest-amd64,ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:latest-arm64
-          tags: ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:latest,ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:${{ needs.prepare.outputs.docker_tag }}
-          push: true
\ No newline at end of file
+          inputs: |
+            ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain@${{ needs.build-toolchain.outputs.digest-amd64 }},
+            ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain@${{ needs.build-toolchain.outputs.digest-arm64 }}
+          tags: |
+            ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:latest,
+            ghcr.io/${{ env.OWNER_LC }}/deeploy-toolchain:${{ needs.prepare.outputs.docker_tag }}
+          push: true
diff --git a/.github/workflows/infra-generate-ccache.yml b/.github/workflows/infra-generate-ccache.yml
new file mode 100644
index 0000000000..e4d00ea911
--- /dev/null
+++ b/.github/workflows/infra-generate-ccache.yml
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: Infrastructure • Generate CCache
+
+"on":
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/pulp-platform/deeploy:devel"
+  schedule:
+    # Runs the workflow on the default branch every day at 1AM CET to keep the cache fresh
+    - cron: "0 1 * * *"
+
+jobs:
+  generate-ccache:
+    runs-on: ubuntu-latest
+    container:
+      image: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy:devel' }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+
+      - name: Generate CCache
+        run: |
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          pytest 'test_platforms.py::test_generic_kernels[Kernels/Integer/Add/Regular]' --skipsim
+          pytest 'test_platforms.py::test_mempool_kernels[Kernels/Integer/Add/Regular]' --skipsim
+          pytest 'test_platforms.py::test_cortexm_kernels[Kernels/Integer/Add/Regular]' --skipsim
+          pytest 'test_platforms.py::test_snitch_kernels[Kernels/Integer/Add/Regular]' --skipsim
+          pytest 'test_platforms.py::test_snitch_tiled_kernels_l2_singlebuffer[Kernels/Integer/Add/Large-5000-L2-singlebuffer]' --skipsim
+          pytest 'test_platforms.py::test_siracusa_kernels[Kernels/Integer/Add/Regular]' --skipsim
+          pytest 'test_platforms.py::test_siracusa_tiled_kernels_l2_singlebuffer[Kernels/Integer/MatMul/Regular-64000-L2-singlebuffer]' --skipsim
+          pytest 'test_platforms.py::test_siracusa_neureka_tiled_kernels_l2_singlebuffer[Kernels/Integer/GEMM/Regular_RQPerColumn-16000-L2-singlebuffer]' --skipsim
+          pytest 'test_platforms.py::test_chimera_kernels[Kernels/Integer/Add/Regular]' --skipsim
+
+      - name: Clean and Upload CCache
+        uses: actions/cache@v4
+        with:
+          path: /app/.ccache
+          key: ccache-ci
diff --git a/.github/workflows/documentation.yml b/.github/workflows/infra-generate-documentation.yml
similarity index 81%
rename from .github/workflows/documentation.yml
rename to .github/workflows/infra-generate-documentation.yml
index ddd2c8a5c5..8b0ae2b5ce 100644
--- a/.github/workflows/documentation.yml
+++ b/.github/workflows/infra-generate-documentation.yml
@@ -1,7 +1,16 @@
-name: documentation
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
 
-on:
+---
+name: Infrastructure • Generate Documentation
+
+"on":
   push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
   pull_request:
   workflow_dispatch:
 
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000000..93c0147fd2
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+name: Publish
+
+on:
+  push:
+    tags:
+      - "v*"
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  id-token: write
+
+env:
+  UV_EXTRA_INDEX_URL: https://pypi.ngc.nvidia.com
+
+jobs:
+  publish-pypi:
+    name: Publish to PyPI
+    if: github.event_name == 'push'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Build artifacts
+        run: uv build
+
+      - name: Test wheel installation
+        run: uv run --isolated --no-project --with dist/*.whl python -c "import Deeploy"
+
+      - name: Test sdist installation
+        run: uv run --isolated --no-project --with dist/*.tar.gz python -c "import Deeploy"
+
+      - name: Publish to PyPI
+        run: uv publish
+
+  publish-test-pypi:
+    name: Publish to Test PyPI
+    if: github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v5
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Build artifacts
+        run: uv build
+
+      - name: Test wheel installation
+        run: uv run --isolated --no-project --with dist/*.whl python -c "import Deeploy"
+
+      - name: Test sdist installation
+        run: uv run --isolated --no-project --with dist/*.tar.gz python -c "import Deeploy"
+
+      - name: Publish to Test PyPI
+        run: uv publish --publish-url https://test.pypi.org/legacy/
diff --git a/.gitignore b/.gitignore
index d6ecb06744..dc93328e4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,31 +1,42 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# Editor and OS files
 *~
-__pycache__
-build
-dist
-**/*.egg*
+*#
 *.vscode
 .DS_Store
-*.html
-!docs/_templates/*
-*.csv
+
+# Python
+__pycache__
+.venv/*
+.mypy_cache
 .ipynb_checkpoints/
-*#
-install/
+**/*.egg*
 *.pkl
 *.data
-*#
+
+# Build artifacts
+build
+dist
+install/
+compile_commands.json
 toolchain/**/*/
+
+# Node
 package.json
 package-lock.json
-.mypy_cache
 node_modules
 
-compile_commands.json
-
+# Documentation
 docs/_autosummary
 docs/_build
+*.html
+!docs/_templates/*
+*.csv
 
-
+# DeeployTest
 DeeployTest/TestFiles/
 DeeployTest/Tests/**/*.txt
 DeeployTest/**/BUILD/*
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
deleted file mode 100644
index 94318a4e9d..0000000000
--- a/.gitlab-ci.yml
+++ /dev/null
@@ -1,717 +0,0 @@
-variables:
-  GIT_SUBMODULE_STRATEGY: recursive
-  FF_USE_FASTZIP: "true"
-  # These can be specified per job or per pipeline
-  ARTIFACT_COMPRESSION_LEVEL: "fastest"
-  CACHE_COMPRESSION_LEVEL: "fastest"
-  TOOLCHAIN: "LLVM"
-  CMAKE_GENERATOR: "Ninja"
-
-stages: # List of stages for jobs, and their order of execution
-  - test
-
-.setup_test:
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - cd DeeployTest
-    - git lfs pull
-
-build_deeploy: # This job runs in the build stage, which runs first.
-  stage: test
-  resource_group: install
-  artifacts:
-    untracked: true
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - pip install -e .
-    - rm -f DeeployTest/out.txt
-
-gen_docs:
-  stage: test
-  resource_group: install
-  artifacts:
-    untracked: true
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - make docs
-
-run_cmsis_test_models: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - qemu-arm
-  parallel:
-    matrix:
-    - TEST: [simpleRegression, WaveFormer]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_cortexm.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_cmsis_test_kernels: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - qemu-arm
-  parallel:
-    matrix:
-    - TEST: [Adder, MultIO, test1DPad, test2DPad, testMatMul, testMatMulAdd, testMaxPool, testRQConv, testReduceSum, testReduceMean, testSlice]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_cortexm.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_QEMU/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_test_models: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: [simpleRegression, miniMobileNet, miniMobileNetv2, Attention, MLPerf/KeywordSpotting, MLPerf/ImageClassification, MLPerf/AnomalyDetection]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_siracusa.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_test_kernels: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: [Adder, MultIO, test1DPad, test2DPad, testMatMul, testMatMulAdd, testRequantizedDWConv, test2DRequantizedConv, iSoftmax, testConcat, testRMSNorm, trueIntegerDivSandwich, Hardswish, RQHardswish, testBacktracking]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_siracusa.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_DMA_slice_L2: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  script:
-    - !reference [.setup_test, script]
-    - python testSlice_PULP.py --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/testSlice/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/testSlice/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_tiled_kernels_singlebuffer_L2: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "testMatMul"
-      L1: [64000, 32000, 16000]
-    - TEST: "test2DRequantizedConv"
-      L1: [8000, 6000, 4000]
-    - TEST: "testRequantizedDWConv"
-      L1: [2561] # SCHEREMO: The implicit transpose after the conv is untiled; need at least 2560
-    - TEST: "iSoftmax"
-      L1: [800, 500, 300]
-    - TEST: "testConcat"
-      L1: [32000, 16000, 8000]
-    - TEST: "testRMSNorm"
-      L1: [2048, 1024, 512]
-    - TEST: "Hardswish"
-      L1: [750]
-    - TEST: "RQHardswish"
-      L1: [750]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_tiled_kernels_doublebuffer_L2: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "testMatMul"
-      L1: [64000, 32000, 16000]
-    - TEST: "test2DRequantizedConv"
-      L1: [8000, 6000, 5000]
-    - TEST: "testRequantizedDWConv"
-      L1: [5121] # SCHEREMO: The implicit transpose after the conv is untiled; need at least 2560 * 2 for DB
-    - TEST: "iSoftmax"
-      L1: [1600, 1000, 600]
-    - TEST: "testConcat"
-      L1: [64000, 32000, 16000]
-    - TEST: "testRMSNorm"
-      L1: [4096, 2048, 1024]
-    - TEST: "Hardswish"
-      L1: [750]
-    - TEST: "RQHardswish"
-      L1: [750]
-    - TEST: "microLlama/microLlama1"
-      L1: [60000, 20000, 10000]
-    - TEST: "microLlama/microLlama8"
-      L1: [60000, 20000, 10000]
-    - TEST: "microLlama/microLlama8_parallel"
-      L1: [60000, 20000, 10000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --doublebuffer
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_tiled_models_singlebuffer_L2: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "simpleRegression"
-      L1: [45000, 30000, 15000]
-    - TEST: "miniMobileNet"
-      L1: [60000, 12000, 6000, 3000]
-    - TEST: "miniMobileNetv2"
-      L1: [60000, 16000, 12000, 8000]
-    - TEST: "Attention"
-      L1: [60000, 10000, 5000]
-    - TEST: "microLlama/microLlama1"
-      L1: [60000, 10000, 5000]
-    - TEST: "microLlama/microLlama8"
-      L1: [60000, 10000, 5000]
-    - TEST: "microLlama/microLlama8_parallel"
-      L1: [60000, 10000, 5000]
-    - TEST: "MLPerf/KeywordSpotting"
-      L1: [64000]
-    - TEST: "MLPerf/ImageClassification"
-      L1: [64000]
-    - TEST: "MLPerf/AnomalyDetection"
-      L1: [64000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_tiled_models_singlebuffer_L3: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "simpleRegression"
-      L1: [45000, 30000, 16000] # SCHEREMO: 15000 leads to non-2d transfers in L3!
-    - TEST: "miniMobileNet"
-      L1: [60000, 12000, 6000] # SCHEREMO: 3000 leads to non-2d transfers in L3!
-    - TEST: "miniMobileNetv2"
-      L1: [60000, 16000, 12000, 8000]
-    - TEST: "Attention"
-      L1: [60000, 10000, 5000, 2500]
-    - TEST: "Transformer"
-      L1: [60000, 30000, 15000]
-    - TEST: "microLlama/microLlama1"
-      L1: [60000, 10000, 5000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-
-run_siracusa_tiled_models_doublebuffer_L3: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "simpleRegression"
-      L1: [60000, 45000, 30000]
-    - TEST: "miniMobileNet"
-      L1: [60000, 24000, 12000, 6000]
-    - TEST: "miniMobileNetv2"
-      L1: [60000, 32000, 24000, 16000]
-    - TEST: "Attention"
-      L1: [60000, 20000, 10000, 5000]
-    - TEST: "Transformer"
-      L1: [60000, 30000, 15000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --doublebuffer --defaultMemLevel=L3
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-
-run_siracusa_w_neureka_tiled_kernels_singlebuffer_L2:
-  stage: test
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "testRequantizedLinear"
-      L1: [16000]
-    - TEST: "testPointwise"
-      L1: [32000]
-    - TEST: "testPointwiseConvBNReLU"
-      L1: [32000]
-    - TEST: "testPointwiseUnsignedWeights"
-      L1: [32000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --defaultMemLevel=L2
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-
-run_siracusa_w_neureka_tiled_kernels_doublebuffer_L2:
-  stage: test
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "testRequantizedLinear"
-      L1: [16000]
-    - TEST: "testPointwise"
-      L1: [32000]
-    - TEST: "testPointwiseConvBNReLU"
-      L1: [32000]
-    - TEST: "testPointwiseUnsignedWeights"
-      L1: [32000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --defaultMemLevel=L2 --doublebuffer
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_w_neureka_tiled_models_singlebuffer_L3:
-  stage: test
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "miniMobileNet"
-      L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
-    - TEST: "Attention"
-      L1: [2500]
-    - TEST: "Transformer"
-      L1: [15000]
-    - TEST: "microLlama/microLlama1"
-      L1: [10000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_w_neureka_tiled_models_doublebuffer_L3:
-  stage: test
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "miniMobileNet"
-      L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
-    - TEST: "Attention"
-      L1: [5000]
-    - TEST: "Transformer"
-      L1: [30000]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3 --doublebuffer
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_w_neureka_tiled_kernels_singlebuffer_L2_wmem:
-  stage: test
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "testRequantizedLinear"
-      L1: [16000]
-    - TEST: "testPointwise"
-      L1: [32000]
-    - TEST: "testPointwiseConvBNReLU"
-      L1: [32000]
-    - TEST: "testPointwiseUnsignedWeights"
-      L1: [32000]
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - cd DeeployTest
-    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --defaultMemLevel=L2 --neureka-wmem
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_siracusa_w_neureka_tiled_models_doublebuffer_L3_wmem:
-  stage: test
-  tags:
-    - PULP
-  parallel:
-    matrix:
-    - TEST: "miniMobileNet"
-      L1: [2000] # LMACAN: 1000 leads to non-2d transfers in L3!
-    - TEST: "Attention"
-      L1: [2500]
-    - TEST: "Transformer"
-      L1: [30000]
-    - TEST: "microLlama/microLlama1"
-      L1: [10000]
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - cd DeeployTest
-    - python testRunner_tiled_siracusa_w_neureka.py -t ./Tests/$TEST --l1 $L1 --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR --cores=8 --defaultMemLevel=L3 --doublebuffer --neureka-wmem
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_SIRACUSA_W_NEUREKA/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_mempool_test_kernels: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  tags:
-    - banshee
-  retry: 2
-  parallel:
-    matrix:
-    - TEST: [Adder, MultIO, test1DConvolution, test2DConvolution, test1DDWConvolution, test2DDWConvolution, test1DPad, test2DPad, testGEMM, testMatMul, testMatMulAdd, testMaxPool, testRQConv, testRQGEMM, testRQMatMul, testReduceSum, testReduceMean, testSlice, testRequantizedDWConv, test2DRequantizedConv]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_mempool.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_mempool_test_models:   # This job runs in the test stage.
-  stage: test              # It only starts when the job in the build stage completes successfully.
-  tags:
-    - banshee
-  retry: 2
-  parallel:
-    matrix:
-    - TEST: [simpleRegression, simpleCNN, ICCT, ICCT_ITA, ICCT_8, ICCT_ITA_8, miniMobileNet, miniMobileNetv2]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_mempool.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-    # - python testRunner_mempool.py -t ./Tests/WaveFormer -DGCC_INSTALL_DIR=$MEMPOOL_GCC_INSTALL_DIR # Boken with ITA (heap is too small)
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_MEMPOOL/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_generic_test_kernels: # This job runs in the test stage.
-  stage: test # It only starts when the job in the build stage completes successfully.
-  parallel:
-    matrix:
-    - TEST: [Adder, MultIO, test1DConvolution, test2DConvolution, test1DDWConvolution, test2DDWConvolution, test1DPad, test2DPad, testConcat, testGEMM, testMatMul, testMatMulAdd, testMaxPool, testRQConv, testRQMatMul, testReduceSum, testReduceMean, testSlice, testRequantizedDWConv, test2DRequantizedConv, iSoftmax]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_generic.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-run_generic_test_models:   # This job runs in the test stage.
-  stage: test              # It only starts when the job in the build stage completes successfully.
-  parallel:
-    matrix:
-    - TEST: [simpleRegression, WaveFormer, simpleCNN, ICCT, ICCT_ITA, ICCT_8, ICCT_ITA_8, miniMobileNet, miniMobileNetv2]
-  script:
-    - !reference [.setup_test, script]
-    - python testRunner_generic.py -t ./Tests/$TEST --toolchain=$TOOLCHAIN --toolchain_install_dir=$LLVM_INSTALL_DIR
-  artifacts:
-    name: "$CI_JOB_NAME-$CI_COMMIT_REF_NAME-$CI_COMMIT_SHORT_SHA"
-    paths:
-      - ./DeeployTest/out.txt
-      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.c
-      - ./DeeployTest/TEST_GENERIC/Tests/$TEST/*.h
-    expire_in: 4 weeks
-  cache:
-    key: $CI_PROJECT_DIR-$CI_COMMIT_REF_SLUG
-    paths:
-      - ./DeeployTest/TEST_*/build
-
-test_deeploy_state_serialization:
-  stage: test
-  parallel:
-    matrix:
-    - TEST: [simpleRegression]
-      PLATFORM: ['QEMU-ARM', 'Siracusa', 'MemPool', 'Generic']
-  script:
-    - !reference [.setup_test, script]
-    - python deeployStateEqualityTest.py -t ./Tests/$TEST -p $PLATFORM
-
-test_memory_level_extension:
-  stage: test
-  parallel:
-    matrix:
-    - TEST: [simpleRegression]
-      PLATFORM: ['QEMU-ARM', 'Siracusa', 'MemPool', 'Generic']
-  script:
-    - !reference [.setup_test, script]
-    - python testMemoryLevelExtension.py -t ./Tests/$TEST -p $PLATFORM
-
-test_tiler_extension:
-  stage: test
-  parallel:
-    matrix:
-    - TEST: [simpleRegression, simpleCNN, testMatMul, testMaxPool]
-      PLATFORM: ['Siracusa']
-  script:
-    - !reference [.setup_test, script]
-    - python testTilerExtension.py -t ./Tests/$TEST -p $PLATFORM
-
-test_tiler_extension_fails:
-  stage: test
-  parallel:
-    matrix:
-    - TEST: [simpleRegression, simpleCNN, testMatMul]
-      PLATFORM: ['Siracusa']
-  script:
-    - !reference [.setup_test, script]
-    - python testTilerExtension.py -t ./Tests/$TEST -p $PLATFORM --l1 2000 --shouldFail
-
-test_memory_allocation_extension:
-  stage: test
-  parallel:
-    matrix:
-    - TEST: [simpleRegression, simpleCNN, miniMobileNet, miniMobileNetv2, testMatMul, testMaxPool]
-      PLATFORM: ['Siracusa']
-  script:
-    - !reference [.setup_test, script]
-    - python testTilerExtension.py -t ./Tests/$TEST -p $PLATFORM
-
-test_deeploy_typing:
-  stage: test
-  script:
-    - !reference [.setup_test, script]
-    - python testTypes.py
-
-test_regex_matching:
-  stage: test
-  script:
-    - !reference [.setup_test, script]
-    - python testRegexMatching.py
-
-format_python:
-  stage: test
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - yapf -rpd -e "third_party/" -e "install/" -e "toolchain/" .
-
-format_python_imports:
-  stage: test
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - isort --sg "**/third_party/*"  --sg "install/*" --sg "toolchain/*" ./ -c -v
-    - autoflake -c -r --remove-all-unused-imports --ignore-init-module-imports --exclude "*/third_party/**" ./
-
-format_c:
-  stage: test
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - python scripts/run_clang_format.py -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -ir --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format ./ scripts
-
-lint_python_licenses:
-  stage: test
-  variables:
-    LICENSE_STRING: "SPDX-License-Identifier: Apache-2.0"
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - grep -Lr "$LICENSE_STRING" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude "run_clang_format.py" | grep ".*\.py$" || [[ $? == 1 ]]
-
-lint_c_licenses:
-  stage: test
-  variables:
-    LICENSE_STRING: "SPDX-License-Identifier: Apache-2.0"
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - grep -Lr "$LICENSE_STRING" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude-dir="runtime" | grep ".*\.c$" || [[ $? == 1 ]]
-
-lint_c_header_licenses:
-  stage: test
-  variables:
-    LICENSE_STRING: "SPDX-License-Identifier: Apache-2.0"
-  script:
-    - bash && source ~/.bashrc
-    - $CONDA activate dumpoci
-    - export PYTHONPATH=`pwd`:$PYTHONPATH
-    - grep -Lr "$LICENSE_STRING" --exclude-dir="toolchain" --exclude-dir="install" --exclude-dir=".git" . --exclude-dir="third_party" --exclude-dir="TEST_*" --exclude-dir="runtime" | grep ".*\.h$" || [[ $? == 1 ]]
diff --git a/.gitlab/issue_templates/issue_template.md b/.gitlab/issue_templates/issue_template.md
deleted file mode 100644
index 92cae8b04d..0000000000
--- a/.gitlab/issue_templates/issue_template.md
+++ /dev/null
@@ -1,24 +0,0 @@
-## Summary
-
-Give a *short* description of the problem, at most two paragraphs.
-
-## Steps to reproduce
-
-If possible create an example project that exhibits the problematic behaviour and reference it here. Please be as specific as possible.
-
-## Bug Behaviour
-
-Describe what is happening in your minimal example.
-
-## Expected Behaviour
-
-Describe what you expect to happen.
-
-## Relevant logs and/or screenshots
-
-If available, paste any relevant logs - use code blocks (```) to format console output, logs, and code, as
-it's very hard to read otherwise.
-
-## Possible fixes
-
-If you can, link to the line of code that might be responsible for the problem.
diff --git a/.gitlab/merge_request_templates/MRTemplate.md b/.gitlab/merge_request_templates/MRTemplate.md
deleted file mode 100644
index f5321172a1..0000000000
--- a/.gitlab/merge_request_templates/MRTemplate.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Changelog
-
-Describe the intent of your merge request here.
-
-## Added
-
-## Changed
-
-## Fixed
-
-
-## PR Merge Checklist
-
-1. [ ] Is your PR rebased on the latest `devel` commit and pointing to `devel`?
-2. [ ] Was your PR reviewed and accepted?
-3. [ ] Does your latest pipeline pass?
-4. [ ] Are all dependencies merged onto their respective `main` branches?
-5. [ ] Did you reset all .gitmodules URLs to point to the `deeploy` group?
-6. [ ] Did you check in the latest commits for all dependencies available on their `main` branches?
diff --git a/.gitmodules b/.gitmodules
index def05e2adc..ea01f2734d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 [submodule "pulp-nn-mixed"]
 	path = TargetLibraries/PULPOpen/third_party/pulp-nn-mixed
 	url = https://github.com/pulp-platform/pulp-nn-mixed.git
diff --git a/.isort.cfg b/.isort.cfg
index 59aa31d957..217d3f7dc1 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 [settings]
 line_length=120
 multi_line_output=2
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 799faa5dbe..75f2a421bd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,32 +1,71 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
-exclude: .*third_party.*
-
-# By default, all hooks will be installed as pre-push
-default_stages: [pre-push]
+---
+exclude: |
+  (?x)^(
+    .*third_party.*
+    | .*install/.*
+    | .*toolchain/.*
+    | .*TEST_.*
+    | .*TestFiles.*
+    | .*runtime.*
+  )
 
 repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: check-added-large-files
+        name: Check for added large files
+      - id: trailing-whitespace
+        name: Check for trailing whitespace
+  - repo: local
+    hooks:
+      - id: reuse
+        name: Check SPDX License Headers
+        entry: scripts/reuse_skip_wrapper.py
+        language: python
+        stages: [pre-commit, pre-merge-commit, pre-push, manual]
+        types: [text]
+        exclude_types: [batch, svg, json, markdown]
+  - repo: https://github.com/google/yapf
+    rev: v0.43.0
     hooks:
-    -   id: check-added-large-files
--   repo: https://github.com/PyCQA/autoflake
+      - id: yapf
+        name: Autoformat Python Files
+        args: ["--in-place", "--parallel"]
+        stages: [pre-commit, pre-merge-commit, pre-push, manual]
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: Autoformat Python Imports
+        args: ["--quiet"]
+        stages: [pre-commit, pre-merge-commit, pre-push, manual]
+  - repo: https://github.com/PyCQA/autoflake
     rev: v2.3.0
     hooks:
-    -   id: autoflake
+      - id: autoflake
+        name: Remove Unused Python Imports
         args:
-            - "--remove-all-unused-imports"
-            - "--ignore-init-module-imports"
-            - "--in-place"
--   repo: https://github.com/google/yapf
-    rev: v0.33.0
+          - "--remove-all-unused-imports"
+          - "--ignore-init-module-imports"
+          - "--in-place"
+        stages: [pre-commit, pre-merge-commit, pre-push, manual]
+  - repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v15.0.7
     hooks:
-    -   id: yapf
-        args:
-            - "--in-place"
-            - "--parallel"
--   repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+      - id: clang-format
+        name: Autoformat C/C++ Files
+        args: ["-i"]
+        stages: [pre-commit, pre-merge-commit, pre-push, manual]
+  - repo: https://github.com/adrienverge/yamllint.git
+    rev: v1.33.0
     hooks:
-    -   id: isort
-        name: isort (python)
+      - id: yamllint
+        name: Lint YAML Files
+        stages: [pre-commit, pre-merge-commit, pre-push, manual]
diff --git a/.style.yapf b/.style.yapf
index 3389b2a67b..2aa801bde2 100644
--- a/.style.yapf
+++ b/.style.yapf
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 [style]
 based_on_style = google
 column_limit = 120
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
index 1874c6aee9..7fe57401ba 100644
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -1,12 +1,12 @@
 {
-    "configurations": [
-        {
-            "name": "cMake",
-            "configurationProvider": "ms-vscode.cmake-tools",
-            "compileCommands": [
-                "${workspaceFolder}/DeeployTest/TEST_RECENT/build/compile_commands.json"
-            ]
-        }
-    ],
-    "version": 4
+  "configurations": [
+    {
+      "name": "cMake",
+      "configurationProvider": "ms-vscode.cmake-tools",
+      "compileCommands": [
+        "${workspaceFolder}/DeeployTest/TEST_RECENT/build/compile_commands.json"
+      ]
+    }
+  ],
+  "version": 4
 }
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 6703ff81b6..554ca8cd91 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,80 +1,91 @@
 {
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "Deeploy Generate Untiled",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "generateNetwork.py",
-            "console": "integratedTerminal",
-            "cwd": "${workspaceFolder}/DeeployTest",
-            "justMyCode": false,
-            "args": "-p${input:platformUntiled} -t${input:model} ${input:additionalArgsUntiled}"
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Deeploy Generate Untiled",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "generateNetwork.py",
+      "console": "integratedTerminal",
+      "cwd": "${workspaceFolder}/DeeployTest",
+      "justMyCode": false,
+      "args":
+          "-p${input:platformUntiled} -t${input:model} ${input:additionalArgsUntiled}"
+    },
+    {
+      "name": "Deeploy Generate Tiled",
+      "type": "debugpy",
+      "request": "launch",
+      "program": "testMVP.py",
+      "console": "integratedTerminal",
+      "cwd": "${workspaceFolder}/DeeployTest",
+      "justMyCode": false,
+      "args":
+          "-p${input:platformTiled} -t${input:model} ${input:additionalArgsTiled}"
+    }
+  ],
+  "inputs": [
+    {
+      "id": "platformUntiled",
+      "type": "pickString",
+      "description": "Problem",
+      "options": [
+        "QEMU-ARM",
+        "Generic",
+        "MemPool",
+        "Apollo3",
+        "Apollo4",
+        "Snitch",
+        "Siracusa"
+      ],
+      "default": "Generic"
+    },
+    {
+      "id": "platformTiled",
+      "type": "pickString",
+      "description": "Problem",
+      "options": [
+        "Snitch",
+        "Siracusa",
+        "Siracusa_w_neureka"
+      ],
+      "default": "Siracusa"
+    },
+    {
+      "id": "model",
+      "type": "command",
+      "command": "extension.commandvariable.file.pickFile",
+      "args": {
+        "description": "Select ONNX File",
+        "include": "**/*.onnx",
+        "display": "transform",
+        "fromFolder": {
+          "fixed": "${workspaceFolder}/DeeployTest/Tests"
         },
-        {
-            "name": "Deeploy Generate Tiled",
-            "type": "debugpy",
-            "request": "launch",
-            "program": "testMVP.py",
-            "console": "integratedTerminal",
-            "cwd": "${workspaceFolder}/DeeployTest",
-            "justMyCode": false,
-            "args": "-p${input:platformTiled} -t${input:model} ${input:additionalArgsTiled}"
-        }
-    ],
-    "inputs": [
-        {
-            "id": "platformUntiled",
-            "type": "pickString",
-            "description": "Problem",
-            "options": [
-                "QEMU-ARM",
-                "Generic",
-                "MemPool",
-                "Apollo3",
-                "Apollo4",
-                "Snitch",
-                "Siracusa"
-            ],
-            "default": "Generic"
-        },
-        {
-            "id": "platformTiled",
-            "type": "pickString",
-            "description": "Problem",
-            "options": [
-                "Snitch",
-                "Siracusa",
-                "Siracusa_w_neureka"
-            ],
-            "default": "Siracusa"
-        },
-        {
-            "id": "model",
-            "type": "command",
-            "command": "extension.commandvariable.file.pickFile",
-            "args": {
-                "description": "Select ONNX File",
-                "include": "DeeployTest/Tests/**/*.onnx",
-                "transform": {
-                    "text": "${fileDirname}"
-                }
-            }
-        },
-        {
-            "id": "additionalArgsUntiled",
-            "type": "promptString",
-            "description": "Additional Arguments",
-            "default": "-v"
+        "labelTransform": {
+          "text": "${fileDirname}",
+          "find" : "${workspaceFolder}/DeeployTest/Tests/",
+          "replace": ""
         },
-        {
-            "id": "additionalArgsTiled",
-            "type": "promptString",
-            "description": "Additional Arguments",
-            "default": "-v --doublebuffer"
+        "valueTransform": {
+          "text": "${fileDirname}"
         }
-    ]
+      }
+    },
+    {
+      "id": "additionalArgsUntiled",
+      "type": "promptString",
+      "description": "Additional Arguments",
+      "default": "-v"
+    },
+    {
+      "id": "additionalArgsTiled",
+      "type": "promptString",
+      "description": "Additional Arguments",
+      "default": "-v --doublebuffer"
+    }
+  ]
 }
\ No newline at end of file
diff --git a/.yamllint b/.yamllint
new file mode 100644
index 0000000000..ca8d1f606b
--- /dev/null
+++ b/.yamllint
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+extends: default
+
+rules:
+  line-length: disable
+  indentation:
+    spaces: 2
+    indent-sequences: consistent
+  braces:
+    forbid: false
+    min-spaces-inside: 0
+    max-spaces-inside: 2
+  comments:
+    min-spaces-from-content: 1
+
+
+ignore:
+  # Ignore all files in third_party
+  - "**/third_party/"
+  # Ignore all files in runtime
+  - "**/runtime/"
+  # Ignore all files in TEST_*
+  - "**/TEST_*/"
+  # Ignore all files in install
+  - "**/install/"
+  # Ignore all files in toolchain
+  - "**/toolchain/"
+  # Ignore all files in .git
+  - "**/.git/**"
diff --git a/.yapfignore b/.yapfignore
index 200637b4e2..459507ebde 100644
--- a/.yapfignore
+++ b/.yapfignore
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 *third_party/
 *install/
 *toolchain/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 66b3b3892d..cc185c7459 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,197 @@
 # Changelog
 This file contains the changelog for the Deeploy project. The changelog is divided into sections based on the version of the project. Each section contains a list of pull requests, features, changes, fixes, and removals that were made in that version.
 
-## Release v0.2.0 (2025-07-09)
+## Unreleased (Planned Release Target: v0.2.2)
+
+
+### List of Pull Requests
+- 
+
+### Added
+- 
+
+### Changed
+- 
+
+### Fixed
+- 
+
+### Removed
+- 
+
+## Release v0.2.1 (2026-02-05) [#158](https://github.com/pulp-platform/Deeploy/pull/158)
+
+### List of Pull Requests
+- PyPi Package Deployment + Remove Banshee Dept [#154](https://github.com/pulp-platform/Deeploy/pull/154)
+- PyTest Migration [#144](https://github.com/pulp-platform/Deeploy/pull/144)
+- Update submodule `pulp-nn-mixed` [#145](https://github.com/pulp-platform/Deeploy/pull/145)
+- Improve Profiling [#138](https://github.com/pulp-platform/Deeploy/pull/138)
+- FP32 ReduceMean operator improvement [#137](https://github.com/pulp-platform/Deeploy/pull/137)
+- Support for RMSNorm (Pow and Sqrt operators) [#136](https://github.com/pulp-platform/Deeploy/pull/136)
+- Demo TinyViT compatibility with tiled Siracusa [#124](https://github.com/pulp-platform/Deeploy/pull/124)
+- TinyViT on non-tiled Siracusa [#117](https://github.com/pulp-platform/Deeploy/pull/117)
+- Support Fully Asynchronous DMAs [#114](https://github.com/pulp-platform/Deeploy/pull/114)
+- Disallow shape inference [#128](https://github.com/pulp-platform/Deeploy/pull/128)
+- Remove memory-aware node bindings [#123](https://github.com/pulp-platform/Deeploy/pull/123)
+- Fix missing const's layout transformation and refactor NCHWtoNHWC passes [#122](https://github.com/pulp-platform/Deeploy/pull/122)
+- Fix aliasing [#125](https://github.com/pulp-platform/Deeploy/pull/125)
+- Support for 1D Autoencoder [#98](https://github.com/pulp-platform/Deeploy/pull/98)
+- Refactor Logging for Improved Debugging [#115](https://github.com/pulp-platform/Deeploy/pull/115)
+- Add reuse-tool as an SPDX license header linter [#113](https://github.com/pulp-platform/Deeploy/pull/113)
+- Bug fixes, API Cleanup and Reduce Compiler Warning on PULP [#112](https://github.com/pulp-platform/Deeploy/pull/112)
+- Fix PULP GEMM `batch` serialization [#109](https://github.com/pulp-platform/Deeploy/pull/109)
+- Split CI Workflows by Platform and Task, Improve Formatting and Linting Reliability [#108](https://github.com/pulp-platform/Deeploy/pull/108)
+- Refactor tiling code generation [#105](https://github.com/pulp-platform/Deeploy/pull/105)
+- Change order of typeMatching entries [#68](https://github.com/pulp-platform/Deeploy/pull/68)
+- Node Mangling to avoid duplication [#93](https://github.com/pulp-platform/Deeploy/pull/93)
+- Prepare Post v0.2.0 Release [#104](https://github.com/pulp-platform/Deeploy/pull/104)
+- Use Docker digests instead of arch-specific tags [#106](https://github.com/pulp-platform/Deeploy/pull/106)
+- Fix `Unsqueeze` Op. when using ONNX opset 13 or higher (from attribute to input) [#119](https://github.com/pulp-platform/Deeploy/pull/119)
+- Fix bias hoisting in generic GEMM with no bias [#126](https://github.com/pulp-platform/Deeploy/pull/126)
+
+### Added
+- The `publish.yml` action to build a branch and push it to PyPi. The action is automatically triggered when a tag with the "v*" format is emitted. 
+- I created a release of [Banshee](https://github.com/pulp-platform/banshee/releases/tag/v0.5.0-prebuilt) so we don't need to rebuild it over and over. The `Makefile` now pulls that release depending on the platform.
+- I bumped the onnx-graphsurgeon version such that we don't need to use NVIDIA's PyPi index anymore.
+- `_export_graph` assigns their export type to the tensors before export.
+- `pytest` and `pytest-xdist` as dependencies of Deeploy.
+- A `pytest.ini` for the global configuration of PyTest for the project.
+- `conftest.py` to define CLI args for PyTest for the whole project, it also defines a set of global fixtures and markers.
+- `pytestRunner.py` contains helper functions and fixtures for the whole project.
+- `test_platforms.py` lists the E2E tests and sorts them into marked categories (per platform and per kernel/model).
+- Each platform has a test config file where a list or a dict describes the tests.
+- Support for unknown number of data dimensions in the tiler
+- Parallelization support for the FP32 ReduceMean operator on PULPOpen
+- Extensive testing for the ReduceMean operator
+- Pass to remove ReduceMean operators that don't change data content, but only its shape
+- Support for RMSNorm operation via operator decomposition.
+- Added `Pow` (Power) and `Sqrt` (Square Root) operation support (Parsers, Layers, Bindings, Templates, and FP32 Kernels) for the Generic platform.
+- Support for input tiling for PULP FP regular and DW conv 2D.
+- CI tests for tiled Siracusa FP regular and DW conv 2D, with and without bias, for skip connections, and for the demo version of TinyViT.
+- Documentation for PULP FP regular and DW conv 2D and MatMul tile constraints.
+- PULP ReduceMean and Slice tile constraints.
+- PULP 2D FP DW conv Im2Col template and kernel, with bias support.
+- Bias support for PULP 2D FP regular conv Im2Col in template & kernel.
+- PULP FP DW conv 2D parser.
+- FP conv 2D (simple & DW), reshape & skip connection, and TinyViT demo tests to the non-tiled Siracusa CI pipeline.
+- FP bindings and mappings for PULP slice, DW conv 2D, and reduce mean operations.
+- FP PULP DW conv lowering optimization pass similar to the existent one for integer version.
+- RemoveEmptyConvBiasPass to the PULP optimizer.
+- Add manual type inference feature (CLI: `--input-type-map`/`--input-offset-map`) to resolve ambiguities when test inputs are not representative enough
+- Added a `testTypeInferenceDifferentTypes` test case to validate type inference for different input types
+- Added `_mangleNodeNames` function to avoid duplicate node mappings
+- Output Docker image digests per platform (`amd64`, `arm64`) after build, which is used to construct the multi-arch Docker manifest. This preventes registry clutter caused by unnecessary per-architecture Docker tags.
+- AsyncDma abstraction of DMA's
+- test runner per DMA and a script that tests all the DMA's
+- generic Single/DoubleBufferingTilingCodeGeneration classes
+- TilingVariableReplacementUpdate class that updates the variable replacement refs
+- TilingHoistingMixIn class that encapsulates all the hoisting helper functions of tiling
+- sorting of input memory allocations to allow references that live in the same memory level as the memory they are referencing
+- a function that tests the tiling solution for correctness which currently only tests buffer allocation for byte alignment
+- IntrospectiveCodeTransformation: `_indexPointer()`, `indexVars()`, `dereferenceVars()`. The `*Vars` functions index/dereference a list of variables (useful for tiling)
+- NetworkContext: `unravelReference()` that unravels a `_ReferenceBuffer` until the base buffer
+- NetworkContext: `is_object()` - helper function that determines whether the string represents a name of a local or global object
+- NetworkContext: `is_buffer()` - helper function that determines whether the string represents a name of a buffer
+- missing checks for environment variables
+- `_permuteHyperRectangle` helper function
+- Added CI badges to the README
+- Added YAML linting to CI
+- Added missing license headers and C header include guards
+- Extended the pre-commit hooks to remove trailing whitespace, check licenses, format and lint files
+- Reshape operator support for PULP (`ReshapeTemplate` in bindings)
+- Missing class attributes in `Closure.py`
+- reuse_skip_wrapper.py to manually skip files
+- Centralized logging with `DEFAULT_LOGGER`, replacing `print` statements
+- Debug logs for type checking/parsing; `__repr__` for core classes
+- Buffer utilities: `checkNumLevels` validation and `sizeInBytes` method
+- Per–memory-level usage tracking and worst-case reporting in `NetworkContext`
+- Memory/I/O summaries and input/output logging in deployers
+- RequantHelpers.py for Neureka's TileConstraints
+- Added assertion that all the graph tensors after lowering have a shape annotated
+- Added testFloatGEMMnobias
+- Profiling support and optional comments in generated DMA code for better traceability
+- Added new waiting-strategy logic with fine-grained `PerTensorWaitingStrategy`
+- PULPClusterEngine now accepts a `n_cores` parameter to set the number of cores used
+- annotateNCores method to PULPDeployer that adds an `n_cores` key to all PULPClusterEngine templates' operatorRepresentations
+- Calculate non-kernel overhead and show total time spent during profiling
+
+### Changed
+- Rename package name from `PULP-Deeploy` to `deeploy-pulp`.
+- Each CI workflow has been simplified to call the pytest suite with certain markers.
+- Structure of Tests subdir for improved ordering
+- Structure of .gitignore file for improved ordering
+- Decreased L1 maximal memory limit for CI pipeline tests where compatible thanks to the implementation of Conv2D input tiling support.
+- Reduced size of reshape & skip connection test, for non-tiled Siracusa memory compatibility.
+- Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`.
+- mchan HAL is now reduced to bare-bones
+- refactor of the IntrospectiveCodeTransformation to work on the Mako template
+- refactor of memory allocation code transformation passes
+- _ReferenceBuffer accepts an optional `offset` argument to offset the reference
+- NetworkContext: `hoistReference` - accepts the actual buffer as reference instead of name, accepts shape, offset, and override_type arguments, and returns the actual buffer, not its name
+- `_mangleNodeRep` -> `_mangleOpRepr` - the canonical name we use is `OperatorRepresentation`. `NodeRep` and `ParseDict` are old iterations of the name.
+- rename of permutation functions to follow this convention: `permute` is an action that permutes something, `permutation` is a function that generates a permutation
+- `_permuteList` to just `_permute`
+- removed manual buffer name mangling since we do it in the ExecutionBlock generate() function, simplifies templates
+- we now check that buffer shapes/hyperrectangles/tiling ranks match which required changing a few `serializeTilingSolution` functions to preserve the same shape rank
+- big refactor of the code generation part of the TilingExtension and needed changes to PULPOpen and Snitch due to it
+- PULPClusterTilingSB and PULPClusterTilingDB now allow for transfers of any rank (dimensionality)
+- PULP's final output diff is now calculated as absolute error, instead of just subtraction
+- common code generation code between testMVP/generateNetwork/... was extracted into a single `generateTestNetwork` function
+- in some functions, instead of passing the name of a buffer, the actual buffer is just passed
+- tile function allows overriding the optimizer with external tilingSolution and memoryMap
+- refactor of the permutation functions for clarity
+- Split CI into multiple workflow files: one per platform, one for lint & license, one for general Deeploy tests, one for infrastructure, and two for Docker flows, improving maintainability and status reporting
+- Extended CI to check license in cMake and YAML files
+- Removed all trailing whitespace
+- Removed unnecessary includes from the PULP platform header list, such as `DeeployBasicMath.h`, for cleaner code generation
+- Changed types and added correct casts to fix many compiler warnings in the PULP target library
+- Use [reuse-tool](https://github.com/fsfe/reuse-tool) in pre-commit, CI, and Makefile for SPDX license header linting
+- Deployer workflow now uses `prepare(...)` instead of `generateFunction(...)`.
+- Removed `fromVariableBuffer`
+- Refactored `hoistConstant`
+- Refactored TransientBuffer's `__init__`
+- Refactor of the NCHWtoNHWC passes
+- Removed NodeMemoryLevelChecker, MemoryAwareNodeBinding
+- Removed _parseNode from MemoryNetworkDeployer since we don't need the annotations before typeChecking anymore
+- Removed Wmem variants of bindings and tile constraints from Neureka
+- Disabled ICCT_ITA_8 MemPool test because it was using a lowering that created shapeless tensors
+- Added missing shape annotation to the testTypeInferenceDifferentTypes
+- Refactored DMA code generation (`SnitchDma`, `Mchan`) to correctly overlap transfers and compute in double-buffering mode
+- changed `_mapNode` to `_selectEngine` which reduces the responsibility of that function to, as the name states, just engine selection
+- Print kernel profiling information for all memory levels
+
+### Fixed
+- Update `install.md` to remove rust mention and fix test command.
+- Update `README.md` to remove reference to NVIDIA's PyPi index.
+- `nvidia-pyindex` was broken as it now tries to build the wheel to respect the new policy on packages using `pyproject`. Instead of installing this package, we just add the `https://pypi.ngc.nvidia.com` channel to the pip config file.
+- Pin versions of broken dependencies of Banshee.
+- Fixed ReduceMean parallelization and tiling issues described in Issue [#134](https://github.com/pulp-platform/Deeploy/issues/134).
+- Fixed PULP FP32 regular and DW Conv2D, and MatMul tile constraints.
+- Fixed type casting for tiling code generation.
+- Fixed bug in buffer name identification in code generation for tests with L3 default memory level.
+- PULP GELU kernel to use tanh approximation.
+- Fixed bug for non-batched elements in the PULPOpen FP GEMM and matmul templates.
+- Added underscore to the beginning of closure names to avoid naming issues when they start with unsupported first characters (like numbers).
+- Data types in the PULPOpen FP add and mul templates.
+- Prevent node duplication for graphs generated via GraphSurgeon
+- Resolved issue with missing `id` in the `Build Cache for Docker` step, used in the `Inject build-cache` step.
+- Fix license CI check and prevent potential issues with `jq` installation
+- PULP Gemm `batch` variable serialization
+- Fixed multiple typos in variable and method names, such as changing `includeGobalReferences` to `includeGlobalReferences` and `dicardedMappers` to `discardedMappers`
+- Corrected method usage in `importDeeployState` to call `NetworkContext.importNetworkContext` instead of the incorrect method name
+- Correctly return `signProp` from `setupDeployer` instead of hardcoding the value to `False` in `testMVP.py`
+- Fixed `Unsqueeze` Op. when using ONNX opset 13 or higher (from attribute to input)
+- Fixed aliasing
+- Missing layout transformation of the const's (bias, mul, add, shift in Conv/RequantizedConv)
+- Keep mul/add rank of requantized Neureka tile constraints
+- Fix bias hoisting in generic GEMM with no bias
+- DMA synchronization bug causing reduced DB performance on memory-bound kernels.
+
+### Removed
+- Delete outdated and unused `.gitlab-ci.yml` file
+- dory_dma.c and dory_dma.h
+
+## Release v0.2.0 (2025-07-08) [#103](https://github.com/pulp-platform/Deeploy/pull/103)
 This release containing major architectural changes, new platform support, enhanced simulation workflows, floating-point kernel support, training infrastructure for CCT models, memory allocation strategies, and documentation improvements.
 
 ### List of Pull Requests
@@ -71,6 +261,13 @@ This release containing major architectural changes, new platform support, enhan
 
 
 ### Added
+- BatchNorm kernel
+- ConvTranspose kernel
+- MaxPool1D kernel
+- Template for 1D Convolution
+- Support for float32 data type in the previous kernels
+- Float binding for Pad1D kernel
+- Test for Autoencoder1D in the CI pipeline
 - ChimeraDeployer, currently mainly a placeholder
 - Allocate templates for Chimera
 - ChimeraPlatform, using appropriate allocation templates and using the generic Parser + Binding for the Add node
@@ -204,6 +401,8 @@ This release containing major architectural changes, new platform support, enhan
 - `dev-requirements.txt` tracking the dependencies of the build system, linting, documentation, and QOL.
 
 ### Changed
+- FloatConvTemplate file
+- Platform.py file
 - Bump the CMake version to 3.24 as required for the chimera-sdk
 - Bump GVSoC's version and add chimera simulation target
 - Rename the generic source util to utils to avoid name collision with chimera-sdk
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 04a3121baf..70dec13084 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 cmake_minimum_required(VERSION 3.12)
 
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
@@ -106,7 +110,7 @@ if(platform STREQUAL SoftHier)
   if(TOOLCHAIN STREQUAL GCC)
     set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/softhier/toolchain_gcc.cmake)
   endif()
-  
+
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/softhier/softhier_gvsoc.cmake)
 
   project(deeploy LANGUAGES C ASM)
diff --git a/CODEOWNERS b/CODEOWNERS
index 40e51a533d..baeacab34e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1,5 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: CC-BY-ND-4.0
+
 *       @victor-jung @xeratec @lukamac
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 08c2486cd6..2285670366 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,7 +1,6 @@
 # Contribution Guide
 
 We encourage submitting your issues and work in pull requests against the `devel` branch. Please understand that we are trying to maintain a consistent minimal quality standard.
-Any and all pull requests you submit can only be accepted under the Apache 2.0 License.
 
 ## Overview
 
@@ -42,24 +41,16 @@ Additionally, add the title and link to the pull request in the list of pull req
 
 Deeploy mainly consists of code implemented in C, Makefile, and Python. To facilitate efficient collaboration among users and contributors, it is important to maintain a consistent coding style. To achieve this, it is strongly recommend to use autoformatting tools with the provided configuration files. Additionally, the Continuous Integration (CI) system checks the adherence to the style guide for each pushed commit. Currently configuration for C using `clang-format` and for Python using `yapf` and `isort` are provided.
 
-To recursively format all Python files run:
+You can format all relevant files by running:
 ```bash
-autoflake -i -r --remove-all-unused-imports --ignore-init-module-imports --exclude "*/third_party/**" .
-yapf -ipr .
-isort .
+make format
 ```
 
-And for C files:
+Alternatively, to only lint the files without modifying them, you can run:
 ```bash
-python scripts/run_clang_format.py -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -ir --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format ./
+make lint
 ```
 
-Note that third party applications should not be formatted. You can alternatively also run:
-```bash
-make format
-```
-to format all C and Python files.
-
 ### Pre-commit
 
 Additionally, we provide the [pre-commit](https://pre-commit.com) configuration file which you can use to install github hooks that execute the formatting commands on your changes.
@@ -81,3 +72,7 @@ pre-commit uninstall
 ```
 
 _Note:_ This configures only the python formatting git hooks. The c formatting is not supported at the moment.
+
+## Licensing
+
+Any and all pull requests you submit can only be accepted under the Apache 2.0 License. Every file needs to have an SPDX license header. We use the [reuse-tool](https://github.com/fsfe/reuse-tool) to check for the license header. You can use the same tool to add the license by calling it with the `annotate` command.
diff --git a/Container/Dockerfile.deeploy b/Container/Dockerfile.deeploy
index 6222492f9c..5266b84982 100644
--- a/Container/Dockerfile.deeploy
+++ b/Container/Dockerfile.deeploy
@@ -1,16 +1,20 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 ########## Stage 1: Large image to build toolchains and emulator ##########
 ARG BASE_IMAGE=ghcr.io/pulp-platform/deeploy-toolchain
 FROM ${BASE_IMAGE} AS toolchain
 
 # Intermediate Stage
 ARG DEBIAN_FRONTEND=noninteractive
-ARG BENDER_VERSION=0.28.1
+ARG BENDER_VERSION=0.29.1
 ARG UBUNTU_VERSION=22.04
 ARG TARGETPLATFORM
 
 ENV DEBIAN_FRONTEND=noninteractive
 ENV TZ=Etc/UTC
-ENV PATH="/root/.cargo/bin:/app/bender:${PATH}"
+ENV PATH="/app/install/bender:${PATH}"
 ENV LLVM_INSTALL_DIR=/app/install/llvm
 
 WORKDIR /app
@@ -38,14 +42,13 @@ RUN --mount=type=cache,target=/ccache \
     ccache -s
 
 # Install Bender
-RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+RUN mkdir -p /app/install/bender && \
+if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
     wget https://github.com/pulp-platform/bender/releases/download/v${BENDER_VERSION}/bender-${BENDER_VERSION}-x86_64-linux-gnu-ubuntu${UBUNTU_VERSION}.tar.gz && \
-    tar xzf bender-${BENDER_VERSION}-x86_64-linux-gnu-ubuntu${UBUNTU_VERSION}.tar.gz && \
-    cp  /app/bender /bin; \
+    tar xzf bender-${BENDER_VERSION}-x86_64-linux-gnu-ubuntu${UBUNTU_VERSION}.tar.gz -C /app/install/bender; \
 elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-    wget https://github.com/Xeratec/bender/releases/download/v0.28.3-rc1/bender-0.28.3-rc1-arm64-linux-gnu-ubuntu${UBUNTU_VERSION}.tar.gz && \
-    tar xzf bender-0.28.3-rc1-arm64-linux-gnu-ubuntu${UBUNTU_VERSION}.tar.gz && \
-    cp  /app/bender /bin; \
+    wget https://github.com/pulp-platform/bender/releases/download/v${BENDER_VERSION}/bender-${BENDER_VERSION}-arm64-linux-gnu-ubuntu${UBUNTU_VERSION}.tar.gz && \
+    tar xzf bender-${BENDER_VERSION}-arm64-linux-gnu-ubuntu${UBUNTU_VERSION}.tar.gz -C /app/install/bender; \
 fi
 
 # Compile Snitch Runtime
@@ -73,18 +76,22 @@ ENV SNITCH_HOME=/app/install/snitch_cluster
 ENV CHIMERA_SDK_HOME=/app/install/chimera-sdk
 ENV LLVM_INSTALL_DIR=/app/install/llvm
 ENV GVSOC_INSTALL_DIR=/app/install/gvsoc
+ENV BANSHEE_INSTALL_DIR=/app/install/banshee
 ENV SOFTHIER_INSTALL_DIR=/app/install/softhier
 ENV MINIMALLOC_INSTALL_DIR=/app/install/minimalloc
 ENV MEMPOOL_HOME=/app/install/mempool
-ENV PATH=/root/.cargo/bin:/app/install/qemu/bin:/app/install/banshee:$PATH
+ENV BENDER_INSTALL_DIR=/app/install/bender
+ENV PATH=/app/install/qemu/bin:/app/install/banshee:/app/install/bender:$PATH 
 
 WORKDIR /app
 
 COPY pyproject.toml ./
 
+# Add nvidia channel to the pip configuration
+RUN mkdir -p /etc && printf "[global]\nextra-index-url = https://pypi.ngc.nvidia.com\n" > /etc/pip.conf
+
 # Install dependencies
-RUN mkdir -p /root/.cargo/bin/ && \
-    apt-get update && \
+RUN apt-get update && \
     apt-get install -y git-lfs \
     wget \
     ccache \
@@ -99,7 +106,6 @@ RUN mkdir -p /root/.cargo/bin/ && \
     curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
     python get-pip.py && \
     rm get-pip.py && \
-    pip install nvidia-pyindex && \
     pip install toml-to-requirements && \
     toml-to-req --toml-file pyproject.toml && \
     pip install -r requirements.txt
@@ -122,4 +128,3 @@ RUN pip install -r requirements-dev.txt -r core-requirements.txt -r gapy-require
 
 # Copy pre-built toolchains and emulators
 COPY --from=toolchain /app/install ./install
-COPY --from=toolchain /root/.cargo/bin/banshee /root/.cargo/bin/banshee
\ No newline at end of file
diff --git a/Container/Dockerfile.toolchain b/Container/Dockerfile.toolchain
index 6e15d32197..3bd03b14da 100644
--- a/Container/Dockerfile.toolchain
+++ b/Container/Dockerfile.toolchain
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 ########## Stage 1: Large image to build toolchains and emulator ##########
 FROM ubuntu:22.04 AS toolchain
 
@@ -9,7 +13,6 @@ ENV TZ=Etc/UTC
 ENV CC="ccache gcc"
 ENV CXX="ccache g++"
 ENV CCACHE_DIR=/ccache
-ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Change the working directory
 WORKDIR /app
@@ -64,14 +67,6 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
 
-# Build Rust tools
-RUN apt remove cargo -y && \
-    apt autoremove -y && \
-    curl https://sh.rustup.rs -sSf | bash -s -- -y && \
-    rustup install 1.63.0 && \
-    rustup default 1.63.0 && \
-    rustup component add rust-src
-
 # Install meson
 RUN pip install meson
 
diff --git a/Container/Makefile b/Container/Makefile
index 7432aa059e..cd07d8632b 100644
--- a/Container/Makefile
+++ b/Container/Makefile
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
-# File: Makefile
-#
-# Created: 20.05.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 # Variables
 TOOLCHAIN_IMAGE ?= ghcr.io/pulp-platform/deeploy-toolchain:latest
diff --git a/Deeploy.code-workspace b/Deeploy.code-workspace
index 25b484c3fc..e601282bd3 100644
--- a/Deeploy.code-workspace
+++ b/Deeploy.code-workspace
@@ -1,3 +1,7 @@
+// SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
 {
 	"folders": [
 		{
diff --git a/Deeploy/AbstractDataTypes.py b/Deeploy/AbstractDataTypes.py
index 6af502c0f7..feeebe939b 100644
--- a/Deeploy/AbstractDataTypes.py
+++ b/Deeploy/AbstractDataTypes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: AbstractDataTypes.py
-#
-# Last edited: 25.04.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from __future__ import annotations
 
@@ -42,6 +21,8 @@
 _DeeployType = TypeVar("_DeeployType", _PointerType, _ImmediateType, _StructType)
 _PythonType = TypeVar("_PythonType", str, int, float, Dict[str, "_PythonType"], Iterable["_PythonType"])
 
+from Deeploy.Logging import DEFAULT_LOGGER as log
+
 
 class _ClassPropertyDescriptor(object):
 
@@ -209,6 +190,10 @@ def typeMin(cls) -> int:
         else:
             return 0
 
+    @_classproperty
+    def nLevels(cls) -> int:
+        return cls.typeMax - cls.typeMin + 1
+
     @classmethod
     def partialOrderUpcast(cls, otherCls: Type[Immediate]) -> bool:
         if issubclass(otherCls, IntegerImmediate):
@@ -234,6 +219,10 @@ def checkValue(cls, value: Union[int, Iterable[int], np.ndarray], ctxt: Optional
             return False
         return True
 
+    @classmethod
+    def fitsNumLevels(cls, nLevels: int) -> bool:
+        return nLevels <= cls.nLevels
+
 
 class FloatImmediate(Immediate[Union[float, Iterable[float]], _ImmediateType]):
     typeMantissa: int  #: int: Represents the number of bits reserved for the mantissa part
@@ -267,8 +256,8 @@ def partialOrderUpcast(cls, otherCls: Type[Immediate]) -> bool:
     @classmethod
     def checkValue(cls, value: Union[float, Iterable[float], np.ndarray], ctxt: Optional[_NetworkContext] = None):
         """
-        This method tries to manually cast standard python's standard immediate float precision values 
-        (64 bits) to an arbitrary FP representation and check if the new representation is close enough 
+        This method tries to manually cast standard python's standard immediate float precision values
+        (64 bits) to an arbitrary FP representation and check if the new representation is close enough
         to the original value.
         """
         _val_list = []
@@ -329,7 +318,7 @@ def checkValue(cls, value: Optional[str], ctxt: Optional[_NetworkContext] = None
             return False
 
         if value is None or value == "NULL":
-            print("WARNING: Setting pointer value to NULL - Referenced data is invalid!")
+            log.warning("Setting pointer value to NULL - Referenced data is invalid!")
             return True
 
         reference = ctxt.lookup(value)
@@ -353,6 +342,10 @@ def checkPromotion(cls, _value: Union[Optional[str], Pointer], ctxt: Optional[_N
             value = _value
         return cls.checkValue(value, ctxt)
 
+    @classmethod
+    def fitsNumLevels(cls, nLevels: int) -> bool:
+        return cls.referencedType.fitsNumLevels(nLevels)
+
     def __init__(self, _value: Union[Optional[str], Pointer], ctxt: Optional[_NetworkContext] = None):
         """Initializes a pointer to a registered object in the NetworkContext
 
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
index b137266de6..70a91fd0ce 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/Closure.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: Closure.py
-#
-# Last edited: 12.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Optional, Tuple, Type, Union
 
@@ -41,7 +20,10 @@
 _closureTemplate = NodeTemplate("""
 static void ${closureName}(void* ${closureName}_args){
 // CLOSURE ARG CAST
+% if len(closureStructArgs.value) > 0:
 ${closureStructArgs.typeName}* args = (${closureStructArgs.typeName}*) ${closureStructArgName};
+% endif
+
 % for argName, argType in closureStructArgs.value.items():
 ${argType.typeName} ${argName} = args->${argName};
 % endfor
@@ -85,7 +67,8 @@ def baseBlock(self):
 
 class ClosureGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
 
-    closureStructArgs: Struct
+    closureStructArgType: Dict[str, Type[Union[Pointer, Immediate, Struct]]]
+    closureStructArgs: Dict[str, Union[Pointer, Immediate, Struct]]
 
     def __init__(self,
                  closureCallTemplate: NodeTemplate = _closureCallTemplate,
@@ -109,7 +92,7 @@ def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: Execution
         closureStruct: Dict[str, Union[Pointer, Immediate, Struct]] = {}
         makoDynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, True)
 
-        for arg in list(dict.fromkeys(makoDynamicReferences)):
+        for arg in makoDynamicReferences:
             ref = ctxt.lookup(arg)
             if isinstance(ref, TransientBuffer):
                 closureStructArgsType[ctxt._mangle(arg)] = PointerClass(VoidType)
@@ -175,7 +158,8 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-        self.closureName = name + self.closureSuffix
+        # Prepend underscore to avoid name issues when beginning with problematic characters (like numbers)
+        self.closureName = "_" + name + self.closureSuffix
         self.functionCall = executionBlock.generate(ctxt)
         self._generateClosureStruct(ctxt, executionBlock)
         ctxt = self._generateClosureCtxt(ctxt, name)
@@ -202,7 +186,7 @@ def _generateClosureStruct(self, ctxt: NetworkContext, executionBlock: Execution
         # Add closure struct info to operatorRepresentation
         closureStructArgsType = {}
         closureStruct = {}
-        makoDynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, True)
+        makoDynamicReferences = self.extractDynamicReferences(ctxt, executionBlock, unrollStructs = True)
 
         filteredMakoDynamicReferences = []
 
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py b/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
index eb74e5a70f..42f5d57b1a 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/CycleMeasurement.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: CycleMeasurement.py
-#
-# Last edited: 13.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
index acdcc0d09c..7e682b2644 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/IntrospectiveCodeTransformation.py
@@ -1,38 +1,17 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: IntrospectiveBinding.py
-#
-# Last edited: 10.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-import copy
 import types
 from typing import Dict, List
 
 import mako.codegen as codegen
 from mako.lexer import Lexer
-from mako.parsetree import Expression, TemplateNode
+from mako.parsetree import Expression, TemplateNode, Text
+from mako.template import Template
 
 from Deeploy.AbstractDataTypes import Pointer, Struct
-from Deeploy.DeeployTypes import ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+from Deeploy.DeeployTypes import ExecutionBlock, NetworkContext, OperatorRepresentation, VariableBuffer
 
 _NULL: str = "NULL"
 
@@ -42,78 +21,89 @@ class IntrospectiveCodeTransformationMixIn():
     parseTreeDict: Dict[int, TemplateNode] = {}
 
     @staticmethod
-    def _generateParseTree(template: NodeTemplate) -> TemplateNode:
-        return Lexer(template.template._source).parse()
+    def _generateParseTree(template: Template) -> TemplateNode:
+        return Lexer(template._source).parse()
 
     @staticmethod
-    def _reconstructCode(template: NodeTemplate, node: TemplateNode):
-
-        def fixupParseTree(parseTree: TemplateNode) -> TemplateNode:
-            nodes = []
-            prevLine = 0
-            prevPos = 0
-            for node in parseTree.nodes:
-
-                newNode = copy.copy(node)
-                offset = len(node.source)
-
-                # Expression contain the actual expression + the symbols "${}", i.e. 3 offset symbols
-                if isinstance(newNode, Expression):
-                    offset += 3
+    def _reconstructCode(template: Template, node: TemplateNode) -> Template:
+        lexer = Lexer(template._source)
+        source = codegen.compile(
+            node,
+            template.uri,
+            None,
+            default_filters = template.default_filters,
+            buffer_filters = template.buffer_filters,
+            imports = template.imports,
+            future_imports = template.future_imports,
+            source_encoding = lexer.encoding,
+            generate_magic_comment = True,
+            strict_undefined = template.strict_undefined,
+            enable_loop = template.enable_loop,
+            reserved_names = template.reserved_names,
+        )
+        module = types.ModuleType(template.module_id)
+        code = compile(source, template.module_id, "exec")
+        exec(code, module.__dict__, module.__dict__)
 
-                prevPos = prevPos + offset
+        template._code = code
+        template.module = module
+        template.callable_ = template.module.render_body
+        return template
 
-                if prevLine != node.lineno:
-                    prevPos = node.pos
+    @staticmethod
+    def _indexPointer(parseTree: TemplateNode, ptrName: str, index: str) -> TemplateNode:
+        indexes = [i for i, node in enumerate(parseTree.nodes) if isinstance(node, Expression) and node.text == ptrName]
 
-                newNode.pos = prevPos
-                prevLine = node.lineno
+        for offset, idx in enumerate(indexes):
+            bracketOpen = Text("[", source = "[", lineno = 0, pos = 0, filename = None)
+            indexExpr = Expression(index, '', source = index, lineno = 0, pos = 0, filename = None)
+            bracketClose = Text("]", source = "]", lineno = 0, pos = 0, filename = None)
+            parseTree.nodes.insert(idx + 3 * offset + 1, bracketOpen)
+            parseTree.nodes.insert(idx + 3 * offset + 2, indexExpr)
+            parseTree.nodes.insert(idx + 3 * offset + 3, bracketClose)
 
-                nodes.append(newNode)
+        return parseTree
 
-            parseTree.nodes = nodes
+    @staticmethod
+    def indexVars(template: Template, varNames: List[str], index: str) -> None:
+        if len(varNames) == 0:
+            return
+        parseTree = IntrospectiveCodeTransformationMixIn._generateParseTree(template)
+        for name in varNames:
+            parseTree = IntrospectiveCodeTransformationMixIn._indexPointer(parseTree, name, index)
+        IntrospectiveCodeTransformationMixIn._reconstructCode(template, parseTree)
 
-            return parseTree
+    @staticmethod
+    def _dereferencePointer(parseTree: TemplateNode, ptrName: str) -> TemplateNode:
+        indexes = [i for i, node in enumerate(parseTree.nodes) if isinstance(node, Expression) and node.text == ptrName]
 
-        node = fixupParseTree(node)
+        for offset, idx in enumerate(indexes):
+            text = Text("*", source = "*", lineno = 0, pos = 0, filename = None)
+            parseTree.nodes.insert(idx + offset, text)
 
-        temp = template.template
-        lexer = Lexer(temp._source)
-        source = codegen.compile(
-            node,
-            temp.uri,
-            None,
-            default_filters = temp.default_filters,
-            buffer_filters = temp.buffer_filters,
-            imports = temp.imports,
-            future_imports = temp.future_imports,
-            source_encoding = lexer.encoding,
-            generate_magic_comment = True,
-            strict_undefined = temp.strict_undefined,
-            enable_loop = temp.enable_loop,
-            reserved_names = temp.reserved_names,
-        )
-        module = types.ModuleType(temp.module_id)
-        code = compile(source, temp.module_id, "exec")
-        exec(code, module.__dict__, module.__dict__)
+        return parseTree
 
-        temp._code = code
-        temp.module = module
-        temp.callable_ = temp.module.render_body
-        template.template = temp
+    @staticmethod
+    def dereferenceVars(template: Template, varNames: List[str]) -> None:
+        if len(varNames) == 0:
+            return
+        parseTree = IntrospectiveCodeTransformationMixIn._generateParseTree(template)
+        for name in varNames:
+            parseTree = IntrospectiveCodeTransformationMixIn._dereferencePointer(parseTree, name)
+        IntrospectiveCodeTransformationMixIn._reconstructCode(template, parseTree)
 
     def extractDynamicReferences(self,
                                  ctxt: NetworkContext,
                                  executionBlock: ExecutionBlock = None,
                                  unrollStructs = False,
-                                 includeGobalReferences = False):
+                                 includeGlobalReferences = False):
 
         makoDynamicReferences = []
         for codeSnippet in executionBlock.codeSnippets:
             template, operatorRepresentation = codeSnippet.template, codeSnippet.operatorRepresentation
 
-            newRefs = self._extractDynamicExpressions(ctxt, operatorRepresentation, template, unrollStructs,
-                                                      includeGobalReferences)
+            newRefs = self._extractDynamicExpressions(ctxt, operatorRepresentation, template.template, unrollStructs,
+                                                      includeGlobalReferences)
 
             makoDynamicReferences += newRefs
 
@@ -132,11 +122,10 @@ def _fixCtxtOrdering(ctxt: NetworkContext, nameList: List[str]) -> List[str]:
     def _extractDynamicExpressions(self,
                                    ctxt: NetworkContext,
                                    operatorRepresentation: OperatorRepresentation,
-                                   template: NodeTemplate,
+                                   template: Template,
                                    unrollStructs = False,
-                                   includeGobalReferences = False):
-
-        codeHash = hash(template.template._source)
+                                   includeGlobalReferences = False):
+        codeHash = hash(template._source)
 
         if codeHash in self.parseTreeDict.keys():
             makoParseTree = self.parseTreeDict[codeHash]
@@ -146,62 +135,66 @@ def _extractDynamicExpressions(self,
             self.parseTreeDict[codeHash] = makoParseTree
 
         # Filter parsing tree for expressions
-        makoExpressions = [node.text for node in makoParseTree.nodes if type(node) == Expression]
+        makoExpressions = [node.text for node in makoParseTree.nodes if isinstance(node, Expression)]
 
-        # Filter expressions for local variables contained in operatorRepresentation
-        makoLocalReferences = [
-            node for node in makoExpressions
-            if ((node in operatorRepresentation) and type(operatorRepresentation[node]) == str and (
-                operatorRepresentation[node] in ctxt.localObjects.keys()))
+        # Filter represented expressions
+        representedExpressions = [
+            operatorRepresentation[expr] for expr in makoExpressions if expr in operatorRepresentation
         ]
 
-        # Filter expressions for global variables contained in operatorRepresentation
-        makoGlobalReferences = [
-            node for node in makoExpressions
-            if ((node in operatorRepresentation) and type(operatorRepresentation[node]) == str and (
-                operatorRepresentation[node] in ctxt.globalObjects.keys()))
-        ]
+        # Add in mako expressions that are accessed through pageargs
+        # Required for unknown number of data dimensions
+        for expr in makoExpressions:
+            if expr.startswith("pageargs["):
+                # Extract key inside pageargs[]
+                key = expr[len("pageargs["):-1]
+                assert key.startswith("'") or key.startswith(
+                    "\""), f"pageargs key must begin with a string literal, got: {key}"
+
+                # Extract initial string literal (between first 2 " or ' characters)
+                quoteChar = key[0]
+                endIdx = key.find(quoteChar, 1)
+                key = key[1:endIdx]
+
+                assert endIdx != -1, f"pageargs key missing closing quote: {expr}"
+
+                # Search for all expressions that begin with the given key
+                for exprKey in operatorRepresentation.keys():
+                    if exprKey.startswith(key):
+                        representedExpressions.append(operatorRepresentation[exprKey])
+
+        # Filter buffers from expressions
+        references = [expr for expr in representedExpressions if ctxt.is_buffer(expr)]
+
+        if unrollStructs:
+
+            def _unrollStructReferences(val: Struct) -> List[str]:
+                assert isinstance(val, Struct)
+                # Recursively unroll struct references
+                structReferences = []
+                for field in val.value.values():
+                    if isinstance(field, Struct):
+                        structReferences += _unrollStructReferences(field)
+                    elif isinstance(field, Pointer) and field.referenceName != _NULL:
+                        structReferences.append(field.referenceName)
+                return structReferences
+
+            # Unroll local struct references
+            for ref in references:
+                if hasattr(ctxt.lookup(ref), "structDict"):
+                    references += _unrollStructReferences(ctxt.lookup(ref).structDict)
 
-        def _unrollStructReferences(val) -> List[str]:
-            # Unroll struct references
-            structReferences = []
-            if isinstance(val, Struct):
-                for key, _type in val.value.items():
-                    if isinstance(_type, Struct):
-                        structReferences += _unrollStructReferences(val.value[key])
-                    elif isinstance(_type, Pointer) and val.value[key].referenceName != _NULL:
-                        structReferences.append(val.value[key].referenceName)
-            return structReferences
-
-        # Unroll local struct references
-        localReferences = []
-        localStructReferences = []
-        for ref in makoLocalReferences:
-            localReferences.append(operatorRepresentation[ref])
-            if unrollStructs:
-                if ctxt.is_local(operatorRepresentation[ref]) and hasattr(ctxt.lookup(operatorRepresentation[ref]),
-                                                                          "structDict"):
-                    localStructReferences += _unrollStructReferences(
-                        ctxt.lookup(operatorRepresentation[ref]).structDict)
-
-        # Unroll global struct references
-        globalReferences = []
-        globalStructReferences = []
-        for ref in makoGlobalReferences:
-            globalReferences.append(operatorRepresentation[ref])
-            if unrollStructs:
-                if ctxt.is_global(operatorRepresentation[ref]) and hasattr(ctxt.lookup(operatorRepresentation[ref]),
-                                                                           "structDict"):
-                    globalStructReferences += _unrollStructReferences(
-                        ctxt.lookup(operatorRepresentation[ref]).structDict)
+        # Filter expressions for local variables contained in operatorRepresentation
+        localReferences = [ref for ref in references if ctxt.is_local(ref)]
+
+        # Filter expressions for global variables contained in operatorRepresentation
+        globalReferences = [ref for ref in references if ctxt.is_global(ref)]
 
         # Filter for dynamically allocated tensors
-        dynamicLocalReferences = [ref for ref in localReferences + localStructReferences if ctxt.lookup(ref)._deploy]
-        dynamicGlobalReferences = [
-            ref for ref in globalReferences + globalStructReferences if isinstance(ctxt.lookup(ref), VariableBuffer)
-        ]
+        dynamicLocalReferences = [ref for ref in localReferences if ctxt.lookup(ref)._deploy]
+        dynamicGlobalReferences = [ref for ref in globalReferences if isinstance(ctxt.lookup(ref), VariableBuffer)]
 
-        if includeGobalReferences:
+        if includeGlobalReferences:
             return dynamicLocalReferences + dynamicGlobalReferences
         else:
             return dynamicLocalReferences
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
index b95d31a01b..f10d333502 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryAllocation.py
-#
-# Last edited: 12.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import re
 from functools import partial
@@ -30,7 +9,7 @@
 from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
     IntrospectiveCodeTransformationMixIn
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
-    NodeTemplate, StructBuffer, TransientBuffer, _NoVerbosity
+    NodeTemplate, StructBuffer, TransientBuffer, VariableBuffer, _NoVerbosity, _ReferenceBuffer
 
 
 class _ArgStructAllocateTemplate(NodeTemplate):
@@ -77,112 +56,103 @@ def apply(self,
 
 class MemoryManagementGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
 
-    def __init__(self, memoryHierarchyRegex: Optional[str] = None):
+    def __init__(self, memoryLevelRegex: Optional[str] = None):
         super().__init__()
-        if memoryHierarchyRegex is not None:
-            self.regex = re.compile(memoryHierarchyRegex)
+        if memoryLevelRegex is not None:
+            self.regex = re.compile(memoryLevelRegex)
         else:
             self.regex = None
 
-    def _matchesRegex(self, ctxt: NetworkContext, key: str) -> bool:
-        _buffer = ctxt.lookup(key)
-
+    def is_memory_level(self, buffer: VariableBuffer) -> bool:
         if self.regex is None:
-            return not hasattr(_buffer, "_memoryLevel")
-
-        if not hasattr(_buffer, "_memoryLevel"):
-            return False
-
-        ret = self.regex.findall(ctxt.lookup(key)._memoryLevel)
-        return ret != []
-
-    def _extractTransientBuffers(self, ctxt: NetworkContext, name: str) -> List[str]:
-        names = []
-
-        for key, _buffer in ctxt.localObjects.items():
-            if isinstance(_buffer, TransientBuffer) and name in _buffer._users:
-                names.append(key)
-
-        filteredNames = [key for key in names if self._matchesRegex(ctxt, key)]
-
-        return filteredNames
+            return not hasattr(buffer, "_memoryLevel")
+        else:
+            return hasattr(buffer, "_memoryLevel") and self.regex.fullmatch(buffer._memoryLevel) is not None
 
-    def _getOutputNames(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str) -> List[str]:
-        outputs = []
-        references = self.extractDynamicReferences(ctxt, executionBlock, True)
-        localKeys = [key for key in references if ctxt.is_local(key)]
+    @staticmethod
+    def is_final_input(buffer: VariableBuffer, nodeName: str) -> bool:
+        return not isinstance(buffer, (StructBuffer, TransientBuffer)) and \
+            len(buffer._users) > 0 and nodeName == buffer._users[-1]
 
-        filteredKeys = [key for key in localKeys if self._matchesRegex(ctxt, key)]
+    @staticmethod
+    def is_output(buffer: VariableBuffer, nodeName: str) -> bool:
+        return not isinstance(buffer, (StructBuffer, TransientBuffer)) and nodeName not in buffer._users
 
-        for key in filteredKeys:
-            _buffer = ctxt.lookup(key)
-            if isinstance(_buffer, (StructBuffer, TransientBuffer)):
-                continue
-            if name not in _buffer._users:
-                outputs.append(_buffer.name)
+    @staticmethod
+    def is_transient(buffer: VariableBuffer, nodeName: str) -> bool:
+        return isinstance(buffer, TransientBuffer) and nodeName in buffer._users
 
-        return list(dict.fromkeys(outputs))
+    @staticmethod
+    def topologicallySortBuffers(buffers: List[VariableBuffer]) -> List[VariableBuffer]:
+        sortedBuffers = []
+        unsortedBufferNames = [buff.name for buff in buffers]
+        lastLen = len(unsortedBufferNames)
 
-    def _getFinalInputNames(self, ctxt: NetworkContext, executionBlock: ExecutionBlock, name: str) -> List[str]:
-        inputs = []
-        references = self.extractDynamicReferences(ctxt, executionBlock, True)
-        localKeys = [key for key in references if ctxt.is_local(key)]
+        while len(unsortedBufferNames) > 0:
+            for buffer in buffers:
+                if isinstance(buffer, _ReferenceBuffer) and buffer._referenceName in unsortedBufferNames:
+                    continue
 
-        filteredKeys = [key for key in localKeys if self._matchesRegex(ctxt, key)]
+                sortedBuffers.append(buffer)
+                unsortedBufferNames.remove(buffer.name)
 
-        for key in filteredKeys:
-            _buffer = ctxt.lookup(key)
-            if isinstance(_buffer, (StructBuffer, TransientBuffer)) or _buffer._users == []:
-                continue
-            if name == _buffer._users[-1]:
-                inputs.append(_buffer.name)
+            assert len(
+                unsortedBufferNames) != lastLen, f"Circular reference detected among buffers: {unsortedBufferNames}"
+            lastLen = len(unsortedBufferNames)
 
-        return list(dict.fromkeys(inputs))
+        return sortedBuffers
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        references = self.extractDynamicReferences(ctxt,
+                                                   executionBlock,
+                                                   unrollStructs = True,
+                                                   includeGlobalReferences = False)
+        localBuffers = [ctxt.localObjects[ref] for ref in references]
+        memoryLevelBuffers = [buff for buff in localBuffers if self.is_memory_level(buff)]
 
-        outputNames = self._getOutputNames(ctxt, executionBlock, name)
-        inputNames = self._getFinalInputNames(ctxt, executionBlock, name)
-        transientBuffers = self._extractTransientBuffers(ctxt, name)
+        transients = [buff for buff in memoryLevelBuffers if self.is_transient(buff, name)]
+        outputs = [buff for buff in memoryLevelBuffers if self.is_output(buff, name)]
+        inputs = [buff for buff in memoryLevelBuffers if self.is_final_input(buff, name)]
 
         # We have to allocate the output buffers, unless they are global
-
-        for buffer in list(reversed(outputNames)) + transientBuffers:
-            # Extract buffer info from context
-            nb = ctxt.lookup(buffer)
-
-            # Check that it was not already allocated
-            assert ctxt.localObjects[nb.name]._live == False, f"Tried to allocate already live buffer {nb.name}"
-
-            # Mark it as live
-            ctxt.localObjects[nb.name]._live = True
-
-            # Add the allocation code to the execution block
-            executionBlock.addLeft(nb.allocTemplate, nb._bufferRepresentation())
-
-        for buffer in inputNames + transientBuffers:
-            # Extract buffer info from context
-            nb = ctxt.lookup(buffer)
-
-            # Check that it was not already deallocated
-            assert ctxt.localObjects[nb.name]._live == True, f"Tried to deallocate already dead buffer {nb.name}"
-
-            # Mark it as dead (not useful anymore)
-            ctxt.localObjects[nb.name]._live = False
-
-            # Check for live ancestors (buffers that this is an alias of, that are still live),
-            # and add the deallocation code to the execution block if none found
-            if not nb.has_live_ancestors(ctxt = ctxt):
-                executionBlock.addRight(nb.deallocTemplate, nb._bufferRepresentation())
+        for buffer in reversed(self.topologicallySortBuffers(outputs + transients)):
+            assert buffer._live == False, f"Tried to allocate already live buffer {buffer.name}"
+            buffer._live = True
+
+            memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel
+            if memoryLevel not in ctxt._dynamicSize:
+                ctxt._dynamicSize[memoryLevel] = int(buffer.sizeInBytes())
+            else:
+                ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes())
+
+            executionBlock.addLeft(buffer.allocTemplate, buffer._bufferRepresentation())
+
+        for levels in ctxt._dynamicSize.keys():
+            if levels not in ctxt._maxDynamicSize:
+                ctxt._maxDynamicSize[levels] = max(0, ctxt._dynamicSize[levels])
+            else:
+                ctxt._maxDynamicSize[levels] = max(ctxt._maxDynamicSize.get(levels, 0), ctxt._dynamicSize[levels])
+
+        for buffer in inputs + transients:
+            assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}"
+            buffer._live = False
+            # Don't deallocate if it's an alias of a live buffer
+            if not buffer.has_live_aliases(ctxt):
+                memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel
+                if memoryLevel not in ctxt._dynamicSize:
+                    ctxt._dynamicSize[memoryLevel] = 0
+                else:
+                    ctxt._dynamicSize[memoryLevel] -= int(buffer.sizeInBytes())
+                executionBlock.addRight(buffer.deallocTemplate, buffer._bufferRepresentation())
 
         return ctxt, executionBlock
 
 
-class MemoryPassthroughGeneration(MemoryManagementGeneration, IntrospectiveCodeTransformationMixIn):
+class MemoryPassthroughGeneration(MemoryManagementGeneration):
 
     def __init__(self, memoryHierarchyRegex: Optional[str] = None):
         super().__init__(memoryHierarchyRegex)
@@ -192,22 +162,43 @@ def apply(self,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-
-        outputNames = self._getOutputNames(ctxt, executionBlock, name)
-        inputNames = self._getFinalInputNames(ctxt, executionBlock, name)
-        transientBuffers = self._extractTransientBuffers(ctxt, name)
-
-        # We have to allocate the output buffers, unless they are global
-        for buffer in outputNames + transientBuffers:
-            nb = ctxt.lookup(buffer)
-
-            assert ctxt.localObjects[nb.name]._live == False, f"Tried to allocate already live buffer {nb.name}"
-            ctxt.localObjects[nb.name]._live = True
-
-        for buffer in inputNames + transientBuffers:
-            nb = ctxt.lookup(buffer)
-
-            assert ctxt.localObjects[nb.name]._live == True, f"Tried to deallocate already dead buffer {nb.name}"
-            ctxt.localObjects[nb.name]._live = False
+        references = self.extractDynamicReferences(ctxt,
+                                                   executionBlock,
+                                                   unrollStructs = True,
+                                                   includeGlobalReferences = False)
+        localBuffers = [ctxt.localObjects[ref] for ref in references]
+        memoryLevelBuffers = [buff for buff in localBuffers if self.is_memory_level(buff)]
+
+        transients = [buff for buff in memoryLevelBuffers if self.is_transient(buff, name)]
+        outputs = [buff for buff in memoryLevelBuffers if self.is_output(buff, name)]
+        inputs = [buff for buff in memoryLevelBuffers if self.is_final_input(buff, name)]
+
+        for buffer in outputs + transients:
+            assert buffer._live == False, f"Tried to allocate already live buffer {buffer.name}"
+
+            memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel
+            if memoryLevel not in ctxt._dynamicSize:
+                ctxt._dynamicSize[memoryLevel] = int(buffer.sizeInBytes())
+            else:
+                ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes())
+
+            buffer._live = True
+
+        for levels in ctxt._dynamicSize.keys():
+            if levels not in ctxt._maxDynamicSize:
+                ctxt._maxDynamicSize[levels] = max(0, ctxt._dynamicSize[levels])
+            else:
+                ctxt._maxDynamicSize[levels] = max(ctxt._maxDynamicSize.get(levels, 0), ctxt._dynamicSize[levels])
+
+        for buffer in inputs + transients:
+            assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}"
+
+            memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel
+            if memoryLevel not in ctxt._dynamicSize:
+                ctxt._dynamicSize[memoryLevel] = 0
+            else:
+                ctxt._dynamicSize[memoryLevel] -= int(buffer.sizeInBytes())
+
+            buffer._live = False
 
         return ctxt, executionBlock
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
index d1f74e43d0..300c5d2ad9 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/PrintInputs.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PrintInput.py
-#
-# Last edited: 13.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import re
 from typing import Optional, Tuple
@@ -40,9 +19,9 @@
     accessStr += "[" + f"print_iter_{idx}" + "]"
     if idx > 0:
         dimStr += "[" + f"{dim}" + "]"
-formatSpecifier = "%*i" 
+formatSpecifier = "%*i"
 if "float" in bufferType.referencedType.typeName or "double" in bufferType.referencedType.typeName:
-    formatSpecifier = "%*.6f"  
+    formatSpecifier = "%*.6f"
 %>
 printf("${nodeName} ${bufferName}: ${bufferType.referencedType.typeName}, ${bufferShape}, %p\\n", ${bufferName});
 % for idx, dim in enumerate(bufferShape):
@@ -83,7 +62,7 @@ def apply(self,
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
-                                                   includeGobalReferences = True)
+                                                   includeGlobalReferences = True)
 
         for ref in references:
             refDict = self._getRepDict(ctxt, ref, name)
@@ -126,7 +105,7 @@ def apply(self,
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
-                                                   includeGobalReferences = True)
+                                                   includeGlobalReferences = True)
 
         filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
 
@@ -167,7 +146,7 @@ def apply(self,
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
-                                                   includeGobalReferences = True)
+                                                   includeGlobalReferences = True)
 
         for ref in references:
             rep = self._getRepDict(ctxt, ref, name)
@@ -188,7 +167,7 @@ def apply(self,
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
-                                                   includeGobalReferences = True)
+                                                   includeGlobalReferences = True)
 
         filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
 
@@ -220,7 +199,7 @@ def apply(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
-                                                   includeGobalReferences = True)
+                                                   includeGlobalReferences = True)
 
         for ref in references:
             rep = self._getRepDict(ctxt, ref, name)
@@ -241,7 +220,7 @@ def apply(self,
         references = self.extractDynamicReferences(ctxt,
                                                    executionBlock,
                                                    unrollStructs = True,
-                                                   includeGobalReferences = True)
+                                                   includeGlobalReferences = True)
 
         filteredReferences = [ref for ref in references if self._matchesRegex(ctxt, ref)]
 
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/__init__.py b/Deeploy/CommonExtensions/CodeTransformationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/__init__.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/CommonExtensions/DataTypes.py b/Deeploy/CommonExtensions/DataTypes.py
index f88eef2851..c05ea3b9d9 100644
--- a/Deeploy/CommonExtensions/DataTypes.py
+++ b/Deeploy/CommonExtensions/DataTypes.py
@@ -1,29 +1,10 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: BasicDataTypes.py
-#
-# Last edited: 31.08.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-from typing import Tuple, Type
+from typing import Iterable, Tuple, Type, Union
+
+import numpy.typing as npt
 
 from Deeploy.AbstractDataTypes import FloatImmediate, IntegerImmediate
 
@@ -106,9 +87,37 @@ class float64_t(FloatImmediate):
 
 SignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (int8_t, int16_t, int32_t, int64_t)
 UnsignedIntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (uint8_t, uint16_t, uint32_t, uint64_t)
-IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = (sorted((
-    *SignedIntegerDataTypes,
-    *UnsignedIntegerDataTypes,
-),
-                                                               key = lambda _type: _type.typeWidth))
-FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t)
\ No newline at end of file
+IntegerDataTypes: Tuple[Type[IntegerImmediate], ...] = tuple(
+    sorted((
+        *SignedIntegerDataTypes,
+        *UnsignedIntegerDataTypes,
+    ), key = lambda _type: _type.typeWidth))
+FloatDataTypes: Tuple[Type[FloatImmediate], ...] = (bfloat16_t, float16_t, float32_t, float64_t)
+
+
+def minimalIntegerType(value: Union[int, Iterable[int], npt.NDArray]) -> Type[IntegerImmediate]:
+    # Sort data types by typeWidth and signedness (unsigned types go first)
+    sorted_types = sorted(
+        IntegerDataTypes,
+        key = lambda t: (t.typeWidth, t.typeMin < 0),
+    )
+
+    for _type in sorted_types:
+        if _type.checkValue(value):
+            return _type
+
+    raise RuntimeError(f"Couldn't find appropriate integer type for value: {value}")
+
+
+def minimalFloatType(value: Union[float, Iterable[float], npt.NDArray]) -> Type[FloatImmediate]:
+    # Sort data types by typeWidth
+    sorted_types = sorted(
+        FloatDataTypes,
+        key = lambda t: t.typeWidth,
+    )
+
+    for _type in sorted_types:
+        if _type.checkValue(value):
+            return _type
+
+    raise RuntimeError(f"Couldn't find appropriate float type for value: {value}")
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
index 6a99627a1b..a8f27b5463 100644
--- a/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
+++ b/Deeploy/CommonExtensions/NetworkDeployers/NetworkDeployerWrapper.py
@@ -1,33 +1,10 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: NetworkDeployerWrapper.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Tuple, Union
 
 import onnx_graphsurgeon as gs
 
-from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkContext, NetworkDeployer, ONNXLayer, _NoVerbosity
+from Deeploy.DeeployTypes import CodeGenVerbosity, DeploymentEngine, NetworkContext, NetworkDeployer, _NoVerbosity
 
 
 class NetworkDeployerWrapper(NetworkDeployer):
@@ -84,15 +61,16 @@ def lower(self, graph: gs.Graph) -> gs.Graph:
     def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
         return self._innerObject.codeTransform(verbose)
 
-    # MemoryAwareDeployer augment
-    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
-                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
-        return self._innerObject._parseNode(node, ctxt, default_channels_first)
-
     # PULPDeployer augment
     def generateBufferAllocationCode(self) -> str:
         return self._innerObject.generateBufferAllocationCode()
 
     # MultiEngineDeployer augment
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
-        return self._innerObject._mapNode(node)
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
+        return self._innerObject._selectEngine(node)
+
+    def _printMemorySummary(self):
+        return self._innerObject._printMemorySummary()
+
+    def _printInputOutputSummary(self):
+        return self._innerObject._printInputOutputSummary()
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py b/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py
index c7174e175c..7a9fbea1ae 100644
--- a/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py
+++ b/Deeploy/CommonExtensions/NetworkDeployers/SignPropDeployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: SignPropDeployer.py
-#
-# Last edited: 11.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
@@ -29,6 +8,7 @@
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer
+from Deeploy.Logging import DEFAULT_LOGGER as log
 
 
 class SignPropDeployer(NetworkDeployer):
@@ -62,3 +42,16 @@ def _createIOBindings(self, ctxt, graph):
             nb.nLevels = (2**data_type.referencedType.typeWidth)
 
         return ctxt
+
+    def _printInputOutputSummary(self):
+        log.info('Input:')
+        for buf in self.inputs():
+            log.info(
+                f" - '{buf.name}': Type: {buf._type.referencedType.typeName}, nLevels: {buf.nLevels}, Signed: {buf._signed}, Offset: {self.inputOffsets[buf.name]}"
+            )
+
+        log.info('Output:')
+        for buf in self.outputs():
+            log.info(
+                f" - '{buf.name}': Type: {buf._type.referencedType.typeName}, nLevels: {buf.nLevels}, Signed: {buf._signed}"
+            )
diff --git a/Deeploy/CommonExtensions/NetworkDeployers/__init__.py b/Deeploy/CommonExtensions/NetworkDeployers/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/CommonExtensions/NetworkDeployers/__init__.py
+++ b/Deeploy/CommonExtensions/NetworkDeployers/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/AutoTranspose.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/AutoTranspose.py
index 5ff95db746..8a7191ad2a 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/AutoTranspose.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/AutoTranspose.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: AutoTranspose.py
-#
-# Last edited: 20.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import onnx_graphsurgeon as gs
 
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/BindingsOptimization.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/BindingsOptimization.py
index df97a96c5c..8740b8d296 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/BindingsOptimization.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/BindingsOptimization.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: BindingsOptimization.py
-#
-# Last edited: 21.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/PassClasses.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/PassClasses.py
index 475809ba80..738b6d60f5 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/PassClasses.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/PassClasses.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PassClasses.py
-#
-# Last edited: 21.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import onnx_graphsurgeon as gs
 
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/__init__.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/__init__.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/bindingUtils.py b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/bindingUtils.py
index 01958ccad2..4737a6bd5d 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/bindingUtils.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/BindingsOptimizationPasses/bindingUtils.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: bindingUtils.py
-#
-# Last edited: 21.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from typing import Any, Dict, List, Tuple, Union
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py b/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
index 54ec01fbaf..cec95ec134 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/Matchers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: Matchers.py
-#
-# Last edited: 28.04.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import re
 from typing import Dict, Literal, NamedTuple, Optional
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/PassClasses.py b/Deeploy/CommonExtensions/OptimizationPasses/PassClasses.py
index e027e1dafa..f9c784b2e4 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/PassClasses.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/PassClasses.py
@@ -1,35 +1,13 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: PassClasses.py
-#
-# Last edited: 28.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author:
-# Moritz Scherer, ETH Zurich
-# Georg Rutishauser, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Optional
 
 import onnx_graphsurgeon as gs
 
 from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Logging import DEFAULT_LOGGER as log
 
 from .Matchers import Match, NonBranchingMatcher, SubgraphMatcher
 
@@ -152,7 +130,7 @@ def remove_subpass(self, name):
         try:
             del self._subpasses[name]
         except KeyError:
-            print(f"No subpass with name {name}, cannot remove!")
+            log.error(f"No subpass with name {name}, cannot remove!")
         except AttributeError:
             raise AttributeError("Cannot remove sub-pass before calling Pass.__init__!")
 
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/DebugPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/DebugPasses.py
index 7525bbbaa9..bd632cefdd 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/DebugPasses.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/DebugPasses.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: DebugPasses.py
-#
-# Last edited: 28.04.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from functools import partial
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
index c3887ab54d..aba6740d49 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/LoweringOptimizationPasses.py
@@ -1,37 +1,36 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: LoweringOptimizationPasses.py
-#
-# Last edited: 07.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from functools import partial
-from typing import Iterable, List, Optional, Sequence, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, TypeVar, Union
 
 import numpy as np
 import onnx_graphsurgeon as gs
 
-from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, NonBranchingMatcher
 from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, SequentialPass, \
     contextagnostic
+from Deeploy.TilingExtension.TilingCodegen import HyperRectangle
+
+
+def _singleNodePattern(op: str) -> gs.Graph:
+    tensorIn = gs.Variable("input")
+    tensorOut = gs.Variable("output")
+    node = gs.Node(op = op, name = "node", inputs = [tensorIn], outputs = [tensorOut])
+    graph = gs.Graph([node], inputs = [tensorIn], outputs = [tensorOut])
+    return graph
+
+
+def _isDepthwise(node: gs.Node) -> bool:
+    if node.op not in ["Conv", "RequantizedConv"]:
+        return False
+
+    channels_first = node.attrs.get("channels_first", True)
+    spatialDims = len(node.inputs[1].shape) - 2
+    shapeIn = node.inputs[0].shape
+    chIn = shapeIn[-spatialDims - 1] if channels_first else shapeIn[-1]
+    return chIn != 1 and node.attrs.get("group", 1) == chIn
 
 
 def _createReshape(tensorIn: gs.Tensor,
@@ -106,122 +105,105 @@ def _prependSqueezeDims(tensor: gs.Tensor, name: str, axis: Union[int, Sequence[
 
 
 # Permute (0,1,2,3,...,N-2,N-1) -> (0,1,2,3,...,N-1,N-2)
-def _permuteLastTwoDims(length: int) -> List[int]:
-    outList = list(range(length))
-    tmp = outList[-1]
-    outList[-1] = outList[-2]
-    outList[-2] = tmp
-    return outList
+def _swapLastTwoDimsPermutation(N: int) -> List[int]:
+    assert N >= 2, "N needs to be larger then 2"
+    return [*range(N - 2), N - 1, N - 2]
+
+
+# Permute channels first <-> channels last:
+#   (*<batch dims>, ch, *<spatial dims>) <-> (*<batch dims>, *<spatial dims>, ch)
+def _transformLayoutPermutation(dims: int, spatialDims: int, targetChannelsFirst: bool) -> List[int]:
+    batchDims = dims - spatialDims - 1
+    if targetChannelsFirst:
+        ch = dims - 1
+        nonBatchPerm = [ch, *range(batchDims, ch)]
+    else:
+        ch = batchDims
+        nonBatchPerm = [*range(ch + 1, dims), ch]
+    return list(range(batchDims)) + nonBatchPerm
 
 
-# Permute (0,1,2,3,...,N-1) -> (0,2,3,...,N-1,1)
-def _permuteNCHWtoNHWC(length: int) -> List[int]:
-    outList = list(range(length))
-    outList.remove(1)
-    outList.append(1)
-    return outList
+# Calculate permutation q = p^(-1) s.t. q(p(i)) = i
+def _invertPermutation(permutation: Sequence[int]) -> List[int]:
+    return [permutation.index(i) for i in range(len(permutation))]
 
 
-# Permute (0,1,2,3,...,N-1) -> (0,N-1,1,2,3,...,N-2)
-def _permuteNHWCtoNCHW(length: int) -> List[int]:
-    outList = list(range(length))
-    outList.remove(length - 1)
-    outList.insert(1, length - 1)
-    return outList
+T = TypeVar('T')
 
 
-# Calculate permutation q = p^(-1) s.t. q(p(i)) = i
-def _invertPermutation(permutation: List[int]) -> List[int]:
-    tuples = []
-    for idx, i in enumerate(permutation):
-        tuples.append((i, idx))
-    sortedTuples = sorted(tuples, key = lambda x: x[0])
-    outPermutation = []
-    for i in sortedTuples:
-        outPermutation.append(i[1])
-    return outPermutation
-
-
-def _permuteList(inputList: List, permutation: List[int]):
-    assert len(inputList) == len(permutation), "Permuted list and permutation must have equal length!"
-    outList = []
-    for i in permutation:
-        outList.append(inputList[i])
-    return outList
-
-
-def _prependTransposeNode(anchor: gs.Variable,
-                          nodeName: str,
-                          permutation: Iterable[int],
-                          invert: bool = False) -> (gs.Node, gs.Variable):
-
-    if invert:
-        outShape = _permuteList(anchor.shape, _invertPermutation(permutation))
-    else:
-        outShape = _permuteList(anchor.shape, permutation)
+def _permute(_list: Sequence[T], permutation: Sequence[int]) -> List[T]:
+    assert len(_list) == len(permutation), "Permuted list and permutation must have equal length!"
+    return [_list[i] for i in permutation]
 
-    anchorTransposeInput = gs.Variable(nodeName + "_Out", dtype = np.float32, shape = outShape)
-    anchorTransposeNode = gs.Node(name = nodeName,
-                                  op = "Transpose",
-                                  inputs = [anchorTransposeInput],
-                                  outputs = [anchor],
-                                  attrs = {'perm': permutation})
 
-    return anchorTransposeNode, anchorTransposeInput
+def _permuteHyperRectangle(rect: HyperRectangle, permutation: List[int]) -> HyperRectangle:
+    assert len(rect.dims) == len(permutation), "Permutation list and HyperRectangle must have equal dimensionality!"
+    return HyperRectangle(tuple(_permute(rect.offset, permutation)), tuple(_permute(rect.dims, permutation)))
 
 
-def _appendTransposeNode(anchor: gs.Variable,
-                         nodeName: str,
-                         permutation: Iterable[int],
-                         invert: bool = False) -> (gs.Node, gs.Variable):
+def _prependTranspose(tensor: gs.Variable, prevNode: gs.Node, perm: List[int]) -> gs.Node:
+    prevNodeTensorIdx = prevNode.outputs.index(tensor)
+    preTransposeTensor = gs.Variable(f"{prevNode.name}_{tensor.name}_pre_transposed", tensor.dtype,
+                                     _permute(tensor.shape, _invertPermutation(perm)))
+    transposeNode = gs.Node(op = "Transpose",
+                            name = f"{prevNode.name}_{tensor.name}_pre_transpose",
+                            attrs = {"perm": perm},
+                            inputs = [preTransposeTensor],
+                            outputs = [tensor])
+    prevNode.outputs[prevNodeTensorIdx] = preTransposeTensor
+    return transposeNode
 
-    if invert:
-        outShape = _permuteList(anchor.shape, _invertPermutation(permutation))
-    else:
-        outShape = _permuteList(anchor.shape, permutation)
 
-    anchorTransposeOutput = gs.Variable(nodeName + "_In", dtype = np.float32, shape = outShape)
-    anchorTransposeNode = gs.Node(name = nodeName,
-                                  op = "Transpose",
-                                  inputs = [anchor],
-                                  outputs = [anchorTransposeOutput],
-                                  attrs = {'perm': permutation})
+def _appendTranspose(tensor: gs.Variable, nextNode: gs.Node, perm: List[int]) -> gs.Node:
+    nextNodeTensorIdx = nextNode.inputs.index(tensor)
+    transposedTensor = gs.Variable(f"{nextNode.name}_{tensor.name}_transposed", tensor.dtype,
+                                   _permute(tensor.shape, perm))
+    transposeNode = gs.Node(op = "Transpose",
+                            name = f"{nextNode.name}_{tensor.name}_transpose",
+                            attrs = {"perm": perm},
+                            inputs = [tensor],
+                            outputs = [transposedTensor])
+    nextNode.inputs[nextNodeTensorIdx] = transposedTensor
+    return transposeNode
 
-    return anchorTransposeNode, anchorTransposeOutput
 
+def _transformLayoutConst(const: gs.Constant, spatialDims: int, targetChannelsFirst: bool) -> None:
+    assert isinstance(const, gs.Constant)
+    if len(const.shape) < 2:
+        return
+    perm = _transformLayoutPermutation(len(const.shape), spatialDims, targetChannelsFirst)
+    const.values = const.values.transpose(perm)
 
-def _transposeMatMulInputs_fun(graph: gs.Graph, match: Match, name: str):
 
-    matched_nodes = [m for k, m in match.nodes_map.items()]
-    gemmNode = matched_nodes[0]
+def _transformLayoutDwWeightConst(const: gs.Constant, targetChannelsFirst: bool) -> None:
+    assert not targetChannelsFirst, "Target layout should be channels_last!"
+    assert isinstance(const, gs.Constant)
+    dims = len(const.shape)
+    perm = [*range(1, dims), 0]
+    const.values = const.values.transpose(perm)
 
-    inputA = gemmNode.inputs[0]
-    inputB = gemmNode.inputs[1]
 
-    if 'transA' not in gemmNode.attrs:
-        gemmNode.attrs['transA'] = 0
-    if 'transB' not in gemmNode.attrs:
-        gemmNode.attrs['transB'] = 0
-    if 'alpha' not in gemmNode.attrs:
-        gemmNode.attrs['alpha'] = 1.0
-    if 'beta' not in gemmNode.attrs:
-        gemmNode.attrs['beta'] = 1.0
+def _transposeMatMulInputs_fun(graph: gs.Graph, match: Match, name: str):
+    node = next(iter((match.nodes_map.values())))
+
+    node.attrs['transA'] = node.attrs.get('transA', 0)
+    node.attrs['transB'] = node.attrs.get('transB', 0)
+    node.attrs['alpha'] = node.attrs.get('alpha', 1.0)
+    node.attrs['beta'] = node.attrs.get('beta', 1.0)
 
     # Prepend transpose on A if it's transposed
-    if gemmNode.attrs['transA'] != 0:
-        anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode(inputA, name + "_A",
-                                                                          _permuteLastTwoDims(len(inputA.shape)))
-        gemmNode.inputs[0] = anchorTransposeOutput
-        gemmNode.attrs['transA'] = 0
-        graph.nodes.append(anchorTransposeNode)
+    if node.attrs['transA'] == 1:
+        tensorA = node.inputs[0]
+        perm = _swapLastTwoDimsPermutation(len(tensorA.shape))
+        graph.nodes.append(_appendTranspose(tensorA, node, perm))
+        node.attrs['transA'] = False
 
     # Prepend transpose on B if it's not transposed
-    if gemmNode.attrs['transB'] != 1:
-        anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode(inputB, name + "_B",
-                                                                          _permuteLastTwoDims(len(inputB.shape)))
-        gemmNode.inputs[1] = anchorTransposeOutput
-        gemmNode.attrs['transB'] = 1
-        graph.nodes.append(anchorTransposeNode)
+    if node.attrs['transB'] == 0:
+        tensorB = node.inputs[1]
+        perm = _swapLastTwoDimsPermutation(len(tensorB.shape))
+        graph.nodes.append(_appendTranspose(tensorB, node, perm))
+        node.attrs['transB'] = True
 
     return graph
 
@@ -232,62 +214,40 @@ def _transposeMatMulInputs_fun(graph: gs.Graph, match: Match, name: str):
 class TransposeMatmulInputsPass(ReplaceSequentialPatternPass):
 
     def __init__(self):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['gemmOut'], op = 'RequantizedGemm', name = 'requantizedGemm')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
+        graph = _singleNodePattern("RequantizedGemm")
         name = "_TRANSPOSE_MATMUL_INPUTS_PASS"
         super().__init__(graph, _transposeMatMulInputs_fun, name)
 
 
 def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
+    node = next(iter((match.nodes_map.values())))
 
-    matched_nodes = [m for k, m in match.nodes_map.items()]
-    opNode = matched_nodes[0]
-    node_op = opNode.op
-
-    # Default for non-existent channels_first: True
-    channels_first = opNode.attrs["channels_first"] if "channels_first" in opNode.attrs else True
-
+    channels_first = node.attrs.get("channels_first", True)
     if (channels_first != default_channels_first):
+        tensorIn = node.inputs[0]
+        tensorOut = node.outputs[0]
 
-        inputNode = opNode.inputs[0]
-        outputNode = opNode.outputs[0]
-
-        inPermute = _permuteNCHWtoNHWC(len(inputNode.shape))
-        outPermute = _permuteNHWCtoNCHW(len(outputNode.shape))
-
-        inputTransposeNode, inputTransposeOutput = _appendTransposeNode(inputNode, name + "_TransposeIn", inPermute)
-        outputTransposeNode, outputTransposeInput = _prependTransposeNode(outputNode,
-                                                                          name + "_TransposeOut",
-                                                                          outPermute,
-                                                                          invert = True)
+        if node.op in ["RequantizedConv", "Conv"]:
+            spatialDims = len(node.inputs[1].shape) - 2
+        elif node.op == "MaxPool":
+            spatialDims = len(node.attrs["kernel_shape"])
+        elif node.op == "Pad":
+            spatialDims = 2  # Hack based on current status
+        else:
+            raise ValueError(f"Cannot determine spatialDims for node {node.name} with operator {node.op}")
 
-        opNode.inputs[0] = inputTransposeOutput
-        opNode.outputs[0] = outputTransposeInput
-        graph.nodes.append(inputTransposeNode)
-        graph.nodes.append(outputTransposeNode)
+        permuteIn = _transformLayoutPermutation(len(tensorIn.shape), spatialDims, default_channels_first)
+        graph.nodes.append(_appendTranspose(tensorIn, node, permuteIn))
 
-        if node_op in ["RequantizedConv", "Conv"]:
+        permuteOut = _transformLayoutPermutation(len(tensorOut.shape), spatialDims, channels_first)
+        graph.nodes.append(_prependTranspose(tensorOut, node, permuteOut))
 
-            # Non DW-Type:
-            if opNode.attrs['group'] == 1:
-                weightNode = opNode.inputs[1]
-                weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
-                                                                                  inPermute)
+        if node.op in ["Conv", "RequantizedConv"]:
+            # In the case of Conv: [weights, opt. bias], RequantizedConv: [weights, mul, add, opt. shift]
+            for tensor in node.inputs[1:]:
+                _transformLayoutConst(tensor, spatialDims, default_channels_first)
 
-            else:
-                DWPermute = [inPermute[-1]] + inPermute[1:-1] + [inPermute[0]]
-                weightNode = opNode.inputs[1]
-                weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
-                                                                                  DWPermute)
-
-            opNode.inputs[1] = weightTransposeOutput
-            graph.nodes.append(weightTransposeNode)
-
-        opNode.attrs["channels_first"] = default_channels_first
+        node.attrs["channels_first"] = default_channels_first
 
     return graph
 
@@ -296,12 +256,7 @@ def _NCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_f
 class NCHWtoNHWCMaxPoolPass(ReplaceSequentialPatternPass):
 
     def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['maxPool'], op = 'MaxPool', name = 'MaxPool')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
+        graph = _singleNodePattern(op = "MaxPool")
         name = "_NCHW_TO_NHWC_MAXPOOL_PASS"
         super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
 
@@ -310,214 +265,129 @@ def __init__(self, default_channels_first: bool = True):
 class NCHWtoNHWCConvPass(ReplaceSequentialPatternPass):
 
     def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
+        graph = _singleNodePattern(op = "Conv|RequantizedConv")
         name = "_NCHW_TO_NHWC_CONV_PASS"
-        super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
-
-
-@contextagnostic
-class NCHWtoNHWCRequantizedConvPass(ReplaceSequentialPatternPass):
-
-    def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
-        name = "_NCHW_TO_NHWC_CONV_PASS"
-        super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
+        super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name,
+                         NonBranchingMatcher(regex_op = True))
 
 
 @contextagnostic
 class NCHWtoNHWCPadPass(ReplaceSequentialPatternPass):
 
     def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['padOut'], op = 'Pad', name = 'pad')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
+        graph = _singleNodePattern(op = "Pad")
         name = "_NCHW_TO_NHWC_PAD_PASS"
         super().__init__(graph, partial(_NCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
 
 
-@contextagnostic
-class NCHWtoNHWCPass(SequentialPass):
-
-    def __init__(self, default_channels_first: bool = True):
-        passes = [
-            NCHWtoNHWCPadPass(default_channels_first),
-            NCHWtoNHWCMaxPoolPass(default_channels_first),
-            NCHWtoNHWCConvPass(default_channels_first),
-            NCHWtoNHWCRequantizedConvPass(default_channels_first),
-        ]
-        super().__init__(*passes)
-
-
-def _PULPDWNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
-
-    matched_nodes = [m for k, m in match.nodes_map.items()]
-    opNode = matched_nodes[0]
-    node_op = opNode.op
+def _NCWHtoNHWC_dw_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool) -> gs.Graph:
+    node = next(iter((match.nodes_map.values())))
 
-    if opNode.attrs['group'] == 1:
+    if not _isDepthwise(node):
         return graph
 
-    if (("channels_first" in opNode.attrs and opNode.attrs["channels_first"] != default_channels_first)
-            or ("channels_first" not in opNode.attrs and default_channels_first == 0)):
-
-        inputNode = opNode.inputs[0]
-        outputNode = opNode.outputs[0]
+    channels_first = node.attrs.get("channels_first", True)
+    if (channels_first != default_channels_first):
+        tensorIn = node.inputs[0]
+        tensorOut = node.outputs[0]
 
-        inPermute = _permuteNCHWtoNHWC(len(inputNode.shape))
-        outPermute = _permuteNHWCtoNCHW(len(outputNode.shape))
+        spatialDims = len(node.inputs[1].shape) - 2
 
-        outputTransposeNode, outputTransposeInput = _prependTransposeNode(outputNode,
-                                                                          name + "_TransposeOut",
-                                                                          outPermute,
-                                                                          invert = True)
+        permuteIn = _transformLayoutPermutation(len(tensorIn.shape), spatialDims, default_channels_first)
+        permuteOut = _transformLayoutPermutation(len(tensorOut.shape), spatialDims, channels_first)
 
-        opNode.outputs[0] = outputTransposeInput
-        graph.nodes.append(outputTransposeNode)
+        graph.nodes.append(_appendTranspose(tensorIn, node, permuteIn))
+        graph.nodes.append(_prependTranspose(tensorOut, node, permuteOut))
 
-        if node_op == "RequantizedConv":
+        _transformLayoutDwWeightConst(node.inputs[1], default_channels_first)  # weights
 
-            weightNode = opNode.inputs[1]
-            weightTransposeNode, weightTransposeOutput = _appendTransposeNode(weightNode, name + "TransposeWeight",
-                                                                              inPermute)
-            opNode.inputs[1] = weightTransposeOutput
-            graph.nodes.append(weightTransposeNode)
+        if len(node.inputs) > 2:
+            # In the case of Conv: [opt. bias], RequantizedConv: [mul, add, opt. shift]
+            for tensor in node.inputs[2:]:
+                _transformLayoutConst(tensor, spatialDims, default_channels_first)  # bias
 
-        opNode.attrs["channels_first"] = default_channels_first
+        node.attrs["channels_first"] = default_channels_first
 
     return graph
 
 
 @contextagnostic
-class PULPDWConvPass(ReplaceSequentialPatternPass):
+class NCHWtoNHWCDwConvPass(ReplaceSequentialPatternPass):
 
     def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
-        name = "_NCHW_TO_NHWC_CONV_PASS"
-        super().__init__(graph, partial(_PULPDWNCHWtoNHWC_fun, default_channels_first = default_channels_first), name)
-
+        graph = _singleNodePattern(op = "Conv|RequantizedConv")
+        name = "_NCHW_TO_NHWC_DW_CONV_PASS"
+        super().__init__(graph, partial(_NCWHtoNHWC_dw_fun, default_channels_first = default_channels_first), name,
+                         NonBranchingMatcher(regex_op = True))
 
-def _PULPDenseNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
 
-    matched_nodes = [m for k, m in match.nodes_map.items()]
-    opNode = matched_nodes[0]
+def _PULP_NCHWtoNHWC_dw_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
+    node = next(iter((match.nodes_map.values())))
 
-    node_group = opNode.attrs['group'] if 'group' in opNode.attrs else 1
-    if node_group != 1:
+    if not _isDepthwise(node):
         return graph
 
-    return _NCHWtoNHWC_fun(graph, match, name, default_channels_first)
-
-
-@contextagnostic
-class PULPNCHWtoNHWCDenseRequantizedConvPass(ReplaceSequentialPatternPass):
-
-    def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
-        name = "_NCHW_TO_NHWC_CONV_PASS"
-        super().__init__(graph, partial(_PULPDenseNCHWtoNHWC_fun, default_channels_first = default_channels_first),
-                         name)
-
+    channels_first = node.attrs.get("channels_first", True)
+    if (channels_first != default_channels_first):
+        tensorOut = node.outputs[0]
 
-def _NeurekaDWNCHWtoNHWC_fun(graph: gs.Graph, match: Match, name: str, default_channels_first: bool = True):
+        spatialDims = len(node.inputs[1].shape) - 2
 
-    matched_nodes = [m for k, m in match.nodes_map.items()]
-    opNode = matched_nodes[0]
+        # LMACAN: PULP DW doesn't transpose the input
 
-    node_group = opNode.attrs['group'] if 'group' in opNode.attrs else 1
-    if node_group == 1:
-        return graph
+        permuteOut = _transformLayoutPermutation(len(tensorOut.shape), spatialDims, channels_first)
+        graph.nodes.append(_prependTranspose(tensorOut, node, permuteOut))
 
-    return _NCHWtoNHWC_fun(graph, match, name, default_channels_first)
+        # RequantizedConv: [weights, mul, add, opt. shift]
+        for tensor in node.inputs[1:]:
+            _transformLayoutConst(tensor, spatialDims, default_channels_first)
 
+        node.attrs["channels_first"] = default_channels_first
 
-@contextagnostic
-class NeurekaNCHWtoNHWCDWRequantizedConvPass(ReplaceSequentialPatternPass):
-
-    def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'RequantizedConv', name = 'requantizedConv')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
-        name = "_NCHW_TO_NHWC_CONV_PASS"
-        super().__init__(graph, partial(_NeurekaDWNCHWtoNHWC_fun, default_channels_first = default_channels_first),
-                         name)
+    return graph
 
 
 @contextagnostic
-class PULPNCHWtoNHWCDenseConvPass(ReplaceSequentialPatternPass):
+class PULPNCHWtoNHWCDwConvPass(ReplaceSequentialPatternPass):
 
     def __init__(self, default_channels_first: bool = True):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
-        name = "_NCHW_TO_NHWC_CONV_PASS"
-        super().__init__(graph, partial(_PULPDenseNCHWtoNHWC_fun, default_channels_first = default_channels_first),
-                         name)
+        graph = _singleNodePattern(op = "RequantizedConv")
+        name = "_PULP_NCHW_TO_NHWC_DW_CONV_PASS"
+        super().__init__(graph, partial(_PULP_NCHWtoNHWC_dw_fun, default_channels_first = default_channels_first), name)
 
 
 @contextagnostic
-class PULPNCHWtoNHWCPass(SequentialPass):
+class NCHWtoNHWCPass(SequentialPass):
 
     def __init__(self, default_channels_first: bool = True):
         passes = [
             NCHWtoNHWCPadPass(default_channels_first),
             NCHWtoNHWCMaxPoolPass(default_channels_first),
-            PULPDWConvPass(default_channels_first),
-            PULPNCHWtoNHWCDenseConvPass(default_channels_first),
-            PULPNCHWtoNHWCDenseRequantizedConvPass(default_channels_first),
+            NCHWtoNHWCDwConvPass(default_channels_first),
+            NCHWtoNHWCConvPass(default_channels_first),
         ]
         super().__init__(*passes)
 
 
 @contextagnostic
-class NeurekaNCHWtoNHWCPass(SequentialPass):
+class PULPNCHWtoNHWCPass(SequentialPass):
 
     def __init__(self, default_channels_first: bool = True):
         passes = [
             NCHWtoNHWCPadPass(default_channels_first),
             NCHWtoNHWCMaxPoolPass(default_channels_first),
-            NeurekaNCHWtoNHWCDWRequantizedConvPass(default_channels_first),
-            PULPNCHWtoNHWCDenseConvPass(default_channels_first),
-            PULPNCHWtoNHWCDenseRequantizedConvPass(default_channels_first),
+            PULPNCHWtoNHWCDwConvPass(default_channels_first),
+            NCHWtoNHWCConvPass(default_channels_first),
         ]
         super().__init__(*passes)
 
 
 def _requantized_gemm_to_pw_fun(graph: gs.Graph, match: Match, name: str):
-    matched_nodes = list(match.nodes_map.values())
-    requantizedGemm = matched_nodes[0]
+    node = next(iter((match.nodes_map.values())))
 
-    matrixA: gs.Variable = requantizedGemm.inputs[0]
-    matrixB: gs.Constant = requantizedGemm.inputs[1]
-    matrixY: gs.Variable = requantizedGemm.outputs[0]
+    matrixA: gs.Variable = node.inputs[0]
+    matrixB: gs.Constant = node.inputs[1]
+    matrixY: gs.Variable = node.outputs[0]
 
     # Check matrixB is a constant, otherwise don't transform
     if not isinstance(matrixB, gs.Constant):
@@ -532,10 +402,22 @@ def _requantized_gemm_to_pw_fun(graph: gs.Graph, match: Match, name: str):
 
     # Pointwise with HWC layout (channels_first == False)
 
+    # Defaults
+    node.attrs['transA'] = node.attrs.get('transA', 0)
+    node.attrs['transB'] = node.attrs.get('transB', 0)
+    node.attrs['alpha'] = node.attrs.get('alpha', 1.0)
+    node.attrs['beta'] = node.attrs.get('beta', 1.0)
+
     # If transA is set then the matrix is of shape [B x K x M] and it needs to be transposed, otherwise its shape is  [B x M x K]
-    if 'transA' in requantizedGemm.attrs and requantizedGemm.attrs['transA'] == 1:
-        matrixATransposeNode, matrixA = _appendTransposeNode(matrixA, name, _permuteLastTwoDims(len(matrixA.shape)))
-        graph.nodes.append(matrixATransposeNode)
+    if node.attrs['transA'] == 1:
+        perm = _swapLastTwoDimsPermutation(len(matrixA.shape))
+        graph.nodes.append(_appendTranspose(matrixA, node, perm))
+        matrixA = node.inputs[0]
+
+    # If transB is set then the matrix is of shape [N x K] and it doesn't need to be transposed, otherwise its shape is [K x N] and it has to be transposed
+    if node.attrs['transB'] == 0:
+        perm = _swapLastTwoDimsPermutation(len(matrixB.shape))
+        matrixB.values = matrixB.values.transpose(perm)
 
     # Align dimensions for convolution
     expandAxis = []
@@ -548,11 +430,6 @@ def _requantized_gemm_to_pw_fun(graph: gs.Graph, match: Match, name: str):
     matrixAExpandDimsNode, pwIn = _appendExpandDims(matrixA, name, axis = expandAxis)
     graph.nodes.append(matrixAExpandDimsNode)
 
-    # If transB is set then the matrix is of shape [N x K] and it doesn't need to be transposed, otherwise its shape is [K x N] and it has to be transposed
-    if not 'transB' in requantizedGemm.attrs or requantizedGemm.attrs['transB'] == 0:
-        # matrixBTransposed, shape [N x K]
-        matrixBTransposeNode, matrixB = _appendTransposeNode(matrixB, name, _permuteLastTwoDims(len(matrixB.shape)))
-        graph.nodes.append(matrixBTransposeNode)
     # pwWeight, shape [N x 1 x 1 x K]
     matrixBExpandDimsNode, pwWeight = _appendExpandDims(matrixB, name, axis = (1, 2))
     graph.nodes.append(matrixBExpandDimsNode)
@@ -574,14 +451,14 @@ def _requantized_gemm_to_pw_fun(graph: gs.Graph, match: Match, name: str):
         'kernel_shape': [1, 1],
         'pads': [0, 0, 0, 0],
         'strides': [1, 1],
-        'div': requantizedGemm.attrs['div'],
-        'n_levels_out': requantizedGemm.attrs['n_levels_out'],
-        'shift': requantizedGemm.attrs['shift'],
-        'signed': requantizedGemm.attrs['signed'],
+        'div': node.attrs['div'],
+        'n_levels_out': node.attrs['n_levels_out'],
+        'shift': node.attrs['shift'],
+        'signed': node.attrs['signed'],
     }
 
-    add = requantizedGemm.inputs[2]
-    mul = requantizedGemm.inputs[3]
+    add = node.inputs[2]
+    mul = node.inputs[3]
 
     _inputs = [pwIn, pwWeight, mul, add]
 
@@ -592,9 +469,9 @@ def _requantized_gemm_to_pw_fun(graph: gs.Graph, match: Match, name: str):
                  attrs = pwAttrs)
     graph.nodes.append(pw)
 
-    requantizedGemm.inputs.clear()
-    requantizedGemm.outputs.clear()
-    graph.nodes.remove(requantizedGemm)
+    node.inputs.clear()
+    node.outputs.clear()
+    graph.nodes.remove(node)
 
     return graph
 
@@ -603,23 +480,16 @@ def _requantized_gemm_to_pw_fun(graph: gs.Graph, match: Match, name: str):
 class RequantizedGemmToPwPass(ReplaceSequentialPatternPass):
 
     def __init__(self):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['out'], op = 'RequantizedGemm', name = 'requantizedGemm')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
+        graph = _singleNodePattern("RequantizedGemm")
         super().__init__(graph, _requantized_gemm_to_pw_fun, "_REQUANTIZED_GEMM_TO_PW_PASS")
 
 
 def _remove_global_output_reshape_fun(graph: gs.Graph, match: Match, name: str):
-    matched_nodes = list(match.nodes_map.values())
-    reshape = matched_nodes[0]
-
-    isGlobalOutput = len(reshape.outputs[0].outputs) == 0
+    node = next(iter((match.nodes_map.values())))
 
+    isGlobalOutput = len(node.outputs[0].outputs) == 0
     if isGlobalOutput:
-        graph.deleteNode(reshape)
+        graph.deleteNode(node)
 
     return graph
 
@@ -628,26 +498,19 @@ def _remove_global_output_reshape_fun(graph: gs.Graph, match: Match, name: str):
 class RemoveGlobalOutputReshapePass(ReplaceSequentialPatternPass):
 
     def __init__(self):
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['out'], op = 'Reshape', name = 'reshape')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
+        graph = _singleNodePattern("Reshape")
         super().__init__(graph, _remove_global_output_reshape_fun, "_REMOVE_GLOBAL_OUTPUT_RESHAPE_PASS")
 
 
 def _remove_empty_conv_bias_fun(graph: gs.Graph, match: Match, name: str):
-    # Extract matched convolution
-    matched_nodes = list(match.nodes_map.values())
-    opNode = matched_nodes[0]
+    node = next(iter((match.nodes_map.values())))
 
-    # Check if the Conv node has a bias input
-    # If it does, check if the bias only contains zeros
-    if len(opNode.inputs) > 2 and np.all(opNode.inputs[2].values == 0):
-        del opNode.inputs[2]
+    # Check if the node has an all-zero bias and remove it
+    if len(node.inputs) == 3:
+        bias = node.inputs[2]
+        if isinstance(bias, gs.Constant) and np.all(bias.values == 0):
+            del node.inputs[2]
 
-    # Return updated graph
     return graph
 
 
@@ -655,13 +518,35 @@ def _remove_empty_conv_bias_fun(graph: gs.Graph, match: Match, name: str):
 class RemoveEmptyConvBiasPass(ReplaceSequentialPatternPass):
 
     def __init__(self):
-        # Initialized graph with a Conv node
-        graph = gs.Graph()
-        _input = gs.Variable(name = 'input_1')
-        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
-        graph.outputs.append(output)
-        graph.inputs.append(_input)
-
-        # Apply function
+        graph = _singleNodePattern("Conv")
         name = "_REMOVE_EMPTY_CONV_BIAS_PASS"
         super().__init__(graph, _remove_empty_conv_bias_fun, name)
+
+
+def _remove_only_singleton_reduce_mean(graph: gs.Graph, match: Match, name: str):
+    node = next(iter((match.nodes_map.values())))
+
+    # Keep node if only one in the graph
+    if len(graph.nodes) == 1:
+        return graph
+
+    # Delete node if only reduction over singleton dimensions
+    if 'axis' in node.attrs:
+        axis = node.attrs['axis']
+    else:
+        axis = node.inputs[1].values
+
+    # Check if shape information is available
+    if node.inputs[0].shape is not None and all(node.inputs[0].shape[ax] == 1 for ax in axis):
+        graph.deleteNode(node)
+
+    return graph
+
+
+@contextagnostic
+class RemoveOnlySingletonReduceMeanPass(ReplaceSequentialPatternPass):
+
+    def __init__(self):
+        graph = _singleNodePattern("ReduceMean")
+        name = "_REMOVE_ONLY_SINGLETON_REDUCE_MEAN_PASS"
+        super().__init__(graph, _remove_only_singleton_reduce_mean, name)
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/__init__.py b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/__init__.py
index 898895b895..d6474eb128 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/__init__.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/TopologyOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 28.04.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/CommonExtensions/OptimizationPasses/__init__.py b/Deeploy/CommonExtensions/OptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/CommonExtensions/OptimizationPasses/__init__.py
+++ b/Deeploy/CommonExtensions/OptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py
index bdcb6ea5f9..c70628729b 100644
--- a/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py
+++ b/Deeploy/CommonExtensions/TypeCheckers/SignPropTypeChecker.py
@@ -1,33 +1,14 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: SignPropChecker.py
-#
-# Last edited: 19.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Optional
 
 import onnx_graphsurgeon as gs
 
+from Deeploy.AbstractDataTypes import IntegerImmediate
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTypeChecker, OperatorRepresentation, VariableBuffer
+from Deeploy.Logging import DEFAULT_LOGGER as log
 
 
 class SignPropTypeChecker(NodeTypeChecker):
@@ -67,8 +48,15 @@ def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node,
             nLevels = self._inferNumLevels(inputs, operatorRepresentation)
             signedness = self._inferSignedness(inputs, operatorRepresentation)
 
-            for obj, nLevels, sign in zip(outputs, nLevels, signedness):
-                obj.nLevels = nLevels
+            if nLevels is None or signedness is None:
+                return ctxt
+            for obj, nLevel, sign in zip(outputs, nLevels, signedness):
+                obj.nLevels = nLevel
                 obj._signed = sign
 
+                if issubclass(obj._type.referencedType, IntegerImmediate) and not obj._type.fitsNumLevels(nLevel):
+                    log.warning(
+                        f"{obj.name} has {nLevel} levels, but {obj._type.referencedType.typeName} only supports {obj._type.referencedType.nLevels} levels."
+                    )
+
         return ctxt
diff --git a/Deeploy/CommonExtensions/TypeCheckers/__init__.py b/Deeploy/CommonExtensions/TypeCheckers/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/CommonExtensions/TypeCheckers/__init__.py
+++ b/Deeploy/CommonExtensions/TypeCheckers/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/CommonExtensions/__init__.py b/Deeploy/CommonExtensions/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/CommonExtensions/__init__.py
+++ b/Deeploy/CommonExtensions/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index bd6145efe6..d9d768fabc 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -1,41 +1,20 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: DeeployTypes.py
-#
-# Last edited: 17.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from __future__ import annotations
 
 import copy
+import math
 import os
 import pickle
 import re
+import time
 from abc import abstractmethod
 from collections import OrderedDict, deque
 from dataclasses import dataclass
 from functools import reduce
-from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, Type, TypeVar, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Set, Tuple, Type, TypeVar, Union
 
 import mako
 import numpy as np
@@ -45,6 +24,9 @@
 from onnx.external_data_helper import convert_model_to_external_data
 from ortools.constraint_solver.pywrapcp import IntVar
 
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.Logging import FAILURE_MARK, SUCCESS_MARK
+
 from .AbstractDataTypes import BaseType, FloatImmediate, IntegerImmediate, Pointer, PointerClass, Struct, VoidType
 
 Shape = TypeVar("Shape", bound = Any)
@@ -118,7 +100,8 @@ def __init__(self, templateStr: str):
 
         """
         self.template = _Template(templateStr, strict_undefined = True)
-        self.subTemplates = {}
+        self.subTemplates: Dict[str, Tuple[NodeTemplate, Callable[[NetworkContext, OperatorRepresentation],
+                                                                  Tuple[NetworkContext, OperatorRepresentation]]]] = {}
         self.subTemplateGenerators = {}
 
     def internalSize(self) -> int:
@@ -240,8 +223,8 @@ def generate(self, operatorRepresentation = {}, **kwargs) -> str:
                 operatorRepresentation[f'RENDER_{key}'] = template.generate(**subNodeRep, **kwargs)
             callStack += self.template.render(**operatorRepresentation, **kwargs)
         except:
-            print(operatorRepresentation)
-            print(mako.exceptions.text_error_template().render())
+            log.error(operatorRepresentation)
+            log.error(mako.exceptions.text_error_template().render())
             raise KeyError(f"Template {self} failed!")
         return callStack
 
@@ -255,7 +238,7 @@ class VariableBuffer():
     allocTemplate: NodeTemplate  #: NodeTemplate: Holds the buffer's allocation code
     deallocTemplate: NodeTemplate  #: NodeTemplate: Holds the buffer's deallocation code
 
-    def __init__(self, name: str = '', shape = [1], alias_of: Optional[List[str]] = []):
+    def __init__(self, name: str = '', shape = [1], aliases: Optional[List[str]] = None):
         self.name: str = name  #: str: Canonical name that this buffer is registered as in the NetworkContext
         self.shape: Sequence[
             int] = shape  #: Sequence[int]: Represents the dimensions of the underlying tensor as a sequence of dimension sizes
@@ -274,7 +257,7 @@ def __init__(self, name: str = '', shape = [1], alias_of: Optional[List[str]] =
         self.is_input: bool = False
         self.is_output: bool = False
 
-        self.alias_of: List[str] = alias_of if alias_of is not None else []
+        self.aliases: Set[str] = set(aliases) if aliases is not None else set()
 
     def _bufferRepresentation(self) -> Dict:
         return {"type": self._instance, "name": self.name, "size": int(np.prod(self.shape))}
@@ -341,60 +324,40 @@ def __getstate__(self):
     def fromNode(cls, node: gs.Node):
         return (cls(name = node.name, shape = node.shape if not isinstance(node, gs.Constant) else node.values.shape))
 
-    def add_aliases(self, aliases_to_add: List[str]):
-        """
-        Adds list of aliases to the alias_of attribute.
+    def has_live_aliases(self, ctxt: NetworkContext) -> bool:
+        """Checks whether this VariableBuffer has any live aliases, i.e. buffers that are still live and are aliased by this buffer.
         Parameters
         ----------
-        alias_to_add : List[str]
-            List of names of aliases to add to the alias_of attribute.
+        ctxt : NetworkContext
+            Current NetworkContext
         Returns
         -------
-        None
+        bool
+            True if this VariableBuffer has any live aliases, False otherwise
         """
+        # Do a breadth-first search across the aliasing double-linked list
+        live = self._live
+        queue = set(self.aliases)
+        visited = set(self.name)
+        while len(queue) > 0:
+            next = queue.pop()
+            buffNext = ctxt.lookup(next)
+            assert isinstance(buffNext, VariableBuffer)
+            live |= buffNext._live
+            visited.add(next)
+            queue |= buffNext.aliases - visited
+        return live
 
-        if not hasattr(self, "alias_of"):
-            return None
-
-        for alias in aliases_to_add:
-            if alias not in self.alias_of:
-                self.alias_of.append(alias)
+    def sizeInBytes(self) -> int:
+        """Returns the size of this VariableBuffer in bytes
 
-        return None
-
-    def get_aliases_of(self):
-        """
-        Getter function for the alias_of attribute.
         Returns
         -------
-        List[str]
-            List of names o all aliases of this VariableBuffer.
-        """
-
-        if hasattr(self, "alias_of"):
-            return self.alias_of
-        else:
-            return list()
+        int
+            Size of this VariableBuffer in bytes
 
-    def has_live_ancestors(self, ctxt: NetworkContext) -> bool:
-        """Checks whether this VariableBuffer has any live ancestors, i.e. buffers that are still live and are aliased by this buffer.
-        Parameters
-        ----------
-        ctxt : NetworkContext
-            Current NetworkContext
-        Returns
-        -------
-        bool
-            True if this VariableBuffer has any live ancestors, False otherwise
         """
-        if not hasattr(self, "alias_of"):
-            return False
-
-        for alias in self.alias_of:
-            if ctxt.lookup(alias)._live:
-                return True
-
-        return False
+        return (math.prod(self.shape) * (self._type.referencedType.typeWidth)) // 8
 
 
 class TransientBuffer(VariableBuffer):
@@ -404,28 +367,13 @@ class TransientBuffer(VariableBuffer):
     """
 
     def __init__(self, name: str = '', size = 0):
-        self.name = name
-        self.size = size  #: int: Total BYTE size of this TransientBuffer
-
-        # Do not override - Should be written in the parsing passes
-        self._users = []
+        super().__init__(name, shape = (size,))
 
         # Do not override - Should be written in the parsing passes
         self._type: Type[Pointer] = PointerClass(VoidType)
-
-        # Do not override - Should be written in the deployment passes
-        self._live = False
-
-        # Do not override - Set in Templates depending on platform
-        self._deploy = True
-
-        self.is_input: bool = False
-        self.is_output: bool = False
-
-        self.alias_of: List[str] = []
+        self.size = size
 
     def __eq__(self, other):
-
         ret = all([self.name == other.name, self.size == other.size])
         return ret
 
@@ -438,9 +386,8 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return f'TransientBuffer: name: {self.name}, size: {self.size}'
 
-    @classmethod
-    def fromVariableBuffer(cls, buffer: VariableBuffer):
-        ret = cls(name = buffer.name, size = np.prod(buffer.shape) * buffer._type.typeWidth // 8)
+    def sizeInBytes(self) -> int:
+        return int(self.size)
 
 
 class ConstantBuffer(VariableBuffer):
@@ -482,12 +429,6 @@ def __repr__(self) -> str:
     def _bufferRepresentation(self) -> Dict:
         return {"type": self._type, "name": self.name, "size": int(np.prod(self.shape)), "values": self._valueString()}
 
-    @classmethod
-    def fromVariableBuffer(cls, buffer: VariableBuffer, values):
-        ret = cls(name = buffer.name, shape = buffer.shape, values = values)
-
-        return ret
-
 
 class StructBuffer(VariableBuffer):
     """Class to represent Struct object needed by the generated C Code
@@ -535,22 +476,33 @@ class _ReferenceBuffer(VariableBuffer):
     """Helper class to hoist references to pre-established pointers; this is used most frequently in tiling to express an offset with respect to input or output tensors
     """
 
-    allocTemplate = NodeTemplate("${type.typeName} ${name} = (${type.typeName}) ${objectName};")
+    allocTemplate = NodeTemplate("""\\
+    % if offset is None:
+    ${type.typeName} ${name} = (${type.typeName}) ${referenceName};\\
+    % else:
+    ${type.typeName} ${name} = (${type.typeName})((char*) ${referenceName} + ${offset});\\
+    % endif
+    """)
     deallocTemplate = NodeTemplate("")
     initTemplate = NodeTemplate("")
 
-    def __init__(self, name: str = '', shape = [1], reference: Optional[VariableBuffer] = None):
-
-        assert reference is not None, "Can't have a reference to None!"
-
+    def __init__(self,
+                 name: str,
+                 reference: VariableBuffer,
+                 shape: Tuple[int, ...] = (1,),
+                 offset: Optional[Union[int, str, VariableBuffer]] = None):
         super().__init__(name, shape)
-        self._referencedBuffer = str(reference._instance)
         self._referenceName = reference.name
+        if isinstance(offset, VariableBuffer):
+            self._offset = offset.name
+        else:
+            self._offset = offset
 
     def _bufferRepresentation(self) -> Dict:
-        rep = super()._bufferRepresentation()
-        rep['objectName'] = self._referencedBuffer
-        return rep
+        repr = super()._bufferRepresentation()
+        repr['referenceName'] = self._referenceName
+        repr['offset'] = self._offset
+        return repr
 
 
 class NetworkContext():
@@ -573,13 +525,16 @@ def __init__(self,
         self.TransientBuffer = transientBuffer
         self.name = name
 
-    def dealiasBuffer(self, referenceName: str) -> str:
-        """Function to unravel reference instantiated in _ReferenceBuffer objects until the underlying VariableBuffer's name is returned
+        self._maxDynamicSize = {}  #: int: Maximum dynamic memory size occupied by live buffers at any point in time
+        self._dynamicSize = {}  #: int: Current dynamic memory size occupied by live buffers
+
+    def dealiasBuffer(self, name: str) -> str:
+        """Function to find the underlying aliased VariableBuffer
 
         Parameters
         ----------
-        referenceName : str
-            Name of the _ReferenceBuffer to unravel
+        name: str
+            Name of the VariableBuffer to dealias
 
         Returns
         -------
@@ -589,25 +544,42 @@ def dealiasBuffer(self, referenceName: str) -> str:
         Raises
         ------
         Exception
-            Raises an Exception if references are circular, i.e. there
-            is no underlying VariableBuffer
+            Raises an Exception if aliases are circular
 
         """
-        _buffer = self.lookup(referenceName)
-        if not hasattr(_buffer, "_alias"):
-            return referenceName
-
         seenAliases: Set[str] = set()
+        alias = self.lookup(name)
+        while hasattr(alias, "_alias"):
+            seenAliases.add(alias.name)
+            alias = self.lookup(alias._alias)
+            assert alias.name not in seenAliases, "Circular aliasing detected!"
+        return alias.name
 
-        alias = _buffer._alias
-        while hasattr(self.lookup(alias), "_alias"):
-            seenAliases.add(alias)
-            alias = self.lookup(alias)._alias
+    def unravelReference(self, ref: VariableBuffer) -> VariableBuffer:
+        """Function to find the underlying referenced VariableBuffer
 
-            if alias in seenAliases:
-                raise Exception("Circular aliasing detected!")
+        Parameters
+        ----------
+        ref : VariableBuffer
+            Buffer to unravel
 
-        return alias
+        Returns
+        -------
+        str
+            Name of the original VariableBuffer that was referenced
+
+        Raises
+        ------
+        Exception
+            Raises an Exception if references are circular
+
+        """
+        seenRefs = set()
+        while isinstance(ref, _ReferenceBuffer):
+            seenRefs.add(ref.name)
+            ref = self.lookup(ref._referenceName)
+            assert ref.name not in seenRefs, "Circular reference found"
+        return ref
 
     def exportNetworkContext(self, folderPath: str, fileName: str):
         """Exports the NetworkContext as a pickled dictionary
@@ -706,7 +678,7 @@ def _mangle(self, name: str, repr: bool = True) -> str:
             repStr = re.sub('\.', '_', self.name) + '_' + repStr
         return repStr
 
-    def add(self, obj: VariableBuffer, ctxt: str = 'local', _id: str = ""):
+    def add(self, obj: VariableBuffer, ctxt: Literal['local', 'global'] = 'local', _id: str = ""):
         """Adds a VariableBuffer object to the NetworkContext
 
         Parameters
@@ -793,10 +765,7 @@ def is_global(self, name: str) -> bool:
             Returns true if the name matches with any global buffer
 
         """
-        if name in self.globalObjects.keys():
-            return True
-        else:
-            return False
+        return name in self.globalObjects
 
     def is_local(self, name: str) -> bool:
         """Checks whether a name is associated with a local buffer
@@ -812,11 +781,42 @@ def is_local(self, name: str) -> bool:
             Returns ture if the name matches with any local buffer
 
         """
+        return name in self.localObjects
 
-        if name in self.localObjects.keys():
-            return True
-        else:
+    def is_object(self, value: Any) -> bool:
+        """Checks whether a value is an existing object name
+
+        Parameters
+        ----------
+        value : Any
+            Value to check
+
+        Returns
+        -------
+        bool
+            Returns ture if the value is an existing buffer name
+
+        """
+        return isinstance(value, str) and (self.is_local(value) or self.is_global(value))
+
+    def is_buffer(self, value: Any) -> bool:
+        """Checks whether a value is an existing buffer name
+
+        Parameters
+        ----------
+        value : Any
+            Value to check
+
+        Returns
+        -------
+        bool
+            Returns ture if the value is an existing buffer name
+
+        """
+        if not self.is_object(value):
             return False
+        obj = self.lookup(value)
+        return isinstance(obj, VariableBuffer)
 
     def hoistTransientBuffer(self, name: str, size: int) -> str:
         """Registers a new TransientBuffer in the local context
@@ -901,55 +901,57 @@ def hoistConstantAndReference(self, constBuf: ConstantBuffer, pointerType: Type[
             name of the registered _ReferenceBuffer
 
         """
-
-        name = constBuf.name
         constBuf._type = pointerType
-
         self.add(constBuf, "global")
+        constBuf._instance = constBuf._type(constBuf.name, self)
+        ref = self.hoistReference(constBuf.name + "_ref", constBuf)
+        return ref.name
 
-        constBuf._instance = constBuf._type(name, self)
-
-        refName = name + "_ref"
-        reference = self.hoistReference(name, refName)
-
-        return refName
-
-    def hoistReference(self, _reference: str, name: str) -> str:
-        """Helper function to register a _ReferenceBuffer to preexisting VariableBuffer
+    def hoistReference(self,
+                       name: str,
+                       reference: VariableBuffer,
+                       shape: Tuple[int, ...] = (1,),
+                       offset: Union[int, str, VariableBuffer] = 0,
+                       override_type: Optional[Type[BaseType]] = None) -> _ReferenceBuffer:
+        """Helper function to register a _ReferenceBuffer to a preexisting VariableBuffer
 
         Parameters
         ----------
-        _reference : str
-            Name of the VariableBuffer that should be referenced
         name : str
-            Name of the _ReferenceBuffer that should be registered
+            Name of the _ReferenceBuffer to register
+        reference : VariableBuffer
+            Referenced VariableBuffer
+        shape: Tuple[int, ...]
+            Shape of the _ReferenceBuffer
+        offset: Union[int, str, VariableBuffer]
+            Offset from the reference
+        override_type: Optional[Type[BaseType]]
+            Optional argument to override the reference type
 
         Returns
         -------
-        str
-            Returns the name of the newly registered _ReferenceBuffer
+        _ReferenceBuffer
+            Returns the newly registered _ReferenceBuffer
 
         """
+        ref = _ReferenceBuffer(name, reference, shape, offset)
+        if override_type is not None:
+            ref._type = PointerClass(override_type)
+        else:
+            ref._type = reference._type
+        self.add(ref, 'local')
+        ref._instance = ref._type(name, ctxt = self)
+        return ref
 
-        assert _reference != name, f"Reference name {_reference} cannot be the same as {name}"
-        assert not self.is_local(name), f"{name} is already in context!"
-
-        _object = self.lookup(_reference)
-
-        referenceBuffer = _ReferenceBuffer(name, reference = _object)
-        referenceBuffer._type = _object._type
-
-        self.add(referenceBuffer, 'local')
-        referenceBuffer._instance = _object._type(name, ctxt = self)
-
-        return name
-
-    def hoistConstant(self, node: gs.Node, name: str = '', _type: Optional[Type[Pointer]] = None) -> str:
-        """Register a ConstantBuffer extracted directly from a graphsurgeon Node
+    def hoistConstant(self,
+                      constant: gs.Constant,
+                      name: Optional[str] = None,
+                      _type: Optional[Type[Pointer]] = None) -> str:
+        """Register a ConstantBuffer extracted directly from a graphsurgeon Constant
 
         Parameters
         ----------
-        node : gs.Node
+        constant : gs.Constant
             graphsurgeon.Node containing a single constant output
         name : str
             Name of the ConstantBuffer to be registered
@@ -962,21 +964,18 @@ def hoistConstant(self, node: gs.Node, name: str = '', _type: Optional[Type[Poin
             Returns the name of the newly registed ConstantBuffer
 
         """
+        assert len(constant.outputs) <= 1, f"Constant {constant.name} has more than one output"
 
-        assert len(node.outputs) <= 1, f"Constant {node.name} has more than one output"
+        name = name if name is not None else constant.name
 
-        if name == "":
-            name = node.name
+        # LMACAN: The shape needs to be copied into a tuple for pickling to work. Don't ask me why..
+        buffer = self.ConstantBuffer(name, tuple(constant.shape), constant.values)
+        self.add(buffer, 'global')
 
-        # SCHEREMO: This is currently heuristic, but should be annotated in ONNX
-        localBuffer = self.VariableBuffer.fromNode(node = node)
-        globalBuffer = self.ConstantBuffer.fromVariableBuffer(localBuffer, values = node.values)
-        globalBuffer.name = name
-        globalBuffer._type = _type
+        if _type is not None:
+            self.annotateType(name, _type)
 
-        self.add(globalBuffer, 'global')
-
-        return globalBuffer.name
+        return name
 
     def addUser(self, name: str, node: gs.Node):
         """Adds an operator's name to the _user list of a VariableBuffer in the context
@@ -1375,6 +1374,11 @@ def typeCheck(self, ctxt: NetworkContext, node: gs.Node,
         self.annotateDict(newCtxt, node, operatorRepresentation)
         return (newCtxt, True)
 
+    def signature(self) -> str:
+        input_types_str = ", ".join([_type.referencedType.typeName for _type in self.input_types])
+        output_types_str = ", ".join([_type.referencedType.typeName for _type in self.output_types])
+        return f"({input_types_str}) -> {output_types_str}"
+
 
 class ExecutionBlock():
     """Deeploy abstraction to represent a operator whose kernel has been determined. Mostly used to apply various code transformations, and, finally, generate C Code
@@ -1470,17 +1474,16 @@ def hoisting(self, ctxt: NetworkContext, **kwargs) -> Tuple[NetworkContext, List
         return newCtxt, transientBuffers + contextBuffers
 
     @staticmethod
-    def _mangleNodeRep(ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> OperatorRepresentation:
-        parseDict = {}
+    def _mangleOpRepr(ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> OperatorRepresentation:
+        mangledOpRepr = {}
 
         for key, value in operatorRepresentation.items():
-            if type(value) == str and (ctxt.is_local(value) or
-                                       ctxt.is_global(value)) and not isinstance(ctxt.lookup(value), GlobalDefinition):
-                parseDict[key] = ctxt._mangle(value)
+            if ctxt.is_buffer(value):
+                mangledOpRepr[key] = ctxt._mangle(value)
             else:
-                parseDict[key] = value
+                mangledOpRepr[key] = value
 
-        return parseDict
+        return mangledOpRepr
 
     def generate(self, ctxt: NetworkContext, **kwargs) -> str:
         """Generates the code for all registered NodeTemplates and joins it to construct a single snippet
@@ -1499,7 +1502,7 @@ def generate(self, ctxt: NetworkContext, **kwargs) -> str:
 
         return ("\n").join([
             codeSnippet.template.generate(
-                ExecutionBlock._mangleNodeRep(ctxt, {
+                ExecutionBlock._mangleOpRepr(ctxt, {
                     **codeSnippet.operatorRepresentation,
                     **kwargs
                 })) for codeSnippet in self.codeSnippets
@@ -1520,6 +1523,9 @@ def __init__(self, typeChecker: NodeTypeChecker, template: NodeTemplate, codeTra
         self.buffers: List[VariableBuffer] = []
         self.codeTransformer: CodeTransformation = codeTransformer
 
+    def __repr__(self):
+        return f"{self.template.__class__.__name__}{self._typeChecker.signature()}"
+
     @property
     def typeChecker(self):
         """Read-only wrapper around the encapsulated type checker
@@ -1581,9 +1587,13 @@ def typeCheck(self, ctxt: NetworkContext, node: gs.Node,
             matches the node
 
         """
+
         newCtxt, ret = self.typeChecker.typeCheck(ctxt.copy(), node, operatorRepresentation)
         if ret:
+            log.debug(f" {SUCCESS_MARK} Type check passed for {self}")
             return newCtxt, True
+        else:
+            log.debug(f" {FAILURE_MARK} Type check failed for {self}")
 
         return ctxt, False
 
@@ -1660,6 +1670,10 @@ def __init__(self, parser: NodeParser, bindings: List[NodeBinding]):
 
         self.discardedBindings = set()  #: Set[NodeBinding]: Set of all bindings which have been tried unsuccessfully.
 
+    def __repr__(self):
+        bindings_str = "\n  ".join([repr(binding) for binding in self.bindings])
+        return f"{self.parser.__class__.__name__} [\n  {bindings_str}\n]"
+
     # Don't override this. Parses the networks with the correct data type
     def _parse(self,
                ctxt: NetworkContext,
@@ -1669,7 +1683,10 @@ def _parse(self,
 
         newCtxt, ret = self.parser.parse(ctxt.copy(), node, default_channels_first, ioParse)
         if ret:
+            log.debug(f" {SUCCESS_MARK} Parser {self.parser.__class__.__name__} succeeded")
             return newCtxt, True
+        else:
+            log.debug(f" {FAILURE_MARK} Parser {self.parser.__class__.__name__} failed")
 
         return ctxt, False
 
@@ -1679,6 +1696,10 @@ def _parseCtxt(self,
                    default_channels_first: bool = True) -> Tuple[NetworkContext, bool]:
 
         newCtxt, ret = self.parser.parseNodeCtxt(ctxt.copy(), node, default_channels_first)
+        if ret:
+            log.debug(f" {SUCCESS_MARK} Context parsing succeeded with {self.parser.__class__.__name__}")
+        else:
+            log.debug(f" {FAILURE_MARK} Context parsing failed with {self.parser.__class__.__name__}")
         return (newCtxt, ret)
 
     def bindingsExhausted(self) -> bool:
@@ -1723,7 +1744,8 @@ def typeCheck(self, ctxt: NetworkContext, node: gs.Graph) -> Tuple[NetworkContex
             failure
 
         """
-        for binder in self.bindings:
+
+        for idx, binder in enumerate(self.bindings):
 
             if binder in self.discardedBindings:
                 continue
@@ -1737,6 +1759,7 @@ def typeCheck(self, ctxt: NetworkContext, node: gs.Graph) -> Tuple[NetworkContex
             self.binder = binder
             return newCtxt, True
 
+        log.debug(f" ‼ All {len(self.bindings)} bindings exhausted for {self.parser.__class__.__name__}")
         return ctxt, False
 
     # Don't override this. This should annotate the output node with the correct data type
@@ -1805,6 +1828,10 @@ def __init__(self, maps: List[NodeMapper]):
         )  #: Set[NodeMapper]: Set of all NodeMappers which cannot be used to represent this layer
         self.node: gs.Node = None  #: gs.Node: The represented operator
 
+    def __repr__(self):
+        maps_str = "\n  ".join([repr(mapper) for mapper in self.maps])
+        return f"{self.__class__.__name__}(maps=[\n  {maps_str}\n])"
+
     def computeOps(self):
         """Returns the number of operations (1 MAC = 2 Ops) of this operator
         """
@@ -1917,7 +1944,7 @@ def discardCurrentMapper(self):
         """Discard the current Mapper
 
         """
-        self.dicardedMappers.add(self.mapper)
+        self.discardedMappers.add(self.mapper)
         self.mapper = None
 
     def resetDiscardedMappers(self):
@@ -1946,12 +1973,14 @@ def parse(self, ctxt: NetworkContext, default_channels_first: bool) -> Tuple[Net
 
 
         """
+
         ioParse = True
 
         # iterate through all possible mappings and return the first that works
         for idx, mapper in enumerate(self.maps):
 
             if mapper in self.discardedMappers:
+                log.debug(f" ⏭️  Skipping mapper {idx}: {mapper.parser.__class__.__name__} (previously discarded)")
                 continue
 
             newCtxt = ctxt.copy()
@@ -1966,11 +1995,13 @@ def parse(self, ctxt: NetworkContext, default_channels_first: bool) -> Tuple[Net
 
             self.mapper = mapper
 
+            # Perform broadcasting
             self.broadcast(newCtxt, default_channels_first)
 
             newCtxt, ret = mapper._parseCtxt(newCtxt, self.node, default_channels_first)
 
             if not ret:
+                log.debug(f" {FAILURE_MARK} Context parsing failed for {mapper.parser.__class__.__name__}")
                 self.discardedMappers.add(mapper)
                 continue
 
@@ -1979,6 +2010,7 @@ def parse(self, ctxt: NetworkContext, default_channels_first: bool) -> Tuple[Net
 
             return newCtxt, True
 
+        log.debug(f" All {len(self.maps)} mappers exhausted for '{self.node.name}'")
         return ctxt, False
 
     def _broadcastToNpType(self, ty: Type[BaseType]):
@@ -2017,6 +2049,9 @@ def typeCheck(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
             failure
 
         """
+        if not hasattr(self, 'mapper') or self.mapper is None:
+            log.debug(f" {FAILURE_MARK} ONNXLayer.typeCheck() - No mapper selected for '{self.node.name}'")
+            return ctxt, False
 
         newCtxt = ctxt.copy()
         newCtxt, ret = self.mapper.typeCheck(newCtxt, self.node)
@@ -2069,7 +2104,7 @@ def bind(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
             self.mapper.parser.operatorRepresentation['nodeOps'] = int(self.computeOps())
             return newCtxt, True
 
-        self.discardedMappers.append(self.mapper)
+        self.discardedMappers.add(self.mapper)
         return ctxt, False
 
     def codeTransform(self, ctxt: NetworkContext, verbose: CodeGenVerbosity = _NoVerbosity) -> NetworkContext:
@@ -2142,7 +2177,8 @@ class TopologyOptimizer():
 
     """
 
-    def __init__(self, passes: List[TopologyOptimizationPass]):
+    def __init__(self, passes: List[TopologyOptimizationPass], name: str = "TopologyOptimizer"):
+        self.name = name
         self.passes = passes
 
     def optimize(self, graph: gs.Graph) -> Tuple[gs.Graph]:
@@ -2160,8 +2196,11 @@ def optimize(self, graph: gs.Graph) -> Tuple[gs.Graph]:
 
         """
         for _pass in self.passes:
+            start_time = time.perf_counter()
             graph = _pass.apply(graph)
             graph.cleanup().toposort()
+            end_time = time.perf_counter()
+            log.debug(f" - Applied {_pass.__class__.__name__} ({(end_time - start_time)*1E3:.3f} ms)")
         return graph
 
 
@@ -2331,6 +2370,9 @@ def canExecute(self, node: gs.Node) -> bool:
         """
         return node.op in self.Mapping
 
+    def __repr__(self):
+        return f"{self.__class__.__name__}(name='{self.name}', mappings={list(self.Mapping.keys())})"
+
 
 class DeploymentPlatform():
     """Deeploy abstraction for a complete system, including at least a host core capable of memory allocation
@@ -2367,6 +2409,16 @@ def __init__(self, engines: List[DeploymentEngine], variableBuffer: Type[Variabl
         self.StructBuffer = structBuffer
         self.TransientBuffer = transientBuffer
 
+    def __repr__(self) -> str:
+        retStr = f"{self.__class__.__name__}("
+        retStr += f"engines={[e.name for e in self.engines]}, "
+        retStr += f"variableBuffer={self.VariableBuffer.__name__}, "
+        retStr += f"constantBuffer={self.ConstantBuffer.__name__}, "
+        retStr += f"structBuffer={self.StructBuffer.__name__}, "
+        retStr += f"transientBuffer={self.TransientBuffer.__name__}"
+        retStr += ")"
+        return retStr
+
 
 class NetworkContainer():
     """Deeploy abstraction for containing the information needed to describe a complete neural network to be deployed
@@ -2410,6 +2462,7 @@ def __init__(self,
                 self.ctxt.hoistConstant(x.attrs['value'], x.outputs[0].name, None)
 
         self.inputTypes = inputTypes
+        self.name = name
 
         self.ctxt = NetworkContext(variableBuffer = self.Platform.VariableBuffer,
                                    constantBuffer = self.Platform.ConstantBuffer,
@@ -2421,6 +2474,9 @@ def __init__(self,
         self.bound = False
         self.transformed = False
 
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(name='{self.name}', platform={self.Platform.__class__.__name__}, inputTypes={ [v.typeName for k, v in self.inputTypes.items()]}, scheduler={self.scheduler.__name__})"
+
     # Don't override this
     def _createIOBindings(self, ctxt: NetworkContext, graph: gs.Graph):
 
@@ -2455,15 +2511,12 @@ def inputs(self) -> List[VariableBuffer]:
         """
         inputs = []
 
-        graphInputs = [tensor.name for tensor in self.graph.inputs]
+        for tensor in self.graph.inputs:
+            if self.ctxt.is_global(tensor.name):
+                buffer = self.ctxt.lookup(tensor.name)
+                if isinstance(buffer, self.ctxt.VariableBuffer) and len(buffer._users) > 0:
+                    inputs.append(buffer)
 
-        for key, value in self.ctxt.globalObjects.items():
-            if not isinstance(value, self.ctxt.VariableBuffer) or value._users == []:
-                continue
-            if key not in graphInputs:
-                continue
-
-            inputs += [value]
         return inputs
 
     def outputs(self) -> List[VariableBuffer]:
@@ -2477,16 +2530,12 @@ def outputs(self) -> List[VariableBuffer]:
         """
         outputs = []
 
-        graphOutputs = [tensor.name for tensor in self.graph.outputs]
+        for tensor in self.graph.outputs:
+            if self.ctxt.is_global(tensor.name):
+                buffer = self.ctxt.lookup(tensor.name)
+                if isinstance(buffer, self.ctxt.VariableBuffer):
+                    outputs.append(buffer)
 
-        for key, value in self.ctxt.globalObjects.items():
-
-            if not isinstance(value, self.ctxt.VariableBuffer):
-                continue
-            if key not in graphOutputs:
-                continue
-
-            outputs += [value]
         return outputs
 
     def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
@@ -2513,10 +2562,10 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
             self.ctxt = layer.codeTransform(self.ctxt, verbose)
         self.transformed = True
 
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
         for engine in self.Platform.engines:
             if node.op in engine.Mapping:
-                return engine.Mapping[node.op](node)
+                return engine
         raise RuntimeError(f"No mapping found for node {node.name} with op type {node.op}")
 
     def _bindLayers(self):
@@ -2533,19 +2582,23 @@ def _bindLayers(self):
                 flatSchedule += subGraph
 
         for node in flatSchedule:
-            layer = self._mapNode(node)
+            engine = self._selectEngine(node)
+            layer = engine.Mapping[node.op](node)
             if isinstance(layer, ONNXLayer):
+                log.debug(f"   {SUCCESS_MARK} Bind {node.name} to layer {layer.__class__.__name__}")
                 self.layerBinding[layer.node.name] = layer
 
     def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
                    default_channels_first: bool) -> Tuple[NetworkContext, bool]:
-
         newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
 
         if not parsePass:
             return ctxt, False
 
-        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
+        return newCtxt, True
+
+    def _typeCheckNode(self, node: ONNXLayer, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
+        newCtxt, LayerBindSuccess = node.typeCheck(ctxt)
 
         if not LayerBindSuccess:
             return ctxt, False
@@ -2580,8 +2633,10 @@ def parse(self, default_channels_first: bool = True) -> bool:
                                    structBuffer = self.Platform.StructBuffer,
                                    transientBuffer = self.Platform.TransientBuffer)
 
+        log.debug(" - Create IO Bindings")
         self.ctxt = self._createIOBindings(self.ctxt, self.graph)
 
+        log.debug(" - Bind Nodes to Layers")
         self._bindLayers()
 
         ctxt = self.ctxt.copy()
@@ -2592,15 +2647,33 @@ def parse(self, default_channels_first: bool = True) -> bool:
 
         deepestIdx = 0
 
+        log.debug(" - Parse and Type Check Network")
+        start_time = time.perf_counter()
+
+        iteration_main = 0
+        iteration_sub = 0
+        iteration_tot = 0
         while (idx < len(scheduledLayerList)):
             currentLayer = scheduledLayerList[idx]
 
+            # Log current exploration state
+            if idx == 0:
+                iteration_main += 1
+                iteration_tot += 1
+                iteration_sub = 0
+                log.debug(31 * "-" + f" MAIN ITERATION {iteration_main:<2} " + 31 * "-")
+
+            log.debug(f"[Layer {idx}] Trying '{currentLayer.node.name}' (op: {currentLayer.node.op})")
+
             stCtxt = copy.deepcopy(ctxt)
 
             newCtxt, parseSuccess = self._parseNode(currentLayer, ctxt, default_channels_first)
 
+            typeCheckSuccess = False
             if parseSuccess:
+                newCtxt, typeCheckSuccess = self._typeCheckNode(currentLayer, newCtxt)
 
+            if parseSuccess and typeCheckSuccess:
                 # SCHEREMO: Continue depth-first exploration
                 ctxtStack.append(stCtxt)
                 ctxt = newCtxt
@@ -2610,15 +2683,21 @@ def parse(self, default_channels_first: bool = True) -> bool:
                     deepestCtxt = stCtxt
 
             else:
-                # SCHEREMO: Rollback one step
-
                 # SCHEREMO: If we can't find a mapping for the root, we must exit
                 if idx == 0:
                     deepestLayer = scheduledLayerList[deepestIdx]
                     deepestNodeName = deepestLayer.node.name
+                    log.debug("-" * 80)
+                    log.error("💥 PARSING FAILED - Backtracking exhausted at root!")
+                    log.error("=" * 80)
+                    log.error(f"🔍 Diagnosis:")
+                    log.error(f"   - Deepest successful exploration: Layer {deepestIdx} '{deepestNodeName}'")
+                    log.error(
+                        f"   - Deepest layer available mappers: {[type(x.parser).__name__ for x in deepestLayer.maps]}")
+                    log.error("=" * 80)
                     raise RuntimeError(
-                        f'Did not find adequate mapping for graph! Explored until layer {deepestLayer} of node {deepestNodeName} Candidates: {[type(x.parser).__name__ for x in deepestLayer.maps]}. Exhausted backtracking.'
-                    )
+                        f'Did not find adequate mapping for graph! Explored until layer {deepestLayer.__class__.__name__} of node {deepestNodeName}'
+                        f'Candidates: {[type(x.parser).__name__ for x in deepestLayer.maps]}. Exhausted backtracking.')
 
                 previousLayer = scheduledLayerList[idx - 1]
                 ctxt = ctxtStack.pop()
@@ -2632,8 +2711,17 @@ def parse(self, default_channels_first: bool = True) -> bool:
                 else:
                     previousLayer.mapper.discardCurrentBinder()
 
+                # SCHEREMO: Rollback one step
                 idx = idx - 1
-
+                if idx != 0:
+                    iteration_sub += 1
+                    iteration_tot += 1
+                    log.debug(31 * "-" + f" SUB ITERATION {iteration_main}.{iteration_sub:<2} " + 31 * "-")
+
+        end_time = time.perf_counter()
+        log.info(
+            f" {SUCCESS_MARK} Parsed network with {len(self.layerBinding)} layers after {iteration_tot} iterations in {(end_time-start_time)*1E3:.3f} ms"
+        )
         self.ctxt = ctxt
         self.parsed = True
         return True
@@ -2660,6 +2748,7 @@ def bind(self) -> bool:
         newCtxt = self.ctxt.copy()
 
         NetworkBindSuccess = True
+        log.info("- Map Layers to Bindings")
         for name, layer in self.layerBinding.items():
 
             newCtxt, LayerBindSuccess = layer.bind(newCtxt)
@@ -2668,6 +2757,8 @@ def bind(self) -> bool:
             if not NetworkBindSuccess:
                 raise RuntimeError(f'Could not find a valid binding for the graph')
 
+            log.debug(f" {SUCCESS_MARK} Mapped {layer.node.name} to {layer.mapper.binder}")
+
         self.bound = True
         self.ctxt = newCtxt
 
@@ -2821,8 +2912,7 @@ def generateIOBufferInitializationCode(self) -> str:
     def worstCaseBufferSize(self):
         """Return the worst-case buffer size occupied by the network implementaiton
         """
-        # WIESEP: There is no reasonable value for a worst case buffer size without tiling
-        raise NotImplementedError("Worst case buffer size is not known or not implemented!")
+        return self.ctxt._maxDynamicSize
 
     # Don't override this
     def generateBufferInitializationCode(self) -> str:
@@ -2972,54 +3062,6 @@ def generateEngineInitializationCode(self) -> str:
         """
         return ("\n").join([engine.initCode for engine in self.Platform.engines])
 
-    # Don't override this - Returns parameter size in bytes
-    def getParameterSize(self) -> int:
-        """Return the BYTE size of all static network parameters (weights, biases, parameters,...)
-
-        Returns
-        -------
-        int
-            Size of all network parameters
-
-        Raises
-        ------
-        RuntimeError
-            Raises a RuntimeError if network is not parsed and bound
-
-
-        """
-        if not self.parsed or not self.bound:
-            raise RuntimeError('You need to parse and bind the network before getting RAM Size!')
-
-        size = 0
-        for _buffer in self.ctxt.globalObjects.values():
-            # We do not count structs for now, since they are not properly modeled
-            if isinstance(_buffer, ConstantBuffer) and _buffer._deploy:
-                size += int((np.prod(_buffer.shape) * _buffer._type.typeWidth // 8))
-
-        return size
-
-    # Don't override this - Returns worst case layer and buffering size in bytes
-    def getTotalSize(self) -> int:
-        """Returns total size of the network, consisting of all parameters and intermediate buffer size
-
-        Returns
-        -------
-        int
-            Total network size
-
-        Raises
-        ------
-        RuntimeError
-            Raises a RuntimeError if network is not parsed and bound
-
-
-        """
-        if not self.parsed or not self.bound:
-            raise RuntimeError('You need to parse and bind the network before getting RAM Size!')
-
-        return self.getParameterSize() + self.worstCaseBufferSize
-
     def numberOfOps(self, verbose: bool) -> int:
         """Returns the total number of operations per network inference
 
@@ -3048,7 +3090,8 @@ def numberOfOps(self, verbose: bool) -> int:
             nodeOps = i.mapper.parser.operatorRepresentation['nodeOps']
             totalSum += nodeOps
             if verbose:
-                print("Layer " + str(i.node.name) + str("\nNumber of operations: \t\t") + str("%12s\n" % nodeOps))
+                log.info(f"Layer '{i.node.name}'")
+                log.info(f"  Number of Operations        : {nodeOps}")
         return totalSum
 
         # Don't override this
@@ -3061,6 +3104,12 @@ def _exportGraph(self, folderPath, fileName):
         if not os.path.isabs(absoluteOnnxPath) or not os.path.isabs(absoluteDataPath):
             raise OSError(f"Error exporting the context to: {absoluteOnnxPath}")
 
+        # VJUNG: ONNX-Graphsurgeon needs tensors to be in their export types
+        constTensors = [tensor for tensor in self.graph.tensors().values() if isinstance(tensor, gs.Constant)]
+        for tensor in constTensors:
+            if tensor.dtype != tensor.export_dtype:
+                tensor.values = tensor.values.astype(tensor.export_dtype)
+
         model = gs.export_onnx(self.graph)
 
         # Annotate additional information in doc_string of tensors
@@ -3127,7 +3176,7 @@ def importDeeployState(self, folderPath: str, fileName: str):
 
         """
         self.graph = NetworkDeployer._importONNXGraph(folderPath, f"{fileName}")
-        self.ctxt = NetworkContext.importNetworkCtxt(folderPath, f"{fileName}")
+        self.ctxt = NetworkContext.importNetworkContext(folderPath, f"{fileName}")
 
 
 class NetworkDeployer(NetworkContainer):
@@ -3178,6 +3227,10 @@ def __init__(self,
 
         self.prepared = False
 
+    def __repr__(self):
+        return super().__repr__(
+        ) + f" (loweringOptimizer: {self.loweringOptimizer.name}, default_channels_first: {self.default_channels_first})"
+
     # Don't override this
     def lower(self, graph: gs.Graph) -> gs.Graph:
         """Apply the lowering optimize
@@ -3303,43 +3356,89 @@ def _mangleTensorNames(self):
         for tensor in self.graph.tensors().values():
             tensor.name = f"{tensor.name}_tensor"
 
+    # Don't override this
+    def _mangleNodeNames(self):
+        """Mangle node names only if duplicates exist. Unique names are preserved."""
+        # Count occurrences of each original name
+        counts: Dict[str, int] = {}
+        for node in self.graph.nodes:
+            counts[node.name] = counts.get(node.name, 0) + 1
+
+        # For any name that appears more than once, append a counter suffix
+        seen: Dict[str, int] = {}
+        for node in self.graph.nodes:
+            orig = node.name
+            if counts[orig] > 1:
+                idx = seen.get(orig, 0)
+                node.name = f"{orig}_{idx}"
+                seen[orig] = idx + 1
+            # else: unique name, leave it unchanged
+
     # Don't override this
     def _removeIdentityNodes(self):
         for node in filter(lambda x: x.op == "Identity", self.graph.nodes):
             self.graph.deleteNode(node)
 
+    def _assertTensorsHaveShape(self) -> None:
+        missingShapes = [name for name, tensor in self.graph.tensors().items() if tensor.shape is None]
+        assert len(missingShapes) == 0, \
+            f"Shape inference is not supported.\nFound tensors with missing shape annotation: {missingShapes}"
+
     def frontEnd(self):
         """API hook to prepare the graph to be deployed and build the initial NetworkContext
 
         """
+
+        log.info(80 * "=")
+        log.info("Deeploy FrontEnd")
+        log.info(80 * "=")
+
+        log.info("- Apply Preprocessing")
+
+        log.debug(" - Remove Identity Nodes")
         self._removeIdentityNodes()
 
+        log.debug(" - Mangle Tensor Names")
         self._mangleTensorNames()
 
+        log.debug(" - Mangle Node Names")
+        self._mangleNodeNames()
+
         # Rename graph inputs and outputs:
         for idx, inputNode in enumerate(self.graph.inputs):
             inputNode.name = "input_" + str(idx)
         for idx, outputNode in enumerate(self.graph.outputs):
             outputNode.name = "output_" + str(idx)
 
+        log.debug(" - Sanitize Graph Names")
         self._sanitizeGraphNames(self.graph)
 
+        log.debug(" - Remove Empty Inputs")
         self._removeEmptyInputs(self.graph)
 
+        log.debug(" - Duplicate Constants")
         self._duplicateConstants(self.graph)
 
+        log.debug(" - Constant Folding")
         self._foldConstants(self.graph)
 
+        log.info(f"> Export State to {_middlewarePreLoweringFilename}[.onnx|.pkl]")
         self.exportDeeployState(self.deeployStateDir, _middlewarePreLoweringFilename)
 
+        log.info("- Perform Graph Lowering")
         self.graph = self.lower(self.graph)  # This lowers the graph to a deployable format
 
+        log.info(f"> Export State {_middlewarePostLoweringFilename}[.onnx|.pkl]")
         self.exportDeeployState(self.deeployStateDir, _middlewarePostLoweringFilename)
 
+        log.info(" - Assert all tensors have a shape annotation")
+        self._assertTensorsHaveShape()
+
+        log.info("- Perform Graph Parsing")
         try:
             self.parse(self.default_channels_first)  # This reparses the lowered graph
         except Exception as e:
-            print("Error during parsing! Exporting deeploy state!")
+            log.error(f"Error during parsing! Exporting deeploy state {_backendPostBindingFilename}[.onnx|.pkl]!")
             self.exportDeeployState(self.deeployStateDir, _backendPostBindingFilename)
             raise e
 
@@ -3347,13 +3446,19 @@ def frontEnd(self):
     def midEnd(self):
         """API hook to be used after finalizing kernel selection; hoist transient buffers, and perform low-level code optimizations (e.g. tiling and static memory allocation)
         """
+        log.info(80 * "=")
+        log.info("Deeploy MidEnd")
+        log.info(80 * "=")
         try:
             self.bind()
         except Exception as e:
-            print("Error during binding! Exporting deeploy state!")
+            log.error("Error during binding! Exporting deeploy state!")
             self.exportDeeployState(self.deeployStateDir, _backendPostBindingFilename)
             raise e
 
+        log.info(f"> Export State {_backendPostParsingFilename}[.onnx|.pkl]")
+        self.exportDeeployState(self.deeployStateDir, _backendPostParsingFilename)
+
     # Don't override this unless you know what you are doin
     def backEnd(self, verbose: CodeGenVerbosity = _NoVerbosity):
         """API hook to generate code once kernel implementations are picked and tiling, memory allocation, and other low-level optimizations have been done.
@@ -3364,11 +3469,14 @@ def backEnd(self, verbose: CodeGenVerbosity = _NoVerbosity):
             Control verbosity of generated code
 
         """
+        log.info(80 * "=")
+        log.info("Deeploy BackEnd")
+        log.info(80 * "=")
 
-        self.exportDeeployState(self.deeployStateDir, _backendPostParsingFilename)
-
+        log.info("- Performing code transformations and optimization...")
         self.codeTransform(verbose)
 
+        log.info(f"> Export State {_backendPostBindingFilename}[.onnx|.pkl]")
         self.exportDeeployState(self.deeployStateDir, _backendPostBindingFilename)
 
     # Don't override this
@@ -3382,15 +3490,66 @@ def prepare(self, verbose: CodeGenVerbosity = _NoVerbosity):
 
         """
         self.frontEnd()
+
         self.midEnd()
+
         self.backEnd(verbose = verbose)
         self.prepared = True
 
+    def _printInputOutputSummary(self):
+        log.info("Input:")
+        for buf in self.inputs():
+            log.info(f" - '{buf.name}': Type: {buf._type.referencedType.typeName}")
+
+        log.info('Output:')
+        for buf in self.outputs():
+            log.info(f" - '{buf.name}': Type: {buf._type.referencedType.typeName}")
+
+    def _printMemorySummary(self):
+        log.info("")
+        log.info("Memory Usage Report:")
+        log.info(f"  Level                 Total (bytes)   (Static + Dynamic)    ")
+        log.info("  " + "-" * 60)
+
+        _worstCaseBufferSize = self.worstCaseBufferSize
+        if len(_worstCaseBufferSize) == 0:
+            _worstCaseBufferSize = {"None": 0}
+
+        for level, dynamicSize in _worstCaseBufferSize.items():
+            staticSize = 0
+            for _buffer in self.ctxt.globalObjects.values():
+                # We do not count structs for now, since they are not properly modeled
+                if isinstance(_buffer, ConstantBuffer) or (isinstance(_buffer, VariableBuffer) and _buffer._deploy):
+                    # SCHEREMO: We only
+                    if (hasattr(_buffer, "_memoryLevel") and _buffer._memoryLevel == level) or level == "None":
+                        staticSize += int((np.prod(_buffer.shape) * _buffer._type.referencedType.typeWidth // 8))
+                    else:
+                        log.warning(f"Buffer {_buffer.name} does not have a valid memory level")
+
+            total = staticSize + dynamicSize
+
+            log.info(f"  {level:<22}     {total:8,d}   "
+                     f"({staticSize:6,d} + {dynamicSize:7,d})  ")
+
     def generateFunction(self, verbose: CodeGenVerbosity = _NoVerbosity) -> str:
         """Helper function to prepare deployment and return generated function code
 
         """
+
         if not self.prepared:
             self.prepare(verbose = verbose)
 
+        log.info("=" * 80)
+        log.info("Deeploy Code Generation")
+        log.info("=" * 80)
+
+        self._printInputOutputSummary()
+
+        num_ops = self.numberOfOps(verbose = True)
+        log.info("-" * 80)
+
+        log.info(f"Number of Ops.                : {num_ops}")
+
+        self._printMemorySummary()
+
         return self.generateInferenceCode()
diff --git a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
index d08978f5e0..570363b9a2 100644
--- a/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
+++ b/Deeploy/EngineExtension/NetworkDeployers/EngineColoringDeployer.py
@@ -1,35 +1,14 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: EngineColoringDeployer.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-from typing import Any, Callable, Dict, Type, Union
+from typing import Callable, Dict, Type
 
 import onnx_graphsurgeon as gs
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
-from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, ONNXLayer, Schedule, TopologyOptimizer
+from Deeploy.DeeployTypes import DeploymentEngine, DeploymentPlatform, NetworkDeployer, Schedule, TopologyOptimizer
 from Deeploy.EngineExtension.OptimizationPasses.TopologyOptimizationPasses.EngineColoringPasses import \
     EngineColoringPass, EngineMapper
 
@@ -62,18 +41,21 @@ def _initEngineColoringDeployer(self, engineMapperCls: Type[EngineMapper]):
 
     def lower(self, graph: gs.Graph) -> gs.Graph:
         graph = super().lower(graph)
-        uncoloredNodes = [node.name for node in graph.nodes if "engine" not in node.attrs]
-        assert len(uncoloredNodes) == 0, f"Missing engine color for nodes {uncoloredNodes}"
+        uncoloredNodes = [node for node in graph.nodes if "engine" not in node.attrs]
+        uncoloredOperations = set(node.op for node in uncoloredNodes)
+        assert len(
+            uncoloredNodes
+        ) == 0, f"Missing engine color for nodes {[node.name for node in uncoloredNodes]} with operations {uncoloredOperations}"
         return graph
 
-    def _mapNode(self, node: gs.Node) -> Union[ONNXLayer, Any]:
+    def _selectEngine(self, node: gs.Node) -> DeploymentEngine:
         assert "engine" in node.attrs, f"Node {node.name} doesn't have an engine color."
         engineName = node.attrs["engine"]
         assert isinstance(engineName, str) and engineName in self.engineDict, \
             f"Node {node.name} has an invalid engine {engineName} assigned."
         engine = self.engineDict[engineName]
         assert node.op in engine.Mapping, f"No mapping found for {node.op} in engine {engine.name}"
-        return engine.Mapping[node.op](node)
+        return engine
 
 
 class EngineColoringDeployerWrapper(EngineColoringDeployer, NetworkDeployerWrapper):
diff --git a/Deeploy/EngineExtension/NetworkDeployers/__init__.py b/Deeploy/EngineExtension/NetworkDeployers/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/EngineExtension/NetworkDeployers/__init__.py
+++ b/Deeploy/EngineExtension/NetworkDeployers/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
index 4c3bc1f164..82b7d1fde4 100644
--- a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
+++ b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: EngineColoringPasses.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Optional, Tuple
 
diff --git a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/__init__.py b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/__init__.py
+++ b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/EngineExtension/OptimizationPasses/__init__.py b/Deeploy/EngineExtension/OptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/EngineExtension/OptimizationPasses/__init__.py
+++ b/Deeploy/EngineExtension/OptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/EngineExtension/__init__.py b/Deeploy/EngineExtension/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/EngineExtension/__init__.py
+++ b/Deeploy/EngineExtension/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/FutureExtension/Bindings/AutoFutureBinding.py b/Deeploy/FutureExtension/Bindings/AutoFutureBinding.py
index 6e9d295b0a..eb7a7b791f 100644
--- a/Deeploy/FutureExtension/Bindings/AutoFutureBinding.py
+++ b/Deeploy/FutureExtension/Bindings/AutoFutureBinding.py
@@ -1,33 +1,13 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPDMAFutureBinding.py
-#
-# Last edited: 08.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Optional
 
 from Deeploy.DeeployTypes import CodeTransformation, NetworkContext, NodeTemplate, NodeTypeChecker
 from Deeploy.FutureExtension.Bindings.FutureBinding import FutureBinding
 from Deeploy.FutureExtension.Future import Future
+from Deeploy.Logging import DEFAULT_LOGGER as log
 
 
 class AutoFutureBinding(FutureBinding):
@@ -42,7 +22,7 @@ def __init__(self,
         futureOutputs = [idx for idx, output in enumerate(self.typeChecker.output_types) if issubclass(output, Future)]
 
         if len(futureOutputs) > 1:
-            raise Exception(f"{self} assigns more than one future output!")
+            raise ValueError(f"{self} assigns more than one future output!")
 
         if len(futureOutputs) == 1:
             self.stateReferenceType = self.typeChecker.output_types[futureOutputs[0]].stateReferenceType
@@ -52,7 +32,7 @@ def __init__(self,
     def assignStateReferenceElement(self, ctxt) -> NetworkContext:
 
         if len(self.futureOutputs) > 1:
-            raise Exception(f"{self} assigns more than one future output!")
+            raise ValueError(f"{self} assigns more than one future output!")
 
         if len(self.futureOutputs) == 0:
             return ctxt
@@ -69,7 +49,7 @@ def assignStateReferenceElement(self, ctxt) -> NetworkContext:
                         stateElementCandidates.append(reference)
 
             if len(stateElementCandidates) == 1:
-                print(f"WARNING: Automagically assigning state Element of {self}")
+                log.warning(f"Automagically assigning state Element of {self}")
                 for key, value in operatorRepresentation.items():
                     if type(value) == str and (ctxt.is_local(value) or ctxt.is_global(value)):
                         reference = ctxt.lookup(value)
@@ -77,6 +57,6 @@ def assignStateReferenceElement(self, ctxt) -> NetworkContext:
                             reference._instance.assignStateReference(stateElementCandidates[0], ctxt)
 
             else:
-                raise Exception(f"Can't assign a unique state element to {self} automagically!")
+                raise ValueError(f"Can't assign a unique state element to {self} automagically!")
 
         return ctxt
diff --git a/Deeploy/FutureExtension/Bindings/FutureBinding.py b/Deeploy/FutureExtension/Bindings/FutureBinding.py
index 1a8b2214a2..ce1d927fe2 100644
--- a/Deeploy/FutureExtension/Bindings/FutureBinding.py
+++ b/Deeploy/FutureExtension/Bindings/FutureBinding.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: DMABinding.py
-#
-# Last edited: 08.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Optional
 
diff --git a/Deeploy/FutureExtension/Bindings/__init__.py b/Deeploy/FutureExtension/Bindings/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/FutureExtension/Bindings/__init__.py
+++ b/Deeploy/FutureExtension/Bindings/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/FutureExtension/CodeTransformationPasses/FutureCodeTransformation.py b/Deeploy/FutureExtension/CodeTransformationPasses/FutureCodeTransformation.py
index 0bce6bcdd1..19b4b7d422 100644
--- a/Deeploy/FutureExtension/CodeTransformationPasses/FutureCodeTransformation.py
+++ b/Deeploy/FutureExtension/CodeTransformationPasses/FutureCodeTransformation.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: Future.py
-#
-# Last edited: 12.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Tuple
 
diff --git a/Deeploy/FutureExtension/CodeTransformationPasses/__init__.py b/Deeploy/FutureExtension/CodeTransformationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/FutureExtension/CodeTransformationPasses/__init__.py
+++ b/Deeploy/FutureExtension/CodeTransformationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/FutureExtension/Future.py b/Deeploy/FutureExtension/Future.py
index 7e4969de42..eff83a917c 100644
--- a/Deeploy/FutureExtension/Future.py
+++ b/Deeploy/FutureExtension/Future.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: Future.py
-#
-# Last edited: 07.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Optional, Type
 
diff --git a/Deeploy/FutureExtension/__init__.py b/Deeploy/FutureExtension/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/FutureExtension/__init__.py
+++ b/Deeploy/FutureExtension/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Logging.py b/Deeploy/Logging.py
new file mode 100644
index 0000000000..2220e0351c
--- /dev/null
+++ b/Deeploy/Logging.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------
+#
+# File: Logging.py
+#
+# Last edited: 22.08.2025
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Philip Wiese, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Setup logging
+import logging
+from enum import Enum
+
+import coloredlogs
+
+
+class AnsiColorCode(Enum):
+    LigthBlue = "\033[94m"
+    Green = "\033[92m"
+    Yellow = "\033[93m"
+    Red = "\033[91m"
+    Magenta = "\033[95m"
+    Reset = "\033[0m"
+
+    def __str__(self) -> str:
+        return self.value
+
+
+def color(msg: str, color: AnsiColorCode) -> str:
+    return f"{color}{msg}{AnsiColorCode.Reset}"
+
+
+SUCCESS_MARK = color("✔", AnsiColorCode.Green)
+FAILURE_MARK = color("✘", AnsiColorCode.Red)
+
+CONSOLE_LOG_FORMAT = "[%(name)s] %(message)s"
+FILE_LOG_FORMAT = "[%(name)s] [%(module)-15s] %(message)s"
+DETAILED_FILE_LOG_FORMAT = "[%(levelname)s] [%(name)s] [%(pathname)s:%(lineno)d] %(message)s"
+
+DEFAULT_LOGGER = logging.getLogger("Deeploy")
+DEFAULT_FMT = CONSOLE_LOG_FORMAT
+
+# Install default logging if not already installed
+if not DEFAULT_LOGGER.handlers:
+    coloredlogs.install(level = 'INFO', logger = DEFAULT_LOGGER, fmt = DEFAULT_FMT)
diff --git a/Deeploy/MemoryLevelExtension/MemoryLevels.py b/Deeploy/MemoryLevelExtension/MemoryLevels.py
index 06b9d4725f..5cf4abcf13 100644
--- a/Deeploy/MemoryLevelExtension/MemoryLevels.py
+++ b/Deeploy/MemoryLevelExtension/MemoryLevels.py
@@ -1,34 +1,8 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryLevel.py
-#
-# Last edited: 04.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-from typing import Dict, List, Optional, Sequence, Tuple
-
-import onnx_graphsurgeon as gs
-
-from Deeploy.DeeployTypes import CodeTransformation, NetworkContext, NodeBinding, NodeTemplate, NodeTypeChecker, \
-    OperatorRepresentation
+from typing import Dict, List, Optional
 
 
 class MemoryLevel():
@@ -130,58 +104,3 @@ def getDefaultMemoryLevel(self):
         if self._defaultMemoryLevel is None:
             raise ValueError('defaultMemoryLevel level not set!')
         return self._defaultMemoryLevel
-
-
-class NodeMemoryLevelChecker():
-
-    def __init__(self, inputMemoryLevels: Sequence[Optional[str]], outputMemoryLevels: Sequence[Optional[str]]):
-        self.inputMemoryLevels = inputMemoryLevels
-        self.outputMemoryLevels = outputMemoryLevels
-
-    def _memEq(self, memoryLevel: str, annotatedMemoryLevel: str) -> bool:
-        if memoryLevel is None:
-            return True
-        else:
-            return memoryLevel == annotatedMemoryLevel
-
-    def _checkMemoryLevels(self, ctxt: NetworkContext, memoryLevels: Sequence[str],
-                           tensors: Sequence[gs.Tensor]) -> bool:
-        buffers = [ctxt.lookup(tensor.name) for tensor in tensors]
-        if not all(hasattr(buffer, "_memoryLevel") for buffer in buffers):
-            return False
-
-        annotatedMemoryLevels = [buffer._memoryLevel for buffer in buffers]
-        if all(
-                self._memEq(memoryLevel, annotatedMemoryLevel)
-                for memoryLevel, annotatedMemoryLevel in zip(memoryLevels, annotatedMemoryLevels)):
-            return True
-        else:
-            return False
-
-    def check(self, ctxt: NetworkContext, node: gs.Node, operatorRepresentation) -> Tuple[NetworkContext, bool]:
-        if self._checkMemoryLevels(ctxt, self.inputMemoryLevels, node.inputs) and self._checkMemoryLevels(
-                ctxt, self.outputMemoryLevels, node.outputs):
-            return ctxt, True
-        else:
-            return ctxt, False
-
-
-class MemoryAwareNodeBinding(NodeBinding):
-
-    def __init__(self, typeChecker: NodeTypeChecker, memoryLevelChecker: NodeMemoryLevelChecker, template: NodeTemplate,
-                 codeTransformer: CodeTransformation):
-        super().__init__(typeChecker, template, codeTransformer)
-        self.memoryLevelChecker = memoryLevelChecker
-
-    def typeCheck(self, ctxt: NetworkContext, node: gs.Node,
-                  operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, bool]:
-        newCtxt, ret = self.memoryLevelChecker.check(ctxt, node, operatorRepresentation)
-        if ret:
-            return super().typeCheck(newCtxt, node, operatorRepresentation)
-
-        return ctxt, False
-
-
-def memoryAwareNodeBindingExtension(binding: NodeBinding,
-                                    memoryLevelChecker: NodeMemoryLevelChecker) -> MemoryAwareNodeBinding:
-    return MemoryAwareNodeBinding(binding.typeChecker, memoryLevelChecker, binding.template, binding.codeTransformer)
diff --git a/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
index 3d1e12f68f..2599f9e819 100644
--- a/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
+++ b/Deeploy/MemoryLevelExtension/NetworkDeployers/MemoryLevelDeployer.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryLevelAnnotation.py
-#
-# Last edited: 04.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# Moritz Scherer, ETH Zurich
-# Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from types import MappingProxyType
 from typing import Any, Callable, Dict, List, Tuple, Type, Union
@@ -34,8 +11,9 @@
 from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
 from Deeploy.CommonExtensions.NetworkDeployers.SignPropDeployer import SignPropDeployer
 from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, DeploymentEngine, DeploymentPlatform, \
-    NetworkContext, NetworkDeployer, NetworkOptimizationPass, NetworkOptimizer, ONNXLayer, Schedule, StructBuffer, \
+    NetworkContext, NetworkDeployer, NetworkOptimizationPass, NetworkOptimizer, Schedule, StructBuffer, \
     TopologyOptimizer, TransientBuffer, VariableBuffer, _NoVerbosity
+from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel
 
@@ -97,7 +75,37 @@ def lookup(self, nodeName: str, tensorName: str) -> str:
         return self._mapping[nodeName, tensorName]
 
 
-class MemoryLevelAwareDeployer(NetworkDeployer):
+class MemorySummaryMixin:
+
+    def _printMemorySummary(self):
+        log.info("")
+        log.info("Memory Usage Report:")
+        log.info(f"  {'Level':<14} {'Capacity (bytes)':>10} {'Total':>10} (    Static + Dynamic   ) (Usage )")
+        log.info("  " + "-" * 78)
+
+        for level, dynamicSize in self.worstCaseBufferSize.items():
+            staticSize = 0
+            for _buffer in self.ctxt.globalObjects.values():
+                # We do not count structs for now, since they are not properly modeled
+                if isinstance(_buffer, ConstantBuffer) and getattr(_buffer, "_deploy", False):
+                    if (hasattr(_buffer, "_memoryLevel") and _buffer._memoryLevel == level) or level in ("None", None):
+                        staticSize += _buffer.sizeInBytes()
+
+            total = staticSize + dynamicSize
+            memLevels = self.Platform.memoryHierarchy.memoryLevels
+            memLevel = memLevels.get(level, None)
+            if memLevel is None or getattr(memLevel, "size", None) is None:
+                log.info(f"  {str(level):<20} {'N/A':>10} {total:10,d} "
+                         f"({staticSize:10,d} + {dynamicSize:10,d}) "
+                         f"({'N/A':>5})")
+            else:
+                capacity = memLevel.size
+                log.info(f"  {str(level):<20} {capacity:10,} {total:10,d} "
+                         f"({staticSize:10,d} + {dynamicSize:10,d}) "
+                         f"({total / capacity * 100:5.1f}%)")
+
+
+class MemoryLevelAwareDeployer(NetworkDeployer, MemorySummaryMixin):
 
     def __init__(self,
                  graph: gs.Graph,
@@ -120,23 +128,11 @@ def getTargetMemoryLevelMapping(self) -> TargetMemoryLevelMapping:
             f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
         return TargetMemoryLevelMapping(self.graph, self.Platform, self.ctxt)
 
-    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
-                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
-
-        newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
-
-        if not parsePass:
-            return ctxt, False
-
-        newCtxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(newCtxt, self.graph)
-        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
-
-        if not LayerBindSuccess:
-            return ctxt, False
-
-        return newCtxt, True
-
     def bind(self):
+        log.info("- Perform Memory Level Annotation")
+        # LMACAN: Annotate before bind because during binding (specifically alignToContext) templates
+        #         may expect the memoryLevel annotation already.
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
 
         ret = super().bind()
         if not ret:
@@ -152,7 +148,7 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
         super().codeTransform(verbose)
 
 
-class MemoryLevelAwareSignPropDeployer(SignPropDeployer):
+class MemoryLevelAwareSignPropDeployer(SignPropDeployer, MemorySummaryMixin):
 
     def __init__(self,
                  graph: gs.Graph,
@@ -176,23 +172,11 @@ def getTargetMemoryLevelMapping(self) -> TargetMemoryLevelMapping:
             f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
         return TargetMemoryLevelMapping(self.graph, self.Platform, self.ctxt)
 
-    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
-                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
-
-        newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
-
-        if not parsePass:
-            return ctxt, False
-
-        newCtxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(newCtxt, self.graph)
-        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
-
-        if not LayerBindSuccess:
-            return ctxt, False
-
-        return newCtxt, True
-
     def bind(self):
+        log.info("- Perform Memory Level Annotation")
+        # LMACAN: Annotate before bind because during binding (specifically alignToContext) templates
+        #         may expect the memoryLevel annotation already.
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
 
         ret = super().bind()
         if not ret:
@@ -208,7 +192,7 @@ def codeTransform(self, verbose: CodeGenVerbosity = _NoVerbosity):
         super().codeTransform(verbose)
 
 
-class MemoryDeployerWrapper(NetworkDeployerWrapper):
+class MemoryDeployerWrapper(NetworkDeployerWrapper, MemorySummaryMixin):
 
     def __init__(self, deployer: NetworkDeployer, memoryLevelAnnotationPasses: List[NetworkOptimizationPass] = []):
         super().__init__(deployer)
@@ -223,23 +207,11 @@ def getTargetMemoryLevelMapping(self) -> TargetMemoryLevelMapping:
             f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
         return TargetMemoryLevelMapping(self.graph, self.Platform, self.ctxt)
 
-    def _parseNode(self, node: ONNXLayer, ctxt: NetworkContext,
-                   default_channels_first: bool) -> Tuple[NetworkContext, bool]:
-
-        newCtxt, parsePass = node.parse(ctxt.copy(), default_channels_first)
-
-        if not parsePass:
-            return ctxt, False
-
-        newCtxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(newCtxt, self.graph)
-        newCtxt, LayerBindSuccess = node.typeCheck(newCtxt)
-
-        if not LayerBindSuccess:
-            return ctxt, False
-
-        return newCtxt, True
-
     def bind(self):
+        log.info("- Perform Memory Level Annotation")
+        # LMACAN: Annotate before bind because during binding (specifically alignToContext) templates
+        #         may expect the memoryLevel annotation already.
+        self.ctxt, self.graph = self.memoryLevelAnnotationOptimizer.optimize(self.ctxt, self.graph)
 
         ret = super().bind()
         if not ret:
diff --git a/Deeploy/MemoryLevelExtension/NetworkDeployers/__init__.py b/Deeploy/MemoryLevelExtension/NetworkDeployers/__init__.py
index 65ec809815..be436b64a3 100644
--- a/Deeploy/MemoryLevelExtension/NetworkDeployers/__init__.py
+++ b/Deeploy/MemoryLevelExtension/NetworkDeployers/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
index 3069262b26..775f5cbfc5 100644
--- a/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
+++ b/Deeploy/MemoryLevelExtension/OptimizationPasses/MemoryLevelAnnotationPasses.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryLevelAnnotationPasses.py
-#
-# Last edited: 10.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Tuple
 
diff --git a/Deeploy/MemoryLevelExtension/OptimizationPasses/__init__.py b/Deeploy/MemoryLevelExtension/OptimizationPasses/__init__.py
index 65ec809815..be436b64a3 100644
--- a/Deeploy/MemoryLevelExtension/OptimizationPasses/__init__.py
+++ b/Deeploy/MemoryLevelExtension/OptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/MemoryLevelExtension/__init__.py b/Deeploy/MemoryLevelExtension/__init__.py
index 65ec809815..be436b64a3 100644
--- a/Deeploy/MemoryLevelExtension/__init__.py
+++ b/Deeploy/MemoryLevelExtension/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Chimera/Deployer.py b/Deeploy/Targets/Chimera/Deployer.py
index 8f45d636ef..ba28279b66 100644
--- a/Deeploy/Targets/Chimera/Deployer.py
+++ b/Deeploy/Targets/Chimera/Deployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: Deployer.py
-#
-# Last edited: 16.06.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
diff --git a/Deeploy/Targets/Chimera/Platform.py b/Deeploy/Targets/Chimera/Platform.py
index 8c98a649cc..0906ddfae0 100644
--- a/Deeploy/Targets/Chimera/Platform.py
+++ b/Deeploy/Targets/Chimera/Platform.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
-# File: Platform.py
-#
-# Last edited: 16.06.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List
 
@@ -68,9 +47,11 @@ class ChimeraStructBuffer(StructBuffer):
     deallocTemplate = NodeTemplate("")
 
 
-ChimeraOptimizer = TopologyOptimizer([
-    # JUNGVI: Nothing for now
-])
+ChimeraOptimizer = TopologyOptimizer(
+    [
+        # JUNGVI: Nothing for now
+    ],
+    name = "ChimeraOptimizer")
 
 _includeList = [
     "uart.h",
diff --git a/Deeploy/Targets/Chimera/Templates/AllocateTemplate.py b/Deeploy/Targets/Chimera/Templates/AllocateTemplate.py
index dc2c774a2b..60a095bc47 100644
--- a/Deeploy/Targets/Chimera/Templates/AllocateTemplate.py
+++ b/Deeploy/Targets/Chimera/Templates/AllocateTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: AllocateTemplate.py
-#
-# Last edited: 16.06.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/CortexM/Bindings.py b/Deeploy/Targets/CortexM/Bindings.py
index 3b7b7ff11d..bfe7b05afc 100644
--- a/Deeploy/Targets/CortexM/Bindings.py
+++ b/Deeploy/Targets/CortexM/Bindings.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: CMSISBindings.py
-#
-# Last edited: 17.12.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
diff --git a/Deeploy/Targets/CortexM/DataTypes.py b/Deeploy/Targets/CortexM/DataTypes.py
index 9951cb357b..66f031d3cd 100644
--- a/Deeploy/Targets/CortexM/DataTypes.py
+++ b/Deeploy/Targets/CortexM/DataTypes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: CMSISDataTypes.py
-#
-# Last edited: 01.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.AbstractDataTypes import PointerClass, Struct, VoidType
 from Deeploy.CommonExtensions.DataTypes import int32_t
diff --git a/Deeploy/Targets/CortexM/Deployer.py b/Deeploy/Targets/CortexM/Deployer.py
index 55f8987304..bef8fdcf36 100644
--- a/Deeploy/Targets/CortexM/Deployer.py
+++ b/Deeploy/Targets/CortexM/Deployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: NetworkDeployer.py
-#
-# Last edited: 26.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
diff --git a/Deeploy/Targets/CortexM/Layers.py b/Deeploy/Targets/CortexM/Layers.py
index ba60a2e214..e64fe6a6ca 100644
--- a/Deeploy/Targets/CortexM/Layers.py
+++ b/Deeploy/Targets/CortexM/Layers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: CMSISLayers.py
-#
-# Last edited: 22.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Tuple
 
diff --git a/Deeploy/Targets/CortexM/Parsers.py b/Deeploy/Targets/CortexM/Parsers.py
index e81caf0077..0fc1efef9e 100644
--- a/Deeploy/Targets/CortexM/Parsers.py
+++ b/Deeploy/Targets/CortexM/Parsers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: CMSISParsers.py
-#
-# Last edited: 17.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import math
 from typing import Tuple
diff --git a/Deeploy/Targets/CortexM/Platform.py b/Deeploy/Targets/CortexM/Platform.py
index c65bb86766..25caeed60f 100644
--- a/Deeploy/Targets/CortexM/Platform.py
+++ b/Deeploy/Targets/CortexM/Platform.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: CMSISPlatform.py
-#
-# Last edited: 17.12.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, ETH Zurich
-# - Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
@@ -146,18 +123,20 @@ class CMSISStructBuffer(StructBuffer):
 
 
 # ExtractPaddingFromConvPass(),ExtractPaddingFromPoolPass(),
-CMSISOptimizer = TopologyOptimizer([
-    IntegerDivRequantMergePass(),
-    iGELURequantMergePass(),
-    LinearAttentionAlignmentPass(),
-    MHSAAlignmentPass(),
-    MergeConstAddAndRequantPass(),
-    ConvRequantMergePass(),
-    GEMMRequantMergePass(),
-    MatMulRequantMergePass(),
-    # DebugPass("Conv", position='before'),
-    # DebugPass("Pad", position='after'),
-])
+CMSISOptimizer = TopologyOptimizer(
+    [
+        IntegerDivRequantMergePass(),
+        iGELURequantMergePass(),
+        LinearAttentionAlignmentPass(),
+        MHSAAlignmentPass(),
+        MergeConstAddAndRequantPass(),
+        ConvRequantMergePass(),
+        GEMMRequantMergePass(),
+        MatMulRequantMergePass(),
+        # DebugPass("Conv", position='before'),
+        # DebugPass("Pad", position='after'),
+    ],
+    name = "CMSISOptimizer")
 
 includeList = ["arm_nnfunctions.h", "DeeployMath.h"]
 
diff --git a/Deeploy/Targets/CortexM/Templates/AddTemplate.py b/Deeploy/Targets/CortexM/Templates/AddTemplate.py
index b477193b9f..664bb2e65c 100644
--- a/Deeploy/Targets/CortexM/Templates/AddTemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/AddTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: AddTemplate.py
-#
-# Last edited: 18.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/CortexM/Templates/CLCATemplate.py b/Deeploy/Targets/CortexM/Templates/CLCATemplate.py
index 3eeb3556ce..470621dd33 100644
--- a/Deeploy/Targets/CortexM/Templates/CLCATemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/CLCATemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: CLCATemplate.py
-#
-# Last edited: 26.08.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from typing import Dict, List, Tuple
diff --git a/Deeploy/Targets/CortexM/Templates/CMSISUtils.py b/Deeploy/Targets/CortexM/Templates/CMSISUtils.py
index d8f03597af..ea8abbbc74 100644
--- a/Deeploy/Targets/CortexM/Templates/CMSISUtils.py
+++ b/Deeploy/Targets/CortexM/Templates/CMSISUtils.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: CMSISUtils.py
-#
-# Last edited: 10.01.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import numpy as np
 
@@ -90,8 +69,8 @@ def bindConvParams(ctxt, name, repName, batch, operatorRepresentation):
     operatorRepresentation[f'{repName}_conv_params'] = ctxt.lookup(f'{name}_conv_params').name
 
     convQuantDict = {
-        'multiplier': ctxt._mangle(operatorRepresentation['mul']),
-        'shift': ctxt._mangle(operatorRepresentation['shift']),
+        'multiplier': operatorRepresentation['mul'],
+        'shift': operatorRepresentation['shift'],
     }
     nameList += [ctxt.hoistStruct(convQuantDict, f'{name}_quant_params', cmsis_nn_per_channel_quant_params)]
     operatorRepresentation[f'{repName}_quant_params'] = ctxt.lookup(f'{name}_quant_params').name
diff --git a/Deeploy/Targets/CortexM/Templates/ConvTemplate.py b/Deeploy/Targets/CortexM/Templates/ConvTemplate.py
index 5743fc9845..d5e05c8343 100644
--- a/Deeploy/Targets/CortexM/Templates/ConvTemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/ConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: ConvTemplate.py
-#
-# Last edited: 17.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
diff --git a/Deeploy/Targets/CortexM/Templates/DWConvTemplate.py b/Deeploy/Targets/CortexM/Templates/DWConvTemplate.py
index 056ef4d059..3733cfc4c6 100644
--- a/Deeploy/Targets/CortexM/Templates/DWConvTemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/DWConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: DWConvTemplate.py
-#
-# Last edited: 04.01.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py b/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py
index 237f5eec7c..d82704cdcf 100644
--- a/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/GEMMTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: GEMMTemplate.py
-#
-# Last edited: 20.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/CortexM/Templates/LinearAttentionTemplate.py b/Deeploy/Targets/CortexM/Templates/LinearAttentionTemplate.py
index 97fd3b6efd..2220d2974a 100644
--- a/Deeploy/Targets/CortexM/Templates/LinearAttentionTemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/LinearAttentionTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: LinearAttentionTemplate.py
-#
-# Last edited: 05.06.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/Targets/CortexM/Templates/MHSATemplate.py b/Deeploy/Targets/CortexM/Templates/MHSATemplate.py
index 0ae237ccef..158ef9c97c 100644
--- a/Deeploy/Targets/CortexM/Templates/MHSATemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/MHSATemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MHSATemplate.py
-#
-# Last edited: 01.01.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/CortexM/Templates/MaxPool2DTemplate.py b/Deeploy/Targets/CortexM/Templates/MaxPool2DTemplate.py
index 37143747ec..415a098d01 100644
--- a/Deeploy/Targets/CortexM/Templates/MaxPool2DTemplate.py
+++ b/Deeploy/Targets/CortexM/Templates/MaxPool2DTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: MaxPool2DTemplate.py
-#
-# Last edited: 27.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/CortexM/Templates/__init__.py b/Deeploy/Targets/CortexM/Templates/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/CortexM/Templates/__init__.py
+++ b/Deeploy/Targets/CortexM/Templates/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/CortexM/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/Passes.py
index dcc169ee45..7f9b7ef079 100644
--- a/Deeploy/Targets/CortexM/TopologyOptimizationPasses/Passes.py
+++ b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/Passes.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: CMSISPasses.py
-#
-# Last edited: 17.12.2022
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, ETH Zurich
-# - Georg Rutishauser, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import numpy as np
 import onnx_graphsurgeon as gs
diff --git a/Deeploy/Targets/CortexM/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/CortexM/TopologyOptimizationPasses/__init__.py
+++ b/Deeploy/Targets/CortexM/TopologyOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/CortexM/TypeCheckers.py b/Deeploy/Targets/CortexM/TypeCheckers.py
index c5f58a9f47..b58ab91ddd 100644
--- a/Deeploy/Targets/CortexM/TypeCheckers.py
+++ b/Deeploy/Targets/CortexM/TypeCheckers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: CMSISCheckers.py
-#
-# Last edited: 18.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Sequence, Type
 
diff --git a/Deeploy/Targets/CortexM/__init__.py b/Deeploy/Targets/CortexM/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/CortexM/__init__.py
+++ b/Deeploy/Targets/CortexM/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
index ec9c32a6d2..ec2ed6270f 100644
--- a/Deeploy/Targets/Generic/Bindings.py
+++ b/Deeploy/Targets/Generic/Bindings.py
@@ -1,30 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: BasicBindings.py
-#
-# Last edited: 05.05.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Philip Wiese, ETH Zurich
-# - Calin Diaconu, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import itertools
 
@@ -35,19 +11,20 @@
     int8_t, int32_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, ConvTemplate, DebugPrintTemplate, \
-    DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatDivTemplate, \
-    FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, \
-    FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, FloatReduceMeanTemplate, FloatReluTemplate, \
-    FloatSoftmaxTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, \
-    MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
+from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, ConvTemplate, \
+    ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \
+    FloatConvTemplate, FloatDivTemplate, FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \
+    FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \
+    FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, \
+    GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \
+    MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
     RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \
     iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
-from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DebugPrintChecker, \
-    DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, LayerNormChecker, \
-    MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, ReduceSumChecker, \
-    ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, SoftmaxChecker, \
-    TransposeChecker
+from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \
+    DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \
+    LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \
+    ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \
+    SoftmaxChecker, TransposeChecker
 
 BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()])
 
@@ -77,8 +54,14 @@
                 FloatAddTemplate.referenceTemplate, BasicTransformer)
 ]
 
-BasicConv1DBinding = NodeBinding(ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
-                                 ConvTemplate.reference1DTemplate, BasicTransformer)
+BasicConv1DBindings = [
+    NodeBinding(ConvChecker(
+        [PointerClass(type), PointerClass(type), PointerClass(type)], [PointerClass(type)]),
+                FloatConvTemplate.reference1DTemplate, BasicTransformer) for type in FloatDataTypes
+] + [
+    NodeBinding(ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
+                ConvTemplate.reference1DTemplate, BasicTransformer)
+]
 
 BasicDWConv1DBinding = NodeBinding(ConvChecker([PointerClass(int8_t), PointerClass(int8_t)], [PointerClass(int32_t)]),
                                    DWConvTemplate.reference1DTemplate, BasicTransformer)
@@ -135,6 +118,16 @@
         BasicTransformer)
 ]
 
+BasicPowBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatPowTemplate.referenceTemplate, BasicTransformer),
+]
+
+BasicSqrtBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatSqrtTemplate.referenceTemplate,
+                BasicTransformer),
+]
+
 BasicDivBindings = [
     NodeBinding(DivChecker([PointerClass(int32_t), PointerClass(int32_t)], [PointerClass(int32_t)]),
                 IntegerDivTemplate.referenceTemplate, BasicTransformer)
@@ -171,6 +164,11 @@
                 FloatMatMulTemplate.referenceTemplate, BasicTransformer)
 ]
 
+BasicMaxPool1DBindings = [
+    NodeBinding(MaxPoolChecker([PointerClass(type)], [PointerClass(type)]), FloatMaxPoolTemplate.reference1DTemplate,
+                BasicTransformer) for type in FloatDataTypes
+]
+
 BasicMaxPool2DBindings = [
     NodeBinding(MaxPoolChecker([PointerClass(int8_t)], [PointerClass(int8_t)]), MaxPoolTemplate.referenceTemplate,
                 BasicTransformer)
@@ -191,7 +189,11 @@
 BasicPad1DBindings = [
     NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference1DTemplate,
                 BasicTransformer) for type in SignedIntegerDataTypes
+] + [
+    NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), FloatPadTemplate.reference1DTemplate,
+                BasicTransformer) for type in FloatDataTypes
 ]
+
 BasicPad2DBindings = [
     NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference2DTemplate,
                 BasicTransformer) for type in SignedIntegerDataTypes
@@ -206,10 +208,15 @@
     NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
                 BasicTransformer) for type in SignedIntegerDataTypes
 ] + [
+    # ONNX OPSET < 18
     NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]),
                 FloatReduceMeanTemplate.referenceTemplate, BasicTransformer)
     for integer_type in SignedIntegerDataTypes
     for float_type in FloatDataTypes
+] + [
+    # ONNX OPSET >= 18
+    NodeBinding(ReduceMeanChecker([PointerClass(float_type)], [PointerClass(float_type)]),
+                FloatReduceMeanTemplate.referenceTemplate, BasicTransformer) for float_type in FloatDataTypes
 ]
 
 BasicReduceSumBindings = [
@@ -290,3 +297,30 @@
     NodeBinding(DequantChecker([PointerClass(int32_t)], [PointerClass(float32_t)]), DequantTemplate.referenceTemplate,
                 BasicTransformer),
 ]
+
+BasicBatchNormBindings = [
+    NodeBinding(
+        BatchNormChecker(
+            [PointerClass(type),
+             PointerClass(type),
+             PointerClass(type),
+             PointerClass(type),
+             PointerClass(type)], [PointerClass(type)]), BatchNormalizationTemplate.referenceTemplate, BasicTransformer)
+    for type in FloatDataTypes
+]
+
+BasicConvTransposeBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(type), PointerClass(type), PointerClass(type)],  # input, weight, bias
+            [PointerClass(type)]),
+        ConvTransposeTemplate.referenceTemplate,
+        BasicTransformer) for type in FloatDataTypes
+] + [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(type), PointerClass(type)],  # input, weight
+            [PointerClass(type)]),
+        ConvTransposeTemplate.referenceTemplate,
+        BasicTransformer) for type in FloatDataTypes
+]
diff --git a/Deeploy/Targets/Generic/Deployer.py b/Deeploy/Targets/Generic/Deployer.py
index 8dc216d7e7..3cef57a2ea 100644
--- a/Deeploy/Targets/Generic/Deployer.py
+++ b/Deeploy/Targets/Generic/Deployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: GenericDeployer.py
-#
-# Last edited: 04.01.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index 17e4e6ea42..cc733937cc 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: BasicLayers.py
-#
-# Last edited: 17.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from typing import List, Tuple
@@ -81,6 +58,18 @@ def computeOps(self):
         return mul1 + neg + exp + add + div + mul2
 
 
+class GELUGradLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        size = self.mapper.parser.operatorRepresentation['size']
+        ops_per_element = 9
+        gelu_grad_ops = size * ops_per_element
+        return gelu_grad_ops
+
+
 class iHardswishLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
@@ -250,6 +239,18 @@ def computeOps(self):
         return matmul + rqs
 
 
+class PowLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
+class SqrtLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
 class DivLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
@@ -461,6 +462,12 @@ def computeOps(self):
         return compAverage + compNormalize + compSqr + compSum + compSqrt + compDiv
 
 
+class LayerNormGradLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+
 class TransposeLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
@@ -641,3 +648,64 @@ class DequantLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
+
+
+class BatchNormalizationLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeOps(self):
+        # 5 operations per element: sub, mul, add, sqrt, div
+        B = self.mapper.parser.operatorRepresentation['batch_size']
+        C = self.mapper.parser.operatorRepresentation['channel_size']
+        W = self.mapper.parser.operatorRepresentation['window_size']
+        return B * C * W * 5
+
+
+class ConvTransposeLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        """
+        Infers output shapes for ConvTranspose using only static info.
+        - inputShapes[0]: input tensor shape (e.g., [N, C_in, W] for 1D, [N, C_in, H, W] for 2D)
+        - inputShapes[1]: weight tensor shape (e.g., [C_in, C_out // group, kW] for 1D)
+        - outputShapes[0]: output tensor shape (to be updated)
+        """
+        newInputShapes = list(inputShapes)
+        newOutputShapes = list(outputShapes)
+        group = operatorRepresentation.get('group', 1)
+        weight_shape = inputShapes[1]
+
+        if newOutputShapes and len(newOutputShapes[0]) >= 2:
+            # For 1D: weight_shape = [C_in, C_out // group, kW]
+            # For 2D: weight_shape = [C_in, C_out // group, kH, kW]
+            ch_out = weight_shape[1] * group
+            if channels_first:
+                newOutputShapes[0][1] = ch_out
+            else:
+                newOutputShapes[0][-1] = ch_out
+
+        return newInputShapes, newOutputShapes
+
+    def computeOps(self):
+        opRep = self.mapper.parser.operatorRepresentation
+
+        groups = opRep.get('group', 1)
+        kernel_shape = np.prod(opRep['kernel_shape'])  # es. [3, 3] -> 9
+        ch_in = opRep['ch_im_in']
+        ch_out = opRep['ch_im_out']
+
+        opsPerPx = int(kernel_shape * ch_in * ch_out / groups) * 2
+
+        # ConvTranspose upscales spatial dims, quindi num pixel viene da output
+        if 'dim_im_out_y' in opRep:
+            numPx = opRep['dim_im_out_x'] * opRep['dim_im_out_y']
+        else:
+            numPx = opRep['dim_im_out_x']
+
+        return numPx * opsPerPx
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index 148d1b0e32..cf1ba776bd 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -1,30 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: BasicParsers.py
-#
-# Last edited: 12.05.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-# - Calin Diaconu, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import math
 from typing import Tuple
@@ -32,7 +8,7 @@
 import numpy as np
 import onnx_graphsurgeon as gs
 
-from Deeploy.DeeployTypes import NetworkContext, NodeParser, VariableBuffer
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer
 
 
 class ConcatParser(NodeParser):
@@ -245,6 +221,48 @@ def parseNodeCtxt(self,
         return ctxt, True
 
 
+class MaxPool1DParser(MaxPoolParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        ret = super().parseNode(node)
+        wellFormed = False
+        if ret:
+            pads = self.operatorRepresentation['pads']
+            kernel_shape = self.operatorRepresentation['kernel_shape']
+            strides = self.operatorRepresentation['strides']
+            # 1D: pads should be length 2, kernel_shape length 1, strides length 1
+            if len(pads) == 2 and len(kernel_shape) == 1 and len(strides) == 1:
+                wellFormed = True
+                self.operatorRepresentation['padding_y'] = int(pads[0])
+                self.operatorRepresentation['padding_y_right'] = int(pads[1])
+                self.operatorRepresentation['stride_y'] = int(strides[0])
+                self.operatorRepresentation['dim_kernel_y'] = int(kernel_shape[0])
+        return wellFormed
+
+    def parseNodeCtxt(self, ctxt, node, channels_first = True):
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+        if ret:
+            data_in = newCtxt.lookup(self.operatorRepresentation['data_in'])
+            data_out = newCtxt.lookup(self.operatorRepresentation['data_out'])
+            self.operatorRepresentation['batch'] = data_in.shape[0]
+            if channels_first:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[1]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[2]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[1]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[2]
+            else:
+                self.operatorRepresentation['ch_im_in'] = data_in.shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = data_in.shape[1]
+                self.operatorRepresentation['ch_im_out'] = data_out.shape[2]
+                self.operatorRepresentation['dim_im_out_y'] = data_out.shape[1]
+            if len(data_in.shape) == 3 and len(data_out.shape) == 3:
+                return newCtxt, True
+        return ctxt, False
+
+
 class MaxPool2DParser(MaxPoolParser):
 
     def __init__(self):
@@ -322,7 +340,12 @@ def parseNode(self, node: gs.Node) -> bool:
 
         if ret:
             self.operatorRepresentation['mode'] = node.attrs['mode']
-            self.operatorRepresentation['pads'] = node.attrs['pads']
+
+            try:
+                self.operatorRepresentation['pads'] = [int(p) for p in node.attrs['pads']]
+            except Exception as e:
+                self.operatorRepresentation['pads'] = node.attrs['pads']
+
             self.operatorRepresentation['value'] = node.attrs['value']
 
         return ret
@@ -506,9 +529,14 @@ def __init__(self):
         super().__init__()
 
     def parseNode(self, node: gs.Node) -> bool:
-        if len(node.inputs) == 2:
-            # Float node, requiring 2 inputs (ONNX opset version >= 18)
-            wellFormed = all(['keepdims' in node.attrs, len(node.inputs) == 2, len(node.outputs) == 1])
+        if 1 <= len(node.inputs) and ("axes" not in node.attrs):
+            # Float node, requiring 1 or 2 inputs (ONNX opset version >= 18).
+            # "axes" input is optional.
+            # If axes is not provided, then reduction will happen over all dimensions.
+            #
+            # WARNING: noop_with_empty_axes attribute not handled
+
+            wellFormed = all(['keepdims' in node.attrs, 1 <= len(node.inputs) <= 2, len(node.outputs) == 1])
 
             if wellFormed:
                 self.operatorRepresentation['keepdims'] = int(node.attrs['keepdims'])
@@ -523,23 +551,47 @@ def parseNodeCtxt(self,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
 
-        if len(node.inputs) == 2:
+        if 1 <= len(node.inputs) and ("axes" not in node.attrs):
+            # Extract context information for Float ReduceMean node (ONNX opset version >= 18)
             data_in = ctxt.lookup(node.inputs[0].name)
             data_out = ctxt.lookup(node.outputs[0].name)
 
-            axes = ctxt.lookup(node.inputs[1].name)
+            # Extract axes as numpy sorted array
+            # If not provided, according to ONNX specification, reduction will happen over all dimensions
+            if len(node.inputs) == 2:
+                axes = ctxt.lookup(node.inputs[1].name)
+
+                # Mark the axes variable to be excluded from the context, since only used in the template, as part of the operator representation
+                axes._live = False
+                axes._deploy = False
 
+                # Sort axes
+                axes = axes.values
+                axes.sort()
+            else:
+                axes = np.array(list(range(len(data_in.shape))))
+
+            # Remove axes reduced over singleton dimensions
+            # Keep first axis if only singleton dimensions are reduced
+            nonSingletonAxes = []
+            for axis in axes:
+                if data_in.shape[axis] != 1:
+                    nonSingletonAxes.append(axis)
+            if len(nonSingletonAxes) == 0:
+                nonSingletonAxes.append(axes[0])
+            axes = np.array(nonSingletonAxes)
+
+            # Update operator representation
             self.operatorRepresentation['data_in'] = data_in.name
             self.operatorRepresentation['data_out'] = data_out.name
+
             self.operatorRepresentation['data_in_shape'] = data_in.shape
             self.operatorRepresentation['data_out_shape'] = data_out.shape
+
             self.operatorRepresentation['size'] = np.prod(data_in.shape)
-            self.operatorRepresentation['axisLength'] = data_in.shape[axes.values[0]]
-            self.operatorRepresentation['axes'] = axes.values
 
-            # Mark the axes variable to be excluded from the context, since only used in the template, as part of the operator representation
-            axes._live = False
-            axes._deploy = False
+            self.operatorRepresentation['axes'] = axes
+            self.operatorRepresentation['axisLength'] = data_in.shape[axes[0]]
 
             return ctxt, True
         else:
@@ -747,6 +799,33 @@ def parseNodeCtxt(self,
         return ctxt, True
 
 
+class GELUGradParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+
+        ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        upstream_grad = ctxt.lookup(node.inputs[0].name)
+        gelu_input = ctxt.lookup(node.inputs[1].name)
+        gelu_grad = ctxt.lookup(node.outputs[0].name)
+
+        self.operatorRepresentation['grad_in'] = upstream_grad.name
+        self.operatorRepresentation['data_in'] = gelu_input.name
+        self.operatorRepresentation['grad_out'] = gelu_grad.name
+        self.operatorRepresentation['size'] = np.prod(upstream_grad.shape)
+
+        return ctxt, True
+
+
 class RQSiGELUParser(GELUParser):
 
     def __init__(self):
@@ -964,10 +1043,19 @@ def __init__(self):
 
     def parseNode(self, node: gs.Node) -> (bool):
 
-        ret = all(['axes' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1])
+        # ONNX v11: 'axes' is a node attribute
+        if 'axes' in node.attrs:
+            ret = all(['axes' in node.attrs, len(node.inputs) == 1, len(node.outputs) == 1])
+        # ONNX v13+: 'axes' becomes an input with the data
+        # Source: https://onnx.ai/onnx/operators/onnx__Unsqueeze.html
+        else:
+            ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
 
-        if ret:
-            self.operatorRepresentation['axes'] = node.attrs['axes']
+        if ret and 'axes' in node.attrs:
+            axes_attr = node.attrs['axes']
+            self.operatorRepresentation['axes'] = [int(axes_attr)] if isinstance(axes_attr, int) \
+                else [int(a) for a in axes_attr]
+        # For opset 13+, axes will be extracted from the second input in parseNodeCtxt
 
         return ret
 
@@ -976,13 +1064,26 @@ def parseNodeCtxt(self,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
 
-        inputs = ['data_in']
         outputs = ['data_out']
-
-        for idx, inputNode in enumerate(node.inputs):
-            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
-        for idx, outputNode in enumerate(node.outputs):
-            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+        if len(node.inputs) == 1:
+            inputs = ['data_in']
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+        else:
+            data_in = ctxt.lookup(node.inputs[0].name)
+            data_out = ctxt.lookup(node.outputs[0].name)
+            self.operatorRepresentation['data_in'] = data_in.name
+            self.operatorRepresentation['data_out'] = data_out.name
+            # axes must be a constant; extract values
+            axes_buf = ctxt.lookup(node.inputs[1].name)
+            assert hasattr(axes_buf, 'values'), "Unsqueeze: expected constant 'axes' input for opset 13+"
+            axes_vals = np.array(axes_buf.values).astype(int).flatten().tolist()
+            self.operatorRepresentation['axes'] = axes_vals
+            # Do not deploy the axes tensor
+            axes_buf._live = False
+            axes_buf._deploy = False
 
         return ctxt, True
 
@@ -1014,44 +1115,18 @@ def parseNodeCtxt(self,
 
 class ReshapeParser(NodeParser):
 
-    def __init__(self):
-        super().__init__()
-
     def parseNode(self, node: gs.Node) -> (bool):
-
         ret = all([len(node.inputs) == 2, len(node.outputs) == 1])
-
         return ret
 
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
                       node: gs.Node,
                       channels_first: bool = True) -> Tuple[NetworkContext, bool]:
-
-        # Define names of node inputs and outputs, according to the ONNX standard
-        inputs = ['data_in', 'shape']
-        outputs = ['data_out']
-
-        # Map inputs and outputs to their corresponding names in the operator representation
-        for idx, inputNode in enumerate(node.inputs):
-            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
-        for idx, outputNode in enumerate(node.outputs):
-            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
-
-        # Update alias_of parameter for the output node
-        output_node = ctxt.lookup(node.outputs[outputs.index("data_out")].name)
-        input_node = ctxt.lookup(node.inputs[inputs.index("data_in")].name)
-
-        # Prepare new aliases
-        new_output_node_aliases = input_node.get_aliases_of()
-        new_output_node_aliases.append(input_node.name)
-
-        # Add new aliases
-        output_node.add_aliases(aliases_to_add = new_output_node_aliases)
-
-        # Compute data size
-        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
-
+        for tensor, symName in zip(node.inputs, ['data_in', 'shape']):
+            self.operatorRepresentation[symName] = ctxt.lookup(tensor.name).name
+        for tensor, symName in zip(node.outputs, ['data_out']):
+            self.operatorRepresentation[symName] = ctxt.lookup(tensor.name).name
         return ctxt, True
 
 
@@ -1327,6 +1402,8 @@ def parseNodeCtxt(self,
 
             self.operatorRepresentation['batch'] = data_in.shape[0]
             self.operatorRepresentation['dim_im_in_x'] = 1
+
+            # Necessary, since we use the same Convlayer for all convolutions
             self.operatorRepresentation['dim_im_out_x'] = 1
 
             if channels_first:
@@ -1340,6 +1417,11 @@ def parseNodeCtxt(self,
                 self.operatorRepresentation['ch_im_out'] = data_out.shape[2]
                 self.operatorRepresentation['dim_im_out_y'] = data_out.shape[1]
 
+            self.operatorRepresentation[
+                'batchOffsetIn'] = self.operatorRepresentation['ch_im_in'] * self.operatorRepresentation['dim_im_in_y']
+            self.operatorRepresentation['batchOffsetOut'] = self.operatorRepresentation[
+                'ch_im_out'] * self.operatorRepresentation['dim_im_out_y']
+
             if len(data_in.shape) == 3 and len(weight.shape) == 3:
                 return newCtxt, True
 
@@ -1621,6 +1703,36 @@ def parseNodeCtxt(self,
         return ctxt, True
 
 
+class LayerNormGradParser(iLayerNormParser):
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all(['epsilon' in node.attrs, len(node.inputs) == 4, len(node.outputs) == 1])
+
+        if ret:
+            self.operatorRepresentation['epsilon'] = node.attrs['epsilon']
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        inputs = ['grad_in', 'data_in', 'weight', 'bias']
+        outputs = ['grad_out']
+
+        for idx, inputNode in enumerate(node.inputs):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+        for idx, outputNode in enumerate(node.outputs):
+            self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
+
+        self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
+        self.operatorRepresentation['lastDimLength'] = ctxt.lookup(node.inputs[0].name).shape[-1]
+
+        return ctxt, True
+
+
 class MatMulParser(NodeParser):
 
     def __init__(self, noBiasHoisting = True):
@@ -1656,14 +1768,6 @@ def parseNodeCtxt(self,
         for idx, outputNode in enumerate(node.outputs):
             self.operatorRepresentation[outputs[idx]] = ctxt.lookup(outputNode.name).name
 
-        # Create fake C node for GEMM-compatibility and hoist it
-        if not self.noBiasHoisting:
-            values = np.zeros(ctxt.lookup(node.inputs[0].name).shape, dtype = inputNode.dtype)
-            zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
-            ctxt.hoistConstant(zeroTensor, _type = ctxt.lookup(inputNode.name)._type)
-            node.inputs.append(zeroTensor)
-            self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
-
         # Store the input and output shapes in the operator representation
         self.operatorRepresentation['size'] = np.prod(ctxt.lookup(node.inputs[0].name).shape)
         self.operatorRepresentation['A_shape'] = ctxt.lookup(node.inputs[0].name).shape
@@ -1746,8 +1850,7 @@ def parseNodeCtxt(self,
 class GEMMParser(MatMulParser):
 
     def __init__(self, noBiasHoisting = True):
-        self.noBiasHoisting = noBiasHoisting
-        super().__init__()
+        super().__init__(noBiasHoisting)
 
     def parseNode(self, node: gs.Node) -> (bool):
 
@@ -1779,6 +1882,10 @@ def parseNode(self, node: gs.Node) -> (bool):
             else:
                 self.operatorRepresentation['transB'] = 0
 
+            if len(node.inputs) == 2 and not self.noBiasHoisting:
+                C = gs.Constant(f"{node.name}_C", np.zeros((1,)))
+                node.inputs.append(C)
+
             return True
         # This might be a matmul node -> Cast up
         else:
@@ -1810,18 +1917,6 @@ def parseNodeCtxt(self,
                 # Create flag for same dimension between bias matrix and the final batch dimension
                 self.operatorRepresentation['C_batched'] = (self.operatorRepresentation['batch'] == np.prod(
                     newCtxt.lookup(node.inputs[2].name).shape[:-2]))
-            elif not self.noBiasHoisting:
-                # Create mock bias matrix if not present in the inputs
-                values = np.zeros((1))
-                zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
-                newCtxt.hoistConstant(zeroTensor)
-
-                # Store it in the operator representation
-                self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
-                self.operatorRepresentation['C_shape'] = (0,)
-
-                # Create flag for same dimension between bias matrix and the final batch dimension
-                self.operatorRepresentation['C_batched'] = False
 
             self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape)
 
@@ -1955,6 +2050,32 @@ def parseNodeCtxt(self,
         return ctxt, True
 
 
+class PowParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return node.op == 'Pow' and len(node.inputs) == 2 and len(node.outputs) == 1
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        # Lookup both inputs (data and exponent)
+        data_in = ctxt.lookup(node.inputs[0].name)
+        exponent_tensor = ctxt.lookup(node.inputs[1].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['exponent'] = exponent_tensor.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
+
+        return ctxt, True
+
+
 class DivParser(NodeParser):
 
     def __init__(self):
@@ -2138,7 +2259,20 @@ def parseNodeCtxt(self,
 
         if ret:
             inputs = ['data_in', 'weight']
+
+            # Handle bias, if present
+            if len(node.inputs) > 2:
+                inputs.append("bias")
+                self.operatorRepresentation["has_bias"] = 1
+            else:
+                self.operatorRepresentation["has_bias"] = 0
+                self.operatorRepresentation["bias"] = "NULL"
+
             for idx, inputNode in enumerate(node.inputs):
+                if idx >= len(inputs):
+                    raise IndexError(
+                        f"Index {idx} out of range for inputs of length {len(inputs)} in node {inputNode.name}")
+
                 self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
 
             return newCtxt, True
@@ -2285,7 +2419,7 @@ def parseNodeCtxt(self,
 
 class GenericGEMMParser(GEMMParser):
 
-    def __init__(self, noBiasHoisting = True):
+    def __init__(self, noBiasHoisting = False):
         super().__init__(noBiasHoisting)
 
     def parseNode(self, node: gs.Node) -> (bool):
@@ -2557,3 +2691,194 @@ def parseNodeCtxt(self,
         self.operatorRepresentation['lr'] = node.attrs['lr']
 
         return ctxt, True
+
+
+class BatchNormParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # Verify the attributes (epsilon is mandatory, momentum and training_mode  are optional)
+        if 'epsilon' not in node.attrs:
+            return False
+        # Common Inputs: 5 (X, scale, B, mean, var)
+        if len(node.inputs) < 5:
+            return False
+
+        # Save the attributes, default values are provided if not present
+        self.operatorRepresentation['epsilon'] = node.attrs.get('epsilon', 1e-5)
+        self.operatorRepresentation['momentum'] = node.attrs.get('momentum', 0.9)
+        self.operatorRepresentation['training_mode'] = node.attrs.get('training_mode', 0)
+
+        return True
+
+    def parseNodeCtxt(self, ctxt, node: gs.Node, channels_first: bool = True):
+        inputs = ['data_in', 'scale', 'bias', 'mean', 'variance']
+        outputs = ['data_out']
+
+        for idx, inputNode in enumerate(node.inputs[:5]):
+            self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+        # Output (Y)
+        self.operatorRepresentation[outputs[0]] = ctxt.lookup(node.outputs[0].name).name
+
+        input_shape = ctxt.lookup(node.inputs[0].name).shape
+        # Save input shape information
+        self.operatorRepresentation['batch_size'] = input_shape[0]
+        self.operatorRepresentation['channel_size'] = input_shape[1]
+        self.operatorRepresentation['window_size'] = input_shape[2]
+
+        return ctxt, True
+
+
+class ConvTransposeParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # Extract ONNX attributes with defaults
+        strides = node.attrs.get('strides', [1])
+
+        pads = node.attrs.get('pads', [0, 0])
+        kernel_shape = node.attrs.get('kernel_shape', None)
+        dilations = node.attrs.get('dilations', [1])
+        group = node.attrs.get('group', 1)
+
+        # Check for required attributes
+        wellFormed = (kernel_shape is not None and len(node.outputs) == 1)
+        if wellFormed:
+            self.operatorRepresentation['strides'] = strides
+            self.operatorRepresentation['pads'] = pads
+            self.operatorRepresentation['kernel_shape'] = kernel_shape
+            self.operatorRepresentation['dilations'] = dilations
+            self.operatorRepresentation['group'] = group
+            self.operatorRepresentation['nodeName'] = node.name
+            self.operatorRepresentation['nodeOp'] = node.op
+        return wellFormed
+
+    def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: bool = True):
+        # Register buffer names for codegen
+        self.operatorRepresentation['data_in'] = node.inputs[0].name
+        self.operatorRepresentation['weight'] = node.inputs[1].name
+        self.operatorRepresentation['data_out'] = node.outputs[0].name
+        if len(node.inputs) == 3:
+            self.operatorRepresentation['bias'] = node.inputs[2].name
+            self.operatorRepresentation['has_bias'] = "true"
+        else:
+            self.operatorRepresentation['has_bias'] = "false"
+        # Get output shape from context
+        data_out = ctxt.lookup(node.outputs[0].name)
+        out_shape = data_out.shape
+        if len(out_shape) == 3:
+            self.operatorRepresentation['dim_im_out_x'] = out_shape[2]
+        elif len(out_shape) == 4:
+            self.operatorRepresentation['dim_im_out_x'] = out_shape[2]
+            self.operatorRepresentation['dim_im_out_y'] = out_shape[3]
+
+        stride_x, stride_y = 1, 1
+        if "strides" in node.attrs:
+            stride_y = node.attrs["strides"][0]
+            stride_x = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_y
+        self.operatorRepresentation["stride_y"] = stride_y
+        self.operatorRepresentation["stride_x"] = stride_x
+
+        if "kernel_shape" in node.attrs:
+            kernel_shape = node.attrs["kernel_shape"]
+            kernel_shape_x = kernel_shape[0]
+            # For 2D, kernel_shape may have two elements
+            kernel_shape_y = kernel_shape[1] if len(kernel_shape) > 1 else kernel_shape_x
+        else:
+            kernel_shape_x = 1
+            kernel_shape_y = 1
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        in_shape = data_in.shape
+        out_shape = data_out.shape
+
+        self.operatorRepresentation['ch_im_in'] = in_shape[1]
+        self.operatorRepresentation['dim_im_in_y'] = in_shape[2]
+        self.operatorRepresentation['ch_im_out'] = out_shape[1]
+        self.operatorRepresentation['dim_im_out_y'] = out_shape[2]
+
+        self.operatorRepresentation[
+            'batchOffsetIn'] = self.operatorRepresentation['ch_im_in'] * self.operatorRepresentation['dim_im_in_y']
+        self.operatorRepresentation[
+            'batchOffsetOut'] = self.operatorRepresentation['ch_im_out'] * self.operatorRepresentation['dim_im_out_y']
+        return ctxt, True
+
+
+class ConvTranspose1DParser(ConvTransposeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # 1D ConvTranspose expects 3D input/output and 3D weight
+        wellFormed = super().parseNode(node)
+        ret = False
+        if wellFormed:
+            ret = all([
+                # Make sure strides are 2D
+                len(node.attrs['strides']) == 1,
+                len(node.attrs['pads']) == 2,
+                len(node.attrs['dilations']) == 1,
+            ])
+        if ret:
+
+            self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape']
+            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][0])
+            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][0])
+            self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][0])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            data_in = newCtxt.lookup(node.inputs[0].name)
+            data_out = newCtxt.lookup(node.outputs[0].name)
+            in_shape = data_in.shape
+            out_shape = data_out.shape
+            self.operatorRepresentation['batch'] = in_shape[0]
+            self.operatorRepresentation['ch_im_in'] = in_shape[1]
+            self.operatorRepresentation['dim_im_in_y'] = in_shape[2]
+            self.operatorRepresentation['ch_im_out'] = out_shape[1]
+            self.operatorRepresentation['dim_im_out_y'] = out_shape[2]
+            self.operatorRepresentation[
+                "batchOffsetIn"] = self.operatorRepresentation["ch_im_in"] * self.operatorRepresentation["dim_im_in_y"]
+            self.operatorRepresentation["batchOffsetOut"] = self.operatorRepresentation[
+                "ch_im_out"] * self.operatorRepresentation["dim_im_out_y"]
+            return newCtxt, True
+        return ctxt, False
+
+
+class SqrtParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return node.op == 'Sqrt' and len(node.inputs) == 1 and len(node.outputs) == 1
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
+
+        return ctxt, True
diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py
index 7a62a10346..7a842baba3 100644
--- a/Deeploy/Targets/Generic/Platform.py
+++ b/Deeploy/Targets/Generic/Platform.py
@@ -1,59 +1,39 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: GenericPlatform.py
-#
-# Last edited: 05.05.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, ETH Zurich
-# - Philip Wiese, ETH Zurich
-# - Calin Diaconu, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
-    RemoveEmptyConvBiasPass
+    RemoveEmptyConvBiasPass, RemoveOnlySingletonReduceMeanPass
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
-from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConcatBindings, BasicConv1DBinding, \
-    BasicConv2DBindings, BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, \
-    BasicDWConv2DBindings, BasicGatherBindings, BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, \
-    BasicITASoftmaxBinding, BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool2DBindings, BasicMulBindings, \
-    BasicPad1DBindings, BasicPad2DBindings, BasicQuantBindings, BasicReduceMeanBindings, BasicReduceSumBindings, \
-    BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, \
-    BasicSliceBindings, BasicSoftmaxBindings, BasicTransposeBindings, DummyBinding
-from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, DebugPrintLayer, DequantLayer, DivLayer, \
-    GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, \
-    QuantLayer, ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, \
-    RQSiGELULayer, SliceLayer, SoftmaxLayer, TransposeLayer
-from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DebugParser, DequantParser, DivParser, \
-    DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, GenericConv2DParser, \
-    GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, IntegerDivParser, \
-    ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MulParser, Pad1DParser, Pad2DParser, \
-    QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, \
-    RQSiGELUParser, SliceParser, SoftmaxParser, TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicBatchNormBindings, BasicConcatBindings, \
+    BasicConv1DBindings, BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, \
+    BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \
+    BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \
+    BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, \
+    BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \
+    BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \
+    BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \
+    DummyBinding
+from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \
+    ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \
+    LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \
+    ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \
+    SoftmaxLayer, SqrtLayer, TransposeLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \
+    DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \
+    GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \
+    IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \
+    Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \
+    RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \
+    TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \
     ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \
     iGELURequantMergePass
 
 AddMapper = NodeMapper(AddParser(), BasicAddBindings)
-Conv1DMapper = NodeMapper(GenericConv1DParser(), [BasicConv1DBinding])
+Conv1DMapper = NodeMapper(GenericConv1DParser(), BasicConv1DBindings)
 Conv2DMapper = NodeMapper(GenericConv2DParser(), BasicConv2DBindings)
 ConcatMapper = NodeMapper(ConcatParser(), BasicConcatBindings)
 DebugMapper = NodeMapper(DebugParser(), BasicDebugPrintBindings)
@@ -71,7 +51,10 @@
 ITAPartialMaxMapper = NodeMapper(ITAPartialMaxParser(), [BasicITAPartialSoftmaxBinding])
 MatMulMapper = NodeMapper(MatMulParser(), BasicMatMulBindings)
 MaxPoolMapper = NodeMapper(GenericMaxPool2DParser(), BasicMaxPool2DBindings)
+MaxPool1DMapper = NodeMapper(MaxPool1DParser(), BasicMaxPool1DBindings)
 MulMapper = NodeMapper(MulParser(), BasicMulBindings)
+PowMapper = NodeMapper(PowParser(), BasicPowBindings)
+SqrtMapper = NodeMapper(SqrtParser(), BasicSqrtBindings)
 Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
 Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
 ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings)
@@ -87,7 +70,8 @@
 UnsqueezeMapper = NodeMapper(UnsqueezeParser(), BasicReshapeBindings)
 QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
-
+BatchNormalizationMapper = NodeMapper(BatchNormParser(), BasicBatchNormBindings)
+ConvTransposeMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTransposeBindings)
 SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings)
 
 # Dummy nodes are intended for development purposes only!
@@ -115,8 +99,10 @@
     'ITAPartialMax': ITAMaxLayer([ITAPartialMaxMapper]),
     'MatMul': GEMMLayer([MatMulMapper]),
     'MatMulInteger': MatMulLayer([MatMulMapper]),
-    'MaxPool': MaxPoolLayer([MaxPoolMapper]),
+    'MaxPool': MaxPoolLayer([MaxPool1DMapper, MaxPoolMapper]),
     'Mul': MulLayer([MulMapper]),
+    'Pow': PowLayer([PowMapper]),
+    'Sqrt': SqrtLayer([SqrtMapper]),
     'Pad': PadLayer([Pad1DMapper, Pad2DMapper]),
     'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]),
     'ReduceSum': ReduceSumLayer([ReduceSumMapper]),
@@ -130,7 +116,9 @@
     'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
     'Slice': SliceLayer([SliceMapper]),
     'Quant': QuantLayer([QuantMapper]),
-    'Dequant': DequantLayer([DequantMapper])
+    'Dequant': DequantLayer([DequantMapper]),
+    'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]),
+    'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper])
     # # For example, you can use the DummpyMapper, in case you want to test
     # # deployment or optimizations with GlobalAveragePool nodes but did not yet
     # # implement the corresponding kernel
@@ -166,17 +154,20 @@ class GenericStructBuffer(StructBuffer):
     deallocTemplate = NodeTemplate("")
 
 
-GenericOptimizer = TopologyOptimizer([
-    QuantPatternPass(),
-    DequantPatternPass(),
-    iGELURequantMergePass(),
-    MatMulAddMergePass(),
-    MergeConstAddAndRequantPass(),
-    ExtractPaddingFromConvPass(),
-    ExtractPaddingFromPoolPass(),
-    RemoveEmptyConvBiasPass(),
-    # DebugPrintPass(r'.*[Mm]at[Mm]ul.*', position = 'after'),
-])
+GenericOptimizer = TopologyOptimizer(
+    [
+        QuantPatternPass(),
+        DequantPatternPass(),
+        iGELURequantMergePass(),
+        MatMulAddMergePass(),
+        MergeConstAddAndRequantPass(),
+        ExtractPaddingFromConvPass(),
+        ExtractPaddingFromPoolPass(),
+        RemoveEmptyConvBiasPass(),
+        RemoveOnlySingletonReduceMeanPass(),
+        # DebugPrintPass(r'.*[Mm]at[Mm]ul.*', position = 'after'),
+    ],
+    name = "GenericOptimizer")
 
 includeList = ["DeeployBasicMath.h"]
 
diff --git a/Deeploy/Targets/Generic/Templates/AddTemplate.py b/Deeploy/Targets/Generic/Templates/AddTemplate.py
index 4eec289669..75c16ac429 100644
--- a/Deeploy/Targets/Generic/Templates/AddTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/AddTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: AddTemplate.py
-#
-# Last edited: 15.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/AllocateTemplate.py b/Deeploy/Targets/Generic/Templates/AllocateTemplate.py
index b3638bcee6..9d004c059d 100644
--- a/Deeploy/Targets/Generic/Templates/AllocateTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/AllocateTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: AllocateTemplate.py
-#
-# Last edited: 15.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py
new file mode 100644
index 0000000000..5377c91ca0
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// BatchNorm (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    BatchNorm_fp32(
+        ${data_in}, ${scale}, ${bias}, ${mean}, ${variance},
+        ${data_out}, ${batch_size}, ${channel_size}, ${window_size}
+    );
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/ClosureTemplate.py b/Deeploy/Targets/Generic/Templates/ClosureTemplate.py
index 4398f635e6..50f667e7c7 100644
--- a/Deeploy/Targets/Generic/Templates/ClosureTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ClosureTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ClosureTemplate.py
-#
-# Last edited: 15.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate, OperatorRepresentation
 
diff --git a/Deeploy/Targets/Generic/Templates/ConcatTemplate.py b/Deeploy/Targets/Generic/Templates/ConcatTemplate.py
index e233e93726..17a66b91d3 100644
--- a/Deeploy/Targets/Generic/Templates/ConcatTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ConcatTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: ConcatTemplate.py
-#
-# Last edited: 19.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/ConvTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTemplate.py
index c65f7ee259..51f292dcae 100644
--- a/Deeploy/Targets/Generic/Templates/ConvTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ConvTemplate.py
-#
-# Last edited: 04.01.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py
new file mode 100644
index 0000000000..9bf864c91f
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_y
+%>
+
+// 1D Transposed Conv (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for (uint32_t n=0; n<${batch}; ++n) {
+        ConvTranspose1d_fp32(
+            ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y},
+            ${weight}, ${ch_im_out}, ${dim_kernel_y},
+            ${stride_y},
+            ${bias}, ${has_bias},
+            ref_${data_out}_${data_out}, ${dim_im_out_y}
+        );
+
+        ref_${data_out}_${data_in} += ${batchOffsetIn};
+        ref_${data_out}_${data_out} += ${batchOffsetOut};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/DWConvTemplate.py b/Deeploy/Targets/Generic/Templates/DWConvTemplate.py
index e4c8513931..aeeb1ac523 100644
--- a/Deeploy/Targets/Generic/Templates/DWConvTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/DWConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: DWConvTemplate.py
-#
-# Last edited: 05.01.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/DebugPrintTemplate.py b/Deeploy/Targets/Generic/Templates/DebugPrintTemplate.py
index fce7d0bdeb..d67af90b40 100644
--- a/Deeploy/Targets/Generic/Templates/DebugPrintTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/DebugPrintTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: DebugPrintTemplate.py
-#
-# Last edited: 14.12.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -44,7 +23,7 @@ def alignToContext(self, ctxt: NetworkContext,
         operatorRepresentation['data_in_signed'] = data_in._signed
         operatorRepresentation['offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
 
-        operatorRepresentation['output_name'] = ctxt._mangle(data_out.name)
+        operatorRepresentation['output_name'] = data_out.name
 
         return ctxt, operatorRepresentation, []
 
diff --git a/Deeploy/Targets/Generic/Templates/DequantTemplate.py b/Deeploy/Targets/Generic/Templates/DequantTemplate.py
index 99eeecf3eb..80f34fa66c 100644
--- a/Deeploy/Targets/Generic/Templates/DequantTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/DequantTemplate.py
@@ -1,28 +1,7 @@
-# ----------------------------------------------------------------------
-
-# File: DequantTemplate.py
-
-# Last edited: 17.03.2025
-
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-
-# Author: Federico Brancasi, ETH Zurich
-
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
 # SPDX-License-Identifier: Apache-2.0
 
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-# www.apache.org/licenses/LICENSE-2.0
-
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 from Deeploy.DeeployTypes import NodeTemplate
 
 
diff --git a/Deeploy/Targets/Generic/Templates/DummyTemplate.py b/Deeploy/Targets/Generic/Templates/DummyTemplate.py
index 6ae2d03a77..b5123f9db1 100644
--- a/Deeploy/Targets/Generic/Templates/DummyTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/DummyTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: DummyTemplate.py
-#
-# Last edited: 15.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatAddTemplate.py b/Deeploy/Targets/Generic/Templates/FloatAddTemplate.py
index 51ff681cb6..ec680e1a6b 100644
--- a/Deeploy/Targets/Generic/Templates/FloatAddTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatAddTemplate.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FloatAddTemplate.py
-#
-# Last edited: 13.11.2024
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Francesco Conti, UNIBO
-# - Alberto Dequino, UNIBO
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py b/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py
index faaead90ce..7519d33a21 100644
--- a/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatConvTemplate.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FLoatConvTemplate.py
-#
-# Last edited: 12.05.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Run Wang, ETH Zurich
-# - Calin Diaconu, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
@@ -52,3 +29,29 @@
     }
 END_SINGLE_CORE
 """)
+
+reference1DTemplate = NodeTemplate("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_y
+%>
+    // 1D FP Conv (Name: ${nodeName}, Op: ${nodeOp})
+    BEGIN_SINGLE_CORE
+        ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+        ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+        for (uint32_t n=0; n<${batch}; ++n) {
+            Conv1d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+                ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y},
+                ${weight}, ${ch_im_out}, ${dim_kernel_y},
+                ${stride_y},
+                ${bias},
+                ${has_bias},
+                ref_${data_out}_${data_out},
+                ${dim_im_out_y}
+            );
+
+            ref_${data_out}_${data_in} += ${batchOffsetIn};
+            ref_${data_out}_${data_out} += ${batchOffsetOut};
+        }
+    END_SINGLE_CORE
+    """)
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py b/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py
index 80ae4cb482..0e0fee7a86 100644
--- a/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatDWConvTemplate.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
-# File: FLoatDWConvTemplate.py
-#
-# Last edited: 12.05.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Calin Diaconu, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py b/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py
index be713b3fda..34236311a0 100644
--- a/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatDivTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FloatDivTemplate.py
-#
-# Last edited: 23.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py
index 7b011d76d5..fb74d01258 100644
--- a/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FloatGELUTemplate.py
-#
-# Last edited: 28.03.2025
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py
index bc490f5abc..69bea8484e 100644
--- a/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatGemmTemplate.py
@@ -1,27 +1,7 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: GemmTemplate.py.py
-#
-# Last edited: 27.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the Licens
+
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
@@ -48,15 +28,15 @@
         % if A_batched:
         ref_${data_out}_${A} += ${M} * ${N};
         % endif
-                                 
+
         % if B_batched:
         ref_${data_out}_${B} += ${N} * ${O};
         % endif
-        
+
         % if C_batched:
         ref_${data_out}_${C} += ${M} * ${O};
         % endif
-                                 
+
         ref_${data_out}_${data_out} += ${M} * ${O};
     }
 END_SINGLE_CORE
diff --git a/Deeploy/Targets/Generic/Templates/FloatLayernormTemplate.py b/Deeploy/Targets/Generic/Templates/FloatLayernormTemplate.py
index f21c538541..cfb35b60c6 100644
--- a/Deeploy/Targets/Generic/Templates/FloatLayernormTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatLayernormTemplate.py
@@ -1,32 +1,11 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FloatConvTemplate.py
-#
-# Last edited: 23.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
 // FloatLayernorm (Name: ${nodeName}, Op: ${nodeOp})
-                                
+
 SINGLE_CORE Layernorm_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${weight}, ${bias}, ${epsilon}, ${size}, ${lastDimLength});
 """)
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py
index 1d6121b131..d8a9f5b4b2 100644
--- a/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatMatMulTemplate.py
@@ -1,27 +1,7 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MatMul.py.py
-#
-# Last edited: 27.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the Licens
+
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
diff --git a/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py
index 8458330fb0..1eef5e0f4f 100644
--- a/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatMaxPoolTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MaxPoolTemplate.py
-#
-# Last edited: 24.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
@@ -37,7 +16,28 @@
             ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},${dim_kernel_x}, ${dim_kernel_y}, ${stride_x}, ${stride_y},
             ref_${data_out}_${data_out}
         );
-      
+
     }
 END_SINGLE_CORE
 """)
+
+reference1DTemplate = NodeTemplate("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_y
+%>
+    // 1D Float MaxPool (Name: ${nodeName}, Op: ${nodeOp})
+    BEGIN_SINGLE_CORE
+        ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+        ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+        for (uint32_t n=0; n<${batch}; ++n) {
+            MaxPool1d_fp32_fp32(
+                ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y},
+                ${dim_kernel_y}, ${stride_y},
+                ref_${data_out}_${data_out}
+            );
+            ref_${data_out}_${data_in} += ${batchOffsetIn};
+            ref_${data_out}_${data_out} += ${batchOffsetOut};
+        }
+    END_SINGLE_CORE
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py
index 908c26ef59..3c8c2da501 100644
--- a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MulTemplate.py
-#
-# Last edited: 02.09.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatPadTemplate.py b/Deeploy/Targets/Generic/Templates/FloatPadTemplate.py
index 4788c844ac..ad528910b7 100644
--- a/Deeploy/Targets/Generic/Templates/FloatPadTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatPadTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: PadTemplate.py
-#
-# Last edited: 27.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
@@ -29,7 +8,7 @@
 <%
     y_offset_out = dim_im_out_ch*(pad_y*dim_im_out_y)
     x_offset_out = dim_im_out_ch*(pad_x)
-    width = dim_im_in_ch*dim_im_in_y                
+    width = dim_im_in_ch*dim_im_in_y
 
     addoffsetOut = dim_im_out_ch * dim_im_out_y
     addoffsetIn = dim_im_in_ch * dim_im_in_y
@@ -73,3 +52,42 @@
     %endif
 END_SINGLE_CORE
 """)
+
+reference1DTemplate = NodeTemplate("""
+<%
+    x_offset_out = dim_im_out_ch*(pad_y)
+    width = dim_im_in_ch*dim_im_in_y
+
+    startPosX = x_offset_out
+
+batchOffsetOut = dim_im_out_ch * dim_im_out_y
+%>
+
+// 1D Float Pad (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    for (uint32_t i = 0; i < ${data_out_size}; i++) {
+        ${data_out}[i] = ${value};
+    }
+    uint32_t xoffset_${data_out}_${data_in};
+    uint32_t offset_in_${data_out}_${data_in} = 0;
+
+    % if channels_first:
+    // NCHW Layout
+    for(uint32_t n=0; n<${batch}; n++){
+        xoffset_${data_out}_${data_in} = n*${batchOffsetOut} +${pad_y};
+        for (uint32_t c=0; c<${dim_im_in_ch}; ++c) {
+            memcpy(${data_out} + xoffset_${data_out}_${data_in}, ${data_in}+offset_in_${data_out}_${data_in}, ${dim_im_in_y}*sizeof(${data_out_type.referencedType.typeName}));
+            xoffset_${data_out}_${data_in} += ${dim_im_out_y};
+            offset_in_${data_out}_${data_in} += ${dim_im_in_y};
+        }
+    }
+    % else:
+    // NHWC Layout
+    for(uint32_t n=0; n<${batch}; n++){
+        xoffset_${data_out}_${data_in} = n*${batchOffsetOut} + ${startPosX};
+        memcpy(${data_out}+xoffset_${data_out}_${data_in}, ${data_in}+offset_in_${data_out}_${data_in}, ${width}*sizeof(${data_out_type.referencedType.typeName}));
+        offset_in_${data_out}_${data_in} += ${width};
+    }
+    %endif
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatPowTemplate.py b/Deeploy/Targets/Generic/Templates/FloatPowTemplate.py
new file mode 100644
index 0000000000..83d177cc39
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatPowTemplate.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _PowTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Get input and output tensors
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        exponent = ctxt.lookup(operatorRepresentation['exponent'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        # Get data type (fp32)
+        data_type = data_in._type.typeName
+        operatorRepresentation['data_type'] = data_type
+
+        # Get type width dynamically (e.g., 32, 64)
+        type_width = data_in._type.referencedType.typeWidth
+        operatorRepresentation['type_width'] = type_width
+
+        # Calculate size
+        input_size = int(np.prod(data_in.shape))
+        exponent_size = int(np.prod(exponent.shape))
+        operatorRepresentation['size'] = input_size
+
+        # Check if exponent is scalar (broadcasting)
+        if exponent_size == 1:
+            operatorRepresentation['is_scalar'] = True
+            # Get the full variable name with prefix
+            exponent_name = operatorRepresentation['exponent']
+            operatorRepresentation['exponent_scalar'] = f"DeeployNetwork_{exponent_name}[0]"
+        else:
+            # Since currently the kernel only supports equally sized base-exponent data,
+            # for non-scalar, let's add a size check here (length of data_in should be equal to exponent length).
+            if input_size != exponent_size:
+                raise ValueError(f"Pow operator mismatch: input size ({input_size}) "
+                                 f"must equal exponent size ({exponent_size}) for non-scalar exponents.")
+
+            operatorRepresentation['is_scalar'] = False
+            operatorRepresentation['exponent_scalar'] = "NULL"
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _PowTemplate("""
+// Pow (Name: ${nodeName}, Op: ${nodeOp})
+% if is_scalar:
+Pow_fp${type_width}_scalar_fp${type_width}(${data_in}, ${exponent_scalar}, ${data_out}, ${size});
+% else:
+Pow_fp${type_width}_fp${type_width}_fp${type_width}(${data_in}, ${exponent}, ${data_out}, ${size});
+% endif
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py
index 719a06be6f..88ffc32de8 100644
--- a/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatReduceMeanTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: ReduceMeanTemplate.py
-#
-# Last edited: 04.06.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author: Calin Diaconu, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -38,66 +17,78 @@ def alignToContext(self, ctxt: NetworkContext,
 
         data_in = ctxt.lookup(operatorRepresentation['data_in'])
         data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
         operatorRepresentation['input_offset'] = 0
         if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
             operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+
         operatorRepresentation['output_offset'] = 0
         if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
-            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_in.nLevels / 2)
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels / 2)
 
         return ctxt, operatorRepresentation, []
 
 
 referenceTemplate = _FloatReduceMeanTemplate("""
-// FloatReduceMean (Name: ${nodeName}, Op: ${nodeOp})
-BEGIN_SINGLE_CORE
-${data_out_type.referencedType.typeName} ${data_out}_accumulator = 0;
+## =============== Compute required variables ===============
 <%
-
+## Compute the total number of elements being reduced in one axis
 reduceLength = 1
+
 for i, axis in enumerate(axes):
     if axis < 0:
         axes[i] += len(data_in_shape)
     reduceLength = reduceLength * data_in_shape[axis]
-%>
-<%
-    shapeStr = ''
-    accessStr = ''
-%>
-% for idx, i in enumerate(data_in_shape[1:]):
-<%
+
+## Compute the remaining dimensions after reduction
+restDims = set(list(range(len(data_in_shape)))).difference(set(axes))
+
+## =============== Prepare shape and access strings ===============
+## shapeStr is going to have the [d1][d2]... format
+## accessStr is going to have the [i_0][i_1]... format
+shapeStr = ''
+accessStr = ''
+
+for idx, i in enumerate(data_in_shape[1:]):
     shapeStr += '['+str(i)+']'
-%>
-% endfor
-% for j in range(len(data_in_shape)):
-<%
+
+for j in range(len(data_in_shape)):
     accessStr += '[i_'+str(j)+']'
 %>
-% endfor
+
+## =============== Start of the actual template ===============
+## Prepare variables
+// ReduceMean (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+
+${data_out_type.referencedType.typeName} ${data_out}_accumulator = 0;
 ${data_out_type.typeName} dummy_${data_out} = ${data_out};
 
-<%
-restDims = set(list(range(len(data_in_shape)))).difference(set(axes))
-%>
+## Iterate through non-reduced dimensions
 % for i in list(restDims):
-for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++) {
 % endfor
+## Initialize accumulator
 ${data_out}_accumulator = ${input_offset}*${reduceLength};
+
+## Iterate through reduced dimensions and accumulate
 % for i in list(axes):
-for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++){
+for(uint32_t i_${i} = 0; i_${i}<${data_in_shape[i]}; i_${i}++) {
 % endfor
 ${data_out}_accumulator += ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr};
-
 % for i in range(len(axes)):
 }
 % endfor
+
+## Write back the mean value
 % if keepdims:
-*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) ((${data_out}_accumulator / ${reduceLength} + ${output_offset});
+*dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) (${data_out}_accumulator / ${reduceLength} + ${output_offset});
 % else:
 *dummy_${data_out}++ = (${data_out_type.referencedType.typeName}) (${data_out}_accumulator / ${reduceLength});
 % endif
 % for i in range(len(restDims)):
 }
 % endfor
+
 END_SINGLE_CORE
 """)
diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceSumTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceSumTemplate.py
index d9579c3030..e9a9ff1742 100644
--- a/Deeploy/Targets/Generic/Templates/FloatReduceSumTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatReduceSumTemplate.py
@@ -1,31 +1,11 @@
-# ---------------------------------------------------------------------- #
-# File: ReduceSumTemplateFloat.py
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# Last edited: March 14, 2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-# Modified for float support
-# ---------------------------------------------------------------------- #
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
-// Float ReduceSum (Name: ${nodeName}, Op: ${nodeOp}) 
+// Float ReduceSum (Name: ${nodeName}, Op: ${nodeOp})
 BEGIN_SINGLE_CORE
 float32_t ${data_out}_accumulator = 0.0f;
 <% reduceLength = 1
diff --git a/Deeploy/Targets/Generic/Templates/FloatReluTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReluTemplate.py
index 7bb71487ee..c40d888d06 100644
--- a/Deeploy/Targets/Generic/Templates/FloatReluTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatReluTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FloatReluTemplate.py
-#
-# Last edited: 23.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/Generic/Templates/FloatSoftmaxTemplate.py
index d0784f1378..118543c0b0 100644
--- a/Deeploy/Targets/Generic/Templates/FloatSoftmaxTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatSoftmaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FloatSoftmaxTemplate.py
-#
-# Last edited: 23.1.2025
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatSqrtTemplate.py b/Deeploy/Targets/Generic/Templates/FloatSqrtTemplate.py
new file mode 100644
index 0000000000..99d7ba0475
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatSqrtTemplate.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _SqrtTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        # Get input and output tensors
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        # Get data type (fp32)
+        data_type = data_in._type.typeName
+        operatorRepresentation['data_type'] = data_type
+
+        type_width = data_in._type.referencedType.typeWidth
+        operatorRepresentation['type_width'] = type_width
+
+        # Calculate size
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _SqrtTemplate("""
+// Sqrt (Name: ${nodeName}, Op: ${nodeOp})
+Sqrt_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FreeTemplate.py b/Deeploy/Targets/Generic/Templates/FreeTemplate.py
index 84e13d0583..4e8b74bc8c 100644
--- a/Deeploy/Targets/Generic/Templates/FreeTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FreeTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FreeTemplate.py
-#
-# Last edited: 15.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/GatherTemplate.py b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
index 918125606f..dd5e534fa4 100644
--- a/Deeploy/Targets/Generic/Templates/GatherTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/GatherTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: GatherTemplate.py
-#
-# Last edited: 16.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/GemmTemplate.py b/Deeploy/Targets/Generic/Templates/GemmTemplate.py
index 5bc780d223..62d760d15c 100644
--- a/Deeploy/Targets/Generic/Templates/GemmTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/GemmTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: GemmTemplate.py.py
-#
-# Last edited: 05.01.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/ITAMaxTemplate.py b/Deeploy/Targets/Generic/Templates/ITAMaxTemplate.py
index 67d9b0f319..c943f5cf2c 100644
--- a/Deeploy/Targets/Generic/Templates/ITAMaxTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ITAMaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ITAMaxTemplate.py
-#
-# Last edited: 27.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -43,7 +22,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         size = operatorRepresentation['lastDimLength']
         name = operatorRepresentation['nodeName'] + f"_buffer"
         ctxt.hoistTransientBuffer(name, size)
-        operatorRepresentation['ctxtBuffer'] = ctxt._mangle(name)
+        operatorRepresentation['ctxtBuffer'] = name
         operatorRepresentation['ctxtBufferSize'] = size
 
         return ctxt, operatorRepresentation, [name]
diff --git a/Deeploy/Targets/Generic/Templates/ITAPartialMaxTemplate.py b/Deeploy/Targets/Generic/Templates/ITAPartialMaxTemplate.py
index f813dca776..c5b3675acd 100644
--- a/Deeploy/Targets/Generic/Templates/ITAPartialMaxTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ITAPartialMaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: ITAPartialMaxTemplate.py
-#
-# Last edited: 08.01.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/IntegerDivTemplate.py b/Deeploy/Targets/Generic/Templates/IntegerDivTemplate.py
index 6946b60214..d6495a152a 100644
--- a/Deeploy/Targets/Generic/Templates/IntegerDivTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/IntegerDivTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: IntegerDivTemplate.py
-#
-# Last edited: 02.09.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/MatMulTemplate.py b/Deeploy/Targets/Generic/Templates/MatMulTemplate.py
index 038be0c634..d1b25c1b0d 100644
--- a/Deeploy/Targets/Generic/Templates/MatMulTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/MatMulTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MatMulTemplate.py
-#
-# Last edited: 02.09.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py b/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py
index c66358e45d..1a1b3060bc 100644
--- a/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/MaxPoolTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MaxPoolTemplate.py
-#
-# Last edited: 04.01.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/MulTemplate.py b/Deeploy/Targets/Generic/Templates/MulTemplate.py
index f96b549090..5709eef4bf 100644
--- a/Deeploy/Targets/Generic/Templates/MulTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/MulTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MulTemplate.py
-#
-# Last edited: 02.09.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/PadTemplate.py b/Deeploy/Targets/Generic/Templates/PadTemplate.py
index dfcced9249..f0d4462648 100644
--- a/Deeploy/Targets/Generic/Templates/PadTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/PadTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: PadTemplate.py
-#
-# Last edited: 27.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/QuantTemplate.py b/Deeploy/Targets/Generic/Templates/QuantTemplate.py
index 817f90a04d..79860652c3 100644
--- a/Deeploy/Targets/Generic/Templates/QuantTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/QuantTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
-# File: QuantTemplate.py
-#
-# Last edited: 12.03.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author: Federico Brancasi, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
@@ -35,23 +14,23 @@ def __init__(self, templateStr):
 referenceTemplate = _QuantTemplate("""
 // Quantization (Name: ${nodeName}, Op: ${nodeOp})
 BEGIN_SINGLE_CORE
-    
+
     for (uint32_t i=0; i<${size}; i++) {
         // quantization formula
         float32_t input_val = ${data_in}[i];
         float32_t scaled_val = input_val * ${scale};  // Multiply instead of divide
         float32_t shifted_val = scaled_val + ${zero_point};
-        
+
         // Round to nearest integer
         int32_t quantized = (int32_t)(shifted_val + 0.5f * (shifted_val >= 0 ? 1 : -1));
-        
+
         // Clamp the value
         if (quantized < ${min_val}) quantized = ${min_val};
         if (quantized > ${max_val}) quantized = ${max_val};
-        
+
         // Assign directly with explicit cast
         ${data_out}[i] = (${data_out_type.referencedType.typeName})quantized;
-        
+
     }
 END_SINGLE_CORE
 """)
diff --git a/Deeploy/Targets/Generic/Templates/RQAddTemplate.py b/Deeploy/Targets/Generic/Templates/RQAddTemplate.py
index dacc9ac627..35593ad133 100644
--- a/Deeploy/Targets/Generic/Templates/RQAddTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/RQAddTemplate.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: RQAddTemplate.py
-#
-# Last edited: 11.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/RQIntegerDivTemplate.py b/Deeploy/Targets/Generic/Templates/RQIntegerDivTemplate.py
index 1f2e01c1ce..6ce4cba12f 100644
--- a/Deeploy/Targets/Generic/Templates/RQIntegerDivTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/RQIntegerDivTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: RQIntegerDivTemplate.py
-#
-# Last edited: 02.09.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/RQSiGELUTemplate.py b/Deeploy/Targets/Generic/Templates/RQSiGELUTemplate.py
index 7783ade729..2d6a967936 100644
--- a/Deeploy/Targets/Generic/Templates/RQSiGELUTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/RQSiGELUTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: RQSiGELUTemplate.py
-#
-# Last edited: 13.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/RQSiHardswishTemplate.py b/Deeploy/Targets/Generic/Templates/RQSiHardswishTemplate.py
index 4067ef0cf9..da2b8a78da 100644
--- a/Deeploy/Targets/Generic/Templates/RQSiHardswishTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/RQSiHardswishTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: RQSiHardswishTemplate.py
-#
-# Last edited: 23.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
index d43d6f7456..93d884eb87 100644
--- a/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ReduceMeanTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: ReduceMeanTemplate.py
-#
-# Last edited: 05.06.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/ReduceSumTemplate.py b/Deeploy/Targets/Generic/Templates/ReduceSumTemplate.py
index 952efb4ce8..9fc4a301e5 100644
--- a/Deeploy/Targets/Generic/Templates/ReduceSumTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ReduceSumTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ReduceSumTemplate.py
-#
-# Last edited: 27.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py b/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py
index 9041296c98..2fca2e0eb1 100644
--- a/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/RequantShiftTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: RequantShiftTemplate.py
-#
-# Last edited: 14.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/ReshapeTemplate.py b/Deeploy/Targets/Generic/Templates/ReshapeTemplate.py
index 221d9909d6..15b7d64bef 100644
--- a/Deeploy/Targets/Generic/Templates/ReshapeTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ReshapeTemplate.py
@@ -1,31 +1,10 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: ReshapeTemplate.py
-#
-# Last edited: 16.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
-from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
 
 
 class _ReshapeTemplate(NodeTemplate):
@@ -46,9 +25,14 @@ def alignToContext(self, ctxt: NetworkContext,
             ctxt.globalObjects[operatorRepresentation["shape"]]._deploy = False
             ctxt.globalObjects[operatorRepresentation["shape"]]._live = False
 
-        inBuffer = ctxt.lookup(operatorRepresentation['data_in'])
-        outBuffer = ctxt.lookup(operatorRepresentation['data_out'])
-        outBuffer._alias = inBuffer.name
+        bufferIn = ctxt.lookup(operatorRepresentation['data_in'])
+        assert isinstance(bufferIn, VariableBuffer)
+        bufferOut = ctxt.lookup(operatorRepresentation['data_out'])
+        assert isinstance(bufferOut, VariableBuffer)
+
+        # Link aliases to each buffer
+        bufferIn.aliases.add(bufferOut.name)
+        bufferOut.aliases.add(bufferIn.name)
 
         return ctxt, operatorRepresentation, []
 
diff --git a/Deeploy/Targets/Generic/Templates/SkipTemplate.py b/Deeploy/Targets/Generic/Templates/SkipTemplate.py
index dee5b80c12..e8453ed2fd 100644
--- a/Deeploy/Targets/Generic/Templates/SkipTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/SkipTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: SkipTemplate.py
-#
-# Last edited: 16.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/SliceTemplate.py b/Deeploy/Targets/Generic/Templates/SliceTemplate.py
index fa475e22dc..3ffaa46219 100644
--- a/Deeploy/Targets/Generic/Templates/SliceTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/SliceTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: SliceTemplate.py
-#
-# Last edited: 01.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -67,7 +46,7 @@ def alignToContext(self, ctxt: NetworkContext,
 for dim in data_in_shape[1:]:
      dimSteps.append(dimSteps[-1]//dim)
 %>
-<%                                   
+<%
 transferSize = dimSteps[int(axes[-1])]
 %>
 <%
diff --git a/Deeploy/Targets/Generic/Templates/TransposeTemplate.py b/Deeploy/Targets/Generic/Templates/TransposeTemplate.py
index 0dfceacb8c..bbc50bcadc 100644
--- a/Deeploy/Targets/Generic/Templates/TransposeTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/TransposeTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: TransposeTemplate.py
-#
-# Last edited: 28.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/__init__.py b/Deeploy/Targets/Generic/Templates/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Generic/Templates/__init__.py
+++ b/Deeploy/Targets/Generic/Templates/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Generic/Templates/iGELUTemplate.py b/Deeploy/Targets/Generic/Templates/iGELUTemplate.py
index 0b3e1b8fc8..9ff716782c 100644
--- a/Deeploy/Targets/Generic/Templates/iGELUTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/iGELUTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: iGELUTemplate.py
-#
-# Last edited: 13.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/iHardswishTemplate.py b/Deeploy/Targets/Generic/Templates/iHardswishTemplate.py
index 0dd7f65aef..da10fd50e3 100644
--- a/Deeploy/Targets/Generic/Templates/iHardswishTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/iHardswishTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iHardswishTemplate.py
-#
-# Last edited: 22.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/iLayernormTemplate.py b/Deeploy/Targets/Generic/Templates/iLayernormTemplate.py
index 75a1a9b5c0..bd14213ee7 100644
--- a/Deeploy/Targets/Generic/Templates/iLayernormTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/iLayernormTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: ILayernormTemplate.py
-#
-# Last edited: 31.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py b/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py
index 242962e306..562b3168a9 100644
--- a/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/iNoNormTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iNoNormTemplate.py
-#
-# Last edited: 22.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Generic/Templates/iRMSNormTemplate.py b/Deeploy/Targets/Generic/Templates/iRMSNormTemplate.py
index 2f8859e026..0fe1e1338b 100644
--- a/Deeploy/Targets/Generic/Templates/iRMSNormTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/iRMSNormTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iRMSNormTemplate.py
-#
-# Last edited: 20.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py b/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py
index 45b80a7bc2..9cf609deea 100644
--- a/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/iSoftmaxPreAllocatedBuffTemplate.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iSoftmaxPreAllocatedBuffTemplate.py
-#
-# Last edited: 09.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
diff --git a/Deeploy/Targets/Generic/Templates/iSoftmaxTemplate.py b/Deeploy/Targets/Generic/Templates/iSoftmaxTemplate.py
index be5c7f1e6c..81aca29330 100644
--- a/Deeploy/Targets/Generic/Templates/iSoftmaxTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/iSoftmaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: iSoftmaxTemplate.py
-#
-# Last edited: 30.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py
index 0e932a89c6..e87f9abb62 100644
--- a/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/AddTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: AddTileConstraint.py
-#
-# Last edited: 05.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from .BOPTileConstraint import BOPTileConstraint
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py
index d5b77f9b8f..e1f6f0e71c 100644
--- a/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/BOPTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: BOPTileConstraint.py
-#
-# Last edited: 05.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/ConcatTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/ConcatTileConstraint.py
index 6e546ab1ea..1fc8967c4c 100644
--- a/Deeploy/Targets/Generic/TileConstraints/ConcatTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/ConcatTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: ConcatTileConstraint.py
-#
-# Last edited: 19.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from typing import Dict, List, Tuple
diff --git a/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py
index f2a794ffea..9f71012ffe 100644
--- a/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/MulTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: MulTileConstraint.py
-#
-# Last edited: 22.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from .BOPTileConstraint import BOPTileConstraint
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/NOPTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/NOPTileConstraint.py
index 73293fba9c..d24abb4ba5 100644
--- a/Deeploy/Targets/Generic/TileConstraints/NOPTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/NOPTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FlattenTileConstraint.py
-#
-# Last edited: 02.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/RQSiGELUTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/RQSiGELUTileConstraint.py
index 5bea47e35e..6c78c96734 100644
--- a/Deeploy/Targets/Generic/TileConstraints/RQSiGELUTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/RQSiGELUTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: RQSiGELUTileConstraint.py
-#
-# Last edited: 22.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from .UnaryTileConstraint import UnaryTileConstraint
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/RQSiHardswishTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/RQSiHardswishTileConstraint.py
index 98e3fd78a9..fd81d5d9a3 100644
--- a/Deeploy/Targets/Generic/TileConstraints/RQSiHardswishTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/RQSiHardswishTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: RQiHardswishTileConstraint.py
-#
-# Last edited: 23.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from .UnaryTileConstraint import UnaryTileConstraint
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
index f9d53f8d82..c83d8b1e2a 100644
--- a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
@@ -1,40 +1,18 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: TransposeTileConstraint.py
-#
-# Last edited: 01.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.DataTypes import uint16_t
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
-    _invertPermutation, _permuteList
+    _invertPermutation, _permuteHyperRectangle
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
-from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
-    VariableReplacementScheme
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
 
 
 class TransposeTileConstraint(TileConstraint):
@@ -68,8 +46,6 @@ def serializeTilingSolution(
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
 
-        inputInCubes = []
-
         replacementTypes = {}
         replacements: Dict[str, List[int]] = {}
 
@@ -79,28 +55,16 @@ def serializeTilingSolution(
             replacementTypes[f"dimLen_{dim}"] = PointerClass(uint16_t)
             replacements[f"dimLen_{dim}"] = []
 
-        perm = operatorRepresentation['perm']
-        invPerm = _invertPermutation(perm)
-
-        for cube in outputCubes:
-
-            inCubeDims = _permuteList(cube.dims, invPerm)
-
-            InCube = HyperRectangle(_permuteList(cube.offset, invPerm), inCubeDims)
-            inputInCubes.append(InCube)
-
-            for dim in range(numDims):
-                replacements[f"dimLen_{dim}"].append(inCubeDims[dim])
-
-        inputLoadSchedule = []
-        outputLoadSchedule = []
-
-        for a in inputInCubes:
-            inputLoadSchedule.append({"data_in": a})
-
-        for out in outputCubes:
-            outputLoadSchedule.append({"data_out": out})
+        invPerm = _invertPermutation(operatorRepresentation['perm'])
+        inputCubes = []
+        for outCube in outputCubes:
+            inCube = _permuteHyperRectangle(outCube, invPerm)
+            inputCubes.append(inCube)
+            for i, dim in enumerate(inCube.dims):
+                replacements[f"dimLen_{i}"].append(dim)
 
+        inputLoadSchedule = [{"data_in": cube} for cube in inputCubes]
+        outputLoadSchedule = [{"data_out": cube} for cube in outputCubes]
         tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/UnaryTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UnaryTileConstraint.py
index 590ff87956..91e180dd2c 100644
--- a/Deeploy/Targets/Generic/TileConstraints/UnaryTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/UnaryTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: UnaryTileConstraint.py
-#
-# Last edited: 05.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
index d9fef0eb9c..091cb55a41 100644
--- a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: UntiledTileConstraint.py
-#
-# Last edited: 03.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/__init__.py b/Deeploy/Targets/Generic/TileConstraints/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Generic/TileConstraints/__init__.py
+++ b/Deeploy/Targets/Generic/TileConstraints/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Generic/TileConstraints/iHardswishTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/iHardswishTileConstraint.py
index 96abf977bb..d3d2e72520 100644
--- a/Deeploy/Targets/Generic/TileConstraints/iHardswishTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/iHardswishTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iHardswishTileConstraint.py
-#
-# Last edited: 22.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Generic/TileConstraints/iRMSNormTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/iRMSNormTileConstraint.py
index 4cff06d064..b503fb5e91 100644
--- a/Deeploy/Targets/Generic/TileConstraints/iRMSNormTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/iRMSNormTileConstraint.py
@@ -1,29 +1,7 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iRMSNormTileConstraint.py
-#
-# Last edited: 21.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-import copy
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -34,7 +12,8 @@
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
-from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
 
 
 class iRMSNormTileConstraint(TileConstraint):
@@ -75,7 +54,6 @@ def serializeTilingSolution(
         addrNames = ['data_in', 'weight', 'data_out']
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
-
         replacements = {"size": []}
         replacementTypes = {"size": PointerClass(uint16_t)}
 
@@ -87,9 +65,7 @@ def serializeTilingSolution(
         outputLoadSchedule = []
 
         for cube in outputCubes:
-
-            weightCube = copy.deepcopy(cube)
-            weightCube.dims = (cube.dims[-1],)
+            weightCube = HyperRectangle((cube.offset[-1],), (cube.dims[-1],))
             inputLoadSchedule.append({"data_in": cube, "weight": weightCube})
 
         for out in outputCubes:
diff --git a/Deeploy/Targets/Generic/Tiler.py b/Deeploy/Targets/Generic/Tiler.py
index 35df320a4f..9e55c706ca 100644
--- a/Deeploy/Targets/Generic/Tiler.py
+++ b/Deeploy/Targets/Generic/Tiler.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: BasicTiler.py
-#
-# Last edited: 01.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConcatBindings, BasicReshapeBindings, \
     BasicTransposeBindings
diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
index ac6933bbbd..146bcf699e 100644
--- a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
+++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: BasicPasses.py
-#
-# Last edited: 28.04.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from collections import OrderedDict
@@ -697,8 +676,8 @@ def _split_transposes_fun(graph: gs.Graph, match: Match, name: str):
     inputNode.outputs = [postSplitOutput]
 
     for node in originalNode.outputs.copy():
-        nodeName = node.name + f"_transpose_in"
-        varName = node.name + f"_transpose_in_var"
+        nodeName = f"{t1.name}_{node.name}_transpose_in"
+        varName = f"{t1.name}_{node.name}_transpose_in_var"
         newOutput = gs.Variable(name = varName, dtype = np.float32, shape = t1.outputs[0].shape)
 
         transposeNode = gs.Node(name = nodeName,
diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Generic/TopologyOptimizationPasses/__init__.py
+++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
index 8b62e41e76..c2c8d436f8 100644
--- a/Deeploy/Targets/Generic/TypeCheckers.py
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: BasicCheckers.py
-#
-# Last edited: 16.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Optional, Sequence, Type
 
@@ -619,3 +596,17 @@ def _inferNumLevels(self, inputs: List[VariableBuffer],
     def _inferSignedness(self, inputs: List[VariableBuffer],
                          operatorRepresentation: OperatorRepresentation) -> Optional[List[bool]]:
         return [True]
+
+
+class BatchNormChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
diff --git a/Deeploy/Targets/Generic/__init__.py b/Deeploy/Targets/Generic/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Generic/__init__.py
+++ b/Deeploy/Targets/Generic/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/MemPool/Bindings.py b/Deeploy/Targets/MemPool/Bindings.py
index 38157c6997..cea42f2d03 100644
--- a/Deeploy/Targets/MemPool/Bindings.py
+++ b/Deeploy/Targets/MemPool/Bindings.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MemPoolBindings.py
-#
-# Last edited: 13.11.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
diff --git a/Deeploy/Targets/MemPool/DataTypes.py b/Deeploy/Targets/MemPool/DataTypes.py
index 11ed173617..fe790d28a8 100644
--- a/Deeploy/Targets/MemPool/DataTypes.py
+++ b/Deeploy/Targets/MemPool/DataTypes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: MemPoolDataTypes.py
-#
-# Last edited: 08.01.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from dataclasses import dataclass
 
diff --git a/Deeploy/Targets/MemPool/Deployer.py b/Deeploy/Targets/MemPool/Deployer.py
index 788fe8db59..5431320978 100644
--- a/Deeploy/Targets/MemPool/Deployer.py
+++ b/Deeploy/Targets/MemPool/Deployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MemPoolDeployer.py
-#
-# Last edited: 13.11.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
diff --git a/Deeploy/Targets/MemPool/Layers.py b/Deeploy/Targets/MemPool/Layers.py
index 6f0ef4b543..679b47459e 100644
--- a/Deeploy/Targets/MemPool/Layers.py
+++ b/Deeploy/Targets/MemPool/Layers.py
@@ -1,24 +1,3 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: MemPoolLayers.py
-#
-# Last edited: 13.11.2022
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/Deeploy/Targets/MemPool/Parsers.py b/Deeploy/Targets/MemPool/Parsers.py
index 6166d1cdaf..a3081c5a3c 100644
--- a/Deeploy/Targets/MemPool/Parsers.py
+++ b/Deeploy/Targets/MemPool/Parsers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MemPoolParsers.py
-#
-# Last edited: 13.11.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/MemPool/Platform.py b/Deeploy/Targets/MemPool/Platform.py
index 02ac39c5d7..48599736f4 100644
--- a/Deeploy/Targets/MemPool/Platform.py
+++ b/Deeploy/Targets/MemPool/Platform.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: MemPoolPlatform.py
-#
-# Last edited: 05.05.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Philip Wiese, ETH Zurich
-# - Calin Diaconu, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict
 
@@ -31,7 +8,7 @@
 
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
-from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConv1DBinding, BasicConv2DBindings, \
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicConv1DBindings, BasicConv2DBindings, \
     BasicDebugPrintBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \
     BasicGELUBindings, BasicLayerNormBindings, BasicMulBindings, BasicPad1DBindings, BasicPad2DBindings, \
     BasicReduceMeanBindings, BasicReduceSumBindings, BasicReshapeBindings, BasicRQIntegerDivBinding, \
@@ -60,7 +37,7 @@
 
 # Fallback bindings from the generic platform
 # (they support a wider range of attribute values)
-GenericConv1D_Mapper = NodeMapper(GenericConv1DParser(), [BasicConv1DBinding])
+GenericConv1D_Mapper = NodeMapper(GenericConv1DParser(), BasicConv1DBindings)
 GenericDWConv1D_Mapper = NodeMapper(GenericDWConv1DParser(), [BasicDWConv1DBinding])
 GenericConv2D_Mapper = NodeMapper(GenericConv2DParser(), BasicConv2DBindings)
 GenericDWConv2D_Mapper = NodeMapper(GenericDWConv2DParser(), BasicDWConv2DBindings)
@@ -186,22 +163,24 @@ class MemPoolStructBuffer(StructBuffer):
     deallocTemplate = NodeTemplate("")
 
 
-MemPoolOptimizer = TopologyOptimizer([
-    MemPoolFuseMHSAPass(H = 8, bias = False, preSoftMaxRQ = True, integerDiv = False),
-    MemPoolFuseMHSAPass(H = 1, bias = False, preSoftMaxRQ = True, integerDiv = False),
-    MemPoolFuseMHSAPass(H = -1, bias = False, preSoftMaxRQ = True, integerDiv = False),
-    MemPoolFuseMHSAPass(H = -1, bias = True, preSoftMaxRQ = True, integerDiv = False),
-    MemPoolSplitMHSAPass(),
-    iGELURequantMergePass(),
-    MatMulAddMergePass(),
-    SplitAddPass(),
-    MergeConstAddAndRequantPass(),
-    MemPoolMatMulRequantMergePass(),
-    MemPoolGEMMRequantMergePass(),
-    ExtractPaddingFromConvPass(),
-    ExtractPaddingFromPoolPass(),
-    # DebugPrintPass(r'.*[Mm]at[Mm]ul.*', position = 'after'),
-])
+MemPoolOptimizer = TopologyOptimizer(
+    [
+        MemPoolFuseMHSAPass(H = 8, bias = False, preSoftMaxRQ = True, integerDiv = False),
+        MemPoolFuseMHSAPass(H = 1, bias = False, preSoftMaxRQ = True, integerDiv = False),
+        MemPoolFuseMHSAPass(H = -1, bias = False, preSoftMaxRQ = True, integerDiv = False),
+        MemPoolFuseMHSAPass(H = -1, bias = True, preSoftMaxRQ = True, integerDiv = False),
+        MemPoolSplitMHSAPass(),
+        iGELURequantMergePass(),
+        MatMulAddMergePass(),
+        SplitAddPass(),
+        MergeConstAddAndRequantPass(),
+        MemPoolMatMulRequantMergePass(),
+        MemPoolGEMMRequantMergePass(),
+        ExtractPaddingFromConvPass(),
+        ExtractPaddingFromPoolPass(),
+        # DebugPrintPass(r'.*[Mm]at[Mm]ul.*', position = 'after'),
+    ],
+    name = "MemPoolOptimizer")
 
 includeList = ["DeeployMath.h", "runtime.h", "synchronization.h"]
 
diff --git a/Deeploy/Targets/MemPool/Templates/AllocateTemplate.py b/Deeploy/Targets/MemPool/Templates/AllocateTemplate.py
index 0e78db4008..40e03a3773 100644
--- a/Deeploy/Targets/MemPool/Templates/AllocateTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/AllocateTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: AllocateTemplate.py
-#
-# Last edited: 15.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/MemPool/Templates/ConvTemplate.py b/Deeploy/Targets/MemPool/Templates/ConvTemplate.py
index 0eeddf086c..7539eebbf4 100644
--- a/Deeploy/Targets/MemPool/Templates/ConvTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/ConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: ConvTemplate.py
-#
-# Last edited: 02.12.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py b/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py
index 1a5c4f8041..27252cb74b 100644
--- a/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/DWConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: DWConvTemplate.py
-#
-# Last edited: 09.01.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/MemPool/Templates/FreeTemplate.py b/Deeploy/Targets/MemPool/Templates/FreeTemplate.py
index 899afa9896..10cffb0b18 100644
--- a/Deeploy/Targets/MemPool/Templates/FreeTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/FreeTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FreeTemplate.py
-#
-# Last edited: 15.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/MemPool/Templates/GemmTemplate.py b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py
index d4852ba00f..e5d53bd255 100644
--- a/Deeploy/Targets/MemPool/Templates/GemmTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/GemmTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: GemmTemplate.py
-#
-# Last edited: 16.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -62,9 +41,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_A"
         operatorRepresentation['ctxtBuffer_A_size'] = size
         if isinstance(A, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_A'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_A'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_A'] = operatorRepresentation['A']
 
@@ -72,9 +51,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_B"
         operatorRepresentation['ctxtBuffer_B_size'] = size
         if isinstance(B, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_B'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_B'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_B'] = operatorRepresentation['B']
 
@@ -82,9 +61,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_C"
         operatorRepresentation['ctxtBuffer_C_size'] = size
         if isinstance(C, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_C'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_C'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_C'] = operatorRepresentation['C']
 
diff --git a/Deeploy/Targets/MemPool/Templates/ITAMaxTemplate.py b/Deeploy/Targets/MemPool/Templates/ITAMaxTemplate.py
index 81d3cb1c95..6bc8596a51 100644
--- a/Deeploy/Targets/MemPool/Templates/ITAMaxTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/ITAMaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ITAMaxTemplate.py
-#
-# Last edited: 27.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -44,7 +23,7 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         size = operatorRepresentation['lastDimLength'] * 192
         name = operatorRepresentation['nodeName'] + f"_buffer"
         ctxt.hoistTransientBuffer(name, size)
-        operatorRepresentation['ctxtBuffer'] = ctxt._mangle(name)
+        operatorRepresentation['ctxtBuffer'] = name
         operatorRepresentation['ctxtBufferSize'] = size
 
         return ctxt, operatorRepresentation, [name]
diff --git a/Deeploy/Targets/MemPool/Templates/ITATemplate.py b/Deeploy/Targets/MemPool/Templates/ITATemplate.py
index e5cf5ea76e..2905210048 100644
--- a/Deeploy/Targets/MemPool/Templates/ITATemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/ITATemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ITATemplate.py
-#
-# Last edited: 16.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
@@ -322,9 +301,8 @@ def alignToContext(self, ctxt: NetworkContext,
             if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
                 operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels // 2)
 
-        operatorRepresentation['data_in_array'] = ctxt._mangle(operatorRepresentation['nodeName'] + f"_data_in_array")
-        operatorRepresentation['quant_params_array'] = ctxt._mangle(operatorRepresentation['nodeName'] +
-                                                                    f"_quant_params_array")
+        operatorRepresentation['data_in_array'] = f"{nodeName}_data_in_array"
+        operatorRepresentation['quant_params_array'] = f"{nodeName}_quant_params_array"
 
         return ctxt, operatorRepresentation, nameList
 
diff --git a/Deeploy/Targets/MemPool/Templates/MHSATemplate.py b/Deeploy/Targets/MemPool/Templates/MHSATemplate.py
index ed208d311c..c5e8322dea 100644
--- a/Deeploy/Targets/MemPool/Templates/MHSATemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/MHSATemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MHSATemplate.py
-#
-# Last edited: 30.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py b/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py
index 400d556dc3..062ba3cc9e 100644
--- a/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/MatMulTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MatMulTemplate.py
-#
-# Last edited: 13.11.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py b/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py
index 92adc98624..c57bbade77 100644
--- a/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/MaxPoolTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MaxPoolTemplate.py
-#
-# Last edited: 13.12.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py
index b336af20ba..e6a42768e8 100644
--- a/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/RQGemmTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: RQGemmTemplate.py
-#
-# Last edited: 17.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -72,9 +51,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_A"
         operatorRepresentation['ctxtBuffer_A_size'] = size
         if isinstance(A, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_A'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_A'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_A'] = operatorRepresentation['A']
 
@@ -82,9 +61,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_B"
         operatorRepresentation['ctxtBuffer_B_size'] = size
         if isinstance(B, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_B'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_B'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_B'] = operatorRepresentation['B']
 
@@ -92,9 +71,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_C"
         operatorRepresentation['ctxtBuffer_C_size'] = size
         if isinstance(C, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_C'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_C'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_C'] = operatorRepresentation['C']
 
diff --git a/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py
index d8165c6e51..76ad029fb4 100644
--- a/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/RQMatMulTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: RQMatMulTemplate.py
-#
-# Last edited: 02.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -77,9 +56,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_A"
         operatorRepresentation['ctxtBuffer_A_size'] = size
         if isinstance(A, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_A'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_A'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_A'] = operatorRepresentation['A']
 
@@ -87,9 +66,9 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         name = operatorRepresentation['nodeName'] + f"_buffer_B"
         operatorRepresentation['ctxtBuffer_B_size'] = size
         if isinstance(B, ConstantBuffer):
-            names += [name]
-            ctxt.hoistTransientBuffer(name, size)
-            operatorRepresentation['ctxtBuffer_B'] = ctxt._mangle(name)
+            bufferName = ctxt.hoistTransientBuffer(name, size)
+            names += [bufferName]
+            operatorRepresentation['ctxtBuffer_B'] = bufferName
         else:
             operatorRepresentation['ctxtBuffer_B'] = operatorRepresentation['B']
 
diff --git a/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py b/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py
index bc1ca14404..7898790af0 100644
--- a/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py
+++ b/Deeploy/Targets/MemPool/Templates/RequantShiftTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: RequantShiftTemplate.py
-#
-# Last edited: 24.04.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/Deeploy/Targets/MemPool/Templates/__init__.py b/Deeploy/Targets/MemPool/Templates/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/MemPool/Templates/__init__.py
+++ b/Deeploy/Targets/MemPool/Templates/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py
index 56b2683f9a..49f317caa4 100644
--- a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py
+++ b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/Passes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: MemPoolPasses.py
-#
-# Last edited: 13.11.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from functools import partial
 from typing import Dict, Union
diff --git a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/MemPool/TopologyOptimizationPasses/__init__.py
+++ b/Deeploy/Targets/MemPool/TopologyOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/MemPool/__init__.py b/Deeploy/Targets/MemPool/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/MemPool/__init__.py
+++ b/Deeploy/Targets/MemPool/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Neureka/Bindings.py b/Deeploy/Targets/Neureka/Bindings.py
index 2a62cd58e5..4e73df784f 100644
--- a/Deeploy/Targets/Neureka/Bindings.py
+++ b/Deeploy/Targets/Neureka/Bindings.py
@@ -1,34 +1,10 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: NeurekaBindings.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# Luka Macan, University of Bologna
-# Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.DataTypes import int8_t, int32_t, uint8_t
 from Deeploy.DeeployTypes import NodeBinding
-from Deeploy.MemoryLevelExtension.MemoryLevels import NodeMemoryLevelChecker, memoryAwareNodeBindingExtension
 from Deeploy.Targets.Generic.TypeCheckers import ConvChecker
 from Deeploy.Targets.Neureka.Templates.ConvTemplate import NeurekaDenseConv2D_Template, NeurekaDWConv2D_Template, \
     NeurekaPWConv2D_Template, NeurekaRqntDenseConv2D_Template, NeurekaRqntDWConv2D_Template, \
@@ -56,15 +32,6 @@
     for weight_type in [uint8_t, int8_t]
 ]
 
-NeurekaWmemRQSPWConv2DBindings = [
-    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM", None, None], [None]))
-    for binding in NeurekaRQSPWConv2DBindings
-]
-NeurekaWmemPWConv2DBindings = [
-    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM"], [None]))
-    for binding in NeurekaPWConv2DBindings
-]
-
 NeurekaRQSDWConv2DBindings = [
     NodeBinding(
         PULPConvChecker(
@@ -85,15 +52,6 @@
     for weight_type in [uint8_t, int8_t]
 ]
 
-NeurekaWmemRQSDWConv2DBindings = [
-    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM", None, None], [None]))
-    for binding in NeurekaRQSDWConv2DBindings
-]
-NeurekaWmemDWConv2DBindings = [
-    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM"], [None]))
-    for binding in NeurekaDWConv2DBindings
-]
-
 NeurekaRQSDenseConv2DBindings = [
     NodeBinding(
         PULPConvChecker(
@@ -114,12 +72,3 @@
     for data_in_type in [uint8_t, int8_t]
     for weight_type in [uint8_t, int8_t]
 ]
-
-NeurekaWmemRQSDenseConv2DBindings = [
-    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM", None, None], [None]))
-    for binding in NeurekaRQSDenseConv2DBindings
-]
-NeurekaWmemDenseConv2DBindings = [
-    memoryAwareNodeBindingExtension(binding, NodeMemoryLevelChecker([None, "WeightMemory_SRAM"], [None]))
-    for binding in NeurekaDenseConv2DBindings
-]
diff --git a/Deeploy/Targets/Neureka/Deployer.py b/Deeploy/Targets/Neureka/Deployer.py
index c14d1abec1..be34e1f4d3 100644
--- a/Deeploy/Targets/Neureka/Deployer.py
+++ b/Deeploy/Targets/Neureka/Deployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: Deployer.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
@@ -29,7 +8,7 @@
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
-    NeurekaNCHWtoNHWCPass, PULPNCHWtoNHWCPass
+    NCHWtoNHWCPass, PULPNCHWtoNHWCPass
 from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
 from Deeploy.Targets.Neureka.TopologyOptimizationPasses.Passes import ConvEngineDiscolorationPass, \
     NeurekaOptimizationPass
@@ -54,7 +33,7 @@ def __init__(self,
         if self.Platform.engines[0].enable3x3:
             for idx in range(len(self.loweringOptimizer.passes)):
                 if isinstance(self.loweringOptimizer.passes[idx], PULPNCHWtoNHWCPass):
-                    self.loweringOptimizer.passes[idx] = NeurekaNCHWtoNHWCPass(self.default_channels_first)
+                    self.loweringOptimizer.passes[idx] = NCHWtoNHWCPass(self.default_channels_first)
 
         self.loweringOptimizer.passes += [
             ConvEngineDiscolorationPass(),
diff --git a/Deeploy/Targets/Neureka/Engine.py b/Deeploy/Targets/Neureka/Engine.py
index 7775f415fb..2585b1a688 100644
--- a/Deeploy/Targets/Neureka/Engine.py
+++ b/Deeploy/Targets/Neureka/Engine.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: Engine.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List
 
@@ -33,27 +12,17 @@
     NeurekaRQSDenseConv2DParser, NeurekaRQSDWConv2DParser, NeurekaRQSPWConv2DParser
 from Deeploy.Targets.Neureka.Tiler import NeurekaDenseConv2DTilingReadyBindings, NeurekaDWConv2DTilingReadyBindings, \
     NeurekaPWConv2DTilingReadyBindings, NeurekaRQSDenseConv2DTilingReadyBindings, \
-    NeurekaRQSDWConv2DTilingReadyBindings, NeurekaRQSPWConv2DTilingReadyBindings, \
-    NeurekaWmemDenseConv2DTilingReadyBindings, NeurekaWmemDWConv2DTilingReadyBindings, \
-    NeurekaWmemPWConv2DTilingReadyBindings, NeurekaWmemRQSDenseConv2DTilingReadyBindings, \
-    NeurekaWmemRQSDWConv2DTilingReadyBindings, NeurekaWmemRQSPWConv2DTilingReadyBindings
+    NeurekaRQSDWConv2DTilingReadyBindings, NeurekaRQSPWConv2DTilingReadyBindings
 from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer
 
-NeurekaRqntPWConv2DMapper = NodeMapper(
-    NeurekaRQSPWConv2DParser(), NeurekaWmemRQSPWConv2DTilingReadyBindings + NeurekaRQSPWConv2DTilingReadyBindings)
-NeurekaPWConv2DMapper = NodeMapper(NeurekaPWConv2DParser(),
-                                   NeurekaWmemPWConv2DTilingReadyBindings + NeurekaPWConv2DTilingReadyBindings)
-
-NeurekaRqntDWConv2DMapper = NodeMapper(
-    NeurekaRQSDWConv2DParser(), NeurekaWmemRQSDWConv2DTilingReadyBindings + NeurekaRQSDWConv2DTilingReadyBindings)
-NeurekaDWConv2DMapper = NodeMapper(NeurekaDWConv2DParser(),
-                                   NeurekaWmemDWConv2DTilingReadyBindings + NeurekaDWConv2DTilingReadyBindings)
-
-NeurekaRqntDenseConv2DMapper = NodeMapper(
-    NeurekaRQSDenseConv2DParser(),
-    NeurekaWmemRQSDenseConv2DTilingReadyBindings + NeurekaRQSDenseConv2DTilingReadyBindings)
-NeurekaDenseConv2DMapper = NodeMapper(NeurekaDenseConv2DParser(),
-                                      NeurekaWmemDenseConv2DTilingReadyBindings + NeurekaDenseConv2DTilingReadyBindings)
+NeurekaRqntPWConv2DMapper = NodeMapper(NeurekaRQSPWConv2DParser(), NeurekaRQSPWConv2DTilingReadyBindings)
+NeurekaPWConv2DMapper = NodeMapper(NeurekaPWConv2DParser(), NeurekaPWConv2DTilingReadyBindings)
+
+NeurekaRqntDWConv2DMapper = NodeMapper(NeurekaRQSDWConv2DParser(), NeurekaRQSDWConv2DTilingReadyBindings)
+NeurekaDWConv2DMapper = NodeMapper(NeurekaDWConv2DParser(), NeurekaDWConv2DTilingReadyBindings)
+
+NeurekaRqntDenseConv2DMapper = NodeMapper(NeurekaRQSDenseConv2DParser(), NeurekaRQSDenseConv2DTilingReadyBindings)
+NeurekaDenseConv2DMapper = NodeMapper(NeurekaDenseConv2DParser(), NeurekaDenseConv2DTilingReadyBindings)
 
 NeurekaMapping = {
     'RequantizedConv':
diff --git a/Deeploy/Targets/Neureka/Parsers.py b/Deeploy/Targets/Neureka/Parsers.py
index caa1881e17..3c564c10b2 100644
--- a/Deeploy/Targets/Neureka/Parsers.py
+++ b/Deeploy/Targets/Neureka/Parsers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: Parsers.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/Neureka/Platform.py b/Deeploy/Targets/Neureka/Platform.py
index b618cabdad..e83f7f20f4 100644
--- a/Deeploy/Targets/Neureka/Platform.py
+++ b/Deeploy/Targets/Neureka/Platform.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: Platform.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Optional
 
@@ -39,7 +18,7 @@
 NeurekaOptimizer = TopologyOptimizer([
     *PULPOptimizer.passes,
     RequantizedGemmToPwPass(),
-])
+], name = "NeurekaOptimizer")
 
 
 class NeurekaConstantBuffer(ConstantBuffer):
diff --git a/Deeploy/Targets/Neureka/Templates/AllocateTemplate.py b/Deeploy/Targets/Neureka/Templates/AllocateTemplate.py
index 5a39360113..ef8897e84c 100644
--- a/Deeploy/Targets/Neureka/Templates/AllocateTemplate.py
+++ b/Deeploy/Targets/Neureka/Templates/AllocateTemplate.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: AllocateTemplate.py
-#
-# Last edited: 09.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#         Luka Macan, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Neureka/Templates/ConvTemplate.py b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py
index 2d658cc1ca..97253d6e12 100644
--- a/Deeploy/Targets/Neureka/Templates/ConvTemplate.py
+++ b/Deeploy/Targets/Neureka/Templates/ConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: ConvTemplate.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from abc import abstractmethod
 from typing import Dict, List, Tuple
diff --git a/Deeploy/Targets/Neureka/Templates/__init__.py b/Deeploy/Targets/Neureka/Templates/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Neureka/Templates/__init__.py
+++ b/Deeploy/Targets/Neureka/Templates/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py
index 8457c17e88..814024a877 100644
--- a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py
+++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDenseConstraint.py
@@ -1,48 +1,27 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: NeurekaDenseConstraint.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t, uint32_t
-from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer
 from Deeploy.Targets.Neureka.Templates.ConvTemplate import Neureka2DDenseConvTemplate, getInputAddrOffset, \
     ioStridesFromDimensions
+from Deeploy.Targets.Neureka.TileConstraints.RequantHelpers import requantAddGeometricalConstraint, requantLoadSchedule
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
 from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
-    VariableReplacementScheme, calculateRectangleOffset
+    VariableReplacementScheme, calculateFlatOffsetInBytes
 
 
 class NeurekaDenseConv2DTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        # Get to-be-tiled tensor's buffers
         inputBufferName = parseDict['data_in']
         weightBufferName = parseDict['weight']
         outputBufferName = parseDict['data_out']
@@ -51,7 +30,6 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         padding = parseDict["pads"]
         dilation = parseDict["dilations"]
 
-        # Add I/O dimensions to the model as variables
         for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
@@ -69,9 +47,15 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
         outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
         outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
         # Map output dims to inputs dims
-        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
-        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
+
+        weightBuffer = ctxt.lookup(weightBufferName)
+        if hasattr(weightBuffer, "_memoryLevel") and weightBuffer._memoryLevel == "WeightMemory_SRAM":
+            tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max())
+        else:
+            tilerModel.addConstraint(weightOutChannelVar == outputChannelVar)
 
         inputBuffer = ctxt.lookup(inputBufferName)
 
@@ -85,22 +69,11 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
 
     @staticmethod
     def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        # Get to-be-tiled tensor's buffers
-        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
-        weightBuffer = ctxt.lookup(name = parseDict['weight'])
-        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
-
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
-
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = parseDict['data_in'], dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = parseDict['data_in'], dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = parseDict['data_in'], dimIdx = 3)
 
         strides = parseDict["strides"]
-        padding = parseDict["pads"]
 
         tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
         tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
@@ -122,7 +95,7 @@ def serializeTilingSolution(
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
         outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
-        addrNames = ['data_in', 'weight', 'data_out']
+        addrNames = ['data_in', 'data_out']
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
 
@@ -130,7 +103,6 @@ def serializeTilingSolution(
         varOut = operatorRepresentation['data_out']
 
         inputInCubes = []
-        inputWeightCubes = []
         replacements: Dict[str, List[int]] = {
             "padding_y_top": [],
             "padding_y_bottom": [],
@@ -182,272 +154,15 @@ def serializeTilingSolution(
         pads = operatorRepresentation['pads']
         strides = operatorRepresentation['strides']
 
-        for cube in outputCubes:
-            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
-            (BatchSize, HSize, WSize, CSize) = cube.dims
-
-            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
-            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
-
-            replacements['padding_y_top'].append(padding_top)
-            replacements['padding_y_bottom'].append(padding_bottom)
-            replacements['padding_x_left'].append(padding_left)
-            replacements['padding_x_right'].append(padding_right)
-
-            inBSize, inHSize, inWSize, inCSize = InCube.dims
-
-            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
-                                                                             operatorRepresentation["input_bits"])
-            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
-            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
-            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
-                                                                               operatorRepresentation["output_bits"])
-            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
-            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
-
-            replacements['input_addr_offset'].append(
-                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
-
-            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DDenseConvTemplate.getCounters(
-                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
-
-            replacements["nKo"].append(nKo)
-            replacements["nKi"].append(nKi)
-            replacements["nHo"].append(nHo)
-            replacements["nWo"].append(nWo)
-            replacements["bKo"].append(bKo)
-            replacements["bKi"].append(bKi)
-            replacements["bHo"].append(bHo)
-            replacements["bWo"].append(bWo)
-            replacements["bHi"].append(bHi)
-            replacements["bWi"].append(bWi)
-
-            inputInCubes.append(InCube)
-
-            weightShape = ctxt.lookup(varWeight).shape
-            WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
-
-            inputWeightCubes.append(WeightCube)
-
-        inputLoadSchedule = []
-        outputLoadSchedule = []
-
-        for a, b in zip(inputInCubes, inputWeightCubes):
-            inputLoadSchedule.append({"data_in": a, "weight": b})
+        outputBuffer = ctxt.lookup(varOut)
+        assert isinstance(outputBuffer, VariableBuffer)
 
-        for out in outputCubes:
-            outputLoadSchedule.append({"data_out": out})
-
-        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
-        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
-
-        return variableReplacementSchedule, tilingSchedule
-
-
-class NeurekaRQSDenseConv2DTileConstraint(NeurekaDenseConv2DTileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        tilerModel = NeurekaDenseConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        outputBufferName = parseDict['data_out']
-        mulBufferName = parseDict['mul']
-        addBufferName = parseDict['add']
-
-        # Add I/O dimensions to the model as variables
-        for bufferName in [mulBufferName, addBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
-        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
-
-        tilerModel.addConstraint(outputChannelVar == addChannelVar)
-        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
-
-        return tilerModel
-
-    @classmethod
-    def serializeTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
-            targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
-            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
-
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
-        addrNames = ['mul', 'add']
-        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
-                                                         addrNames)
-        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
-
-        inputRequantCubes = []
         for cube in outputCubes:
-            (_, _, _, COffset) = cube.offset
-            (_, _, _, CSize) = cube.dims
-            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
-        newInputLoadSchedule = [{
-            **schedule, "add": requant,
-            "mul": requant
-        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
-
-        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
-                                           tilingSchedule.outputLoadSchedule)
-
-        return variableReplacementSchedule, newTilingSchedule
-
-
-class NeurekaWmemDenseConv2DTileConstraint(TileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        inputBufferName = parseDict['data_in']
-        weightBufferName = parseDict['weight']
-        outputBufferName = parseDict['data_out']
-
-        strides = parseDict["strides"]
-        padding = parseDict["pads"]
-        dilation = parseDict["dilations"]
-
-        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
-
-        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-
-        # Map output dims to inputs dims
-        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
-
-        tilerModel.addConstraint(inputHeightVar >= 3)
-        tilerModel.addConstraint(inputWidthVar >= 3)
-
-        inputBuffer = ctxt.lookup(inputBufferName)
-
-        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
-        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
-
-        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (3 - 1) - 1) // strides[0] + 1))
-        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (3 - 1) - 1) // strides[1] + 1))
-
-        return tilerModel
-
-    @staticmethod
-    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        # Get to-be-tiled tensor's buffers
-        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
-        weightBuffer = ctxt.lookup(name = parseDict['weight'])
-        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
-
-        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 0)
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
-
-        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 0)
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
-
-        strides = parseDict["strides"]
-        padding = parseDict["pads"]
-
-        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
-        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
-
-        tilerModel.addConstraint(inputChannelVar == inputChannelVar.Max())
-
-        tilerModel.addConstraint(inputHeightVar == inputHeightVar.Max(), strategy = PerformanceHint(1))
-        tilerModel.addConstraint(inputWidthVar == inputWidthVar.Max(), strategy = PerformanceHint(1))
-
-        return tilerModel
-
-    @classmethod
-    def serializeTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
-            targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
-        addrNames = ['data_in', 'data_out']
-        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
-                                                                  operatorRepresentation, addrNames)
-
-        varWeight = operatorRepresentation['weight']
-        varOut = operatorRepresentation['data_out']
-
-        inputInCubes = []
-        replacements: Dict[str, List[int]] = {
-            "padding_y_top": [],
-            "padding_y_bottom": [],
-            "padding_x_left": [],
-            "padding_x_right": [],
-            "weight_addr_offset": [],
-            "dim_im_in_x_stride": [],
-            "dim_im_in_y_stride": [],
-            "dim_im_out_x_stride": [],
-            "dim_im_out_y_stride": [],
-            "input_addr_offset": [],
-            "nKo": [],
-            "nKi": [],
-            "nHo": [],
-            "nWo": [],
-            "bKo": [],
-            "bKi": [],
-            "bHo": [],
-            "bWo": [],
-            "bHi": [],
-            "bWi": [],
-        }
-
-        replacementTypes = {
-            "padding_y_top": PointerClass(uint8_t),
-            "padding_y_bottom": PointerClass(uint8_t),
-            "padding_x_left": PointerClass(uint8_t),
-            "padding_x_right": PointerClass(uint8_t),
-            "weight_addr_offset": PointerClass(uint32_t),
-            "dim_im_in_x_stride": PointerClass(uint32_t),
-            "dim_im_in_y_stride": PointerClass(uint32_t),
-            "dim_im_out_x_stride": PointerClass(uint32_t),
-            "dim_im_out_y_stride": PointerClass(uint32_t),
-            "input_addr_offset": PointerClass(uint32_t),
-            "nKo": PointerClass(uint16_t),
-            "nKi": PointerClass(uint16_t),
-            "nHo": PointerClass(uint16_t),
-            "nWo": PointerClass(uint16_t),
-            "bKo": PointerClass(uint16_t),
-            "bKi": PointerClass(uint16_t),
-            "bHo": PointerClass(uint16_t),
-            "bWo": PointerClass(uint16_t),
-            "bHi": PointerClass(uint16_t),
-            "bWi": PointerClass(uint16_t),
-        }
-
-        weightH = operatorRepresentation['dim_kernel_y']
-        weightW = operatorRepresentation['dim_kernel_x']
-        weightC = operatorRepresentation['ch_im_in']
-
-        pads = operatorRepresentation['pads']
-        strides = operatorRepresentation['strides']
-
-        for absoluteCube in absoluteOutputCubes:
-            cube = absoluteCube.rectangle
             (BatchOffset, HOffset, WOffset, COffset) = cube.offset
             (BatchSize, HSize, WSize, CSize) = cube.dims
 
             InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
+                                                                          cube, outputBuffer.shape)
             padding_left, padding_right, padding_top, padding_bottom = padding_tuple
 
             replacements['padding_y_top'].append(padding_top)
@@ -485,11 +200,6 @@ def serializeTilingSolution(
 
             inputInCubes.append(InCube)
 
-            _, _, _, absoluteCOffset = absoluteCube.absoluteOffset
-            weightShape = ctxt.lookup(varWeight).shape
-            WeightCube = HyperRectangle((absoluteCOffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
-            replacements['weight_addr_offset'].append(calculateRectangleOffset(WeightCube, ctxt.lookup(varWeight)))
-
         inputLoadSchedule = []
         outputLoadSchedule = []
 
@@ -499,34 +209,39 @@ def serializeTilingSolution(
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
 
+        weightBuffer = ctxt.lookup(varWeight)
+        assert isinstance(weightBuffer, VariableBuffer)
+        weightShape = weightBuffer.shape
+
+        if hasattr(weightBuffer, "_memoryLevel") and weightBuffer._memoryLevel == "WeightMemory_SRAM":
+            replacements['weight_addr_offset'] = []
+            replacementTypes['weight_addr_offset'] = PointerClass(uint32_t)
+            for absoluteCube in absoluteOutputCubes:
+                COffset, CSize = absoluteCube.absoluteOffset[-1], absoluteCube.rectangle.dims[-1]
+                WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+                replacements['weight_addr_offset'].append(calculateFlatOffsetInBytes(WeightCube, weightBuffer))
+        else:
+            inputWeightBaseOffsets, outputWeightBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                                  operatorRepresentation, ['weight'])
+            inputBaseOffsets.update(inputWeightBaseOffsets)
+            outputBaseOffsets.update(outputWeightBaseOffsets)
+
+            for cube, load in zip(outputCubes, inputLoadSchedule):
+                COffset, CSize = cube.offset[-1], cube.dims[-1]
+                load['weight'] = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+
         tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
         return variableReplacementSchedule, tilingSchedule
 
 
-class NeurekaWmemRQSDenseConv2DTileConstraint(NeurekaWmemDenseConv2DTileConstraint):
+class NeurekaRQSDenseConv2DTileConstraint(NeurekaDenseConv2DTileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        tilerModel = NeurekaWmemDenseConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        outputBufferName = parseDict['data_out']
-        mulBufferName = parseDict['mul']
-        addBufferName = parseDict['add']
-
-        # Add I/O dimensions to the model as variables
-        for bufferName in [mulBufferName, addBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
-        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
-
-        tilerModel.addConstraint(outputChannelVar == addChannelVar)
-        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
-
-        return tilerModel
+        tilerModel = NeurekaDenseConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+        return requantAddGeometricalConstraint(tilerModel, parseDict, ctxt)
 
     @classmethod
     def serializeTilingSolution(
@@ -536,22 +251,16 @@ def serializeTilingSolution(
         variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
             tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
 
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
         addrNames = ['mul', 'add']
         inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
                                                          addrNames)
         newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
 
-        inputRequantCubes = []
-        for cube in outputCubes:
-            (_, _, _, COffset) = cube.offset
-            (_, _, _, CSize) = cube.dims
-            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        requantSchedule = requantLoadSchedule(absoluteOutputCubes, ctxt, operatorRepresentation)
         newInputLoadSchedule = [{
-            **schedule, "add": requant,
-            "mul": requant
-        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+            **load,
+            **rqLoad
+        } for load, rqLoad in zip(tilingSchedule.inputLoadSchedule, requantSchedule)]
 
         newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
                                            tilingSchedule.outputLoadSchedule)
diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py
index 6364afcdf7..fd5d791119 100644
--- a/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py
+++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaDepthwiseConstraint.py
@@ -1,48 +1,27 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: NeurekaDepthwiseConstraint.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t, uint32_t
-from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer
 from Deeploy.Targets.Neureka.Templates.ConvTemplate import Neureka2DDWConvTemplate, getInputAddrOffset, \
     ioStridesFromDimensions
+from Deeploy.Targets.Neureka.TileConstraints.RequantHelpers import requantAddGeometricalConstraint, requantLoadSchedule
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
 from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
-    VariableReplacementScheme, calculateRectangleOffset
+    VariableReplacementScheme, calculateFlatOffsetInBytes
 
 
 class NeurekaDWConv2DTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        # Get to-be-tiled tensor's buffers
         inputBufferName = parseDict['data_in']
         weightBufferName = parseDict['weight']
         outputBufferName = parseDict['data_out']
@@ -51,7 +30,6 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         padding = parseDict["pads"]
         dilation = parseDict["dilations"]
 
-        # Add I/O dimensions to the model as variables
         for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
@@ -61,263 +39,6 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
 
         weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
-        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
-        weightBitsVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
-        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
-
-        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-        # Map output dims to inputs dims
-        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
-        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
-        tilerModel.addConstraint(outputChannelVar == inputChannelVar)  # Output Channel
-
-        tilerModel.addConstraint(inputHeightVar >= 3)
-        tilerModel.addConstraint(inputWidthVar >= 3)
-
-        inputBuffer = ctxt.lookup(inputBufferName)
-
-        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
-        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
-
-        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (3 - 1) - 1) // strides[0] + 1))
-        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (3 - 1) - 1) // strides[1] + 1))
-
-        return tilerModel
-
-    @staticmethod
-    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        # Get to-be-tiled tensor's buffers
-        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
-        weightBuffer = ctxt.lookup(name = parseDict['weight'])
-        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
-
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
-
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
-
-        strides = parseDict["strides"]
-        padding = parseDict["pads"]
-
-        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
-        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
-
-        tilerModel.addConstraint(inputHeightVar == inputHeightVar.Max(), strategy = PerformanceHint(1))
-        tilerModel.addConstraint(inputWidthVar == inputWidthVar.Max(), strategy = PerformanceHint(1))
-
-        return tilerModel
-
-    @classmethod
-    def serializeTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
-            targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
-        addrNames = ['data_in', 'weight', 'data_out']
-        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
-                                                                  operatorRepresentation, addrNames)
-
-        varWeight = operatorRepresentation['weight']
-        varOut = operatorRepresentation['data_out']
-
-        inputInCubes = []
-        inputWeightCubes = []
-        replacements: Dict[str, List[int]] = {
-            "padding_y_top": [],
-            "padding_y_bottom": [],
-            "padding_x_left": [],
-            "padding_x_right": [],
-            "dim_im_in_x_stride": [],
-            "dim_im_in_y_stride": [],
-            "dim_im_out_x_stride": [],
-            "dim_im_out_y_stride": [],
-            "input_addr_offset": [],
-            "nKo": [],
-            "nKi": [],
-            "nHo": [],
-            "nWo": [],
-            "bKo": [],
-            "bKi": [],
-            "bHo": [],
-            "bWo": [],
-            "bHi": [],
-            "bWi": [],
-        }
-
-        replacementTypes = {
-            "padding_y_top": PointerClass(uint8_t),
-            "padding_y_bottom": PointerClass(uint8_t),
-            "padding_x_left": PointerClass(uint8_t),
-            "padding_x_right": PointerClass(uint8_t),
-            "dim_im_in_x_stride": PointerClass(uint32_t),
-            "dim_im_in_y_stride": PointerClass(uint32_t),
-            "dim_im_out_x_stride": PointerClass(uint32_t),
-            "dim_im_out_y_stride": PointerClass(uint32_t),
-            "input_addr_offset": PointerClass(uint32_t),
-            "nKo": PointerClass(uint16_t),
-            "nKi": PointerClass(uint16_t),
-            "nHo": PointerClass(uint16_t),
-            "nWo": PointerClass(uint16_t),
-            "bKo": PointerClass(uint16_t),
-            "bKi": PointerClass(uint16_t),
-            "bHo": PointerClass(uint16_t),
-            "bWo": PointerClass(uint16_t),
-            "bHi": PointerClass(uint16_t),
-            "bWi": PointerClass(uint16_t),
-        }
-
-        weightH = operatorRepresentation['dim_kernel_y']
-        weightW = operatorRepresentation['dim_kernel_x']
-        weightC = operatorRepresentation['ch_im_in']
-
-        pads = operatorRepresentation['pads']
-        strides = operatorRepresentation['strides']
-
-        for cube in outputCubes:
-            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
-            (BatchSize, HSize, WSize, CSize) = cube.dims
-
-            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
-            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
-
-            replacements['padding_y_top'].append(padding_top)
-            replacements['padding_y_bottom'].append(padding_bottom)
-            replacements['padding_x_left'].append(padding_left)
-            replacements['padding_x_right'].append(padding_right)
-
-            inBSize, inHSize, inWSize, inCSize = InCube.dims
-
-            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
-                                                                             operatorRepresentation["input_bits"])
-            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
-            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
-            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
-                                                                               operatorRepresentation["output_bits"])
-            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
-            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
-
-            replacements['input_addr_offset'].append(
-                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
-
-            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DDWConvTemplate.getCounters(
-                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
-
-            replacements["nKo"].append(nKo)
-            replacements["nKi"].append(nKi)
-            replacements["nHo"].append(nHo)
-            replacements["nWo"].append(nWo)
-            replacements["bKo"].append(bKo)
-            replacements["bKi"].append(bKi)
-            replacements["bHo"].append(bHo)
-            replacements["bWo"].append(bWo)
-            replacements["bHi"].append(bHi)
-            replacements["bWi"].append(bWi)
-
-            inputInCubes.append(InCube)
-
-            weightShape = ctxt.lookup(varWeight).shape
-            WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
-
-            inputWeightCubes.append(WeightCube)
-
-        inputLoadSchedule = []
-        outputLoadSchedule = []
-
-        for a, b in zip(inputInCubes, inputWeightCubes):
-            inputLoadSchedule.append({"data_in": a, "weight": b})
-
-        for out in outputCubes:
-            outputLoadSchedule.append({"data_out": out})
-
-        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
-        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
-
-        return variableReplacementSchedule, tilingSchedule
-
-
-class NeurekaRQSDWConv2DTileConstraint(NeurekaDWConv2DTileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        tilerModel = NeurekaDWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        outputBufferName = parseDict['data_out']
-        mulBufferName = parseDict['mul']
-        addBufferName = parseDict['add']
-
-        # Add I/O dimensions to the model as variables
-        for bufferName in [mulBufferName, addBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
-        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
-
-        tilerModel.addConstraint(outputChannelVar == addChannelVar)
-        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
-
-        return tilerModel
-
-    @classmethod
-    def serializeTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
-            targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
-            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
-
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
-        addrNames = ['mul', 'add']
-        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
-                                                         addrNames)
-        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
-
-        inputRequantCubes = []
-        for cube in outputCubes:
-            (_, _, _, COffset) = cube.offset
-            (_, _, _, CSize) = cube.dims
-            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
-        newInputLoadSchedule = [{
-            **schedule, "add": requant,
-            "mul": requant
-        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
-
-        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
-                                           tilingSchedule.outputLoadSchedule)
-
-        return variableReplacementSchedule, newTilingSchedule
-
-
-class NeurekaWmemDWConv2DTileConstraint(TileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        inputBufferName = parseDict['data_in']
-        weightBufferName = parseDict['weight']
-        outputBufferName = parseDict['data_out']
-
-        strides = parseDict["strides"]
-        padding = parseDict["pads"]
-        dilation = parseDict["dilations"]
-
-        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
 
         outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
         outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
@@ -328,6 +49,12 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         tilerModel.addConstraint(outputBatchVar == inputBatchVar)
         tilerModel.addConstraint(outputChannelVar == inputChannelVar)
 
+        weightBuffer = ctxt.lookup(weightBufferName)
+        if hasattr(weightBuffer, "_memoryLevel") and weightBuffer._memoryLevel == "WeightMemory_SRAM":
+            tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max())
+        else:
+            tilerModel.addConstraint(weightOutChannelVar == outputChannelVar)
+
         tilerModel.addConstraint(inputHeightVar >= 3)
         tilerModel.addConstraint(inputWidthVar >= 3)
 
@@ -343,24 +70,10 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
 
     @staticmethod
     def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        # Get to-be-tiled tensor's buffers
-        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
-        weightBuffer = ctxt.lookup(name = parseDict['weight'])
-        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
-
-        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 0)
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
-
-        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 0)
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = parseDict['data_in'], dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = parseDict['data_in'], dimIdx = 2)
 
         strides = parseDict["strides"]
-        padding = parseDict["pads"]
 
         tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
         tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
@@ -390,7 +103,6 @@ def serializeTilingSolution(
             "padding_y_bottom": [],
             "padding_x_left": [],
             "padding_x_right": [],
-            "weight_addr_offset": [],
             "dim_im_in_x_stride": [],
             "dim_im_in_y_stride": [],
             "dim_im_out_x_stride": [],
@@ -413,7 +125,6 @@ def serializeTilingSolution(
             "padding_y_bottom": PointerClass(uint8_t),
             "padding_x_left": PointerClass(uint8_t),
             "padding_x_right": PointerClass(uint8_t),
-            "weight_addr_offset": PointerClass(uint32_t),
             "dim_im_in_x_stride": PointerClass(uint32_t),
             "dim_im_in_y_stride": PointerClass(uint32_t),
             "dim_im_out_x_stride": PointerClass(uint32_t),
@@ -438,8 +149,10 @@ def serializeTilingSolution(
         pads = operatorRepresentation['pads']
         strides = operatorRepresentation['strides']
 
-        for absoluteCube in absoluteOutputCubes:
-            cube = absoluteCube.rectangle
+        outputBuffer = ctxt.lookup(varOut)
+        assert isinstance(outputBuffer, VariableBuffer)
+
+        for cube in outputCubes:
             (BatchOffset, HOffset, WOffset, COffset) = cube.offset
             (BatchSize, HSize, WSize, CSize) = cube.dims
 
@@ -483,11 +196,6 @@ def serializeTilingSolution(
 
             inputInCubes.append(InCube)
 
-            _, _, _, absoluteCOffset = absoluteCube.absoluteOffset
-            weightShape = ctxt.lookup(varWeight).shape
-            WeightCube = HyperRectangle((absoluteCOffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
-            replacements['weight_addr_offset'].append(calculateRectangleOffset(WeightCube, ctxt.lookup(varWeight)))
-
         inputLoadSchedule = []
         outputLoadSchedule = []
 
@@ -497,34 +205,39 @@ def serializeTilingSolution(
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
 
+        weightBuffer = ctxt.lookup(varWeight)
+        assert isinstance(weightBuffer, VariableBuffer)
+        weightShape = weightBuffer.shape
+
+        if hasattr(weightBuffer, "_memoryLevel") and weightBuffer._memoryLevel == "WeightMemory_SRAM":
+            replacements['weight_addr_offset'] = []
+            replacementTypes['weight_addr_offset'] = PointerClass(uint32_t)
+            for absoluteCube in absoluteOutputCubes:
+                COffset, CSize = absoluteCube.absoluteOffset[-1], absoluteCube.rectangle.dims[-1]
+                WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+                replacements['weight_addr_offset'].append(calculateFlatOffsetInBytes(WeightCube, weightBuffer))
+        else:
+            inputWeightBaseOffsets, outputWeightBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                                  operatorRepresentation, ['weight'])
+            inputBaseOffsets.update(inputWeightBaseOffsets)
+            outputBaseOffsets.update(outputWeightBaseOffsets)
+
+            for cube, load in zip(outputCubes, inputLoadSchedule):
+                COffset, CSize = cube.offset[-1], cube.dims[-1]
+                load['weight'] = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+
         tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
         return variableReplacementSchedule, tilingSchedule
 
 
-class NeurekaWmemRQSDWConv2DTileConstraint(NeurekaWmemDWConv2DTileConstraint):
+class NeurekaRQSDWConv2DTileConstraint(NeurekaDWConv2DTileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        tilerModel = NeurekaWmemDWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        outputBufferName = parseDict['data_out']
-        mulBufferName = parseDict['mul']
-        addBufferName = parseDict['add']
-
-        # Add I/O dimensions to the model as variables
-        for bufferName in [mulBufferName, addBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
-        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
-
-        tilerModel.addConstraint(outputChannelVar == addChannelVar)
-        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
-
-        return tilerModel
+        tilerModel = NeurekaDWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+        return requantAddGeometricalConstraint(tilerModel, parseDict, ctxt)
 
     @classmethod
     def serializeTilingSolution(
@@ -534,22 +247,16 @@ def serializeTilingSolution(
         variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
             tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
 
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
         addrNames = ['mul', 'add']
         inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
                                                          addrNames)
         newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
 
-        inputRequantCubes = []
-        for cube in outputCubes:
-            (_, _, _, COffset) = cube.offset
-            (_, _, _, CSize) = cube.dims
-            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        requantSchedule = requantLoadSchedule(absoluteOutputCubes, ctxt, operatorRepresentation)
         newInputLoadSchedule = [{
-            **schedule, "add": requant,
-            "mul": requant
-        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+            **load,
+            **rqLoad
+        } for load, rqLoad in zip(tilingSchedule.inputLoadSchedule, requantSchedule)]
 
         newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
                                            tilingSchedule.outputLoadSchedule)
diff --git a/Deeploy/Targets/Neureka/TileConstraints/NeurekaPointwiseConstraint.py b/Deeploy/Targets/Neureka/TileConstraints/NeurekaPointwiseConstraint.py
index 303cc6a4e7..61a5b8756a 100644
--- a/Deeploy/Targets/Neureka/TileConstraints/NeurekaPointwiseConstraint.py
+++ b/Deeploy/Targets/Neureka/TileConstraints/NeurekaPointwiseConstraint.py
@@ -1,356 +1,55 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: NeurekaPointwiseConstraint.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t, uint32_t
-from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer
 from Deeploy.Targets.Neureka.Templates.ConvTemplate import Neureka2DPWConvTemplate, getInputAddrOffset, \
     ioStridesFromDimensions
+from Deeploy.Targets.Neureka.TileConstraints.RequantHelpers import requantAddGeometricalConstraint, requantLoadSchedule
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
 from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
-    VariableReplacementScheme, calculateRectangleOffset
+    VariableReplacementScheme, calculateFlatOffsetInBytes
 
 
 class NeurekaPWConv2DTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        # Get to-be-tiled tensor's buffers
         inputBufferName = parseDict['data_in']
         weightBufferName = parseDict['weight']
         outputBufferName = parseDict['data_out']
 
-        # Add I/O dimensions to the model as variables
         for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
         inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
         inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
         inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
 
         weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
-        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
-        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
 
         outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
         outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
         outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
         outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
 
-        # Map output dims to inputs dims
-        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
-        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
-        tilerModel.addConstraint(outputHeightVar == inputHeightVar)
-        tilerModel.addConstraint(outputWidthVar == inputWidthVar)
-
-        tilerModel.addConstraint(inputHeightVar >= 1)
-        tilerModel.addConstraint(inputWidthVar >= 1)
-
-        return tilerModel
-
-    @staticmethod
-    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        # Get to-be-tiled tensor's buffers
-        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
-        weightBuffer = ctxt.lookup(name = parseDict['weight'])
-        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
-
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
-        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
-
-        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
-        weightInChannelMajorVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
-        weightBandwidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
-
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 2)
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = 3)
-
-        strides = parseDict["strides"]
-        padding = parseDict["pads"]
-
-        # LMACAN: Force full input channel to avoid partial results
-        tilerModel.addConstraint(inputChannelVar == inputChannelVar.Max())
-        tilerModel.addConstraint(weightInChannelMajorVar == weightInChannelMajorVar.Max())
-        tilerModel.addConstraint(weightBandwidthVar == weightBandwidthVar.Max())
-
-        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
-        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
-
-        # N-EUREKA tile constraints to align with N-EUREKA's hardware subtiling
-        if parseDict["dim_im_out_x"] > 6:
-            tilerModel.addTileSizeDivisibleConstraint(parseDict,
-                                                      "dim_im_out_x",
-                                                      outputHeightVar,
-                                                      6,
-                                                      strategy = PerformanceHint(priority = 3))
-        else:
-            tilerModel.addConstraint(outputHeightVar == outputHeightVar.Max(), strategy = PerformanceHint(priority = 3))
-
-        if parseDict["dim_im_out_y"] > 6:
-            tilerModel.addTileSizeDivisibleConstraint(parseDict,
-                                                      "dim_im_out_y",
-                                                      outputWidthVar,
-                                                      6,
-                                                      strategy = PerformanceHint(priority = 2))
-        else:
-            tilerModel.addConstraint(outputWidthVar == outputWidthVar.Max(), strategy = PerformanceHint(priority = 2))
-
-        if parseDict["ch_im_out"] > 32:
-            tilerModel.addTileSizeDivisibleConstraint(parseDict,
-                                                      "ch_im_out",
-                                                      outputChannelVar,
-                                                      32,
-                                                      strategy = PerformanceHint(priority = 1))
-        else:
-            tilerModel.addConstraint(outputChannelVar == outputChannelVar.Max(),
-                                     strategy = PerformanceHint(priority = 1))
-
-        return tilerModel
-
-    @classmethod
-    def serializeTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
-            targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
-        addrNames = ['data_in', 'weight', 'data_out']
-        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
-                                                                  operatorRepresentation, addrNames)
-
-        varWeight = operatorRepresentation['weight']
-        varOut = operatorRepresentation['data_out']
-
-        inputInCubes = []
-        inputWeightCubes = []
-        replacements: Dict[str, List[int]] = {
-            "padding_y_top": [],
-            "padding_y_bottom": [],
-            "padding_x_left": [],
-            "padding_x_right": [],
-            "dim_im_in_x_stride": [],
-            "dim_im_in_y_stride": [],
-            "dim_im_out_x_stride": [],
-            "dim_im_out_y_stride": [],
-            "input_addr_offset": [],
-            "nKo": [],
-            "nKi": [],
-            "nHo": [],
-            "nWo": [],
-            "bKo": [],
-            "bKi": [],
-            "bHo": [],
-            "bWo": [],
-            "bHi": [],
-            "bWi": [],
-        }
-
-        replacementTypes = {
-            "padding_y_top": PointerClass(uint8_t),
-            "padding_y_bottom": PointerClass(uint8_t),
-            "padding_x_left": PointerClass(uint8_t),
-            "padding_x_right": PointerClass(uint8_t),
-            "dim_im_in_x_stride": PointerClass(uint32_t),
-            "dim_im_in_y_stride": PointerClass(uint32_t),
-            "dim_im_out_x_stride": PointerClass(uint32_t),
-            "dim_im_out_y_stride": PointerClass(uint32_t),
-            "input_addr_offset": PointerClass(uint32_t),
-            "nKo": PointerClass(uint16_t),
-            "nKi": PointerClass(uint16_t),
-            "nHo": PointerClass(uint16_t),
-            "nWo": PointerClass(uint16_t),
-            "bKo": PointerClass(uint16_t),
-            "bKi": PointerClass(uint16_t),
-            "bHo": PointerClass(uint16_t),
-            "bWo": PointerClass(uint16_t),
-            "bHi": PointerClass(uint16_t),
-            "bWi": PointerClass(uint16_t),
-        }
-
-        weightH = operatorRepresentation['dim_kernel_y']
-        weightW = operatorRepresentation['dim_kernel_x']
-        weightC = operatorRepresentation['ch_im_in']
-
-        pads = operatorRepresentation['pads']
-        strides = operatorRepresentation['strides']
-
-        for cube in outputCubes:
-            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
-            (BatchSize, HSize, WSize, CSize) = cube.dims
-
-            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
-            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
-
-            replacements['padding_y_top'].append(padding_top)
-            replacements['padding_y_bottom'].append(padding_bottom)
-            replacements['padding_x_left'].append(padding_left)
-            replacements['padding_x_right'].append(padding_right)
-
-            inBSize, inHSize, inWSize, inCSize = InCube.dims
-
-            dim_im_in_x_stride, dim_im_in_y_stride = ioStridesFromDimensions(inWSize, inCSize,
-                                                                             operatorRepresentation["input_bits"])
-            replacements['dim_im_in_x_stride'].append(dim_im_in_x_stride)
-            replacements['dim_im_in_y_stride'].append(dim_im_in_y_stride)
-            dim_im_out_x_stride, dim_im_out_y_stride = ioStridesFromDimensions(WSize, CSize,
-                                                                               operatorRepresentation["output_bits"])
-            replacements['dim_im_out_x_stride'].append(dim_im_out_x_stride)
-            replacements['dim_im_out_y_stride'].append(dim_im_out_y_stride)
-
-            replacements['input_addr_offset'].append(
-                getInputAddrOffset(inWSize, dim_im_in_y_stride, padding_top, padding_left))
-
-            nKo, nKi, nHo, nWo, bKo, bKi, bHo, bWo, bHi, bWi = Neureka2DPWConvTemplate.getCounters(
-                inCSize, HSize, WSize, CSize, padding_bottom, padding_right, operatorRepresentation)
-
-            replacements["nKo"].append(nKo)
-            replacements["nKi"].append(nKi)
-            replacements["nHo"].append(nHo)
-            replacements["nWo"].append(nWo)
-            replacements["bKo"].append(bKo)
-            replacements["bKi"].append(bKi)
-            replacements["bHo"].append(bHo)
-            replacements["bWo"].append(bWo)
-            replacements["bHi"].append(bHi)
-            replacements["bWi"].append(bWi)
-
-            inputInCubes.append(InCube)
-
-            weightShape = ctxt.lookup(varWeight).shape
-            WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
-
-            inputWeightCubes.append(WeightCube)
-
-        inputLoadSchedule = []
-        outputLoadSchedule = []
-
-        for a, b in zip(inputInCubes, inputWeightCubes):
-            inputLoadSchedule.append({"data_in": a, "weight": b})
-
-        for out in outputCubes:
-            outputLoadSchedule.append({"data_out": out})
-
-        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
-        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
-
-        return variableReplacementSchedule, tilingSchedule
-
-
-class NeurekaRQSPWConv2DTileConstraint(NeurekaPWConv2DTileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        tilerModel = NeurekaPWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        outputBufferName = parseDict['data_out']
-        mulBufferName = parseDict['mul']
-        addBufferName = parseDict['add']
-
-        # Add I/O dimensions to the model as variables
-        for bufferName in [mulBufferName, addBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
-        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
-
-        tilerModel.addConstraint(outputChannelVar == addChannelVar)
-        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
-
-        return tilerModel
-
-    @classmethod
-    def serializeTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
-            targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
-            tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
-
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
-        addrNames = ['mul', 'add']
-        inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
-                                                         addrNames)
-        newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
-
-        inputRequantCubes = []
-        for cube in outputCubes:
-            (_, _, _, COffset) = cube.offset
-            (_, _, _, CSize) = cube.dims
-            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
-        newInputLoadSchedule = [{
-            **schedule, "add": requant,
-            "mul": requant
-        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
-
-        newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
-                                           tilingSchedule.outputLoadSchedule)
-
-        return variableReplacementSchedule, newTilingSchedule
-
-
-class NeurekaWmemPWConv2DTileConstraint(TileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        inputBufferName = parseDict['data_in']
-        weightBufferName = parseDict['weight']
-        outputBufferName = parseDict['data_out']
-
-        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
-        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
-        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
-
-        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
-
-        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
-        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
-        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
-
         # Map output dims to inputs dims
         tilerModel.addConstraint(outputBatchVar == inputBatchVar)
         tilerModel.addConstraint(outputHeightVar == inputHeightVar)
         tilerModel.addConstraint(outputWidthVar == inputWidthVar)
 
-        # Don't tile weights in weight memory
-        tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max())
+        weightBuffer = ctxt.lookup(weightBufferName)
+        if hasattr(weightBuffer, "_memoryLevel") and weightBuffer._memoryLevel == "WeightMemory_SRAM":
+            tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max())
+        else:
+            tilerModel.addConstraint(weightOutChannelVar == outputChannelVar)
 
         tilerModel.addConstraint(inputHeightVar >= 1)
         tilerModel.addConstraint(inputWidthVar >= 1)
@@ -439,7 +138,6 @@ def serializeTilingSolution(
             "padding_y_bottom": [],
             "padding_x_left": [],
             "padding_x_right": [],
-            "weight_addr_offset": [],
             "dim_im_in_x_stride": [],
             "dim_im_in_y_stride": [],
             "dim_im_out_x_stride": [],
@@ -462,7 +160,6 @@ def serializeTilingSolution(
             "padding_y_bottom": PointerClass(uint8_t),
             "padding_x_left": PointerClass(uint8_t),
             "padding_x_right": PointerClass(uint8_t),
-            "weight_addr_offset": PointerClass(uint32_t),
             "dim_im_in_x_stride": PointerClass(uint32_t),
             "dim_im_in_y_stride": PointerClass(uint32_t),
             "dim_im_out_x_stride": PointerClass(uint32_t),
@@ -487,14 +184,15 @@ def serializeTilingSolution(
         pads = operatorRepresentation['pads']
         strides = operatorRepresentation['strides']
 
-        for absoluteCube in absoluteOutputCubes:
-            cube = absoluteCube.rectangle
+        outputBuffer = ctxt.lookup(varOut)
+        assert isinstance(outputBuffer, VariableBuffer)
+
+        for cube in outputCubes:
             (BatchOffset, HOffset, WOffset, COffset) = cube.offset
             (BatchSize, HSize, WSize, CSize) = cube.dims
 
             InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
+                                                                          cube, outputBuffer.shape)
             padding_left, padding_right, padding_top, padding_bottom = padding_tuple
 
             replacements['padding_y_top'].append(padding_top)
@@ -532,11 +230,6 @@ def serializeTilingSolution(
 
             inputInCubes.append(InCube)
 
-            _, _, _, absoluteCOffset = absoluteCube.absoluteOffset
-            weightShape = ctxt.lookup(varWeight).shape
-            WeightCube = HyperRectangle((absoluteCOffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
-            replacements['weight_addr_offset'].append(calculateRectangleOffset(WeightCube, ctxt.lookup(varWeight)))
-
         inputLoadSchedule = []
         outputLoadSchedule = []
 
@@ -546,34 +239,39 @@ def serializeTilingSolution(
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
 
+        weightBuffer = ctxt.lookup(varWeight)
+        assert isinstance(weightBuffer, VariableBuffer)
+        weightShape = weightBuffer.shape
+
+        if hasattr(weightBuffer, "_memoryLevel") and weightBuffer._memoryLevel == "WeightMemory_SRAM":
+            replacements['weight_addr_offset'] = []
+            replacementTypes['weight_addr_offset'] = PointerClass(uint32_t)
+            for absoluteCube in absoluteOutputCubes:
+                COffset, CSize = absoluteCube.absoluteOffset[-1], absoluteCube.rectangle.dims[-1]
+                WeightCube = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+                replacements['weight_addr_offset'].append(calculateFlatOffsetInBytes(WeightCube, weightBuffer))
+        else:
+            inputWeightBaseOffsets, outputWeightBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                                  operatorRepresentation, ['weight'])
+            inputBaseOffsets.update(inputWeightBaseOffsets)
+            outputBaseOffsets.update(outputWeightBaseOffsets)
+
+            for cube, load in zip(outputCubes, inputLoadSchedule):
+                COffset, CSize = cube.offset[-1], cube.dims[-1]
+                load['weight'] = HyperRectangle((COffset, 0, 0), (CSize, weightShape[-2], weightShape[-1]))
+
         tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
         return variableReplacementSchedule, tilingSchedule
 
 
-class NeurekaWmemRQSPWConv2DTileConstraint(NeurekaWmemPWConv2DTileConstraint):
+class NeurekaRQSPWConv2DTileConstraint(NeurekaPWConv2DTileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-        tilerModel = NeurekaWmemPWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        outputBufferName = parseDict['data_out']
-        mulBufferName = parseDict['mul']
-        addBufferName = parseDict['add']
-
-        # Add I/O dimensions to the model as variables
-        for bufferName in [mulBufferName, addBufferName]:
-            tilerModel.addTensorDimToModel(ctxt, bufferName)
-
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
-        addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = 0)
-        mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = 0)
-
-        tilerModel.addConstraint(outputChannelVar == addChannelVar)
-        tilerModel.addConstraint(outputChannelVar == mulChannelVar)
-
-        return tilerModel
+        tilerModel = NeurekaPWConv2DTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
+        return requantAddGeometricalConstraint(tilerModel, parseDict, ctxt)
 
     @classmethod
     def serializeTilingSolution(
@@ -583,22 +281,16 @@ def serializeTilingSolution(
         variableReplacementSchedule, tilingSchedule = super().serializeTilingSolution(
             tilingSolution, absoluteOutputCubes, targetMemLevel, ctxt, operatorRepresentation)
 
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
         addrNames = ['mul', 'add']
         inputRequantBaseOffsets, _ = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
                                                          addrNames)
         newInputBaseOffsets = {**tilingSchedule.inputBaseOffsets, **inputRequantBaseOffsets}
 
-        inputRequantCubes = []
-        for cube in outputCubes:
-            (_, _, _, COffset) = cube.offset
-            (_, _, _, CSize) = cube.dims
-            inputRequantCubes.append(HyperRectangle((COffset,), (CSize,)))
+        requantSchedule = requantLoadSchedule(absoluteOutputCubes, ctxt, operatorRepresentation)
         newInputLoadSchedule = [{
-            **schedule, "add": requant,
-            "mul": requant
-        } for schedule, requant in zip(tilingSchedule.inputLoadSchedule, inputRequantCubes)]
+            **load,
+            **rqLoad
+        } for load, rqLoad in zip(tilingSchedule.inputLoadSchedule, requantSchedule)]
 
         newTilingSchedule = TilingSchedule(newInputBaseOffsets, tilingSchedule.outputBaseOffsets, newInputLoadSchedule,
                                            tilingSchedule.outputLoadSchedule)
diff --git a/Deeploy/Targets/Neureka/TileConstraints/RequantHelpers.py b/Deeploy/Targets/Neureka/TileConstraints/RequantHelpers.py
new file mode 100644
index 0000000000..e1e4b16aea
--- /dev/null
+++ b/Deeploy/Targets/Neureka/TileConstraints/RequantHelpers.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle
+
+
+def requantAddGeometricalConstraint(tilerModel: TilerModel, operatorRepresentation: OperatorRepresentation,
+                                    ctxt: NetworkContext) -> TilerModel:
+    outputBufferName = operatorRepresentation['data_out']
+    mulBufferName = operatorRepresentation['mul']
+    addBufferName = operatorRepresentation['add']
+
+    # Add I/O dimensions to the model as variables
+    for bufferName in [mulBufferName, addBufferName]:
+        tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+    outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+    addBuffer = ctxt.lookup(addBufferName)
+    addChannelVar = tilerModel.getTensorDimVar(tensorName = addBufferName, dimIdx = len(addBuffer.shape) - 1)
+    mulBuffer = ctxt.lookup(mulBufferName)
+    mulChannelVar = tilerModel.getTensorDimVar(tensorName = mulBufferName, dimIdx = len(mulBuffer.shape) - 1)
+
+    tilerModel.addConstraint(outputChannelVar == addChannelVar)
+    tilerModel.addConstraint(outputChannelVar == mulChannelVar)
+
+    return tilerModel
+
+
+def requantLoadSchedule(
+    absoluteOutputCubes: List[AbsoluteHyperRectangle],
+    ctxt: NetworkContext,
+    operatorRepresentation: OperatorRepresentation,
+) -> List[Dict[str, HyperRectangle]]:
+    outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+    shapeMul = ctxt.lookup(operatorRepresentation["mul"]).shape
+    shapeAdd = ctxt.lookup(operatorRepresentation["add"]).shape
+
+    schedule = []
+    for cube in outputCubes:
+        (_, _, _, COffset) = cube.offset
+        (_, _, _, CSize) = cube.dims
+        MulCube = HyperRectangle((0,) * (len(shapeMul) - 1) + (COffset,), (1,) * (len(shapeMul) - 1) + (CSize,))
+        AddCube = HyperRectangle((0,) * (len(shapeAdd) - 1) + (COffset,), (1,) * (len(shapeAdd) - 1) + (CSize,))
+        schedule.append({"mul": MulCube, "add": AddCube})
+
+    return schedule
diff --git a/Deeploy/Targets/Neureka/TileConstraints/__init__.py b/Deeploy/Targets/Neureka/TileConstraints/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Neureka/TileConstraints/__init__.py
+++ b/Deeploy/Targets/Neureka/TileConstraints/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Neureka/Tiler.py b/Deeploy/Targets/Neureka/Tiler.py
index e4f07c1dda..dda00930c0 100644
--- a/Deeploy/Targets/Neureka/Tiler.py
+++ b/Deeploy/Targets/Neureka/Tiler.py
@@ -1,40 +1,16 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: Tiler.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 
 from Deeploy.Targets.Neureka.Bindings import NeurekaDenseConv2DBindings, NeurekaDWConv2DBindings, \
-    NeurekaPWConv2DBindings, NeurekaRQSDenseConv2DBindings, NeurekaRQSDWConv2DBindings, NeurekaRQSPWConv2DBindings, \
-    NeurekaWmemDenseConv2DBindings, NeurekaWmemDWConv2DBindings, NeurekaWmemPWConv2DBindings, \
-    NeurekaWmemRQSDenseConv2DBindings, NeurekaWmemRQSDWConv2DBindings, NeurekaWmemRQSPWConv2DBindings
+    NeurekaPWConv2DBindings, NeurekaRQSDenseConv2DBindings, NeurekaRQSDWConv2DBindings, NeurekaRQSPWConv2DBindings
 from Deeploy.Targets.Neureka.TileConstraints.NeurekaDenseConstraint import NeurekaDenseConv2DTileConstraint, \
-    NeurekaRQSDenseConv2DTileConstraint, NeurekaWmemDenseConv2DTileConstraint, \
-    NeurekaWmemRQSDenseConv2DTileConstraint
+    NeurekaRQSDenseConv2DTileConstraint
 from Deeploy.Targets.Neureka.TileConstraints.NeurekaDepthwiseConstraint import NeurekaDWConv2DTileConstraint, \
-    NeurekaRQSDWConv2DTileConstraint, NeurekaWmemDWConv2DTileConstraint, NeurekaWmemRQSDWConv2DTileConstraint
+    NeurekaRQSDWConv2DTileConstraint
 from Deeploy.Targets.Neureka.TileConstraints.NeurekaPointwiseConstraint import NeurekaPWConv2DTileConstraint, \
-    NeurekaRQSPWConv2DTileConstraint, NeurekaWmemPWConv2DTileConstraint, NeurekaWmemRQSPWConv2DTileConstraint
+    NeurekaRQSPWConv2DTileConstraint
 from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
 
 NeurekaRQSPWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaRQSPWConv2DBindings,
@@ -42,27 +18,12 @@
 NeurekaPWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaPWConv2DBindings,
                                                              tileConstraint = NeurekaPWConv2DTileConstraint())
 
-NeurekaWmemRQSPWConv2DTilingReadyBindings = TilingReadyNodeBindings(
-    nodeBindings = NeurekaWmemRQSPWConv2DBindings, tileConstraint = NeurekaWmemRQSPWConv2DTileConstraint())
-NeurekaWmemPWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaWmemPWConv2DBindings,
-                                                                 tileConstraint = NeurekaWmemPWConv2DTileConstraint())
-
 NeurekaRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaRQSDWConv2DBindings,
                                                                 tileConstraint = NeurekaRQSDWConv2DTileConstraint())
 NeurekaDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaDWConv2DBindings,
                                                              tileConstraint = NeurekaDWConv2DTileConstraint())
 
-NeurekaWmemRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(
-    nodeBindings = NeurekaWmemRQSDWConv2DBindings, tileConstraint = NeurekaWmemRQSDWConv2DTileConstraint())
-NeurekaWmemDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaWmemDWConv2DBindings,
-                                                                 tileConstraint = NeurekaWmemDWConv2DTileConstraint())
-
 NeurekaRQSDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(
     nodeBindings = NeurekaRQSDenseConv2DBindings, tileConstraint = NeurekaRQSDenseConv2DTileConstraint())
 NeurekaDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = NeurekaDenseConv2DBindings,
                                                                 tileConstraint = NeurekaDenseConv2DTileConstraint())
-
-NeurekaWmemRQSDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(
-    nodeBindings = NeurekaWmemRQSDenseConv2DBindings, tileConstraint = NeurekaWmemRQSDenseConv2DTileConstraint())
-NeurekaWmemDenseConv2DTilingReadyBindings = TilingReadyNodeBindings(
-    nodeBindings = NeurekaWmemDenseConv2DBindings, tileConstraint = NeurekaWmemDenseConv2DTileConstraint())
diff --git a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py
index f3fdeafcb0..84e0565b97 100644
--- a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py
+++ b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/Passes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: Passes.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import itertools
 import math
diff --git a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Neureka/TopologyOptimizationPasses/__init__.py
+++ b/Deeploy/Targets/Neureka/TopologyOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Neureka/__init__.py b/Deeploy/Targets/Neureka/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/Neureka/__init__.py
+++ b/Deeploy/Targets/Neureka/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 547a29af10..e1a9ed5932 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPBindings.py
-#
-# Last edited: 10.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurichs
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import itertools
 from functools import partial
@@ -31,32 +8,35 @@
 from Deeploy.AbstractDataTypes import PointerClass
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureGeneration, MemoryAwareClosureGeneration
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
-    MemoryManagementGeneration
-from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes, SignedIntegerDataTypes, float32_t, int8_t, int32_t, \
-    uint8_t
+    MemoryManagementGeneration, MemoryPassthroughGeneration
+from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, SignedIntegerDataTypes, float32_t, \
+    int8_t, int32_t, int64_t, uint8_t
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
 from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \
-    GatherTemplate, QuantTemplate, RQSiGELUTemplate, iHardswishTemplate
+    GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
     GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
-    QuantChecker, ReduceMeanChecker, ReluChecker, RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, \
-    SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
+    QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
+    SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
-from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, \
-    FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, \
-    FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, \
-    ReduceMeanTemplate, RequantShiftTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SliceTemplate, \
-    SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
-    iRMSNormTemplate, iSoftmaxTemplate
+from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
+from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \
+    FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \
+    FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \
+    MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \
+    RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \
+    TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
+    TilingVariableReplacementUpdate
 
 _clusterEntryClosureCallTemplate = NodeTemplate("""
 // ${closureName} CLOSURE CALL
@@ -73,6 +53,12 @@
 pi_cl_team_fork(NUM_CORES, (void*)${closureName}, &${closureStructArgName});
 """)
 
+SkipTransformer = CodeTransformation(
+    [ArgumentStructGeneration(),
+     MemoryPassthroughGeneration("L.*"),
+     MemoryPassthroughGeneration(),
+     FutureGeneration()])
+
 FunctionCallClosure = partial(ClosureGeneration, closureSuffix = "_closure")
 ClusterClosure = partial(ClosureGeneration,
                          closureSuffix = "_cluster_entry",
@@ -115,29 +101,31 @@
     TilingCallClosure(writeback = False),
     PULPSynchCoresPass(),
     ForkClosure(writeback = False, generateStruct = True),
-    PULPClusterTiling("L1"),
+    TilingVariableReplacementUpdate("L1"),
+    PULPClusterTiling("L2", "L1", MchanDma()),
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
-    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
     TilingVariableReplacement("L2"),
-    PULPL3Tiling("L2"),
+    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+    PULPL3Tiling("L3", "L2", l3DmaHack),
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
-    MemoryManagementGeneration("L3.*"),
     MemoryManagementGeneration("L2"),
+    MemoryManagementGeneration("L3.*"),
     MemoryManagementGeneration(),
 ])
 
 ClusterTransformer = CodeTransformation([
     TilingVariableReplacement("L1"),
     TilingCallClosure(writeback = False, generateStruct = True),
-    PULPClusterTiling("L1"),
+    TilingVariableReplacementUpdate("L1"),
+    PULPClusterTiling("L2", "L1", MchanDma()),
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
-    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
     TilingVariableReplacement("L2"),
-    PULPL3Tiling("L2"),
+    MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+    PULPL3Tiling("L3", "L2", l3DmaHack),
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
@@ -160,10 +148,30 @@
             PointerClass(uint8_t),
             PointerClass(uint8_t),
             PointerClass(uint8_t)
-        ], [PULPDMAFuture(underlyingType = type)]), SliceTemplate.referenceTemplate, MemoryAwareForkTransformer)
+        ], [PULPDMAFuture(underlyingType = type)]), DMASliceTemplate.referenceTemplate, MemoryAwareForkTransformer)
     for type in IntegerDataTypes
 ]
 
+PULPSliceBindings = [
+    NodeBinding(
+        SliceChecker(
+            [
+                PointerClass(float_type),  # data_in
+                PointerClass(int_type),  # starts
+                PointerClass(int_type),  # ends
+                PointerClass(int_type),  # axes
+                PointerClass(int_type)  # steps
+            ],
+            [PointerClass(float_type)]),
+        SliceTemplate.referenceTemplate,
+        ForkTransformer) for float_type in FloatDataTypes for int_type in IntegerDataTypes
+]
+
+PULPReshapeBindings = [
+    NodeBinding(ReshapeChecker([PointerClass(type), PointerClass(int64_t)], [PointerClass(type)]),
+                ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes
+]
+
 PULPRQAddBindings = [
     NodeBinding(RQAddChecker([PointerClass(_type), PointerClass(_type2)], [PointerClass(_type3)]),
                 RQAddTemplate.referenceTemplate, ForkTransformer)
@@ -229,6 +237,14 @@
         ForkTransformer)
 ]
 
+PULPFloatDWConv2DBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(float_type), PointerClass(float_type),
+             PointerClass(float_type)], [PointerClass(float_type)]), FloatConvTemplate.referenceDW2DIm2ColTemplate,
+        ForkTransformer) for float_type in FloatDataTypes
+]
+
 PULPRQSMatrixVecBindings = [
     NodeBinding(
         PULPLinearChecker([PointerClass(type1),
@@ -280,6 +296,11 @@
 PULPReduceMeanBindings = [
     NodeBinding(ReduceMeanChecker([PointerClass(type)], [PointerClass(type)]), ReduceMeanTemplate.referenceTemplate,
                 ClusterTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ReduceMeanChecker([PointerClass(float_type), PointerClass(integer_type)], [PointerClass(float_type)]),
+                FloatReduceMeanTemplate.referenceTemplate, ForkTransformer)
+    for integer_type in SignedIntegerDataTypes
+    for float_type in FloatDataTypes
 ]
 
 PULPReduceSumBindings = [
@@ -394,10 +415,22 @@
          PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceTemplate,
     ForkTransformer)
 
+PULPLayernormGradBinding = NodeBinding(
+    LayerNormChecker(
+        [PointerClass(float32_t),
+         PointerClass(float32_t),
+         PointerClass(float32_t),
+         PointerClass(float32_t)], [PointerClass(float32_t)]), FloatLayernormTemplate.referenceGradTemplate,
+    ForkTransformer)
+
 PULPFloatGELUBinding = NodeBinding(
     GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
     FloatGELUTemplate.referenceTemplate, ForkTransformer)
 
+PULPFloatGELUGradBinding = NodeBinding(
+    GELUChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+    FloatGELUTemplate.referenceGradTemplate, ForkTransformer)
+
 PULPGatherBindings = [
     NodeBinding(GatherChecker([PointerClass(float32_t), PointerClass(type)], [PointerClass(float32_t)]),
                 GatherTemplate.referenceTemplate, ForkTransformer) for type in IntegerDataTypes
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/AutoTransposeUtils.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/AutoTransposeUtils.py
index 47d19cb850..6bc9d0ac24 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/AutoTransposeUtils.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/AutoTransposeUtils.py
@@ -1,58 +1,28 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: AutoTransposeUtils.py
-#
-# Last edited: 11.12.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from typing import Dict, List, Literal, Tuple
 
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
-    _invertPermutation, _permuteList
+    _invertPermutation, _permute, _permuteHyperRectangle
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
-from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, minimizeRectangleDims
+from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, minimizeRectangle
 
 
-def _transposedDMAStrides(ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL1", "FromL1"],
+def _transposedDMAStrides(ctxt: NetworkContext, rect: HyperRectangle, direction: Literal["ToL1", "FromL1"],
                           perm: List[int], L1Name: str, L2Name: str) -> Tuple[HyperRectangle, List[int], List[int]]:
     _invPerm = _invertPermutation(perm)
-    rectangle = HyperRectangle(_permuteList(rectangle.offset, _invPerm), _permuteList(rectangle.dims, _invPerm))
-
-    contiguousDims = [permIdx == rangeIdx for permIdx, rangeIdx in zip(perm, range(len(perm)))]
-    workList = []
+    inRect = _permuteHyperRectangle(rect, _invPerm)
 
-    for idx, dim in enumerate(contiguousDims):
-        if dim:
-            workList.append(rectangle.dims[idx])
-        else:
-            workList.append(1)
-
-    maxTransferRect = copy.copy(rectangle)
-    maxTransferRect.dims = tuple(workList)
+    maxTransferDims = tuple(inRect.dims[idx] if idx == permIdx else 1 for idx, permIdx in enumerate(perm))
+    maxTransferRect = HyperRectangle(inRect.offset, maxTransferDims)
 
     referenceBuffer = copy.copy(ctxt.lookup(L2Name))
-    referenceBuffer.shape = _permuteList(referenceBuffer.shape, _invPerm)
-    minRect, referenceRect = minimizeRectangleDims(maxTransferRect, referenceBuffer)
+    referenceBuffer.shape = _permute(referenceBuffer.shape, _invPerm)
+    minRect, referenceShape = minimizeRectangle(maxTransferRect, referenceBuffer.shape)
 
     droppedIdx = [
         idx for idx in range(len(perm))
@@ -70,7 +40,7 @@ def _transposedDMAStrides(ctxt: NetworkContext, rectangle: HyperRectangle, direc
         newPerm.append(p - sub)
 
     strides = [1]
-    for dim in reversed(referenceRect.dims[1:]):
+    for dim in reversed(referenceShape[1:]):
         strides.insert(0, strides[0] * dim)
 
     permStrides = [strides[idx] for idx in newPerm]
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterSynch.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterSynch.py
index 275c8baa9c..3d49501ea7 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterSynch.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterSynch.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPClusterSynch.py
-#
-# Last edited: 30.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
index 3f15f04680..3c0bba3107 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTiling.py
@@ -1,43 +1,40 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: PULPClusterTiling.py
-#
-# Last edited: 19.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
+from Deeploy.TilingExtension.AsyncDma import AsyncDma
+from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
+    DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
+from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
+    ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
+
+
+class PULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration):
+    pass
+
+
+class ProfilingPULPClusterTilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn):
+    pass
+
+
+class PULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration):
+    pass
+
 
-from .PULPClusterTilingDB import ProfilingPULPClusterTilingGenerationDB, PULPClusterTilingGenerationDB
-from .PULPClusterTilingSB import ProfilingPULPClusterTilingGenerationSB, PULPClusterTilingGenerationSB
+class ProfilingPULPClusterTilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn):
+    pass
 
 
 class PULPClusterTiling(CodeTransformationPass):
 
-    def __init__(self, targetMemLevel: str):
-        self.SB = PULPClusterTilingGenerationSB(targetMemLevel)
-        self.profilingSB = ProfilingPULPClusterTilingGenerationSB(targetMemLevel)
-        self.DB = PULPClusterTilingGenerationDB(targetMemLevel)
-        self.profilingDB = ProfilingPULPClusterTilingGenerationDB(targetMemLevel)
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+        self.SB = PULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
+        self.profilingSB = ProfilingPULPClusterTilingGenerationSB(externalMemory, localMemory, dma)
+        self.DB = PULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
+        self.profilingDB = ProfilingPULPClusterTilingGenerationDB(externalMemory, localMemory, dma)
 
     def apply(self,
               ctxt: NetworkContext,
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingDB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingDB.py
deleted file mode 100644
index e13c1bbad0..0000000000
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingDB.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: PULPClusterTilingDB.py
-#
-# Last edited: 25.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from typing import Dict, List, Tuple
-
-from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation
-from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTilingSB import PULPClusterTilingSB, _DMAUpdate
-from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import DoubleBufferingTilingMixIn, \
-    ProfilingDoubleBufferingTilingMixIn, TilingMetaInfo
-from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
-from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme
-
-_moveTileInTemplate = NodeTemplate("""
-
-// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
-if (${tileNum} < ${numTiles}[*${tileIdxPtr}+1]){
-dory_dma_memcpy_mindims_async(&${stateReference});
-}
-
-""")
-
-_moveTileOutTemplate = NodeTemplate("""
-
-// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
-if((${tileNum}) % 2 == 0){
-dory_dma_memcpy_mindims_async(&${stateReference});
-} else {
-dory_dma_memcpy_mindims_async(&${_stateReference});
-}
-""")
-
-_blockTileOutTemplate = NodeTemplate("""
-
-// BLOCKING EXPORT TILE ${innerTilePtr}
-if((${tileNum}) > 1){
-if((${tileNum}) % 2 == 0){
-dory_dma_barrier(&${stateReference});
-} else {
-dory_dma_barrier(&${_stateReference});
-}
-}
-
-""")
-
-_finalBlockTileOutTemplate = NodeTemplate("""
-
-// BLOCKING EXPORT TILE ${innerTilePtr}
-dory_dma_barrier(&${stateReference});
-dory_dma_barrier(&${_stateReference});
-""")
-
-_updateDMATransferStructTemplate = NodeTemplate("""
-
-// UPDATE DMA STRUCT ${stateReference}, ${_stateReference}
-${stateReference}.ext = (((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}]);
-${stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
-${stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
-${stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
-${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
-${stateReference}.loc = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}-1]);
-""")
-
-_outUpdateDMATransferStructTemplate = NodeTemplate("""
-
-if ((${tileNum}) % 2 == 0){
-// UPDATE DMA STRUCT ${stateReference}
-${stateReference}.ext = ((char*)${extPtr} + ${extOffsetPtr}[${tileNum}]);
-${stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
-${stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
-${stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
-${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
-${stateReference}.loc = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-} else {
-${_stateReference}.ext = ((char*)${extPtr} + ${extOffsetPtr}[${tileNum}]);
-${_stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
-${_stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
-${_stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
-${_stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
-${_stateReference}.loc = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-}
-${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-
-""")
-
-
-class PULPClusterTilingDB(PULPClusterTilingSB):
-
-    _blockTileOutTemplate = _blockTileOutTemplate
-    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
-    _moveTileOutTemplate = _moveTileOutTemplate
-    _moveTileInTemplate = _moveTileInTemplate
-
-    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
-                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
-        nodeName = operatorRepresentation['nodeName']
-
-        operatorRepresentation = operatorRepresentation.copy()
-
-        dmaName = self._DMAStructName(tensorName, nodeName)
-        # operatorRepresentation['stateReference'] = dmaName
-        # operatorRepresentation['tileNum'] = "TILING_I"
-        operatorRepresentation['locPtr'] = ctxt.lookup(operatorRepresentation[tensorName]).name
-        operatorRepresentation['baseLocPtr'] = ctxt.hoistReference(operatorRepresentation['locPtr'],
-                                                                   operatorRepresentation['locPtr'] + "_ref")
-        operatorRepresentation['_stateReference'] = self._DMAStructName(tensorName, nodeName) + "_1"
-        ctxt.lookup(operatorRepresentation['baseLocPtr'])._memoryLevel = self.targetMemLevel
-
-        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
-
-        ctxt, operatorRepresentation = super()._hoistDMAUpdates(ctxt, tensorName, updateList, operatorRepresentation)
-
-        locOffsetList = []
-        locBaseOffset = updateList[0].locOffset
-        for update in updateList:
-            locOffsetList.append(int(update.locOffset) - locBaseOffset)
-
-        name = namePrefix + "_locOffset"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], locOffsetList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'locOffsetPtr')
-
-        return ctxt, operatorRepresentation
-
-    def _generateEgressPointerUpdates(
-            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
-                                                  nodeMemoryConstraint, tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(_outUpdateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateEgressDMACode(
-            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        egressDMATransferCalls = []
-        egressDMAWaitStatements = []
-
-        exportLoadStep = tilingSchedule.outputLoadSchedule[0]
-        for key, rectangle in exportLoadStep.items():
-            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-            internalPtr = ctxt.lookup(operatorRepresentation[key])
-
-            tensorName = key
-            nodeName = operatorRepresentation['nodeName']
-            dmaName = self._DMAStructName(tensorName, nodeName)
-
-            finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, internalPtr)
-            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL1", internalPtr.name, externalPtr.name,
-                                           finalMemoryLevel)
-            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.DMA_copy)
-            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
-
-            tensorName = key + "_1"
-            nodeName = operatorRepresentation['nodeName']
-            _dmaName = self._DMAStructName(tensorName, nodeName)
-
-            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL1", internalPtr.name, externalPtr.name,
-                                           finalMemoryLevel)
-            _ = ctxt.hoistStruct(struct, _dmaName, PULPStructDataTypes.DMA_copy)
-            ctxt.lookup(_dmaName)._users += [operatorRepresentation['nodeName']]
-
-            egressDMATransferCalls.append(
-                CodeSnippet(
-                    self._moveTileOutTemplate, {
-                        'innerTilePtr': str(internalPtr._instance),
-                        "outerTilePtr": str(externalPtr._instance),
-                        "stateReference": dmaName,
-                        "_stateReference": _dmaName
-                    }))
-
-            egressDMAWaitStatements.append(
-                CodeSnippet(
-                    self._blockTileOutTemplate, {
-                        'innerTilePtr': str(internalPtr._instance),
-                        "outerTilePtr": str(externalPtr._instance),
-                        "stateReference": dmaName,
-                        "_stateReference": _dmaName
-                    }))
-
-        return egressDMATransferCalls, egressDMAWaitStatements
-
-    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
-                    variableReplacement: VariableReplacementScheme,
-                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
-
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
-            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
-
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
-            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
-
-        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
-                                                                      operatorRepresentation)
-        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
-                                                                    operatorRepresentation)
-
-        variableUpdates = self._generateVariableUpdates(tilingSchedule, variableReplacement, ctxt,
-                                                        operatorRepresentation)
-
-        for transaction in ingressDMATransferCalls:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I+1"
-            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
-            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
-
-        for transaction in ingressDMAUpdates:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I+1"
-
-        for transaction in egressDMATransferCalls:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I"
-
-        for transaction in egressDMAWaitStatements:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation['tileNum'] = "TILING_I"
-
-        for transaction in egressDMAUpdates:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I"
-
-        for transaction in variableUpdates:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I"
-
-        openLoopStatement = [
-            CodeSnippet(self._openTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        closeLoopStatement = [
-            CodeSnippet(self._closeTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        setupStatements = []
-        teardownStatements = []
-
-        teardownStatements += [
-            CodeSnippet(self._releaseDMATemplate,
-                        {"stateReference": ingressDMAUpdates[0].operatorRepresentation["stateReference"]})
-        ]
-
-        setupStatements += [CodeSnippet(self._initDMATemplate, {"channelName": "dma_channel"})]
-        setupStatements += [
-            CodeSnippet(self._setDMAChannelTemplate, {
-                **transaction.operatorRepresentation, "channelName": "dma_channel"
-            }) for transaction in ingressDMAUpdates
-        ]
-
-        for transaction in egressDMAUpdates:
-            _operatorRepresentation = transaction.operatorRepresentation.copy()
-            _operatorRepresentation["channelName"] = "dma_channel"
-            setupStatements.append(CodeSnippet(self._setDMAChannelTemplate, _operatorRepresentation.copy()))
-            _operatorRepresentation["channelName"] = "dma_channel"
-            _operatorRepresentation["stateReference"] = _operatorRepresentation["_stateReference"]
-            setupStatements.append(CodeSnippet(self._setDMAChannelTemplate, _operatorRepresentation.copy()))
-
-        for transaction in ingressDMATransferCalls:
-            _operatorRepresentation = transaction.operatorRepresentation.copy()
-            _operatorRepresentation["tileNum"] = 0
-            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
-            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
-            setupStatements.append(CodeSnippet(transaction.template, _operatorRepresentation))
-
-        for transaction in egressDMAWaitStatements:
-            _operatorRepresentation = transaction.operatorRepresentation.copy()
-            _operatorRepresentation['tileNum'] = ctxt.lookup(operatorRepresentation["numTiles"]).values[-1]
-            teardownStatements.append(CodeSnippet(_finalBlockTileOutTemplate, _operatorRepresentation))
-
-        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L2",
-                                  nodeOps = operatorRepresentation['nodeOps'],
-                                  numTiles = len(tilingSchedule.outputLoadSchedule),
-                                  tileIdxVar = "TILING_I",
-                                  kernelLevelTiling = True)
-
-        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                       ingressDMAWaitStatements[-1:], ingressDMAUpdates,
-                                                       egressDMATransferCalls, egressDMAWaitStatements[-1:],
-                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
-                                                       closeLoopStatement, setupStatements, teardownStatements)
-
-        return ctxt, newExecutionBlock, True
-
-    def generateTilingLoop(
-            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
-            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        flatTilingSchedule = copy.copy(tilingSchedules[0])
-        for tilingSchedule in tilingSchedules[1:]:
-            flatTilingSchedule += tilingSchedule
-
-        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
-
-        if len(offsetLists) == 0:
-            return ctxt, executionBlock, False
-
-        for offsetList in offsetLists:
-            if not len(offsetList) == 2:
-                return ctxt, executionBlock, False
-
-        allNumTiles = [len(schedule.outputLoadSchedule) for schedule in tilingSchedules]
-        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
-                                                                 tilingSchedules)
-
-        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
-                                operatorRepresentation)
-
-
-class PULPClusterTilingGenerationDB(PULPClusterTilingDB, DoubleBufferingTilingMixIn):
-    pass
-
-
-class ProfilingPULPClusterTilingGenerationDB(PULPClusterTilingDB, ProfilingDoubleBufferingTilingMixIn):
-    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingSB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingSB.py
deleted file mode 100644
index 90dc3b2b2b..0000000000
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPClusterTilingSB.py
+++ /dev/null
@@ -1,673 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: PULPClusterTiling.py
-#
-# Last edited: 17.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from collections import namedtuple
-from typing import Dict, List, Literal, Optional, Tuple, Type
-
-import numpy as np
-
-import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
-from Deeploy.AbstractDataTypes import Immediate, PointerClass
-from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
-    _invertPermutation, _permuteList
-from Deeploy.DeeployTypes import CodeSnippet, ConstantBuffer, ExecutionBlock, NetworkContext, NodeTemplate, \
-    OperatorRepresentation
-from Deeploy.Targets.PULPOpen.CodeTransformationPasses import AutoTransposeUtils
-from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingSingleBufferingTilingMixIn, \
-    SingleBufferingTilingMixIn, TilingMetaInfo
-from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
-from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
-    calculateRectangleOffset, minimizeRectangleDims
-
-_openTileLoopTemplate = NodeTemplate("""
-
-// TILING LOOP
-for (int TILING_I=${numTiles}[*${tileIdxPtr}]; TILING_I<${numTiles}[(*${tileIdxPtr})+1]; TILING_I++){
-""")
-
-_closeTileLoopTemplate = NodeTemplate("""
-
-// CLOSE TILING LOOP
-}
-*${tileIdxPtr} += 1;
-
-""")
-
-_moveTileInTemplate = NodeTemplate("""
-
-// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
-dory_dma_memcpy_mindims_async(&${stateReference});
-
-""")
-
-_iteratedMoveTileInTemplate = NodeTemplate("""
-
-// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
-// ITERATED
-
-<%
-_extStrides = [stride * stateStruct.value['length_1d_copy'].value for stride in remainderStrides]
-_locStride = f"{stateReference}.length_1d_copy  * {stateReference}.number_of_1d_copies  *  {stateReference}.number_of_2d_copies"
-
-stateStruct.value['ext'] = str(stateReference) + ".ext"
-stateStruct.value['loc'] = str(stateReference) + ".loc"
-stateStruct.value['tid'] = str(stateReference) + ".tid"
-stateStruct.value['stride_2d'] = str(stateReference) + ".stride_2d"
-stateStruct.value['stride_1d'] = str(stateReference) + ".stride_1d"
-stateStruct.value['number_of_2d_copies'] = str(stateReference) + ".number_of_2d_copies"
-stateStruct.value['number_of_1d_copies'] = str(stateReference) + ".number_of_1d_copies"
-stateStruct.value['length_1d_copy'] = str(stateReference) + ".length_1d_copy"
-%>
-
-int8_t * bu_${stateReference}_loc = ${stateReference}.loc;
-int8_t * bu_${stateReference}_ext = ${stateReference}.ext;
-
-% for idx, dimLen in enumerate(dimLens):
-uint16_t ${nodeName}_${tensorName}_dimLen_${idx} = ${dimLen}[${tileNum}];
-for(int i_${idx} = 0; i_${idx} < ${nodeName}_${tensorName}_dimLen_${idx}; i_${idx}++){
-%endfor
-${stateStruct.typeName} trans_${stateReference} = (${stateStruct.typeName}) ${str(stateStruct)};
-dory_dma_memcpy_mindims_async(&trans_${stateReference});
-${stateStruct.value['loc']} = (((int8_t*) ${stateStruct.value['loc']}) + ${_locStride});
-% for idx, _ in enumerate(dimLens):
-${stateStruct.value['ext']} = (((int8_t*) ${stateStruct.value['ext']}) + (${_extStrides[idx]}));
-}
-${stateStruct.value['ext']} = (((int8_t*) ${stateStruct.value['ext']}) - ${nodeName}_${tensorName}_dimLen_${len(dimLens) -1 - idx} * ${_extStrides[idx]});
-%endfor
-
-${stateStruct.value['loc']} = bu_${stateReference}_loc;
-${stateStruct.value['ext']} = bu_${stateReference}_ext;
-
-""")
-
-_blockTileInTemplate = NodeTemplate("""
-
-// BLOCKING IMPORT TILE ${innerTilePtr}
-dory_dma_barrier(&${stateReference});
-
-""")
-
-_moveTileOutTemplate = NodeTemplate("""
-
-// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
-dory_dma_memcpy_mindims_async(&${stateReference});
-
-""")
-
-_blockTileOutTemplate = NodeTemplate("""
-
-// BLOCKING EXPORT TILE ${innerTilePtr}
-dory_dma_barrier(&${stateReference});
-
-""")
-
-_updateDMATransferStructTemplate = NodeTemplate("""
-
-// UPDATE DMA STRUCT ${stateReference}
-${stateReference}.ext = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
-${stateReference}.length_1d_copy = ${length1dPtr}[${tileNum}];
-${stateReference}.number_of_1d_copies = ${number1dPtr}[${tileNum}];
-${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
-
-${stateReference}.stride_1d = ${stride1dPtr}[${tileNum}];
-${stateReference}.stride_2d = ${stride2dPtr}[${tileNum}];
-
-${stateReference}.mchan_cmd = ${mchanCmdPtr}[${tileNum}];
-""")
-
-_updateReferenceTemplate = NodeTemplate("""
-
-// UPDATE VARIABLE ${reference}
-*${reference} = ${baseReference}[${tileNum}];
-""")
-
-_initDMATemplate = NodeTemplate("""
-int32_t ${channelName} = dory_dma_allocate();
-""")
-
-_setDMAChannelTemplate = NodeTemplate("""
-${stateReference}.tid = ${channelName};
-""")
-
-_releaseDMATemplate = NodeTemplate("""
-dory_dma_free(&${stateReference});
-""")
-
-# ADD NUM TRANSFERS VARIABLE
-
-_DMAUpdate = namedtuple(
-    "_DMAUpdate",
-    "extOffset locOffset length_1d_copy number_of_1d_copies number_of_2d_copies stride_1d stride_2d mchan_cmd")
-
-
-class PULPClusterTilingSB(TilingCodeGeneration):
-
-    _prefix = "TILING_REPLACED_"
-
-    _openTileLoopTemplate = _openTileLoopTemplate
-    _closeTileLoopTemplate = _closeTileLoopTemplate
-
-    _moveTileInTemplate = _moveTileInTemplate
-    _iteratedMoveTileInTemplate = _iteratedMoveTileInTemplate
-    _blockTileInTemplate = _blockTileInTemplate
-
-    _moveTileOutTemplate = _moveTileOutTemplate
-    _blockTileOutTemplate = _blockTileOutTemplate
-
-    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
-    _updateReferenceTemplate = _updateReferenceTemplate
-
-    _initDMATemplate = _initDMATemplate
-    _setDMAChannelTemplate = _setDMAChannelTemplate
-    _releaseDMATemplate = _releaseDMATemplate
-
-    @property
-    def prefix(self):
-        return self._prefix + self.targetMemLevel + "_"
-
-    def _DMAStructName(self, tensorName: str, nodeName: str) -> str:
-        return f"{self.prefix}_DMA_{nodeName}_{tensorName}"
-
-    @classmethod
-    def _generatePointerUpdates(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
-                                loadSchedule: List[Dict[str,
-                                                        HyperRectangle]], nodeMemoryConstraint: NodeMemoryConstraint,
-                                tilingSchedule: TilingSchedule) -> Dict[str, _DMAUpdate]:
-        updateDict = {}
-        deltaOffsets = {}
-
-        for idx, loadStep in enumerate(loadSchedule):
-            for stepIdx, (key, rect) in enumerate(loadStep.items()):
-
-                if key in tilingSchedule.outputBaseOffsets.keys():
-                    baseOffsets = tilingSchedule.outputBaseOffsets[key]
-                    direction = "FromL1"
-                else:
-                    baseOffsets = tilingSchedule.inputBaseOffsets[key]
-                    direction = "ToL1"
-
-                if key not in updateDict.keys():
-                    updateDict[key] = []
-                if key not in deltaOffsets.keys():
-                    deltaOffsets[key] = 0
-
-                referenceBuffer = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-                l1Buffer = ctxt.lookup(operatorRepresentation[key])
-
-                finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, l1Buffer)
-
-                if (f"in{stepIdx}_perm" in operatorRepresentation
-                        and key in tilingSchedule.inputBaseOffsets.keys()) and (finalMemoryLevel == False):
-                    perm = operatorRepresentation[f"in{stepIdx}_perm"]
-                    struct, _, _ = AutoTransposeUtils.generateTransposedDMAStruct(ctxt, rect, direction, perm,
-                                                                                  l1Buffer.name,
-                                                                                  l1Buffer._referenceName)
-
-                    _invPerm = _invertPermutation(perm)
-                    _rect = copy.copy(rect)
-                    _referenceBuffer = copy.copy(referenceBuffer)
-                    _rect.offset = _permuteList(rect.offset, _invPerm)
-                    _rect.dims = _permuteList(rect.dims, _invPerm)
-                    _referenceBuffer.shape = _permuteList(referenceBuffer.shape, _invPerm)
-
-                    accOffset = calculateRectangleOffset(_rect, _referenceBuffer)
-
-                else:
-                    struct = cls._rectToDMAStruct(ctxt, rect, direction, l1Buffer.name, l1Buffer._referenceName,
-                                                  finalMemoryLevel)
-                    accOffset = calculateRectangleOffset(rect, referenceBuffer)
-
-                length_1d_copy = struct.value['length_1d_copy'].value
-                number_of_1d_copies = struct.value['number_of_1d_copies'].value
-                number_of_2d_copies = struct.value['number_of_2d_copies'].value
-                stride_1d = struct.value['stride_1d'].value
-                stride_2d = struct.value['stride_2d'].value
-                mchan_cmd = struct.value['mchan_cmd'].value
-
-                lIdx = idx % len(baseOffsets)
-
-                sol = _DMAUpdate(accOffset, baseOffsets[lIdx], length_1d_copy, number_of_1d_copies, number_of_2d_copies,
-                                 stride_1d, stride_2d, mchan_cmd)
-
-                deltaOffsets[key] = accOffset
-                updateDict[key].append(sol)
-
-        return updateDict
-
-    @classmethod
-    def _rectToDMAStruct(cls, ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL1", "FromL1"],
-                         L1Name: str, L2Name: str, finalMemoryLevel: bool) -> PULPStructDataTypes.DMA_copy:
-
-        referenceBuffer = ctxt.lookup(L2Name)
-
-        rect, referenceRect = minimizeRectangleDims(rectangle, referenceBuffer)
-        assert len(rect.dims) <= 3, "PULP: Only 2D transfers are supported!"
-
-        if direction == "ToL1":
-            _dir = 1
-        else:
-            _dir = 0
-
-        length_1d_copy = rect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-
-        number_of_1d_copies = 1
-        stride_1d = 0
-
-        if len(rect.dims) > 1:
-            number_of_1d_copies = rect.dims[-2]
-            stride_1d = referenceRect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-
-            if not finalMemoryLevel:
-                stride_1d = length_1d_copy
-
-        number_of_2d_copies = 1
-        stride_2d = 0
-
-        if len(rect.dims) > 2:
-            number_of_2d_copies = rect.dims[-3]
-            stride_2d = referenceRect.dims[-2] * stride_1d
-
-        length_2d_copy = number_of_1d_copies * length_1d_copy
-        mchan_flags = _dir + 0x2 + 0x8
-        if number_of_1d_copies > 1 or number_of_2d_copies > 1:
-            mchan_flags += 0x4
-        mchan_cmd = length_2d_copy + (mchan_flags << 17)
-
-        assert length_2d_copy <= 2**17, f"The DMA transfer size for mchan should be representable with 17 bits, current number of bits required is {np.ceil(np.log2(length_2d_copy))}"
-
-        struct = PULPStructDataTypes.DMA_copy(
-            {
-                "ext": referenceBuffer.name,
-                "loc": L1Name,
-                "hwc_to_chw": 0,
-                "stride_2d": stride_2d,
-                "number_of_2d_copies": number_of_2d_copies,
-                "stride_1d": stride_1d,
-                "number_of_1d_copies": number_of_1d_copies,
-                "length_1d_copy": length_1d_copy,
-                "mchan_cmd": mchan_cmd,
-                "dir": _dir,
-                "tid": 0
-            }, ctxt)
-
-        return struct
-
-    def _hoistConstantAndReference(self,
-                                   ctxt: NetworkContext,
-                                   constBuf: ConstantBuffer,
-                                   operatorRepresentation: OperatorRepresentation,
-                                   nodeName: str,
-                                   operatorRepresentationName: str,
-                                   immediateType: Optional[Type[Immediate]] = None) -> Tuple[NetworkContext, Dict]:
-
-        if immediateType is None:
-            _type = PointerClass(BasicDataTypes.int32_t)
-        else:
-            _type = PointerClass(immediateType)
-
-        name = constBuf.name
-
-        ctxt.add(constBuf, "global")
-        constBuf._type = _type
-        constBuf._instance = constBuf._type(name, ctxt)
-        constBuf._users = [nodeName]
-        constBuf._memoryLevel = self.targetMemLevel
-
-        refName = name + "_ref"
-        reference = ctxt.hoistReference(name, refName)
-        ctxt.lookup(reference)._memoryLevel = self.targetMemLevel
-
-        operatorRepresentation[operatorRepresentationName] = refName
-
-        return ctxt, operatorRepresentation
-
-    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
-                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
-
-        operatorRepresentation = operatorRepresentation.copy()
-
-        nodeName = operatorRepresentation['nodeName']
-
-        offsetList = []
-        mchanCmdList = []
-        len1dList = []
-        num1dList = []
-        num2dList = []
-        stride1dList = []
-        stride2dList = []
-        for update in updateList:
-            offsetList.append(int(update.extOffset))
-            mchanCmdList.append(int(update.mchan_cmd))
-            len1dList.append(int(update.length_1d_copy))
-            num1dList.append(int(update.number_of_1d_copies))
-            num2dList.append(int(update.number_of_2d_copies))
-            stride1dList.append(int(update.stride_1d))
-            stride2dList.append(int(update.stride_2d))
-
-        dmaName = self._DMAStructName(tensorName, nodeName)
-        operatorRepresentation['stateReference'] = dmaName
-        operatorRepresentation['tileNum'] = "TILING_I"
-        operatorRepresentation['extPtr'] = ctxt.lookup(operatorRepresentation[tensorName])._referenceName
-
-        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
-
-        name = namePrefix + "_offset"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], offsetList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'extOffsetPtr')
-
-        name = namePrefix + "_mchan_cmd"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], mchanCmdList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'mchanCmdPtr',
-            PULPStructDataTypes.DMA_copy.structTypeDict['mchan_cmd'])
-
-        name = namePrefix + "_length_1d_copy"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], len1dList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'length1dPtr',
-            PULPStructDataTypes.DMA_copy.structTypeDict['length_1d_copy'])
-
-        name = namePrefix + "_number_of_1d_copies"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], num1dList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'number1dPtr',
-            PULPStructDataTypes.DMA_copy.structTypeDict['number_of_1d_copies'])
-
-        name = namePrefix + "_number_of_2d_copies"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], num2dList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'number2dPtr',
-            PULPStructDataTypes.DMA_copy.structTypeDict['number_of_2d_copies'])
-
-        name = namePrefix + "_stride_1d"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], stride1dList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'stride1dPtr',
-            PULPStructDataTypes.DMA_copy.structTypeDict['stride_1d'])
-
-        name = namePrefix + "_stride_2d"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], stride2dList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'stride2dPtr',
-            PULPStructDataTypes.DMA_copy.structTypeDict['stride_2d'])
-
-        return ctxt, operatorRepresentation
-
-    def _generateEgressPointerUpdates(
-            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
-                                                  nodeMemoryConstraint, tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateIngressPointerUpdates(
-            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.inputLoadSchedule,
-                                                  nodeMemoryConstraint, tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateVariableUpdates(self, tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme,
-                                 ctxt: NetworkContext,
-                                 operatorRepresentation: OperatorRepresentation) -> List[CodeSnippet]:
-
-        updates = []
-
-        for key in variableReplacement.perTileReplacements.keys():
-
-            buf = ctxt.lookup(operatorRepresentation[key])
-            reference = str(buf._instance)
-
-            updates.append(
-                CodeSnippet(self._updateReferenceTemplate, {
-                    "reference": reference,
-                    "tileNum": "TILING_I",
-                    "baseReference": buf._referenceName
-                }))
-
-        return updates
-
-    def _generateDMACode(self, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-                         operatorRepresentation: OperatorRepresentation, loadSchedule: List[Dict[str, HyperRectangle]],
-                         direction: Literal["ToL1", "FromL1"]) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        DMATransferCalls = []
-        DMAWaitStatements = []
-
-        allNumTransfers = AutoTransposeUtils.allNumTransfers(ctxt, operatorRepresentation, loadSchedule, direction)
-
-        transferNodeRep = {}
-
-        if allNumTransfers != []:
-
-            dimLens = []
-
-            for dim in range(len(allNumTransfers[0])):
-                dimVec = [transfer[dim] for transfer in allNumTransfers]
-                namePrefix = operatorRepresentation["nodeName"] + "_"
-                vecName = f"dimLen_{dim}"
-
-                cb = ctxt.ConstantBuffer(namePrefix + vecName, [len(dimVec)], dimVec)
-                ctxt, transferNodeRep = self._hoistConstantAndReference(ctxt, cb, transferNodeRep,
-                                                                        operatorRepresentation['nodeName'], vecName)
-
-                dimLens.append(str(cb._instance))
-
-            transferNodeRep['nodeName'] = operatorRepresentation['nodeName']
-            transferNodeRep['dimLens'] = dimLens
-            transferNodeRep['tileNum'] = "TILING_I"
-
-        loadStep = loadSchedule[0]
-
-        for idx, (key, rectangle) in enumerate(loadStep.items()):
-
-            permName = f"in{idx}_perm"
-
-            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-            internalPtr = ctxt.lookup(operatorRepresentation[key])
-
-            tensorName = key
-            nodeName = operatorRepresentation['nodeName']
-            dmaName = self._DMAStructName(tensorName, nodeName)
-
-            transferNodeRep = {
-                **transferNodeRep,
-                **{
-                    'innerTilePtr': str(internalPtr._instance),
-                    "outerTilePtr": str(externalPtr._instance),
-                    "stateReference": dmaName
-                }
-            }
-
-            if permName in operatorRepresentation and direction == "ToL1":
-                perm = operatorRepresentation[permName]
-                struct, remainderStrides, numTransfers = AutoTransposeUtils.generateTransposedDMAStruct(
-                    ctxt, rectangle, direction, perm, internalPtr.name, externalPtr.name)
-                locStride = np.prod(
-                    rectangle.dims) // np.prod(numTransfers) * (externalPtr._type.referencedType.typeWidth // 8)
-
-                transferNodeRep['tensorName'] = operatorRepresentation[key]
-
-                transferNodeRep = {**transferNodeRep, **{"remainderStrides": remainderStrides, "locStride": locStride}}
-
-            else:
-                finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, internalPtr)
-
-                struct = self._rectToDMAStruct(ctxt, rectangle, direction, internalPtr.name, externalPtr.name,
-                                               finalMemoryLevel)
-
-            transferNodeRep["stateStruct"] = struct
-            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.DMA_copy)
-            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
-
-            if permName in operatorRepresentation and direction == "ToL1":
-
-                DMATransferCalls.append(CodeSnippet(self._iteratedMoveTileInTemplate, transferNodeRep))
-            else:
-                DMATransferCalls.append(CodeSnippet(self._moveTileInTemplate, transferNodeRep))
-
-            DMAWaitStatements.append(CodeSnippet(self._blockTileInTemplate, transferNodeRep))
-
-        return DMATransferCalls, DMAWaitStatements
-
-    def _generateIngressDMACode(
-            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        importLoadStep = tilingSchedule.inputLoadSchedule
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateDMACode(nodeMemoryConstraint, ctxt,
-                                                                                  operatorRepresentation,
-                                                                                  importLoadStep, "ToL1")
-        return ingressDMATransferCalls, ingressDMAWaitStatements
-
-    def _generateEgressDMACode(
-            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        exportLoadStep = tilingSchedule.outputLoadSchedule
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateDMACode(nodeMemoryConstraint, ctxt,
-                                                                                operatorRepresentation, exportLoadStep,
-                                                                                "FromL1")
-
-        return egressDMATransferCalls, egressDMAWaitStatements
-
-    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
-                    variableReplacement: VariableReplacementScheme,
-                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
-
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
-            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
-
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
-            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
-
-        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
-                                                                      operatorRepresentation)
-        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
-                                                                    operatorRepresentation)
-
-        openLoopStatement = [
-            CodeSnippet(self._openTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        closeLoopStatement = [
-            CodeSnippet(self._closeTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        setupStatements = [CodeSnippet(self._initDMATemplate, {"channelName": "dma_channel"})]
-        setupStatements += [
-            CodeSnippet(self._setDMAChannelTemplate, {
-                **transaction.operatorRepresentation, "channelName": "dma_channel"
-            }) for transaction in ingressDMAUpdates + egressDMAUpdates
-        ]
-
-        teardownStatements = [
-            CodeSnippet(self._releaseDMATemplate,
-                        {"stateReference": ingressDMAUpdates[0].operatorRepresentation["stateReference"]})
-        ]
-
-        variableUpdates = self._generateVariableUpdates(tilingSchedule, variableReplacement, ctxt,
-                                                        operatorRepresentation)
-
-        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L2",
-                                  nodeOps = operatorRepresentation['nodeOps'],
-                                  numTiles = len(tilingSchedule.outputLoadSchedule),
-                                  tileIdxVar = "TILING_I",
-                                  kernelLevelTiling = True)
-
-        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                       ingressDMAWaitStatements, ingressDMAUpdates,
-                                                       egressDMATransferCalls, egressDMAWaitStatements,
-                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
-                                                       closeLoopStatement, setupStatements, teardownStatements)
-
-        return ctxt, newExecutionBlock, True
-
-    def generateTilingLoop(
-            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
-            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        flatTilingSchedule = copy.copy(tilingSchedules[0])
-        for tilingSchedule in tilingSchedules[1:]:
-            flatTilingSchedule += tilingSchedule
-
-        # SCHEREMO: hoist numTiles
-
-        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
-
-        if len(offsetLists) == 0:
-            return ctxt, executionBlock, False
-
-        for offsetList in offsetLists:
-            if not len(offsetList) == 1:
-                return ctxt, executionBlock, False
-
-        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
-                                                                 tilingSchedules)
-
-        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
-                                operatorRepresentation)
-
-
-class PULPClusterTilingGenerationSB(PULPClusterTilingSB, SingleBufferingTilingMixIn):
-    pass
-
-
-class ProfilingPULPClusterTilingGenerationSB(PULPClusterTilingSB, ProfilingSingleBufferingTilingMixIn):
-    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
index af744f8672..9df0d88479 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
@@ -1,43 +1,40 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: PULPL3Tiling.py
-#
-# Last edited: 19.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
+from Deeploy.TilingExtension.AsyncDma import AsyncDma
+from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
+    DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
+from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
+    ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
+
+
+class PULPL3TilingGenerationSB(SingleBufferingTilingCodeGeneration):
+    pass
+
+
+class ProfilingPULPL3TilingGenerationSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn):
+    pass
+
+
+class PULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration):
+    pass
+
 
-from .PULPL3TilingDB import ProfilingPULPL3TilingGenerationDB, PULPL3TilingGenerationDB
-from .PULPL3TilingSB import ProfilingPULPL3TilingGenerationSB, PULPL3TilingGenerationSB
+class ProfilingPULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn):
+    pass
 
 
 class PULPL3Tiling(CodeTransformationPass):
 
-    def __init__(self, targetMemLevel: str):
-        self.SB = PULPL3TilingGenerationSB(targetMemLevel)
-        self.profilingSB = ProfilingPULPL3TilingGenerationSB(targetMemLevel)
-        self.DB = PULPL3TilingGenerationDB(targetMemLevel)
-        self.profilingDB = ProfilingPULPL3TilingGenerationDB(targetMemLevel)
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+        self.SB = PULPL3TilingGenerationSB(externalMemory, localMemory, dma)
+        self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dma)
+        self.profilingSB = ProfilingPULPL3TilingGenerationSB(externalMemory, localMemory, dma)
+        self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dma)
 
     def apply(self,
               ctxt: NetworkContext,
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingDB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingDB.py
deleted file mode 100644
index 6a3f80bd28..0000000000
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingDB.py
+++ /dev/null
@@ -1,329 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: PULPClusterTiling.py
-#
-# Last edited: 17.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from typing import Dict, List, Tuple
-
-from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation
-from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3TilingSB import PULPL3TilingSB, _DMAUpdate
-from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import DoubleBufferingTilingMixIn, \
-    ProfilingDoubleBufferingTilingMixIn, TilingMetaInfo
-from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
-from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme
-
-_moveTileInTemplate = NodeTemplate("""
-
-// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
-if (${tileNum} < ${numTiles}[*${tileIdxPtr}+1]){
-pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
-}
-
-""")
-
-_moveTileOutTemplate = NodeTemplate("""
-
-// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
-if((${tileNum}) % 2 == 0){
-pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
-} else {
-pi_cl_ram_copy_2d(get_ram_ptr(), ${_stateReference}.pi_ram_addr, ${_stateReference}.addr, ${_stateReference}.size, ${_stateReference}.stride, ${_stateReference}.length, ${_stateReference}.ext2loc, &${_stateReference});
-}
-
-""")
-
-_blockTileOutTemplate = NodeTemplate("""
-
-// BLOCKING EXPORT TILE ${innerTilePtr}
-if((${tileNum}) > 1){
-if((${tileNum}) % 2 == 0){
-pi_cl_ram_copy_wait(&${stateReference});
-} else {
-pi_cl_ram_copy_wait(&${_stateReference});
-}
-}
-
-""")
-
-_finalBlockTileOutTemplate = NodeTemplate("""
-
-// BLOCKING EXPORT TILE ${innerTilePtr}
-pi_cl_ram_copy_wait(&${stateReference});
-% if numTiles > 1:
-pi_cl_ram_copy_wait(&${_stateReference});
-% endif
-""")
-
-_updateDMATransferStructTemplate = NodeTemplate("""
-
-// UPDATE DMA STRUCT ${stateReference}
-${stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
-${stateReference}.size = ${length1dPtr}[${tileNum}];
-${stateReference}.length = ${number1dPtr}[${tileNum}];
-${stateReference}.addr = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}-1]);
-
-""")
-
-_outUpdateDMATransferStructTemplate = NodeTemplate("""
-
-if ((${tileNum}) % 2 == 0){
-// UPDATE DMA STRUCT ${stateReference}
-${stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
-${stateReference}.size = ${length1dPtr}[${tileNum}];
-${stateReference}.length = ${number1dPtr}[${tileNum}];
-${stateReference}.addr = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-} else {
-${_stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
-${_stateReference}.size = ${length1dPtr}[${tileNum}];
-${_stateReference}.length = ${number1dPtr}[${tileNum}];
-${_stateReference}.addr = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-}
-${locPtr} = (((char*)${baseLocPtr}) + ${locOffsetPtr}[${tileNum}]);
-
-""")
-
-
-class PULPL3TilingDB(PULPL3TilingSB):
-
-    _prefix = "TILING_REPLACED_"
-    _blockTileOutTemplate = _blockTileOutTemplate
-    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
-    _moveTileOutTemplate = _moveTileOutTemplate
-    _moveTileInTemplate = _moveTileInTemplate
-
-    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
-                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
-
-        nodeName = operatorRepresentation['nodeName']
-
-        operatorRepresentation = operatorRepresentation.copy()
-
-        dmaName = self._DMAStructName(tensorName, nodeName)
-        # operatorRepresentation['stateReference'] = dmaName
-        # operatorRepresentation['tileNum'] = "TILING_I"
-        operatorRepresentation['locPtr'] = ctxt.lookup(operatorRepresentation[tensorName]).name
-        operatorRepresentation['baseLocPtr'] = ctxt.hoistReference(operatorRepresentation['locPtr'],
-                                                                   operatorRepresentation['locPtr'] + "_ref")
-        operatorRepresentation['_stateReference'] = self._DMAStructName(tensorName, nodeName) + "_1"
-        ctxt.lookup(operatorRepresentation['baseLocPtr'])._memoryLevel = self.targetMemLevel
-
-        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
-
-        ctxt, operatorRepresentation = super()._hoistDMAUpdates(ctxt, tensorName, updateList, operatorRepresentation)
-
-        locOffsetList = []
-        locBaseOffset = updateList[0].locOffset
-        for update in updateList:
-            locOffsetList.append(int(update.locOffset) - locBaseOffset)
-
-        name = namePrefix + "_locOffset"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], locOffsetList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'locOffsetPtr')
-
-        return ctxt, operatorRepresentation
-
-    def _generateEgressPointerUpdates(
-            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
-                                                  tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(_outUpdateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateEgressDMACode(
-            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        egressDMATransferCalls = []
-        egressDMAWaitStatements = []
-        exportLoadStep = tilingSchedule.outputLoadSchedule[0]
-
-        for key, rectangle in exportLoadStep.items():
-            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-            internalPtr = ctxt.lookup(operatorRepresentation[key])
-
-            tensorName = key
-            nodeName = operatorRepresentation['nodeName']
-            dmaName = self._DMAStructName(tensorName, nodeName)
-
-            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL2", internalPtr.name, externalPtr.name)
-            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.pi_cl_ram_req_t)
-            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
-
-            tensorName = key + "_1"
-            nodeName = operatorRepresentation['nodeName']
-            _dmaName = self._DMAStructName(tensorName, nodeName)
-
-            struct = self._rectToDMAStruct(ctxt, rectangle, "FromL2", internalPtr.name, externalPtr.name)
-            _ = ctxt.hoistStruct(struct, _dmaName, PULPStructDataTypes.pi_cl_ram_req_t)
-            ctxt.lookup(_dmaName)._users += [operatorRepresentation['nodeName']]
-
-            egressDMATransferCalls.append(
-                CodeSnippet(
-                    self._moveTileOutTemplate, {
-                        'innerTilePtr': str(internalPtr._instance),
-                        "outerTilePtr": str(externalPtr._instance),
-                        "stateReference": dmaName,
-                        "_stateReference": _dmaName
-                    }))
-
-            egressDMAWaitStatements.append(
-                CodeSnippet(
-                    self._blockTileOutTemplate, {
-                        'innerTilePtr': str(internalPtr._instance),
-                        "outerTilePtr": str(externalPtr._instance),
-                        "stateReference": dmaName,
-                        "_stateReference": _dmaName
-                    }))
-
-        return egressDMATransferCalls, egressDMAWaitStatements
-
-    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
-                    variableReplacement: VariableReplacementScheme,
-                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
-
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
-            tilingSchedule, ctxt, operatorRepresentation)
-
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
-            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
-
-        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
-        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
-
-        variableUpdates = []
-
-        for transaction in ingressDMATransferCalls:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I+1"
-            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
-            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
-
-        for transaction in ingressDMAUpdates:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I+1"
-
-        for transaction in egressDMATransferCalls:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I"
-
-        for transaction in egressDMAWaitStatements:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation['tileNum'] = "TILING_I"
-
-        for transaction in egressDMAUpdates:
-            _operatorRepresentation = transaction.operatorRepresentation
-            _operatorRepresentation["tileNum"] = "TILING_I"
-
-        openLoopStatement = [
-            CodeSnippet(self._openTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        closeLoopStatement = [
-            CodeSnippet(self._closeTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        setupStatements = []
-        teardownStatements = []
-
-        for transaction in ingressDMATransferCalls:
-            _operatorRepresentation = transaction.operatorRepresentation.copy()
-            _operatorRepresentation["tileNum"] = 0
-            _operatorRepresentation["numTiles"] = operatorRepresentation['numTiles']
-            _operatorRepresentation["tileIdxPtr"] = tileIdxPtr
-            setupStatements.append(CodeSnippet(transaction.template, _operatorRepresentation))
-
-        for transaction in egressDMAWaitStatements:
-            _operatorRepresentation = transaction.operatorRepresentation.copy()
-            _operatorRepresentation['tileNum'] = ctxt.lookup(operatorRepresentation["numTiles"]).values[-1]
-            _operatorRepresentation['numTiles'] = len(tilingSchedule.outputLoadSchedule)
-            teardownStatements.append(CodeSnippet(_finalBlockTileOutTemplate, _operatorRepresentation))
-
-        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L3",
-                                  nodeOps = operatorRepresentation['nodeOps'],
-                                  numTiles = len(tilingSchedule.outputLoadSchedule),
-                                  tileIdxVar = "TILING_I",
-                                  kernelLevelTiling = False)
-
-        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                       ingressDMAWaitStatements, ingressDMAUpdates,
-                                                       egressDMATransferCalls, egressDMAWaitStatements,
-                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
-                                                       closeLoopStatement, setupStatements, teardownStatements)
-
-        return ctxt, newExecutionBlock, True
-
-    def generateTilingLoop(
-            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
-            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        flatTilingSchedule = copy.copy(tilingSchedules[0])
-        for tilingSchedule in tilingSchedules[1:]:
-            flatTilingSchedule += tilingSchedule
-
-        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
-
-        if len(offsetLists) == 0:
-            return ctxt, executionBlock, False
-
-        for offsetList in offsetLists:
-            if not len(offsetList) == 2:
-                return ctxt, executionBlock, False
-
-        allNumTiles = [len(schedule.outputLoadSchedule) for schedule in tilingSchedules]
-        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
-                                                                 tilingSchedules)
-
-        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
-                                operatorRepresentation)
-
-
-class PULPL3TilingGenerationDB(PULPL3TilingDB, DoubleBufferingTilingMixIn):
-    pass
-
-
-class ProfilingPULPL3TilingGenerationDB(PULPL3TilingDB, ProfilingDoubleBufferingTilingMixIn):
-    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingSB.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingSB.py
deleted file mode 100644
index 8079516720..0000000000
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3TilingSB.py
+++ /dev/null
@@ -1,468 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: PULPL3TilingSB.py
-#
-# Last edited: 19.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from collections import namedtuple
-from typing import Dict, List, Literal, Optional, Tuple, Type
-
-import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
-from Deeploy.AbstractDataTypes import Immediate, PointerClass
-from Deeploy.DeeployTypes import CodeSnippet, ConstantBuffer, ExecutionBlock, NetworkContext, NodeTemplate, \
-    OperatorRepresentation
-from Deeploy.Targets.PULPOpen.CodeTransformationPasses import AutoTransposeUtils
-from Deeploy.Targets.PULPOpen.DataTypes import PULPStructDataTypes
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingSingleBufferingTilingMixIn, \
-    SingleBufferingTilingMixIn, TilingMetaInfo
-from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
-from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
-    calculateRectangleOffset, minimizeRectangleDims
-
-_openTileLoopTemplate = NodeTemplate("""
-
-// TILING LOOP
-// for (int TILING_I=0; TILING_I<${numTiles}; TILING_I++){
-for (int TILING_I=${numTiles}[*${tileIdxPtr}]; TILING_I<${numTiles}[(*${tileIdxPtr})+1]; TILING_I++){
-""")
-
-_closeTileLoopTemplate = NodeTemplate("""
-
-// CLOSE TILING LOOP
-}
-*${tileIdxPtr} += 1;
-
-""")
-
-_moveTileInTemplate = NodeTemplate("""
-
-// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
-pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
-// L3 TRANSFERS CANNOT BE CONCURRENT WITH CURRENT DRIVER
-pi_cl_ram_copy_wait(&${stateReference});
-                                   
-""")
-
-_blockTileInTemplate = NodeTemplate("""
-
-// BLOCKING IMPORT TILE ${innerTilePtr}
-pi_cl_ram_copy_wait(&${stateReference});
-                                    
-""")
-
-_moveTileOutTemplate = NodeTemplate("""
-
-// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
-pi_cl_ram_copy_2d(get_ram_ptr(), ${stateReference}.pi_ram_addr, ${stateReference}.addr, ${stateReference}.size, ${stateReference}.stride, ${stateReference}.length, ${stateReference}.ext2loc, &${stateReference});
-// L3 TRANSFERS CANNOT BE CONCURRENT WITH CURRENT DRIVER
-pi_cl_ram_copy_wait(&${stateReference});
-                                    
-""")
-
-_blockTileOutTemplate = NodeTemplate("""
-
-// BLOCKING EXPORT TILE ${innerTilePtr}
-pi_cl_ram_copy_wait(&${stateReference});
-
-""")
-
-_updateDMATransferStructTemplate = NodeTemplate("""
-
-// UPDATE DMA STRUCT ${stateReference}
-${stateReference}.pi_ram_addr = ((char*)${extPtr}) + ${extOffsetPtr}[${tileNum}];
-${stateReference}.size = ${length1dPtr}[${tileNum}];
-${stateReference}.length = ${number1dPtr}[${tileNum}];
-
-""")
-
-# ${stateReference}.number_of_2d_copies = ${number2dPtr}[${tileNum}];
-
-_updateReferenceTemplate = NodeTemplate("""
-
-// UPDATE VARIABLE ${reference}
-*${reference} = ${baseReference}[${tileNum}];
-""")
-
-# ADD NUM TRANSFERS VARIABLE
-
-_DMAUpdate = namedtuple("_DMAUpdate", "extOffset locOffset length_1d_copy number_of_1d_copies number_of_2d_copies")
-
-
-class PULPL3TilingSB(TilingCodeGeneration):
-
-    _prefix = "TILING_REPLACED_"
-
-    _openTileLoopTemplate = _openTileLoopTemplate
-    _closeTileLoopTemplate = _closeTileLoopTemplate
-
-    _moveTileInTemplate = _moveTileInTemplate
-    _blockTileInTemplate = _blockTileInTemplate
-
-    _moveTileOutTemplate = _moveTileOutTemplate
-    _blockTileOutTemplate = _blockTileOutTemplate
-
-    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
-    _updateReferenceTemplate = _updateReferenceTemplate
-
-    @property
-    def prefix(self):
-        return self._prefix + self.targetMemLevel + "_"
-
-    def _DMAStructName(self, tensorName: str, nodeName: str) -> str:
-        return f"{self.prefix}_DMA_{nodeName}_{tensorName}"
-
-    @classmethod
-    def _generatePointerUpdates(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
-                                loadSchedule: List[Dict[str, HyperRectangle]],
-                                tilingSchedule: TilingSchedule) -> Dict[str, _DMAUpdate]:
-        updateDict = {}
-        deltaOffsets = {}
-
-        for idx, loadStep in enumerate(loadSchedule):
-            for stepIdx, (key, rect) in enumerate(loadStep.items()):
-
-                if key in tilingSchedule.outputBaseOffsets.keys():
-                    baseOffsets = tilingSchedule.outputBaseOffsets[key]
-                    direction = "FromL2"
-                else:
-                    baseOffsets = tilingSchedule.inputBaseOffsets[key]
-                    direction = "ToL2"
-
-                if key not in updateDict.keys():
-                    updateDict[key] = []
-                if key not in deltaOffsets.keys():
-                    deltaOffsets[key] = 0
-
-                referenceBuffer = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-                l1Buffer = ctxt.lookup(operatorRepresentation[key])
-
-                struct = cls._rectToDMAStruct(ctxt, rect, direction, l1Buffer.name, l1Buffer._referenceName)
-                accOffset = calculateRectangleOffset(rect, referenceBuffer)
-
-                length_1d_copy = struct.value['size'].value
-                number_of_1d_copies = struct.value['length'].value
-
-                lIdx = idx % len(baseOffsets)
-
-                sol = _DMAUpdate(accOffset, baseOffsets[lIdx], length_1d_copy, number_of_1d_copies, 0)
-
-                deltaOffsets[key] = accOffset
-                updateDict[key].append(sol)
-
-        return updateDict
-
-    @classmethod
-    def _rectToDMAStruct(cls, ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL2", "FromL2"],
-                         L1Name: str, L2Name: str) -> PULPStructDataTypes.pi_cl_ram_req_t:
-
-        referenceBuffer = ctxt.lookup(L2Name)
-
-        rect, referenceRect = minimizeRectangleDims(rectangle, referenceBuffer)
-        assert len(rect.dims) <= 2, "PULP: Only 2D transfers are supported!"
-
-        if direction == "ToL2":
-            _dir = 1
-        else:
-            _dir = 0
-
-        length_1d_copy = rect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-
-        if len(rect.dims) > 1:
-            number_of_1d_copies = rect.dims[-2]
-            stride_1d = referenceRect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-        else:
-            number_of_1d_copies = 1
-            stride_1d = 0
-
-        struct = PULPStructDataTypes.pi_cl_ram_req_t(
-            {
-                "pi_ram_addr": referenceBuffer.name,
-                "addr": L1Name,
-                "stride": stride_1d,
-                "length": length_1d_copy,
-                "size": number_of_1d_copies * length_1d_copy,
-                "ext2loc": _dir,
-                "is_2d": 1
-            }, ctxt)
-
-        return struct
-
-    def _hoistConstantAndReference(self,
-                                   ctxt: NetworkContext,
-                                   constBuf: ConstantBuffer,
-                                   operatorRepresentation: OperatorRepresentation,
-                                   nodeName: str,
-                                   operatorRepresentationName: str,
-                                   immediateType: Optional[Type[Immediate]] = None) -> Tuple[NetworkContext, Dict]:
-        if immediateType is None:
-            _type = PointerClass(BasicDataTypes.int32_t)
-        else:
-            _type = PointerClass(immediateType)
-
-        constBuf._users = [nodeName]
-        constBuf._memoryLevel = self.targetMemLevel
-
-        refName = ctxt.hoistConstantAndReference(constBuf, _type)
-
-        operatorRepresentation[operatorRepresentationName] = refName
-
-        return ctxt, operatorRepresentation
-
-    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
-                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
-
-        operatorRepresentation = operatorRepresentation.copy()
-
-        nodeName = operatorRepresentation['nodeName']
-
-        offsetList = []
-        len1dList = []
-        num1dList = []
-        num2dList = []
-        for update in updateList:
-            offsetList.append(int(update.extOffset))
-            len1dList.append(int(update.length_1d_copy))
-            num1dList.append(int(update.number_of_1d_copies))
-            num2dList.append(int(update.number_of_2d_copies))
-
-        dmaName = self._DMAStructName(tensorName, nodeName)
-        operatorRepresentation['stateReference'] = dmaName
-        operatorRepresentation['tileNum'] = "TILING_I"
-        operatorRepresentation['extPtr'] = ctxt.lookup(operatorRepresentation[tensorName])._referenceName
-
-        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
-
-        name = namePrefix + "_offset"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], offsetList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'extOffsetPtr')
-
-        name = namePrefix + "_length_1d_copy"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], len1dList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'length1dPtr',
-            PULPStructDataTypes.pi_cl_ram_req_t.structTypeDict['size'])
-
-        name = namePrefix + "_number_of_1d_copies"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], num1dList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(
-            ctxt, cb, operatorRepresentation, nodeName, 'number1dPtr',
-            PULPStructDataTypes.pi_cl_ram_req_t.structTypeDict['length'])
-
-        return ctxt, operatorRepresentation
-
-    def _generateEgressPointerUpdates(
-            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
-                                                  tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateIngressPointerUpdates(
-            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.inputLoadSchedule,
-                                                  tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateVariableUpdates(self, tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme,
-                                 ctxt: NetworkContext,
-                                 operatorRepresentation: OperatorRepresentation) -> List[CodeSnippet]:
-
-        updates = []
-
-        for key in variableReplacement.perTileReplacements.keys():
-
-            buf = ctxt.lookup(operatorRepresentation[key])
-            reference = str(buf._instance)
-
-            updates.append(
-                CodeSnippet(self._updateReferenceTemplate, {
-                    "reference": reference,
-                    "tileNum": "TILING_I",
-                    "baseReference": buf._referenceName
-                }))
-
-        return updates
-
-    def _generateDMACode(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
-                         loadSchedule: List[Dict[str, HyperRectangle]],
-                         direction: Literal["ToL2", "FromL2"]) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        DMATransferCalls = []
-        DMAWaitStatements = []
-
-        allNumTransfers = AutoTransposeUtils.allNumTransfers(ctxt, operatorRepresentation, loadSchedule, direction)
-
-        transferNodeRep = {}
-
-        loadStep = loadSchedule[0]
-
-        for idx, (key, rectangle) in enumerate(loadStep.items()):
-
-            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-            internalPtr = ctxt.lookup(operatorRepresentation[key])
-
-            tensorName = key
-            nodeName = operatorRepresentation['nodeName']
-            dmaName = self._DMAStructName(tensorName, nodeName)
-
-            transferNodeRep = {
-                **transferNodeRep,
-                **{
-                    'innerTilePtr': str(internalPtr._instance),
-                    "outerTilePtr": str(externalPtr._instance),
-                    "stateReference": dmaName
-                }
-            }
-
-            struct = self._rectToDMAStruct(ctxt, rectangle, direction, internalPtr.name, externalPtr.name)
-            transferNodeRep["stateStruct"] = struct
-            _ = ctxt.hoistStruct(struct, dmaName, PULPStructDataTypes.pi_cl_ram_req_t)
-            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
-
-            DMATransferCalls.append(CodeSnippet(self._moveTileInTemplate, transferNodeRep))
-
-            DMAWaitStatements.append(CodeSnippet(self._blockTileInTemplate, transferNodeRep))
-
-        return DMATransferCalls, DMAWaitStatements
-
-    def _generateIngressDMACode(
-            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        importLoadStep = tilingSchedule.inputLoadSchedule
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateDMACode(ctxt, operatorRepresentation,
-                                                                                  importLoadStep, "ToL2")
-        return ingressDMATransferCalls, ingressDMAWaitStatements
-
-    def _generateEgressDMACode(
-            self, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        exportLoadStep = tilingSchedule.outputLoadSchedule
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateDMACode(ctxt, operatorRepresentation,
-                                                                                exportLoadStep, "FromL2")
-
-        return egressDMATransferCalls, egressDMAWaitStatements
-
-    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
-                    variableReplacement: VariableReplacementScheme,
-                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
-
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
-            tilingSchedule, ctxt, operatorRepresentation)
-
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
-            tilingSchedule, ctxt, operatorRepresentation)
-
-        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
-        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(tilingSchedule, ctxt, operatorRepresentation)
-
-        setupStatements: List[CodeSnippet] = []
-        teardownStatements: List[CodeSnippet] = []
-        variableUpdates: List[CodeSnippet] = []
-
-        openLoopStatement = [
-            CodeSnippet(self._openTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        closeLoopStatement = [
-            CodeSnippet(self._closeTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L3",
-                                  nodeOps = operatorRepresentation['nodeOps'],
-                                  numTiles = len(tilingSchedule.outputLoadSchedule),
-                                  tileIdxVar = "TILING_I",
-                                  kernelLevelTiling = False)
-
-        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                       ingressDMAWaitStatements, ingressDMAUpdates,
-                                                       egressDMATransferCalls, egressDMAWaitStatements,
-                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
-                                                       closeLoopStatement, setupStatements, teardownStatements)
-
-        return ctxt, newExecutionBlock, True
-
-    def generateTilingLoop(
-            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
-            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        flatTilingSchedule = copy.copy(tilingSchedules[0])
-        for tilingSchedule in tilingSchedules[1:]:
-            flatTilingSchedule += tilingSchedule
-
-        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
-
-        if len(offsetLists) == 0:
-            return ctxt, executionBlock, False
-
-        for offsetList in offsetLists:
-            if not len(offsetList) == 1:
-                return ctxt, executionBlock, False
-
-        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
-                                                                 tilingSchedules)
-
-        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
-                                operatorRepresentation)
-
-
-class PULPL3TilingGenerationSB(PULPL3TilingSB, SingleBufferingTilingMixIn):
-    pass
-
-
-class ProfilingPULPL3TilingGenerationSB(PULPL3TilingSB, ProfilingSingleBufferingTilingMixIn):
-    pass
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPProfileUntiled.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPProfileUntiled.py
index c8c9b06fb9..69fe52bcb2 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPProfileUntiled.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPProfileUntiled.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: PULPClusterTiling.py
-#
-# Last edited: 19.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/__init__.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/__init__.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
new file mode 100644
index 0000000000..6c2aa30811
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, BlockingDmaFromAsyncDmaAdapter, DmaDirection, Future, \
+    PerTensorWaitingStrategy
+
+
+class L3DmaFuture(Future):
+
+    _initTemplate = NodeTemplate("pi_cl_ram_req_t ${name} = {0};")
+
+    _deinitTemplate = NodeTemplate("")
+
+    _allocTemplate = NodeTemplate("")
+
+    _waitTemplate = NodeTemplate("""
+    if (${name}.size != 0) {
+        pi_cl_ram_copy_wait(&${name});
+    }""")
+
+
+class L3Dma(AsyncDma):
+
+    _transferTemplates = {
+        2:
+            NodeTemplate(
+                "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
+            )
+    }
+    _waitingStrategy = PerTensorWaitingStrategy(L3DmaFuture)
+
+    def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
+        super().__init__(transferTemplates)
+
+    def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                      shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                      direction: DmaDirection) -> None:
+        super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction)
+        assert strideExt[-1] == 1, \
+            "Mchan supports only contigous transfers of the innermost dimension for external memory"
+        assert strideLoc[0] == shape[1] and strideLoc[1] == 1, \
+            f"Mchan supports only contigous transfers for local memory. Received local shape: {shape}, stride: {strideLoc}"
+
+    def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...],
+                       strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection,
+                       future: Future) -> OperatorRepresentation:
+        operatorRepresentation = super().transferOpRepr(externalBuffer, localBuffer, shape, strideExt, strideLoc,
+                                                        direction, future)
+        operatorRepresentation.update({
+            "ext2loc": 1 if direction == "ExternalToLocal" else 0,
+            "transfer_size": math.prod(shape),
+            "length": shape[1],
+            "stride": strideExt[0],
+        })
+        return operatorRepresentation
+
+
+# LMACAN: It's a hack because the driver is now working correctly
+l3DmaHack = BlockingDmaFromAsyncDmaAdapter(L3Dma())
diff --git a/Deeploy/Targets/PULPOpen/DMA/MchanDma.py b/Deeploy/Targets/PULPOpen/DMA/MchanDma.py
new file mode 100644
index 0000000000..93bf699dc6
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/DMA/MchanDma.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future
+
+
+class MchanChannelFuture(Future):
+
+    _initTemplate = NodeTemplate("uint32_t ${name} = (uint32_t) -1;")
+
+    _deinitTemplate = NodeTemplate("")
+
+    _allocTemplate = NodeTemplate("${name} = mchan_channel_alloc();")
+
+    _waitTemplate = NodeTemplate("""
+if (${name} <= MCHAN_CHANNEL_ID_MAX) {
+    mchan_channel_wait(${name});
+    mchan_channel_free(${name});
+}
+""")
+
+
+class MchanDma(AsyncDma):
+
+    _transferTemplates = {
+        1: NodeTemplate("mchan_transfer_1d(${cmd}, ${loc}, ${ext});"),
+        2: NodeTemplate("mchan_transfer_2d_ext_strided(${cmd}, ${loc}, ${ext}, ${size_1d}, ${stride_2d});"),
+    }
+    _waitingStrategy = DirectionWaitingStrategy(MchanChannelFuture, "channel")
+
+    def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
+        super().__init__(transferTemplates)
+
+    def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                      shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                      direction: DmaDirection) -> None:
+        super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction)
+
+        transferRank = len(shape)
+        assert strideExt[
+            -1] == 1, "Mchan supports only contigous transfers of the innermost dimension for external memory"
+        if transferRank == 1:
+            assert strideLoc[0] == 1, "Mchan supports only contigous transfers for local memory"
+        else:
+            assert strideLoc[0] == shape[1] and strideLoc[
+                1] == 1, "Mchan supports only contigous transfers for local memory"
+
+    def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...],
+                       strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection,
+                       future: Future) -> OperatorRepresentation:
+        operatorRepresentation = super().transferOpRepr(externalBuffer, localBuffer, shape, strideExt, strideLoc,
+                                                        direction, future)
+
+        transferRank = len(shape)
+
+        mchanFlags = 0
+        mchanFlags += (1 << 0) if direction == "ExternalToLocal" else 0  # direction
+        mchanFlags += (1 << 1)  # increment addresses
+        mchanFlags += (1 << 2) if transferRank == 2 else 0  # 2d transfer
+        mchanFlags += (1 << 3)  # event enable
+
+        mchanTransferSize = math.prod(shape)
+        mchanTransferSizeBits = math.ceil(math.log2(mchanTransferSize))
+        assert mchanTransferSizeBits <= 17, (
+            "The transfer size is not representable with 17 bits. "
+            f"Received transfer size {mchanTransferSize} that requires {mchanTransferSizeBits}")
+
+        operatorRepresentation["cmd"] = (mchanFlags << 17) + mchanTransferSize
+
+        if transferRank == 2:
+            operatorRepresentation["size_1d"] = shape[1]
+            operatorRepresentation["stride_2d"] = strideExt[0]
+
+        return operatorRepresentation
diff --git a/Deeploy/Targets/PULPOpen/DataTypes.py b/Deeploy/Targets/PULPOpen/DataTypes.py
index 8b53cbc439..43e60a81ce 100644
--- a/Deeploy/Targets/PULPOpen/DataTypes.py
+++ b/Deeploy/Targets/PULPOpen/DataTypes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPDataTypes.py
-#
-# Last edited: 01.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from dataclasses import dataclass
 from functools import partial
diff --git a/Deeploy/Targets/PULPOpen/Deployer.py b/Deeploy/Targets/PULPOpen/Deployer.py
index d501863cb4..bceea01f4d 100644
--- a/Deeploy/Targets/PULPOpen/Deployer.py
+++ b/Deeploy/Targets/PULPOpen/Deployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPDeployer.py
-#
-# Last edited: 08.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, List, Type
 
@@ -36,6 +15,7 @@
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NodeTemplate, TopologyOptimizer, VariableBuffer
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import ReshapeConstOptPass, TransposeConstOptPass, \
     TransposeMergePass, TransposeNoPermOptPass, TransposeSplitPass
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import RQAddTransposeSquashPass
 
 _L3AllocTemplate = NodeTemplate("""
@@ -84,7 +64,15 @@ def __init__(self,
 
         self.extNameCount = 0
 
-    def bind(self):
+    def annotateNCores(self) -> None:
+        for layer in self.layerBinding.values():
+            node = layer.node
+            engine = self._selectEngine(node)
+            opRepr = layer.mapper.parser.operatorRepresentation
+            if isinstance(engine, PULPClusterEngine):
+                opRepr["n_cores"] = engine.n_cores
+
+    def bind(self) -> bool:
         # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
         # SCHEREMO: The BindingOptimizationPass system is fairly fragile;
         # it was designed this way because implementing further topology optimizations after
@@ -92,11 +80,16 @@ def bind(self):
         # but if there is only very few cases, this solution is okay.
         autoTransposePass = AutoTransposeMergePass()
         #self.ctxt, self.layerBinding = autoTransposePass.apply(self.ctxt, self.graph, self.layerBinding)
+
+        # LMACAN: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
+        self.annotateNCores()
+
         # SCHEREMO: THIS IS A STOP GAP SOLUTION. DONT REUSE. I MEAN IT. I WILL FIND YOU.
-        ret = super().bind()
-        if ret:
-            self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;")
-        return ret
+        if not super().bind():
+            return False
+
+        self.ctxt.hoistGlobalDefinition("cluster_dev", "extern struct pi_device cluster_dev;")
+        return True
 
     def _l3ConstBuffer(self) -> List[VariableBuffer]:
         return [
diff --git a/Deeploy/Targets/PULPOpen/Layers.py b/Deeploy/Targets/PULPOpen/Layers.py
index d291078c88..69ce2fa958 100644
--- a/Deeploy/Targets/PULPOpen/Layers.py
+++ b/Deeploy/Targets/PULPOpen/Layers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: CMSISLayers.py
-#
-# Last edited: 22.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py
index 4d5359db95..5c5951eaba 100644
--- a/Deeploy/Targets/PULPOpen/Parsers.py
+++ b/Deeploy/Targets/PULPOpen/Parsers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPParsers.py
-#
-# Last edited: 10.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import math
 from typing import Tuple
@@ -29,8 +8,8 @@
 import onnx_graphsurgeon as gs
 
 from Deeploy.DeeployTypes import NetworkContext
-from Deeploy.Targets.Generic.Parsers import Conv2DParser, GEMMParser, RQSConv1DParser, RQSConv2DParser, \
-    RQSParserInterface
+from Deeploy.Targets.Generic.Parsers import Conv2DParser, GEMMParser, ReduceMeanParser, RQSConv1DParser, \
+    RQSConv2DParser, RQSParserInterface
 
 
 class PULPConv2DParser(RQSConv2DParser):
@@ -93,24 +72,24 @@ def parseNode(self, node: gs.Node) -> (bool):
         wellFormed = super().parseNode(node)
         if wellFormed:
             ret = all([
-                # Make sure padding is square
+                # Current PULP kernel only supports grouping of 1
                 self.operatorRepresentation['group'] == 1,
+
+                # Make sure padding is square
                 self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
                 self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
                 self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
-                len(node.inputs) == 2
+
+                # Check number of inputs
+                # 2 inputs if no bias, 3 if layer has bias
+                len(node.inputs) in [2, 3],
             ])
 
-            self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
-            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
-            self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
-            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+            # Extract additional attributes
             self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
             self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
             self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
             self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
-            self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
-            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
 
             return ret
         return False
@@ -123,11 +102,86 @@ def parseNodeCtxt(self,
         newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
 
         if ret:
+            # Set inputs names
+            inputs = ['data_in', 'weight']
+
+            # Handle bias, if present
+            if len(node.inputs) == 2:
+                self.operatorRepresentation["has_bias"] = "false"
+                self.operatorRepresentation["bias"] = "NULL"
+            else:
+                inputs.append("bias")
+                self.operatorRepresentation["has_bias"] = "true"
+
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
             return newCtxt, True
 
         return ctxt, False
 
 
+class PULPFPDWConv2DParser(Conv2DParser):
+
+    def __init__(self, noBiasHoisting = True):
+        super().__init__(noBiasHoisting)
+
+    def parseNode(self, node: gs.Node) -> (bool):
+        # Parse root conv 2D information
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            # Check if the node is a depthwise convolution
+            ret = all([
+                # Make sure padding is square
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][2],
+                self.operatorRepresentation['pads'][1] == self.operatorRepresentation['pads'][3],
+                self.operatorRepresentation['pads'][0] == self.operatorRepresentation['pads'][1],
+
+                # Check number of inputs
+                # 2 inputs if no bias, 3 if layer has bias
+                len(node.inputs) in [2, 3],
+            ])
+
+            # Extract additional attributes
+            self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+            self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+
+            return ret
+        return False
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        # Parse node context for 2D conv
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            # Define input names
+            inputs = ['data_in', 'weight']
+
+            # Handle bias, if present
+            if len(node.inputs) == 2:
+                self.operatorRepresentation["has_bias"] = "false"
+                self.operatorRepresentation["bias"] = "NULL"
+            else:
+                inputs.append("bias")
+                self.operatorRepresentation["has_bias"] = "true"
+
+            # Map input nodes to operator representation
+            for idx, inputNode in enumerate(node.inputs):
+                self.operatorRepresentation[inputs[idx]] = ctxt.lookup(inputNode.name).name
+
+            # Check if DW
+            if self.operatorRepresentation['group'] == self.operatorRepresentation['ch_im_in']:
+                return newCtxt, True
+
+        return ctxt, False
+
+
 class PULPDWConv1DParser(RQSConv1DParser):
 
     def __init__(self, noBiasHoisting = True):
@@ -408,3 +462,26 @@ def parseNodeCtxt(self,
             return ctxt, False
 
         return newCtxt, True
+
+
+class PULPReduceMeanParser(ReduceMeanParser):
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        # Inherit the generic ReduceMean parsing
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            # Add to operator representation the non-reduced dimensions for tiling purposes
+            originalInputShape = newCtxt.lookup(self.operatorRepresentation['data_in']).shape
+            reducedAxes = self.operatorRepresentation['axes']
+
+            for ax in range(len(originalInputShape)):
+                if ax not in reducedAxes:
+                    self.operatorRepresentation['dim_in_' + str(ax)] = originalInputShape[ax]
+
+            return newCtxt, True
+        else:
+            return ctxt, False
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index 93e42b77d0..d45dc00f9c 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -1,48 +1,27 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPPlatform.py
-#
-# Last edited: 07.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import numpy as np
 import onnx_graphsurgeon as gs
 
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    RemoveEmptyConvBiasPass, RemoveOnlySingletonReduceMeanPass
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NetworkContext, NodeMapper, \
     NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
 from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \
     BasicRQIntegerDivBinding
-from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELULayer, GEMMLayer, \
-    LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceMeanLayer, ReduceSumLayer, \
-    ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \
-    SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \
-    TransposeLayer, iHardswishLayer, iRMSNormLayer
+from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \
+    GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \
+    ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \
+    RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \
+    SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \
-    GELUParser, GEMMParser, LayerNormParser, MatMulParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, \
-    QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, \
-    RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \
+    GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool2DParser, \
+    MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, \
+    RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, SGDParser, SliceParser, \
     SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, SoftmaxParser, \
     TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
@@ -50,20 +29,23 @@
     MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \
     SkipEmptyConcatPass, SkipUnityRequantPass, iGELURequantMergePass, iHardswishRequantMergePass
 from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPConv1DBinding, \
-    PULPDMASliceBindings, PULPDWConv1DBinding, PULPReduceMeanBindings
+    PULPDMASliceBindings, PULPDWConv1DBinding
 from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
 from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
-    PULPDWConv2DParser, PULPFPConv2DParser, PULPGEMMParser, PULPMatrixVecParser, PULPTallGEMMParser
+    PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \
+    PULPReduceMeanParser, PULPTallGEMMParser
 from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
-    PULPConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUTilingReadyBindings, \
-    PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, \
-    PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, PULPLayernormTilingReadyBindings, \
+    PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \
+    PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \
+    PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \
+    PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \
     PULPMatMulTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \
-    PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, \
-    PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \
-    PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \
-    PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
+    PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, \
+    PULPRQAddTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, \
+    PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, \
+    PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, \
+    PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
     PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
     PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
@@ -73,6 +55,7 @@
 AddMapper = NodeMapper(AddParser(), PULPAddTilingReadyBindings)
 FlattenMapper = NodeMapper(FlattenParser(), PULPFlattenTilingReadyBindings)
 GELUMapper = NodeMapper(GELUParser(), PULPFPGELUTilingReadyBindings)
+GELUGradMapper = NodeMapper(GELUGradParser(), PULPFPGELUGradTilingReadyBindings)
 GatherMapper = NodeMapper(GatherParser(), PULPGatherTilingReadyBindings)
 MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings)
 Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
@@ -84,7 +67,7 @@
 RequantShiftMapper = NodeMapper(RequantShiftParser(), PULPRQSTilingReadyBindings)
 UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), PULPUniformRQSTilingReadyBindings)
 
-ReduceMeanMapper = NodeMapper(ReduceMeanParser(), PULPReduceMeanBindings)
+ReduceMeanMapper = NodeMapper(PULPReduceMeanParser(), PULPReduceMeanTilingReadyBindings)
 ReduceSumMapper = NodeMapper(ReduceSumParser(), PULPReduceSumTilingReadyBindings)
 MatMulMapper = NodeMapper(MatMulParser(), PULPMatMulTilingReadyBindings)
 RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
@@ -94,6 +77,7 @@
 DWConv1DMapper = NodeMapper(PULPDWConv1DParser(), [PULPDWConv1DBinding])
 FPConv2DMapper = NodeMapper(PULPFPConv2DParser(), PULPConv2DTilingReadyBindings)
 Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings)
+FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPDWConv2DTilingReadyBindings)
 DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings)
 GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings)
 FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings)
@@ -101,6 +85,7 @@
 TallGEMMMapper = NodeMapper(PULPTallGEMMParser(), PULPRQSTallGEMMTilingReadyBindings)
 MaxPool2DMapper = NodeMapper(MaxPool2DParser(), PULPMaxPool2DTilingReadyBindings)
 LayerNormMapper = NodeMapper(LayerNormParser(), PULPLayernormTilingReadyBindings)
+LayerNormGradMapper = NodeMapper(LayerNormGradParser(), PULPLayernormGradTilingReadyBindings)
 ReluMapper = NodeMapper(ReluParser(), PULPReluTilingReadyBindings)
 SoftmaxMapper = NodeMapper(SoftmaxParser(), PULPSoftmaxTilingReadyBindings)
 SoftmaxGradMapper = NodeMapper(SoftmaxGradParser(), PULPSoftmaxGradTilingReadyBindings)
@@ -108,7 +93,9 @@
 
 ConcatMapper = NodeMapper(ConcatParser(), PULPConcatTilingReadyBindings)
 
-SliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings)
+DMASliceMapper = NodeMapper(SliceParser(), PULPDMASliceBindings)
+
+SliceMapper = NodeMapper(SliceParser(), PULPSliceTilingReadyBindings)
 
 iRMSNormMapper = NodeMapper(iRMSNormParser(), PULPiRMSNormTilingReadyBindings)
 
@@ -122,12 +109,14 @@
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
 GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
 PULPMapping = {
-    'Conv': ConvLayer([FPConv2DMapper]),
+    'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]),
     'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
     'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),
     'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]),
     'Gelu': GELULayer([GELUMapper]),
+    'GeluGrad': GELUGradLayer([GELUGradMapper]),
     'LayerNormalization': LayerNormLayer([LayerNormMapper]),
+    'LayerNormalizationGrad': LayerNormGradLayer([LayerNormGradMapper]),
     'MaxPool': MaxPoolLayer([MaxPool2DMapper]),
     'RequantizediGELU': RQSiGELULayer([RQGELU_int8_Mapper]),
     'RQIntegerDiv': RQIntegerDivLayer([RQIntegerDivMapper]),
@@ -148,7 +137,7 @@
     'Squeeze': ReshapeLayer([UnsqueezeMapper]),
     'Transpose': TransposeLayer([TransposeMapper]),
     'Unsqueeze': ReshapeLayer([UnsqueezeMapper]),
-    'Slice': SliceLayer([SliceMapper]),
+    'Slice': SliceLayer([SliceMapper, DMASliceMapper]),
     'RequantizedAdd': AddLayer([RQAddMapper]),
     'Concat': ConcatLayer([ConcatMapper]),
     'iRMSNorm': iRMSNormLayer([iRMSNormMapper]),
@@ -248,20 +237,28 @@ class PULPStructBuffer(StructBuffer):
     MergeConstAddAndRequantPass(),
     PULPGEMMRequantMergePass(),
     PULPMatMulRequantMergePass(),
-    PULPAddRequantMergePass()
-])
+    PULPAddRequantMergePass(),
+    RemoveEmptyConvBiasPass(),
+    RemoveOnlySingletonReduceMeanPass(),
+],
+                                  name = "PULPOptimizer")
 
 # SCHEREMO: stdint is included before pulp_nn_kernels.h because it is supposed to be included in there, but isn't...
 _includeList = [
-    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployBasicMath.h", "DeeployPULPMath.h", "dory_dma.h", "dory_mem.h",
-    "bsp/ram.h", "pulp_core.h"
+    "pmsis.h", "stdint.h", "pulp_nn_kernels.h", "DeeployPULPMath.h", "mchan_siracusa.h", "dory_mem.h", "bsp/ram.h"
 ]
 
 
 class PULPClusterEngine(DeploymentEngine):
 
-    def __init__(self, name: str, Mapping = PULPMapping, initCode = "", includeList = _includeList) -> None:
+    def __init__(self,
+                 name: str,
+                 Mapping = PULPMapping,
+                 initCode = "",
+                 includeList = _includeList,
+                 n_cores: int = 8) -> None:
         super().__init__(name, Mapping, initCode, includeList)
+        self.n_cores = n_cores
 
 
 class PULPPlatform(DeploymentPlatform):
diff --git a/Deeploy/Targets/PULPOpen/Templates/AllocateTemplate.py b/Deeploy/Targets/PULPOpen/Templates/AllocateTemplate.py
index d6617264e6..468a12dd73 100644
--- a/Deeploy/Targets/PULPOpen/Templates/AllocateTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/AllocateTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: AllocateTemplate.py
-#
-# Last edited: 09.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py
index 7ceec56b13..ebc614f479 100644
--- a/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/ConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ConvTemplate.py
-#
-# Last edited: 10.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py b/Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py
similarity index 87%
rename from Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py
rename to Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py
index 07b6afcbaa..ae400ad216 100644
--- a/Deeploy/Targets/PULPOpen/Templates/SliceTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/DMASliceTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: SliceTemplate.py
-#
-# Last edited: 01.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
index 850de69e55..200ad1b9ea 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
@@ -1,41 +1,19 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FloatAddTemplate.py
-#
-# Last edited: 13.11.2024
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
 // Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp})
-int8_t ${nodeName}_core_id = pi_core_id();
-int8_t ${nodeName}_log2Core = log2(NUM_CORES);
-int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
-int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
-int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
+uint8_t ${nodeName}_core_id = (uint8_t) pi_core_id();
+uint8_t ${nodeName}_log2Core = (uint8_t) log2(NUM_CORES);
+uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+uint32_t ${nodeName}_chunk_start = (uint32_t) MIN(${nodeName}_chunk*${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_chunk_stop = (uint32_t) MIN(${nodeName}_chunk_start + ${nodeName}_chunk, (uint32_t) ${size});
 
 uint32_t i = ${nodeName}_chunk_start;
-for (; i+5 < ${nodeName}_chunk_stop; i+=6) {
+for (; i + 5 < ${nodeName}_chunk_stop; i += 6) {
     ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i];
     ${data_out}[i+1] = ${data_in_1}[i+1] + ${data_in_2}[i+1];
     ${data_out}[i+2] = ${data_in_1}[i+2] + ${data_in_2}[i+2];
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
index 561ba49952..bfa893db94 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FLoatConvTemplate.py
-#
-# Last edited: 23.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
@@ -39,9 +18,13 @@ def __init__(self, templateStr):
     def computeTransientBuffersSize(
             ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
-        im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
-                              operatorRepresentation['dim_kernel_y'])
+        # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth //
+                      8) * operatorRepresentation["n_cores"] * operatorRepresentation[
+                          'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+
         im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+
         return [(im2col_name, im2col_dim)]
 
     def hoistTransientBuffers(self, ctxt: NetworkContext,
@@ -55,6 +38,39 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
         return ctxt, operatorRepresentation, [im2col_name]
 
 
+class PULP2DFloatDWConvIm2ColTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+
+        # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[
+            "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = PULP2DFloatDWConvIm2ColTemplate.computeTransientBuffersSize(
+            ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+        # Manually set the type of the im2col buffer to match the input type, since it defaults to void for transient buffers
+        ctxt.lookup(im2col_name)._type.referencedType = ctxt.lookup(
+            operatorRepresentation['data_in'])._type.referencedType
+
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
+
 reference2DTemplate = NodeTemplate("""
 // 2D FP Conv HWC with ChannelOut parallelism (Name: ${nodeName}, Op: ${nodeOp})
 
@@ -63,15 +79,16 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
 
 for (uint32_t n=0; n<${batch}; ++n) {
     PULP_Conv2d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
-        ref_${data_out}_${data_in}, 
+        ref_${data_out}_${data_in},
         ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in},
         ${weight}, ${ch_im_out},
         ${dim_kernel_y}, ${dim_kernel_x},
         ${stride_y}, ${stride_x},
-        ref_${data_out}_${data_out}, 
+        ${bias}, ${has_bias},
+        ref_${data_out}_${data_out},
         ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}
     );
-    
+
 
     ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
     ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
@@ -84,27 +101,60 @@ def hoistTransientBuffers(self, ctxt: NetworkContext,
 ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
 ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
 
-for (uint32_t n=0; n<${batch}; ++n) {   
+for (uint32_t n=0; n<${batch}; ++n) {
     PULP_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
-        ref_${data_out}_${data_in},            
-        ${dim_im_in_y},                      
-        ${dim_im_in_x},                      
-        ${ch_im_in},                          
-        ${weight},               
-        ${ch_im_out},            
-        ${dim_kernel_y},                      
-        ${dim_kernel_x},                      
-        ${stride_y},                          
-        ${stride_x},                          
-        ref_${data_out}_${data_out},         
-        ${padding_y_top},                    
-        ${padding_y_bottom},                  
-        ${padding_x_left},                    
-        ${padding_x_right},                   
-        ${ctxtBuffer}             
+        ref_${data_out}_${data_in},
+        ${dim_im_in_x},
+        ${dim_im_in_y},
+        ${ch_im_in},
+        ${weight},
+        ${ch_im_out},
+        ${dim_kernel_x},
+        ${dim_kernel_y},
+        ${stride_x},
+        ${stride_y},
+        ${bias}, ${has_bias},
+        ref_${data_out}_${data_out},
+        ${padding_y_top},
+        ${padding_y_bottom},
+        ${padding_x_left},
+        ${padding_x_right},
+        ${ctxtBuffer}
+    );
+
+    ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
+""")
+
+referenceDW2DIm2ColTemplate = PULP2DFloatDWConvIm2ColTemplate("""
+// 2D DW FP Conv HWC with Im2Col and ChannelOout parallelism (Name: ${nodeName}, Op: ${nodeOp})
+
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    PULP_DW_Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
+        ref_${data_out}_${data_in},
+        ${dim_im_in_x},
+        ${dim_im_in_y},
+        ${ch_im_in},
+        ${weight},
+        ${ch_im_out},
+        ${dim_kernel_x},
+        ${dim_kernel_y},
+        ${stride_x},
+        ${stride_y},
+        ${bias}, ${has_bias},
+        ref_${data_out}_${data_out},
+        ${padding_y_top},
+        ${padding_y_bottom},
+        ${padding_x_left},
+        ${padding_x_right},
+        ${ctxtBuffer}
     );
 
     ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
     ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
 }
-""")
\ No newline at end of file
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py
index 1ff52a2b0b..701d102590 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py
@@ -1,31 +1,20 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FloatGELUTemplate.py
-#
-# Last edited: 04.05.2025
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
 // GELU (Name: ${nodeName}, Op: ${nodeOp})
 PULP_GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size});
+""")
+
+referenceGradTemplate = NodeTemplate("""
+// GELU Parallel (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
+int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
+GELU_fp${data_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}_sigmoid_grad_chunk(${grad_in}, ${data_in}, ${grad_out}, ${nodeName}_chunk_start, ${nodeName}_chunk_stop);
 """)
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
index eb017002ce..59499706e5 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -1,37 +1,43 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FloatGemmTemplate.py.py
-#
-# Last edited: 05.06.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the Licens
-from Deeploy.DeeployTypes import NodeTemplate
-
-referenceTemplate = NodeTemplate("""
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import float32_tPtr
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class PULPFloatGEMMTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
+            # No bias case - set C to NULL and provide a default type
+            operatorRepresentation['C'] = None
+            operatorRepresentation['C_type'] = float32_tPtr  # Default to fp32 type
+            operatorRepresentation['C_batched'] = False
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = PULPFloatGEMMTemplate("""
 // GEMM (Name: ${nodeName}, Op: ${nodeOp})
 ${A_type.typeName} ref_${data_out}_${A} = ${A};
 ${B_type.typeName} ref_${data_out}_${B} = ${B};
+% if C is not None:
 ${C_type.typeName} ref_${data_out}_${C} = ${C};
+% else:
+${C_type.typeName} ref_${data_out}_C = NULL;
+% endif
 ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
 
 for(uint32_t i=0; i<${batch}; i++){
+    % if C is not None:
     PULP_Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
         ref_${data_out}_${A},
         ref_${data_out}_${B},
@@ -43,10 +49,31 @@
         ${transA},
         ${transB}
     );
-    
+    % else:
+    PULP_Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${C_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+        ref_${data_out}_${A},
+        ref_${data_out}_${B},
+        NULL,
+        ref_${data_out}_${data_out},
+        ${M},
+        ${N},
+        ${O},
+        ${transA},
+        ${transB}
+    );
+    % endif
+    % if A_batched:
     ref_${data_out}_${A} += ${M} * ${N};
+    % endif
+
+    % if B_batched:
     ref_${data_out}_${B} += ${N} * ${O};
+    % endif
+
+    % if C is not None and C_batched:
     ref_${data_out}_${C} += ${M} * ${O};
+    % endif
+
     ref_${data_out}_${data_out} += ${M} * ${O};
 }
 """)
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py
index 05898ee16d..315481741e 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py
@@ -1,39 +1,52 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FloatLayernormTemplate.py
-#
-# Last edited: 05.06.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
 // Float Layernorm (Name: ${nodeName}, Op: ${nodeOp})
 PULP_Layernorm_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
-    ${data_in}, 
-    ${data_out}, 
-    ${weight}, 
-    ${bias}, 
-    ${epsilon}, 
-    ${size}, 
+    ${data_in},
+    ${data_out},
+    ${weight},
+    ${bias},
+    ${epsilon},
+    ${size},
     ${lastDimLength}
 );
+""")
+
+referenceGradTemplate = NodeTemplate("""
+// FloatLayernormGrad Parallel (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+
+int32_t ${nodeName}_seq_length = ${size} / ${lastDimLength};
+int32_t ${nodeName}_chunk = (${nodeName}_seq_length >> ${nodeName}_log2Core) + 
+                          ((${nodeName}_seq_length & (NUM_CORES-1)) != 0);
+int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${nodeName}_seq_length);
+int32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${nodeName}_seq_length);
+ 
+int32_t ${nodeName}_elem_start = ${nodeName}_start * ${lastDimLength};
+int32_t ${nodeName}_elem_end = ${nodeName}_end * ${lastDimLength};
+int32_t ${nodeName}_elem_count = ${nodeName}_elem_end - ${nodeName}_elem_start;
+ 
+const float${grad_in_type.referencedType.typeWidth}_t* ${nodeName}_grad_in_ptr = ${grad_in} + ${nodeName}_elem_start;
+const float${data_in_type.referencedType.typeWidth}_t* ${nodeName}_data_in_ptr = ${data_in} + ${nodeName}_elem_start;
+float${grad_out_type.referencedType.typeWidth}_t* ${nodeName}_grad_out_ptr = ${grad_out} + ${nodeName}_elem_start;
+ 
+if (${nodeName}_elem_count > 0) {
+  LayernormGrad_fp${grad_in_type.referencedType.typeWidth}_fp${grad_out_type.referencedType.typeWidth}(
+      ${nodeName}_grad_in_ptr,     // Upstream gradient (dy)
+      ${nodeName}_data_in_ptr,     // Original input (x)
+      ${nodeName}_grad_out_ptr,    // Output gradient (dx)
+      ${weight},                   // Input Scale parameter
+      ${bias},                     // Input Bias parameter
+      ${epsilon},                  // Epsilon for numerical stability
+      ${nodeName}_elem_count,      // Number of elements to process
+      ${lastDimLength}             // Size of the feature dimension
+  );
+}
 """)
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
index bcbcd0aefa..3cdf26097b 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
@@ -1,43 +1,33 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: F；FloatMatMul.py
-#
-# Last edited: 28.03.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the Licens
+
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
 // Matmul with row parallelism (Name: ${nodeName}, Op: ${nodeOp})
 
 for(uint32_t b=0; b<${batch}; b++) {
+    % if A_batched:
     ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+    % else:
+    ${A_type.typeName} batch_A = ${A};
+    % endif
+
+    % if B_batched:
     ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+    % else:
+    ${B_type.typeName} batch_B = ${B};
+    % endif
+
     ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
-    
+
     PULP_MatMul_fp32_fp32_fp32_unroll1x7(
         batch_A,
-        batch_B, 
+        batch_B,
         batch_out,
         ${M},
-        ${N}, 
+        ${N},
         ${O}
     );
 }
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py
index f1a01227ec..846aeae92d 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MaxPoolTemplate.py
-#
-# Last edited: 24.01.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
@@ -33,9 +12,9 @@
 
 for (uint32_t n=0; n<${batch}; ++n) {
     PULP_MaxPool2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
-        ref_${data_out}_${data_in}, 
+        ref_${data_out}_${data_in},
         ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in},
-        ${dim_kernel_x}, ${dim_kernel_y}, 
+        ${dim_kernel_x}, ${dim_kernel_y},
         ${stride_x}, ${stride_y},
         ref_${data_out}_${data_out},
         ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
index bbed92ce74..ced6c3cbcf 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
@@ -1,39 +1,17 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FloatMulTemplate.py
-#
-# Last edited: 05.06.2025
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
 // Float Mul with parallelism and 6x unrolling (Name: ${nodeName}, Op: ${nodeOp})
 
-int8_t ${nodeName}_core_id = pi_core_id();
-int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+uint32_t ${nodeName}_core_id = pi_core_id();
+uint32_t ${nodeName}_log2Core = (uint32_t) log2(NUM_CORES);
 uint32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1)) != 0);
-uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${size});
-uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${size});
+uint32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, (uint32_t) ${size});
+uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size});
 
 if (${nodeName}_start < ${nodeName}_end) {
     float32_t ${nodeName}_scalar = ${B}[0];
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatReduceMeanTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatReduceMeanTemplate.py
new file mode 100644
index 0000000000..62e1110f79
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatReduceMeanTemplate.py
@@ -0,0 +1,142 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloatReduceMeanTemplate(NodeTemplate):
+    '''
+    WARNING: This version of parallelization is optimized for the TinyViT ReduceMean layers
+    (49 elements in the reduced axis). Greater sizes of the reduced axis may benefit
+    from different parallelization and tiling strategies.
+    '''
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        operatorRepresentation['input_offset'] = 0
+        if hasattr(data_in, "_signed") and hasattr(data_in, "nLevels"):
+            operatorRepresentation['input_offset'] = (data_in._signed == 0) * int(data_in.nLevels / 2)
+
+        operatorRepresentation['output_offset'] = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            operatorRepresentation['output_offset'] = -(data_out._signed == 0) * int(data_out.nLevels / 2)
+
+        for ax in range(len(operatorRepresentation['data_in_shape'])):
+            if ax not in operatorRepresentation['axes']:
+                _ = operatorRepresentation['dim_in_' + str(ax)]
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _FloatReduceMeanTemplate("""
+## =============== Perform necessary precomputations ===============
+<%
+# Update input shape based on tiling
+new_data_in_shape = data_in_shape.copy()
+
+for i in range(len(new_data_in_shape)):
+    if i not in axes:
+        new_data_in_shape[i] = pageargs['dim_in_' + str(i)]
+
+# Compute the total number of elements being reduced in one axis
+reduceLength = 1
+for i, axis in enumerate(axes):
+    if axis < 0:
+        axes[i] += len(data_in_shape)
+    reduceLength = reduceLength * data_in_shape[axis]
+
+# Compute the remaining dimensions after reduction
+# Order them for more efficient parallelization
+# (heuristically working on the largest non-tiled stride last,
+# since it's impossible to get tiling information here)
+restDims = list(set(list(range(len(data_in_shape)))).difference(set(axes)))
+restDims = sorted(restDims, key=lambda x: data_in_shape[x])
+
+dataSize = new_data_in_shape[restDims[-1]]
+
+# =============== Prepare shape and access strings ===============
+# shapeStr is going to have the [d1][d2]... format
+# accessStr is going to have the [i_0][i_1]... format
+shapeStr = ''
+accessStr = ''
+
+data_out_str = "0"
+data_out_str_prod = "1"
+
+for idx, i in enumerate(new_data_in_shape[1:]):
+    if isinstance(i, str):
+        shapeStr += '[*' + i + ']'
+    else:
+        shapeStr += '[' + str(i) + ']'
+
+for j in range(len(data_in_shape)):
+    accessStr += '[i_' + str(j) + ']'
+
+for k in sorted(restDims, reverse=True):
+    data_out_str += ' + i_' + str(k) + '*' + str(data_out_str_prod)
+    if isinstance(new_data_in_shape[k], str):
+        data_out_str_prod += "* *(" + new_data_in_shape[k] + ")"
+    else:
+        data_out_str_prod += "* " + str(new_data_in_shape[k])
+%>
+
+## =============== Start of the actual template ===============
+// ReduceMean (Name: ${nodeName}, Op: ${nodeOp})
+## Get core information
+uint32_t core_id = pi_core_id();
+uint32_t log2Core = (uint32_t) LOG2(NUM_CORES);
+
+## Split into chunks for each core
+% if isinstance(dataSize, str):
+uint32_t chunk = (*(${dataSize}) >> log2Core) + ((*(${dataSize}) & (NUM_CORES - 1)) != 0);
+uint32_t chunk_start = MIN(chunk * core_id, *(${dataSize}));
+uint32_t chunk_stop = MIN(chunk_start + chunk, *(${dataSize}));
+% else:
+uint32_t chunk = (${dataSize}U >> log2Core) + ((${dataSize}U & (NUM_CORES - 1)) != 0);
+uint32_t chunk_start = MIN(chunk * core_id, ${dataSize}U);
+uint32_t chunk_stop = MIN(chunk_start + chunk, ${dataSize}U);
+% endif
+
+## Iterate through non-reduced dimensions
+## Keep the last dimension for parallelization
+% for i in list(restDims[:-1]):
+% if isinstance(pageargs['dim_in_' + str(i)], str):
+for(uint32_t i_${i} = 0; i_${i} < *${pageargs['dim_in_' + str(i)]}; i_${i}++) {
+% else:
+for(uint32_t i_${i} = 0; i_${i} < ${pageargs['dim_in_' + str(i)]}; i_${i}++) {
+% endif
+% endfor
+for(uint32_t i_${restDims[-1]} = chunk_start; i_${restDims[-1]} < chunk_stop; i_${restDims[-1]}++) {
+## Initialize accumulator
+uint32_t out_idx = ${data_out_str};
+${data_out}[out_idx] = ${input_offset}*${reduceLength};
+
+## Iterate through reduced dimensions and accumulate
+% for i in list(axes):
+for(uint32_t i_${i} = 0; i_${i} < ${data_in_shape[i]}; i_${i}++) {
+% endfor
+${data_out}[out_idx] += ((${data_in_type.referencedType.typeName} (*)${shapeStr})${data_in})${accessStr};
+% for i in range(len(axes)):
+}
+% endfor
+
+## Write back the mean value
+% if keepdims:
+${data_out}[out_idx] = (${data_out_type.referencedType.typeName}) (${data_out}[out_idx] / ${reduceLength} + ${output_offset});
+% else:
+${data_out}[out_idx] = (${data_out_type.referencedType.typeName}) (${data_out}[out_idx] / ${reduceLength});
+% endif
+% for i in range(len(restDims)):
+}
+% endfor
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py
index f7aed10e0d..ab22b75bee 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FloatReluTemplate.py
-#
-# Last edited: 04.05.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py
index 6e2905630a..b12a15c7b0 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: FloatSoftmaxTemplate.py
-#
-# Last edited: 23.1.2025
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/FreeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FreeTemplate.py
index de10f7f8aa..639f707e9f 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FreeTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FreeTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: FreeTemplate.py
-#
-# Last edited: 09.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
index 0486dd18f5..fbe475b8df 100644
--- a/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/GEMMTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: GEMMTemplate.py
-#
-# Last edited: 10.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -71,7 +50,7 @@ def alignToContext(self, ctxt: NetworkContext,
 // LMACAN: In some edge cases sporadic errors happen if this loop is not added.
 // We believe this is due to missing bubbles in the pipeline that break operator forwarding.
 // Breaking test:
-//   `python testRunner_tiled_siracusa.py -t=Tests/Transformer --defaultMemLevel=L3 --doublebuffer --l1=30000`
+//   `python testRunner_tiled_siracusa.py -t=Tests/Models/Transformer --defaultMemLevel=L3 --doublebuffer --l1=30000`
 #pragma unroll 1
 for(int k=0;k<3;k++){
   asm volatile("nop" ::);
diff --git a/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py
index 359619384a..e4b8348614 100644
--- a/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/MatrixVectorTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: MatrixVectorTemplate.py
-#
-# Last edited: 15.03.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/MaxPool2DTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MaxPool2DTemplate.py
index 6b74c3d73f..d4b63a2694 100644
--- a/Deeploy/Targets/PULPOpen/Templates/MaxPool2DTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/MaxPool2DTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MaxPool2DTemplate.py
-#
-# Last edited: 10.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py
index 1b6be3ddcd..1dbefa3287 100644
--- a/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/MulTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: MulTemplate.py
-#
-# Last edited: 15.03.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate, OperatorRepresentation
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/RQAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/RQAddTemplate.py
index f88b2dbacc..1cbf9d9764 100644
--- a/Deeploy/Targets/PULPOpen/Templates/RQAddTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/RQAddTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: RQAddTemplate.py
-#
-# Last edited: 11.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.Targets.Generic.Templates.RQAddTemplate import RQAddTemplate
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/RQSiHardswishTemplate.py b/Deeploy/Targets/PULPOpen/Templates/RQSiHardswishTemplate.py
index 462f176097..8b4f02d0b0 100644
--- a/Deeploy/Targets/PULPOpen/Templates/RQSiHardswishTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/RQSiHardswishTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: RQSiHardswishTemplate.py
-#
-# Last edited: 15.03.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
index 14948844ff..849f68eef3 100644
--- a/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/ReduceMeanTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: ReduceMeanTemplate.py
-#
-# Last edited: 05.06.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/RequantShiftTemplate.py b/Deeploy/Targets/PULPOpen/Templates/RequantShiftTemplate.py
index 4a7a384171..791f3661cd 100644
--- a/Deeploy/Targets/PULPOpen/Templates/RequantShiftTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/RequantShiftTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: RequantShiftTemplate.py
-#
-# Last edited: 14.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
new file mode 100644
index 0000000000..a795a555ed
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/ReshapeTemplate.py
@@ -0,0 +1,59 @@
+# ----------------------------------------------------------------------
+#
+# File: ReshapeTemplate.py
+#
+# Last edited: 16.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation, VariableBuffer
+from Deeploy.Targets.Generic.Templates.ReshapeTemplate import _ReshapeTemplate as _GenericReshapeTemplate
+
+
+class _ReshapeTemplate(_GenericReshapeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+
+        ctxt, operatorRepresentation, _ = super().alignToContext(ctxt, operatorRepresentation)
+
+        # Get buffers
+        bufferIn = ctxt.lookup(operatorRepresentation['data_in'])
+        assert isinstance(bufferIn, VariableBuffer)
+
+        bufferOut = ctxt.lookup(operatorRepresentation['data_out'])
+        assert isinstance(bufferOut, VariableBuffer)
+
+        # HACK: Tiling wasn't updated in the Fix aliasing PR so we have to still
+        #       set the _alias argument
+        bufferOut._alias = bufferIn.name
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ReshapeTemplate("""
+// Reshape (Name: ${nodeName}, Op: ${nodeOp})
+${data_out} = ${data_in};
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py
index 4f3308d1f3..418b41aadf 100644
--- a/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/SGDTemplate.py
@@ -1,41 +1,49 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: SGDTemplate.py
-#
-# Last edited: 21.03.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
-// SGD Weight Update (Name: ${nodeName}, Op: ${nodeOp})
-BEGIN_SINGLE_CORE
-    ${weight_type.typeName} ref_${weight} = ${weight};
-    ${grad_type.typeName} ref_${grad} = ${grad};
-    ${weight_type.typeName} ref_${weight_updated} = ${weight_updated};
+// SGD Weight Update with Separated Multiplication and Subtraction Unrolling
+// (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+int32_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
+int32_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
+
+${weight_type.typeName} ref_${weight} = ${weight};
+${grad_type.typeName} ref_${grad} = ${grad};
+${weight_type.typeName} ref_${weight_updated} = ${weight_updated};
+
+float32_t learning_rate = ${lr};
+
+// Temporary buffer for multiplication results
+float32_t temp_mul[6];
+
+uint32_t i = ${nodeName}_chunk_start;
+for (; i+5 < ${nodeName}_chunk_stop; i+=6) {
+    // Unrolled multiplication operations
+    temp_mul[0] = learning_rate * ref_${grad}[i];
+    temp_mul[1] = learning_rate * ref_${grad}[i+1];
+    temp_mul[2] = learning_rate * ref_${grad}[i+2];
+    temp_mul[3] = learning_rate * ref_${grad}[i+3];
+    temp_mul[4] = learning_rate * ref_${grad}[i+4];
+    temp_mul[5] = learning_rate * ref_${grad}[i+5];
     
-    float32_t learning_rate = ${lr}; 
+    // Unrolled subtraction operations
+    ref_${weight_updated}[i] = ref_${weight}[i] - temp_mul[0];
+    ref_${weight_updated}[i+1] = ref_${weight}[i+1] - temp_mul[1];
+    ref_${weight_updated}[i+2] = ref_${weight}[i+2] - temp_mul[2];
+    ref_${weight_updated}[i+3] = ref_${weight}[i+3] - temp_mul[3];
+    ref_${weight_updated}[i+4] = ref_${weight}[i+4] - temp_mul[4];
+    ref_${weight_updated}[i+5] = ref_${weight}[i+5] - temp_mul[5];
+}
 
-    for (uint32_t i=0; i<${size}; ++i) {
-        ref_${weight_updated}[i] = ref_${weight}[i] - learning_rate * ref_${grad}[i];
-    }
-END_SINGLE_CORE
-""")
+// Handle remaining elements
+for (; i < ${nodeName}_chunk_stop; i++) {
+    float32_t temp_grad = learning_rate * ref_${grad}[i];
+    ref_${weight_updated}[i] = ref_${weight}[i] - temp_grad;
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py
index 9599aa1bf7..c1aefe01a3 100644
--- a/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/SoftmaxCrossEntropyLossTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: SoftmaxCrossEntropyTemplate.py
-#
-# Last edited: 09.03.2025
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
@@ -35,12 +14,12 @@
                 max_logit = ${logits}[i * ${num_classes} + j];
             }
         }
-        
+
         float32_t sum_exp = 0.0f;
         for (uint32_t j = 0; j < ${num_classes}; j++) {
             sum_exp += expf(${logits}[i * ${num_classes} + j] - max_logit);
         }
-        
+
         for (uint32_t j = 0; j < ${num_classes}; j++) {
             // log_prob = logit - max_logit - log(sum_exp)
             ${log_prob}[i * ${num_classes} + j] = ${logits}[i * ${num_classes} + j] - max_logit - logf(sum_exp);
@@ -63,6 +42,6 @@
             }
         }
     }
-    
+
 END_SINGLE_CORE
 """)
diff --git a/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py
index d0f4e220be..76fd47cfb6 100644
--- a/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/TallGEMMTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: TallGEMMTemplate.py
-#
-# Last edited: 21.03.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -54,15 +33,15 @@ def alignToContext(self, ctxt: NetworkContext,
 int16_t ${nodeName}_chunk = (${int(M)} >> ${nodeName}_log2Core) + ((${int(M)} & (NUM_CORES-1))!=0);
 int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${int(M)});
 int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${int(M)} + 1);
-                                          
+
 int8_t* ref_${nodeName}_${A};
 int8_t* ref_${nodeName}_${B};
 int8_t* ref_${nodeName}_${data_out};
-                                          
+
 for(int b=0; b<${batch}; b++){
 
     for (uint32_t i=${nodeName}_chunk_start; i<${nodeName}_chunk_stop; i++){
-        
+
         int8_t* ref_${nodeName}_${A} = ${A} + (b * ${M} * ${N}) + (i * ${N});
         % if W_batched:
         int8_t* ref_${nodeName}_${B} = ${B} + (b * ${N} * ${O});
@@ -70,7 +49,7 @@ def alignToContext(self, ctxt: NetworkContext,
         int8_t* ref_${nodeName}_${B} = ${B};
         % endif
         int8_t* ref_${nodeName}_${data_out} = ${data_out} + (b * ${M} * ${O}) + (i * ${O});
-                                          
+
         gemv_s8_s8_plp(ref_${nodeName}_${A}, NULL, ref_${nodeName}_${data_out}, ref_${nodeName}_${B}, ${mul}, ${C}, 1, ${log2D}, ${N}, ${O}, 1, 1);
     }
 }
diff --git a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
index 5a01c73bdd..65c2285e24 100644
--- a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
 #
-# File: TransposeTemplate.py
-#
-# Last edited: 28.12.2021
-#
-# Copyright (C) 2021, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py b/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py
index 4cfd3d6f83..d712b3a517 100644
--- a/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/UniformRequantShiftTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: UniformRequantShiftTemplate.py
-#
-# Last edited: 12.03.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -78,7 +57,7 @@ def alignToContext(self, ctxt: NetworkContext,
 inSignage = "s" if signedI else "u"
 outSignage = "s" if signedO else "u"
 mul_int_immediate = int(mul_immediate)
-add_int_immediate = int(add_immediate)                    
+add_int_immediate = int(add_immediate)
 %>
 
 // UniformRequantShift (Name: ${nodeName}, Op: ${nodeOp})
diff --git a/Deeploy/Targets/PULPOpen/Templates/__init__.py b/Deeploy/Targets/PULPOpen/Templates/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/PULPOpen/Templates/__init__.py
+++ b/Deeploy/Targets/PULPOpen/Templates/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/PULPOpen/Templates/iRMSNormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/iRMSNormTemplate.py
index 63e5d0e66f..0aa91cc8f5 100644
--- a/Deeploy/Targets/PULPOpen/Templates/iRMSNormTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/iRMSNormTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iRMSNormTemplate.py
-#
-# Last edited: 20.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py b/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py
index 804db3b7e0..af3a93a185 100644
--- a/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/iSoftmaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: iSoftmaxTemplate.py
-#
-# Last edited: 13.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
index 8af20e3df3..f249595ac2 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
@@ -1,30 +1,8 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ConvTileConstraint.py
-#
-# Last edited: 09.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 from ortools.constraint_solver.pywrapcp import IntVar
 
@@ -163,6 +141,7 @@ def serializeTilingSolution(
                                                                   operatorRepresentation, addrNames)
 
         varWeight = operatorRepresentation['weight']
+        varIn = operatorRepresentation["data_in"]
         varOut = operatorRepresentation['data_out']
 
         inputInCubes = []
@@ -204,9 +183,16 @@ def serializeTilingSolution(
             (BatchOffset, HOffset, WOffset, COffset) = cube.offset
             (BatchSize, HSize, WSize, CSize) = cube.dims
 
-            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube(
+                kernelShape = (weightH, weightW),
+                pads = pads,
+                strides = strides,
+                inputCSize = weightC,
+                outputCube = cube,
+                inputDims = ctxt.lookup(varIn).shape,
+                outputDims = ctxt.lookup(varOut).shape,
+            )
+
             padding_left, padding_right, padding_top, padding_bottom = padding_tuple
 
             replacements['dim_im_in_x'].append(InCube.dims[1])
@@ -248,79 +234,154 @@ class Conv2DTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        """
+        Add geometrical constraints for Conv2D tiling.
 
-        # Get to-be-tiled tensor's buffers
+        For spatial tiling, input tiles require extra memory for overlap regions
+        at tile boundaries (kernel receptive field). This method accounts for worst-case
+        overlap on all sides.
+
+        Future optimization: Currently uses worst-case memory allocation (kernel_size - 1
+        on all sides). A more memory-efficient approach would compute exact
+        per-tile memory requirements during serializeTilingSolution based on actual tile
+        positions, but this requires more extensive framework changes.
+        """
+
+        # ===== GET NECESSARY INFORMATION =====
+        #   Get to-be-tiled tensor buffers
         inputBufferName = parseDict['data_in']
-        weightBufferName = parseDict['weight']
         outputBufferName = parseDict['data_out']
 
+        weightBufferName = parseDict['weight']
+        biasBufferName = parseDict['bias']
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        #   Get other information
+        has_bias = False if parseDict['has_bias'] == "false" else True
+
+        pads = parseDict["pads"]
         strides = parseDict["strides"]
-        padding = parseDict["pads"]
-        dilation = parseDict["dilations"]
+        dilations = parseDict["dilations"]
+        group = parseDict["group"]
 
-        # Add I/O dimensions to the model as variables
-        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+        # ===== ADD I/O DIMS TO MODEL AS VARS =====
+        buffersOfInterest = [inputBufferName, outputBufferName, weightBufferName]
+        if has_bias:
+            buffersOfInterest.append(biasBufferName)
+
+        for bufferName in buffersOfInterest:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        #   Input
+        #   NHWC layout
         inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
         inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
         inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
         inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
 
-        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
-        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
-        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
-        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
-
+        #   Output
+        #   NHWC layout
         outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
         outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
         outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
         outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
 
-        # Map output dims to inputs dims
-        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
-        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel
-
-        inputBuffer = ctxt.lookup(inputBufferName)
-
-        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
-        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+        #   Weight
+        #   C_out - H - W layout - C_in
+        #   (with c_in used for grouping different than number of channels)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
 
-        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1))
-        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1))
+        #   Bias (C_out)
+        if has_bias:
+            biasDimVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0)
+
+        # ===== COMPUTE EFFECTIVE INPUT HEIGHT AND WIDTH =====
+        #   Assume worst case scenario (data padding on all sides) when tiling on a ceratin dimension.
+        effectiveInputHeight = inputHeightVar + ((pads[0] + pads[2]) * (inputHeightVar == inputBuffer.shape[1])) - (
+            (weightHeightVar - 1) * (inputHeightVar != inputBuffer.shape[1]))
+        effectiveInputWidth = inputWidthVar + ((pads[1] + pads[3]) * (inputWidthVar == inputBuffer.shape[2])) - (
+            (weightWidthVar - 1) * (inputWidthVar != inputBuffer.shape[2]))
+
+        # ===== ADD CONSTRAINTS =====
+        #   Add constraint for batch size match between input and output
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
+
+        #   Add constraint for input width and height sizes match
+        #   (Depends on output height and width, kernel size, padding, dilations, and strides.
+        #   For more information on the connections, see ONNX and/or Torch Conv2D documentation).
+        tilerModel.addConstraint(
+            (outputHeightVar == (effectiveInputHeight - dilations[0] * (weightHeightVar - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint(
+            (outputWidthVar == (effectiveInputWidth - dilations[1] * (weightWidthVar - 1) - 1) // strides[1] + 1))
+
+        #   Add constraint for input channel size match
+        #   (Depends on weight output channel and conv grouping)
+        tilerModel.addConstraint(inputChannelVar == (weightInChannelVar * group))
+
+        #   Add constraint for weight output channels to match
+        #   output number of channels
+        tilerModel.addConstraint(weightOutChannelVar == outputChannelVar)
+
+        #   Add constraint for bias size to match number of output channels
+        if has_bias:
+            tilerModel.addConstraint(biasDimVar == outputChannelVar)
 
         return tilerModel
 
     @staticmethod
     def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
 
-        # Get to-be-tiled tensor's buffers
+        # ===== GET NECESSARY INFORMATION =====
+        #   Get to-be-tiled tensor buffers
         inputBuffer = ctxt.lookup(name = parseDict['data_in'])
         weightBuffer = ctxt.lookup(name = parseDict['weight'])
 
+        #   Get other information
+        pads = parseDict["pads"]
+        strides = parseDict["strides"]
+
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        #   Input
+        #   NHWC layout
         inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
         inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
         inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
 
-        outputChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+        #   Weight
+        #   C_out - H - W layout - C_in
+        #   (with c_in used for grouping different than number of channels)
         weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
         weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
         weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3)
 
-        strides = parseDict["strides"]
-        padding = parseDict["pads"]
+        # ===== COMPUTE EFFECTIVE INPUT HEIGHT AND WIDTH =====
+        #   Assume worst case scenario (data padding on all sides) when tiling on a ceratin dimension.
+        effectiveInputHeight = inputHeightVar + ((pads[0] + pads[2]) * (inputHeightVar == inputBuffer.shape[1])) - (
+            (weightHeightVar - 1) * (inputHeightVar != inputBuffer.shape[1]))
+        effectiveInputWidth = inputWidthVar + ((pads[1] + pads[3]) * (inputWidthVar == inputBuffer.shape[2])) - (
+            (weightWidthVar - 1) * (inputWidthVar != inputBuffer.shape[2]))
 
-        # RW: Conv only tiled on outchannel
-        tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x'])
-        tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y'])
+        # ===== ADD CONSTRAINTS =====
+        #   Keep whole input channels (required for im2col algorithm)
         tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
 
+        #   Require minimum input spatial dimensions to be at least kernel size for proper convolution application
+        tilerModel.addConstraint(effectiveInputHeight >= parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(effectiveInputWidth >= parseDict['dim_kernel_y'])
+
+        #   Ensure input tiles are compatible with stride
+        tilerModel.addConstraint((effectiveInputHeight % strides[0]) == 0)
+        tilerModel.addConstraint((effectiveInputWidth % strides[1]) == 0)
+
+        #   Weight should not be tiled
         tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
         tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
-        tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in'])
-
-        if (parseDict["ch_im_out"] >= 8):
-            tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8)
+        tilerModel.addConstraint(weightInChannelVar * parseDict['group'] == parseDict['ch_im_in'])
 
         return tilerModel
 
@@ -330,29 +391,49 @@ def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
 
         inputBuffer = ctxt.lookup(name = parseDict['data_in'])
         weightBuffer = ctxt.lookup(name = parseDict['weight'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
 
         symbolicParseDict = parseDict.copy()
+
         symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1)
+        symbolicParseDict['dim_im_in_y'] = tilerModel.getTensorDimVar(inputBuffer.name, 2)
+
         symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 1)
         symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 2)
 
+        symbolicParseDict['dim_im_out_x'] = tilerModel.getTensorDimVar(outputBuffer.name, 1)
+        symbolicParseDict['dim_im_out_y'] = tilerModel.getTensorDimVar(outputBuffer.name, 2)
+
         return symbolicParseDict
 
     @staticmethod
-    def computeInputCube(kernelShape: Tuple[int, int], pads: Tuple[int, int, int, int], strides: Tuple[int, int],
-                         inputCSize: int, outputCube: HyperRectangle,
-                         outputDims: Tuple[int, int, int]) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]:
-
-        (outputBatchOffset, outputHOffset, outputWOffset, outputCOffset) = outputCube.offset
-        (outputBatchSize, outputHSize, outputWSize, outputCSize) = outputCube.dims
-
+    def computeInputCube(
+        kernelShape: Tuple[int, int],
+        pads: Tuple[int, int, int, int],
+        strides: Tuple[int, int],
+        inputCSize: int,
+        outputCube: HyperRectangle,
+        outputDims: Tuple[int, int, int],
+        inputDims: Optional[Tuple[int, int, int]] = None,
+        outputAbsoluteOffsets: Optional[Tuple[int, int, int, int]] = None,
+    ) -> Tuple[HyperRectangle, Tuple[int, int, int, int]]:
+
+        # Obtain relative and absolute information about the output tile
+        (outputBatchOffset, outputHOffset, outputWOffset, _) = outputCube.offset
+        (outputBatchSize, outputHSize, outputWSize, _) = outputCube.dims
+        (_, outputHAbsoluteOffset, outputWAbsoluteOffset,
+         _) = outputAbsoluteOffsets if outputAbsoluteOffsets is not None else outputCube.offset
+
+        # Extract individual pads and strides
         padTop, padLeft, padBottom, padRight = pads
         strideH, strideW = strides
 
-        tilePadTop = padTop if (outputHOffset == 0) else 0
-        tilePadLeft = padLeft if (outputWOffset == 0) else 0
-        tilePadBottom = padBottom if (outputHOffset + outputHSize == outputDims[1]) else 0
-        tilePadRight = padRight if (outputWOffset + outputWSize == outputDims[2]) else 0
+        # Compute actuale tile padding, depending on tile position (keep padding only for margins situated at the edge).
+        # Required for the Im2Col kernel that handles 0-padding internally.
+        tilePadTop = padTop if (outputHAbsoluteOffset == 0) else 0
+        tilePadLeft = padLeft if (outputWAbsoluteOffset == 0) else 0
+        tilePadBottom = padBottom if (outputHAbsoluteOffset + outputHSize == outputDims[1]) else 0
+        tilePadRight = padRight if (outputWAbsoluteOffset + outputWSize == outputDims[2]) else 0
 
         # LMACAN: Calculating the per-dimension relative tile offset without padding
         #         The offset is relative to the upstream bigger tile, and represents the offset to
@@ -360,9 +441,18 @@ def computeInputCube(kernelShape: Tuple[int, int], pads: Tuple[int, int, int, in
         inputHOffset = max(outputHOffset * strideH - padTop, 0)
         inputWOffset = max(outputWOffset * strideW - padLeft, 0)
 
+        # Compute input dimensions according to procedure described in PyTorch's Conv2D documentation
+        # Assuming worst case (cutting of (stride - 1) elements at the end of each dimension)
         inputHSize = outputHSize * strideH + (kernelShape[0] - 1) - (tilePadTop + tilePadBottom)
         inputWSize = outputWSize * strideW + (kernelShape[1] - 1) - (tilePadLeft + tilePadRight)
 
+        if inputDims is not None:
+            # Clamp to remaining input size from the current offset
+            # This prevents reading beyond input boundaries for edge tiles
+            inputHSize = min(inputHSize, inputDims[1] - inputHOffset)
+            inputWSize = min(inputWSize, inputDims[2] - inputWOffset)
+
+        # Generate input tile object
         InCube = HyperRectangle((outputBatchOffset, inputHOffset, inputWOffset, 0),
                                 (outputBatchSize, inputHSize, inputWSize, inputCSize))
 
@@ -373,22 +463,42 @@ def serializeTilingSolution(
             cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
             targetMemLevel: str, ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
-        addrNames = ['data_in', 'weight', 'data_out']
-        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
-                                                                  operatorRepresentation, addrNames)
+        # Extract rectangle information (offsets and dimensions) from output cubes
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
+        # Extract required component information from operator representation
+        varIn = operatorRepresentation["data_in"]
         varWeight = operatorRepresentation['weight']
+        varBias = operatorRepresentation['bias']
         varOut = operatorRepresentation['data_out']
 
+        group = operatorRepresentation["group"]
+
+        # Prepare address names, also handling bias
+        if varBias != "NULL":
+            addrNames = ['data_in', 'weight', 'bias', 'data_out']
+        else:
+            addrNames = ['data_in', 'weight', 'data_out']
+
+        # Extract memory base addresses for each of the required components,
+        # based on the computed memory configuration
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # Prepare cube lists for components
         inputInCubes = []
         inputWeightCubes = []
+        inputBiasCubes = []
+
+        # Prepare replacement lists for the elements inside the operator representation,
+        # for the cubes to be computed further down in this function
         replacements: Dict[str, List[int]] = {
             "dim_im_in_x": [],
             "dim_im_in_y": [],
             "dim_im_out_x": [],
             "dim_im_out_y": [],
+            "ch_im_in": [],
             "ch_im_out": [],
             "padding_y_top": [],
             "padding_y_bottom": [],
@@ -401,6 +511,7 @@ def serializeTilingSolution(
             "dim_im_in_y": PointerClass(uint16_t),
             "dim_im_out_x": PointerClass(uint16_t),
             "dim_im_out_y": PointerClass(uint16_t),
+            "ch_im_in": PointerClass(uint16_t),
             "ch_im_out": PointerClass(uint16_t),
             "padding_y_top": PointerClass(uint8_t),
             "padding_y_bottom": PointerClass(uint8_t),
@@ -408,27 +519,41 @@ def serializeTilingSolution(
             "padding_x_right": PointerClass(uint8_t)
         }
 
-        weightH = ctxt.lookup(varWeight).shape[1]
-        weightW = ctxt.lookup(varWeight).shape[2]
-        weightC = ctxt.lookup(varWeight).shape[3]
+        # Obtain weight dimensions
+        (_, weightH, weightW, weightCin) = ctxt.lookup(varWeight).shape
 
+        # Obtain padding and striding information
         pads = operatorRepresentation['pads']
         strides = operatorRepresentation['strides']
 
-        for cube in outputCubes:
-            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
-            (BatchSize, HSize, WSize, CSize) = cube.dims
-
-            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
-                                                                          cube,
-                                                                          ctxt.lookup(varOut).shape)
-
+        # Iterate throught the cubes in which the output will be split for tiling
+        for idx, cube in enumerate(outputCubes):
+            # Obtain current cube offsets and dimensions
+            COffset = cube.offset[3]
+            (_, HSize, WSize, CSize) = cube.dims
+
+            # Compute input cube
+            InCube, padding_tuple = Conv2DTileConstraint.computeInputCube(
+                kernelShape = (weightH, weightW),
+                pads = pads,
+                strides = strides,
+                inputCSize = weightCin * group,
+                outputCube = cube,
+                inputDims = ctxt.lookup(varIn).shape,
+                outputDims = ctxt.lookup(varOut).shape,
+                outputAbsoluteOffsets = absoluteOutputCubes[idx].absoluteOffset)
+
+            # Extract individual padding
             padding_left, padding_right, padding_top, padding_bottom = padding_tuple
 
+            # Add element information for the operator representation
             replacements['dim_im_in_x'].append(InCube.dims[1])
             replacements['dim_im_in_y'].append(InCube.dims[2])
+
             replacements['dim_im_out_x'].append(HSize)
             replacements['dim_im_out_y'].append(WSize)
+
+            replacements['ch_im_in'].append(weightCin * group)
             replacements['ch_im_out'].append(CSize)
 
             replacements['padding_y_top'].append(padding_top)
@@ -436,21 +561,37 @@ def serializeTilingSolution(
             replacements['padding_x_left'].append(padding_left)
             replacements['padding_x_right'].append(padding_right)
 
+            # Add input cube with tiling information to the corresponding list
             inputInCubes.append(InCube)
 
-            WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, weightC))
-
+            # Obtain and add weight cube with tiling information to the corresponding list
+            WeightCube = HyperRectangle((COffset, 0, 0, 0), (CSize, weightH, weightW, weightCin))
             inputWeightCubes.append(WeightCube)
 
+            # Obtain and add bias cube with tiling information to the corresponding list,
+            # if bias exists
+            if varBias != "NULL":
+                BiasCube = HyperRectangle((COffset,), (CSize,))
+                inputBiasCubes.append(BiasCube)
+
+        # Prepare loading schedule lists
         inputLoadSchedule = []
         outputLoadSchedule = []
 
-        for a, b in zip(inputInCubes, inputWeightCubes):
-            inputLoadSchedule.append({"data_in": a, "weight": b})
+        # Create input schedule lists, with bias handling
+        if varBias == "NULL":
+            for a, b in zip(inputInCubes, inputWeightCubes):
+                inputLoadSchedule.append({"data_in": a, "weight": b})
+        else:
+            for a, b, c in zip(inputInCubes, inputWeightCubes, inputBiasCubes):
+                inputLoadSchedule.append({"data_in": a, "weight": b, "bias": c})
 
+        # Create output schedule list
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
 
+        # Prepare containing objects with information computed in this function regarding tiling schedule
+        # and variable replacement inside operator representation
         tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
index d4e1989061..bb0e3ed6ee 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/DWConvTileConstraint.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: ConvTileConstraint.py
-#
-# Last edited: 09.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
@@ -39,7 +17,7 @@
     VariableReplacementScheme
 
 
-class DWConv2DTileConstraint(TileConstraint):
+class RQDWConv2DTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
@@ -255,3 +233,24 @@ def serializeTilingSolution(
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
         return variableReplacementSchedule, tilingSchedule
+
+
+class DWConv2DTileConstraint(Conv2DTileConstraint):
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Inherit from regular Conv2D policy constraints
+        tilerModel = Conv2DTileConstraint.addPolicyConstraint(tilerModel, parseDict, ctxt)
+
+        # Add constraint for relationship between in and out number of channels
+        # TODO: Fix DW kernel to include group info and support channel tiling
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        tilerModel.addConstraint((inputChannelVar == parseDict['ch_im_in']))
+        tilerModel.addConstraint((outputChannelVar == parseDict['ch_im_out']))
+
+        return tilerModel
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
index 7f8a456265..f913b13a2e 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
@@ -1,29 +1,8 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: GEMMTileConstraint.py
-#
-# Last edited: 02.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
+import math
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
@@ -117,9 +96,13 @@ def serializeTilingSolution(
         addrNames = ['A', 'B', 'mul', 'C', 'data_out']
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
-        varA = operatorRepresentation['A']
+        transA = operatorRepresentation['transA']
+        transB = operatorRepresentation['transB']
+
+        buffA = ctxt.lookup(operatorRepresentation['A'])
+        buffB = ctxt.lookup(operatorRepresentation['B'])
 
-        NSize = ctxt.lookup(varA).shape[-1]
+        NSize = buffA.shape[-1]
         NOffset = 0
 
         inputACubes = []
@@ -131,33 +114,54 @@ def serializeTilingSolution(
 
         # Every output is constructed by a pair of inputs. Reconstruct this pair.
         for cube in outputCubes:
+            MOffset, OOffset = cube.offset[-2:]
+            MSize, OSize = cube.dims[-2:]
 
-            BSize = 1
-            BOffset = 0
-            BatchSize = 1
-            BatchOffset = 0
+            if len(cube.offset) > 2:
+                BatchSize = math.prod(cube.dims[:-2])
 
-            if len(cube.offset) == 2:
-                (MOffset, OOffset) = cube.offset
-                (MSize, OSize) = cube.dims
-            elif len(cube.offset) == 3:
-                (BatchOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, MSize, OSize) = cube.dims
+                if len(cube.offset) > 3:
+                    assert all(off == 0 for off in cube.offset[:-3]), (
+                        f"Unsupported tiling across leading batch dims: offsets={cube.offset}. "
+                        "Only the last batch dim (besides M/O) may be tiled.")
             else:
-                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, BSize, MSize, OSize) = cube.dims
+                BatchSize = 1
 
             replacements["M"].append(MSize)
             replacements["O"].append(OSize)
-            replacements["batch"].append(BSize)
+            replacements["batch"].append(BatchSize)
 
-            ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
-            BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
+            if transA == 0:
+                AMatrixOffsets = (MOffset, NOffset)
+                AMatrixShape = (MSize, NSize)
+            else:
+                AMatrixOffsets = (NOffset, MOffset)
+                AMatrixShape = (NSize, MSize)
 
-            RequantCube = HyperRectangle((OOffset,), (OSize,))
+            if len(buffA.shape) > 2:
+                batchDimCount = len(buffA.shape) - 2
+                AMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + AMatrixOffsets
+                AMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + AMatrixShape
 
+            ACube = HyperRectangle(AMatrixOffsets, AMatrixShape)
             inputACubes.append(ACube)
+
+            if transB == 0:
+                BMatrixOffsets = (NOffset, OOffset)
+                BMatrixShape = (NSize, OSize)
+            else:
+                BMatrixOffsets = (OOffset, NOffset)
+                BMatrixShape = (OSize, NSize)
+
+            if len(buffB.shape) > 2:
+                batchDimCount = len(buffB.shape) - 2
+                BMatrixOffsets = tuple(cube.offset[:-2][-batchDimCount:]) + BMatrixOffsets
+                BMatrixShape = tuple(cube.dims[:-2][-batchDimCount:]) + BMatrixShape
+
+            BCube = HyperRectangle(BMatrixOffsets, BMatrixShape)
             inputBCubes.append(BCube)
+
+            RequantCube = HyperRectangle((OOffset,), (OSize,))
             inputMulCubes.append(RequantCube)
             inputAddCubes.append(RequantCube)
 
@@ -184,40 +188,6 @@ def serializeTilingSolution(
         return VariableReplacementScheme(replacements, replacementTypes), schedule
 
 
-class MatrixVecTileConstraint(GEMMTileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-    @staticmethod
-    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addPolicyConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-
-class TallGEMMTileConstraint(GEMMTileConstraint):
-
-    @staticmethod
-    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addGeometricalConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-    @staticmethod
-    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        tm = GEMMTileConstraint.addPolicyConstraint(tilerModel, parseDict, ctxt)
-
-        return tm
-
-
 class FloatGEMMTileConstraint(TileConstraint):
 
     @staticmethod
@@ -226,11 +196,19 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         # Get to-be-tiled tensor's buffers
         bufferA = ctxt.lookup(name = parseDict['A'])
         bufferB = ctxt.lookup(name = parseDict['B'])
-        bufferC = ctxt.lookup(name = parseDict['C'])
         outputBuffer = ctxt.lookup(name = parseDict['data_out'])
 
         # Add I/O dimensions to the model as variables
-        for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]:
+        has_bias = 'C' in parseDict and parseDict['C'] is not None
+        bufferC = None
+        if has_bias:
+            bufferC = ctxt.lookup(name = parseDict['C'])
+
+        buffer_names = [bufferA.name, bufferB.name, outputBuffer.name]
+        if has_bias:
+            buffer_names.append(bufferC.name)
+
+        for bufferName in buffer_names:
             tilerModel.addTensorDimToModel(ctxt, bufferName)
 
         dimOffsetA = len(bufferA.shape) - 2
@@ -253,10 +231,13 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         # Add GEMM Geometrical constraints
         tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
 
-        addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = 0)
-        addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = 1)
-        tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
-        tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
+        # Add bias constraints only if bias is present
+        if has_bias:
+            dimOffsetC = len(bufferC.shape) - 2
+            addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
+            addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
+            tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
+            tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
 
         return tilerModel
 
@@ -292,9 +273,17 @@ def serializeTilingSolution(
             cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
             targetMemLevel: str, ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
-        addrNames = ['A', 'B', 'C', 'data_out']
+        outputCubes = [
+            HyperRectangle(tuple(cube.rectangle.offset), tuple(cube.rectangle.dims)) for cube in absoluteOutputCubes
+        ]
+
+        has_bias = 'C' in operatorRepresentation and operatorRepresentation['C'] is not None
+
+        addrNames = ['A', 'B', 'data_out']
+        if has_bias:
+            addrNames.insert(2, 'C')
+
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
 
@@ -302,7 +291,6 @@ def serializeTilingSolution(
         transB = operatorRepresentation['transB']
 
         varA = operatorRepresentation['A']
-        varB = operatorRepresentation['B']
 
         if transA == 0:
             NSize = ctxt.lookup(varA).shape[-1]
@@ -317,7 +305,6 @@ def serializeTilingSolution(
 
         replacements = {"M": [], "O": [], "batch": []}
 
-        # Every output is constructed by a pair of inputs. Reconstruct this pair.
         for cube in outputCubes:
 
             BSize = 1
@@ -349,11 +336,12 @@ def serializeTilingSolution(
             else:
                 BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
 
-            CCube = HyperRectangle(cube.offset, cube.dims)
-
             inputACubes.append(ACube)
             inputBCubes.append(BCube)
-            inputAddCubes.append(CCube)
+
+            if has_bias:
+                CCube = HyperRectangle(tuple(cube.offset), tuple(cube.dims))
+                inputAddCubes.append(CCube)
 
         inputLoadSchedule = []
         outputLoadSchedule = []
@@ -367,8 +355,12 @@ def serializeTilingSolution(
             "batch": PointerClass(uint8_t)
         }
 
-        for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
-            inputLoadSchedule.append({"A": a, "B": b, "C": c})
+        if has_bias:
+            for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
+                inputLoadSchedule.append({"A": a, "B": b, "C": c})
+        else:
+            for a, b in zip(inputACubes, inputBCubes):
+                inputLoadSchedule.append({"A": a, "B": b})
 
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GatherTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GatherTileConstraint.py
index 036f8adf35..3526e3d8bb 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/GatherTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/GatherTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: ConcatTileConstraint.py
-#
-# Last edited: 19.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GeluTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GeluTileConstraint.py
new file mode 100644
index 0000000000..3b7b284706
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/GeluTileConstraint.py
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.Targets.Generic.TileConstraints.BOPTileConstraint import BOPTileConstraint
+
+
+class GeluGradTileConstraint(BOPTileConstraint):
+
+    dataIn1Name = 'grad_in'
+    dataIn2Name = 'data_in'
+    dataOutName = 'grad_out'
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py
index 101a056644..c3593ee6f0 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/LayernormTileConstraint.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Tuple
 
 import numpy as np
@@ -74,3 +78,82 @@ def serializeTilingSolution(
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
         return variableReplacementSchedule, tilingSchedule
+
+
+class LayernormGradTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        grad_in_buffer_name = parseDict['grad_in']
+        data_in_buffer_name = parseDict['data_in']
+        weight_buffer_name = parseDict['weight']
+        bias_buffer_name = parseDict['bias']
+        grad_out_buffer_name = parseDict['grad_out']
+
+        for buffer_name in [
+                grad_in_buffer_name, data_in_buffer_name, weight_buffer_name, bias_buffer_name, grad_out_buffer_name
+        ]:
+            tilerModel.addTensorDimToModel(ctxt, buffer_name)
+
+        input_shape = ctxt.lookup(data_in_buffer_name).shape
+        last_dim_idx = len(input_shape) - 1
+        last_dim_len = input_shape[-1]
+
+        tilerModel.addConstraint(
+            tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = last_dim_idx) == last_dim_len)
+
+        tilerModel.addConstraint(
+            tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = last_dim_idx) ==
+            tilerModel.getTensorDimVar(tensorName = weight_buffer_name, dimIdx = 0))
+
+        tilerModel.addConstraint(
+            tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = last_dim_idx) ==
+            tilerModel.getTensorDimVar(tensorName = bias_buffer_name, dimIdx = 0))
+
+        for idx, dim in enumerate(input_shape):
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = idx) ==
+                tilerModel.getTensorDimVar(tensorName = grad_in_buffer_name, dimIdx = idx))
+
+        for idx, dim in enumerate(input_shape):
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = data_in_buffer_name, dimIdx = idx) ==
+                tilerModel.getTensorDimVar(tensorName = grad_out_buffer_name, dimIdx = idx))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+
+        output_cubes = [cube.rectangle for cube in absoluteOutputCubes]
+        addr_names = ['grad_in', 'data_in', 'weight', 'bias', 'grad_out']
+        input_base_offsets, output_base_offsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                      operatorRepresentation, addr_names)
+
+        replacements = {"size": []}
+        replacement_types = {"size": PointerClass(uint16_t)}
+
+        input_load_schedule = []
+        output_load_schedule = []
+
+        for cube in output_cubes:
+            new_size = np.prod(cube.dims)
+            replacements["size"].append(new_size)
+
+            feature_size = cube.dims[-1]
+
+            weight_cube = HyperRectangle((0,), (feature_size,))
+            bias_cube = HyperRectangle((0,), (feature_size,))
+
+            input_load_schedule.append({"grad_in": cube, "data_in": cube, "weight": weight_cube, "bias": bias_cube})
+
+            output_load_schedule.append({"grad_out": cube})
+
+        tiling_schedule = TilingSchedule(input_base_offsets, output_base_offsets, input_load_schedule,
+                                         output_load_schedule)
+        variable_replacement_schedule = VariableReplacementScheme(replacements, replacement_types)
+
+        return variable_replacement_schedule, tiling_schedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
index 2b5d284159..ee7e448be6 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
@@ -1,29 +1,8 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MatMulTileConstraint.py
-#
-# Last edited: 04.07.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
+import math
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
@@ -40,61 +19,73 @@ class MatMulTileConstraint(TileConstraint):
 
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
-        # Get to-be-tiled tensor's buffers
+        # ===== GET NECESSARY INFORMATION =====
         bufferA = ctxt.lookup(name = parseDict['A'])
         bufferB = ctxt.lookup(name = parseDict['B'])
         outputBuffer = ctxt.lookup(name = parseDict['data_out'])
 
-        # Add I/O dimensions to the model as variables
+        tensorsShapeLenA = len(bufferA.shape)
+        tensorsShapeLenB = len(bufferB.shape)
+        tensorsShapeLenOutput = len(outputBuffer.shape)
+
+        # ===== ADD I/O DIMS TO MODEL AS VARS =====
         for _buffer in [bufferA, bufferB, outputBuffer]:
             tilerModel.addTensorDimToModel(ctxt, _buffer.name)
 
-        tensorsShapeLen = len(bufferA.shape)
-
-        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
-                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
-        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
-        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
-        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
-        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2))
-        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1))
-
-        # Map output dims to inputs dims
-        for idx in range(tensorsShapeLen - 2):
-            tilerModel.addConstraint(
-                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
-                    tensorName = bufferA.name, dimIdx = idx))
-            tilerModel.addConstraint(
-                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
-                    tensorName = bufferB.name, dimIdx = idx))
-
-        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
-        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
-
-        # Add GEMM Geometrical constraints
-        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
+        # *Checks on wether dimesnions are reversed via the transA and transB flags
+        #   A dims
+        AMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                        dimIdx = (tensorsShapeLenA - 2) + parseDict['transA'])
+        AMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                         dimIdx = (tensorsShapeLenA - 1) - parseDict['transA'])
+
+        #   B dims
+        BMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                        dimIdx = (tensorsShapeLenB - 2) + parseDict['transB'])
+        BMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                         dimIdx = (tensorsShapeLenB - 1) - parseDict['transB'])
+
+        #   Output dims
+        outputMatrixFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name,
+                                                             dimIdx = (tensorsShapeLenOutput - 2))
+        outputMatrixSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name,
+                                                              dimIdx = (tensorsShapeLenOutput - 1))
+
+        # ===== ADD CONSTRAINTS =====
+        #   Add batch constraints
+        if (bufferA.shape[:-2] == bufferB.shape[:-2]):
+            for idx in range(tensorsShapeLenA - 2):
+                tilerModel.addConstraint(
+                    tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx)
+                    == tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = tensorsShapeLenA - 3 - idx))
+
+            for idx in range(tensorsShapeLenB - 2):
+                tilerModel.addConstraint(
+                    tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = tensorsShapeLenOutput - 3 - idx)
+                    == tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = tensorsShapeLenB - 3 - idx))
+
+        #   Add GEMM geometrical constraints
+        tilerModel.addConstraint(outputMatrixFirstDimVar == AMatrixFirstDimVar)
+        tilerModel.addConstraint(outputMatrixSecondDimVar == BMatrixSecondDimVar)
+
+        tilerModel.addConstraint(AMatrixSecondDimVar == BMatrixFirstDimVar)
 
         return tilerModel
 
     @staticmethod
     def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
-
+        # ===== GET NECESSARY INFORMATION =====
         bufferA = ctxt.lookup(name = parseDict['A'])
         bufferB = ctxt.lookup(name = parseDict['B'])
 
-        tensorsShapeLen = len(bufferA.shape)
-
+        # ===== EXTRACT TENSOR DIMS AS VARS =====
         ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+                                                   dimIdx = (len(bufferA.shape) - 1) - parseDict['transA'])
         BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
-        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
-                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+                                                  dimIdx = (len(bufferB.shape) - 2) + parseDict['transB'])
 
+        # ===== ADD CONSTRAINTS =====
         # VIC: We don't want to deal with intermediate results between kernel calls
         tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
         tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
@@ -106,53 +97,115 @@ def serializeTilingSolution(
             cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
             targetMemLevel: str, ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        # Get output cubes
         outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
 
+        # Get names, optimizer variables, buffers, and other information for elements of interest
         addrNames = ['A', 'B', 'data_out']
         inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
                                                                   operatorRepresentation, addrNames)
 
-        varA = operatorRepresentation['A']
+        buffA = ctxt.lookup(operatorRepresentation['A'])
+        buffB = ctxt.lookup(operatorRepresentation['B'])
+        buffOut = ctxt.lookup(operatorRepresentation['data_out'])
+
+        transA = operatorRepresentation['transA']
+        transB = operatorRepresentation['transB']
+
+        tensorsShapeLenA = len(buffA.shape)
+        tensorsShapeLenB = len(buffB.shape)
+        tensorsShapeOutput = len(buffOut.shape)
 
-        NSize = ctxt.lookup(varA).shape[-1]
+        # NSize depends on transA: if transA=0, N is last dim; if transA=1, N is second-to-last
+        NSize = buffA.shape[-1] if transA == 0 else buffA.shape[-2]
         NOffset = 0
 
+        # Prepare input cubes lists
         inputACubes = []
         inputBCubes = []
 
+        # Prepare replacements lists
         replacements = {"M": [], "O": [], "batch": []}
 
-        # Every output is constructed by a pair of inputs. Reconstruct this pair.
+        # Every output tile is constructed by a pair of input tiles. Reconstruct this pair.
         for cube in outputCubes:
-
-            BSize = 1
-            BOffset = 0
-            BatchSize = 1
-            BatchOffset = 0
-
-            if len(cube.offset) == 2:
-                (MOffset, OOffset) = cube.offset
-                (MSize, OSize) = cube.dims
-            elif len(cube.offset) == 3:
-                (BatchOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, MSize, OSize) = cube.dims
+            # Get output dimensions
+            MOffset, OOffset = cube.offset[-2:]
+            MSize, OSize = cube.dims[-2:]
+
+            # Check that batch tiling is set up properly
+            if len(cube.offset) > 2:
+                BatchSize = math.prod(cube.dims[:-2])
+
+                if len(cube.offset) > 3:
+                    assert all(off == 0 for off in cube.offset[:-3]), (
+                        f"Unsupported tiling across leading batch dims: offsets={cube.offset}. "
+                        "Only the last batch dim (besides M/O) may be tiled.")
             else:
-                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
-                (BatchSize, BSize, MSize, OSize) = cube.dims
+                BatchSize = 1
 
+            # Prepare cube dimensions replacements
             replacements["M"].append(MSize)
             replacements["O"].append(OSize)
-            replacements["batch"].append(BSize)
-
-            ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
-            BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+            replacements["batch"].append(BatchSize)
 
+            # ===== Compute A cube information =====
+            #   Matrix offsets and shape (swap based on transA)
+            if transA == 0:
+                AMatrixOffsets = (MOffset, NOffset)
+                AMatrixShape = (MSize, NSize)
+            else:
+                AMatrixOffsets = (NOffset, MOffset)
+                AMatrixShape = (NSize, MSize)
+
+            #   Batch offset and shape (with broadcasting handling)
+            ABatchOffsets = list()
+            ABatchShape = list()
+
+            for idx in range(tensorsShapeLenA - 2):
+                if buffA.shape[tensorsShapeLenA - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]:
+                    ABatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx])
+                    ABatchShape.append(cube.dims[len(cube.dims) - 3 - idx])
+                else:
+                    ABatchOffsets.append(0)
+                    ABatchShape.append(1)
+
+            ACube = HyperRectangle(
+                tuple(reversed(ABatchOffsets)) + tuple(AMatrixOffsets),
+                tuple(reversed(ABatchShape)) + tuple(AMatrixShape))
             inputACubes.append(ACube)
+
+            # ===== Compute B cube information =====
+            #   Matrix offsets and shape (swap based on transB)
+            if transB == 0:
+                BMatrixOffsets = (NOffset, OOffset)
+                BMatrixShape = (NSize, OSize)
+            else:
+                BMatrixOffsets = (OOffset, NOffset)
+                BMatrixShape = (OSize, NSize)
+
+            #   Batch offset and shape (with broadcasting handling)
+            BBatchOffsets = list()
+            BBatchShape = list()
+
+            for idx in range(tensorsShapeLenB - 2):
+                if buffB.shape[tensorsShapeLenB - 3 - idx] == buffOut.shape[tensorsShapeOutput - 3 - idx]:
+                    BBatchOffsets.append(cube.offset[len(cube.offset) - 3 - idx])
+                    BBatchShape.append(cube.dims[len(cube.dims) - 3 - idx])
+                else:
+                    BBatchOffsets.append(0)
+                    BBatchShape.append(1)
+
+            BCube = HyperRectangle(
+                tuple(reversed(BBatchOffsets)) + tuple(BMatrixOffsets),
+                tuple(reversed(BBatchShape)) + tuple(BMatrixShape))
             inputBCubes.append(BCube)
 
+        # Prepare load schedule lists for computed cubes
         inputLoadSchedule = []
         outputLoadSchedule = []
 
+        # Prepare replacements
         replacements["N"] = [NSize] * len(outputCubes)
 
         replacementTypes = {
@@ -162,12 +215,15 @@ def serializeTilingSolution(
             "batch": PointerClass(int8_t)
         }
 
-        for a, b in zip(inputACubes, inputBCubes):
+        # Update load schedule lists
+        # *With strict=True to fail fast if different list lenghts
+        for a, b in zip(inputACubes, inputBCubes, strict = True):
             inputLoadSchedule.append({"A": a, "B": b})
 
         for out in outputCubes:
             outputLoadSchedule.append({"data_out": out})
 
+        # Prepare tiling schedule object
         schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
 
         return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolTileConstraint.py
index 695ba15812..c0fab8f028 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/MaxPoolTileConstraint.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MaxPoolTileConstraint.py
-#
-# Last edited: 09.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py
new file mode 100644
index 0000000000..b991de80dd
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceMeanConstraint.py
@@ -0,0 +1,179 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class ReduceMeanTileConstraint(TileConstraint):
+    '''
+    WARNING: This version of tiling is optimized for the TinyViT ReduceMean layers
+    (49 elements in the reduced axis). Greater sizes of the reduced axis may benefit
+    from different parallelization and tiling strategies.
+    '''
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # ===== GET NECESSARY INFORMATION =====
+        #   Get I/O buffer names
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        #   Get other necessary information
+        reduceAxes = parseDict['axes']
+        keepDims = parseDict['keepdims']
+
+        # ===== ADD I/O DIMENSIONS TO THE MODEL AS VARIABLES =====
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # ===== ADD CONSTRAINTS =====
+        #   Add constraints for the relationship between the I/O dimensions
+        #   Iterate over input axes and maintain an output index pointer
+        inputShape = parseDict['data_in_shape']
+        output_idx = 0
+        for input_ax in range(len(inputShape)):
+            if input_ax in reduceAxes:
+                # This axis is reduced
+                if keepDims:
+                    # Get the output dimension variable and constrain it to 1
+                    outputDimensionVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = output_idx)
+                    tilerModel.addConstraint(outputDimensionVar == 1)
+                    output_idx += 1
+                # If keepDims is false, this axis doesn't exist in output, so don't increment output_idx
+            else:
+                # This axis is not reduced, so input and output dimensions need to be equal
+                inputDimensionVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = input_ax)
+                outputDimensionVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = output_idx)
+                tilerModel.addConstraint(outputDimensionVar == inputDimensionVar)
+                output_idx += 1
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # ===== GET NECESSARY INFORMATION =====
+        #   Get I/O buffer names
+        inputBufferName = parseDict['data_in']
+
+        #   Get other necessary information
+        inputShape = parseDict['data_in_shape']
+        reduceAxes = parseDict['axes']
+        nonReducedDims = [ax for ax in range(len(inputShape)) if ax not in reduceAxes]
+
+        if len(nonReducedDims) > 0:
+            biggestNonReducedDim = max(nonReducedDims, key = lambda ax: inputShape[ax])
+        else:
+            biggestNonReducedDim = -1  # No non-reduced dimensions
+
+        # ===== ADD CONSTRAINTS =====
+        #  Kernel parallelized only on biggest non-reduced dimension,
+        #  so tile only on that dimension
+        for ax in range(len(inputShape)):
+            dimVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = ax)
+            if ax != biggestNonReducedDim:
+                # This is not the biggest non-reduced dimension, force no tiling
+                tilerModel.addConstraint(dimVar == inputShape[ax])
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+        symbolicParseDict = parseDict.copy()
+
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        for ax in range(len(parseDict['data_in_shape'])):
+            if ax not in parseDict['axes']:
+                dimVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = ax)
+                symbolicParseDict['dim_in_' + str(ax)] = dimVar
+
+        return symbolicParseDict
+
+    @staticmethod
+    def computeInputCubeFromOutputCube(outputCube: HyperRectangle, parseDict: Dict) -> HyperRectangle:
+        # Get required parameters
+        originalInputShape = parseDict['data_in_shape']
+        keepDims = parseDict['keepdims']
+
+        # Start from the output cube dimensions and offsets
+        in_cube_dims = list(originalInputShape).copy()
+        in_cube_offset = [
+            0,
+        ] * len(in_cube_dims)
+
+        # Iterate through input axes
+        out_idx = 0
+        for ax in range(len(in_cube_dims)):
+            if ax in parseDict['axes']:
+                # This axis is reduced
+                if keepDims:
+                    # Keepdims is set, so the output cube has a dimension here (which will be 1, as it's the reduction result)
+                    out_idx += 1
+            else:
+                # This axis is not reduced, so copy from output cube
+                in_cube_dims[ax] = outputCube.dims[out_idx]
+                in_cube_offset[ax] = outputCube.offset[out_idx]
+                out_idx += 1
+
+        return HyperRectangle(offset = tuple(in_cube_offset), dims = tuple(in_cube_dims))
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+
+        # Prepare address names
+        addrNames = ['data_in', 'data_out']
+
+        # Extract memory base addresses for each of the required components,
+        # based on the computed memory configuration
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # Prepare replacements for non-reduced input sizes
+        replacements: Dict[str, List[int]] = dict()
+        replacementTypes = dict()
+
+        for ax in range(len(operatorRepresentation['data_in_shape'])):
+            if ax not in operatorRepresentation['axes']:
+                replacements["dim_in_" + str(ax)] = []
+                replacementTypes["dim_in_" + str(ax)] = PointerClass(uint32_t)
+
+        # Prepare loading schedule lists
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        # Iterate over output cubes to compute corresponding input cubes
+        for out_cube in [cube.rectangle for cube in absoluteOutputCubes]:
+            # Compute input cube
+            in_cube = ReduceMeanTileConstraint.computeInputCubeFromOutputCube(out_cube,
+                                                                              parseDict = operatorRepresentation)
+
+            # Add replacements for non-reduced input sizes
+            for ax in range(len(operatorRepresentation['data_in_shape'])):
+                if ax not in operatorRepresentation['axes']:
+                    replacements["dim_in_" + str(ax)].append(in_cube.dims[ax])
+
+            # Append new cubes
+            inputLoadSchedule.append({"data_in": in_cube})
+            outputLoadSchedule.append({"data_out": out_cube})
+
+        # Prepare containing objects with information computed in this function regarding tiling schedule
+        # and variable replacement inside operator representation
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ReduceSumTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceSumTileConstraint.py
new file mode 100644
index 0000000000..cd404dde73
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ReduceSumTileConstraint.py
@@ -0,0 +1,250 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class ReduceSumTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+        outputBuffer = ctxt.lookup(outputBufferName)
+
+        inputShapeLen = len(inputBuffer.shape)
+        outputShapeLen = len(outputBuffer.shape)
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # For ReduceSum, we need to handle dimension reduction
+        # If keepdims=True, all dimensions should match (reduced dims become 1)
+        # If keepdims=False, reduced dimensions are removed from output
+
+        keepdims = parseDict.get('keepdims', True)  # Default to True if not specified
+
+        if keepdims:
+            # keepdims=True: output has same number of dimensions as input
+            if inputShapeLen == outputShapeLen:
+                for idx in range(inputShapeLen):
+                    outputDim = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx)
+                    inputDim = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx)
+
+                    # For reduced dimensions, output should be 1
+                    if 'axis' in parseDict:
+                        axis = parseDict['axis']
+                        if isinstance(axis, int):
+                            axis = [axis]
+
+                        # Handle negative axis indexing
+                        normalized_axis = []
+                        for ax in axis:
+                            if ax < 0:
+                                ax = inputShapeLen + ax
+                            normalized_axis.append(ax)
+
+                        if idx in normalized_axis:
+                            # This dimension is reduced, output should be 1
+                            tilerModel.addConstraint(outputDim == 1)
+                        else:
+                            # This dimension is preserved
+                            tilerModel.addConstraint(outputDim == inputDim)
+                    else:
+                        # No axis specified, all dimensions are reduced to 1
+                        tilerModel.addConstraint(outputDim == 1)
+            else:
+                raise ValueError("With keepdims=True, input and output should have same number of dimensions")
+
+        else:
+            # keepdims=False: reduced dimensions are removed from output
+            if 'axis' in parseDict:
+                axis = parseDict['axis']
+                if isinstance(axis, int):
+                    axis = [axis]
+
+                # Handle negative axis indexing
+                normalized_axis = []
+                for ax in axis:
+                    if ax < 0:
+                        ax = inputShapeLen + ax
+                    normalized_axis.append(ax)
+                normalized_axis = sorted(normalized_axis)
+
+                # Expected output shape length
+                expected_output_len = inputShapeLen - len(normalized_axis)
+
+                if outputShapeLen != expected_output_len:
+                    raise ValueError(f"With keepdims=False and axis={axis}, expected output to have "
+                                     f"{expected_output_len} dimensions, but got {outputShapeLen}")
+
+                # Map input dimensions to output dimensions (skipping reduced ones)
+                output_idx = 0
+                for input_idx in range(inputShapeLen):
+                    if input_idx not in normalized_axis:
+                        # This dimension is preserved
+                        outputDim = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = output_idx)
+                        inputDim = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = input_idx)
+                        tilerModel.addConstraint(outputDim == inputDim)
+                        output_idx += 1
+
+            else:
+                # No axis specified - global reduction, output should be scalar
+                # In many frameworks, scalar outputs are represented as 1D tensors with size 1
+                # or as 0D tensors (empty shape)
+                if outputShapeLen == 0:
+                    # True scalar (0D tensor) - nothing to constrain
+                    pass
+                elif outputShapeLen == 1:
+                    # 1D tensor with size 1 representing scalar
+                    outputDim = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+                    tilerModel.addConstraint(outputDim == 1)
+                else:
+                    # Allow other representations but warn about potential issues
+                    # Some frameworks might represent scalars differently
+                    # For now, just ensure all output dimensions are 1
+                    for idx in range(outputShapeLen):
+                        outputDim = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx)
+                        tilerModel.addConstraint(outputDim == 1)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # No constraints - let the tiler handle dimensions normally
+        # We'll handle the actual ReduceSum logic in serializeTilingSolution
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBufferName = parseDict['data_in']
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        symbolicParseDict = parseDict.copy()
+
+        # Since we force all dimensions to be full size, we can use the actual shape
+        # This ensures the template gets the correct dimensions for the single cube
+        symbolicParseDict['data_in_shape'] = list(inputBuffer.shape)
+
+        # Add axes information (normalized)
+        if 'axis' in parseDict:
+            axis = parseDict['axis']
+            if isinstance(axis, int):
+                axes = [axis]
+            else:
+                axes = list(axis)
+
+            # Handle negative axis indexing
+            normalized_axes = []
+            for ax in axes:
+                if ax < 0:
+                    ax = len(inputBuffer.shape) + ax
+                normalized_axes.append(ax)
+
+            symbolicParseDict['axes'] = normalized_axes
+        else:
+            # Global reduction - all axes
+            symbolicParseDict['axes'] = list(range(len(inputBuffer.shape)))
+
+        # Add keepdims information
+        symbolicParseDict['keepdims'] = parseDict.get('keepdims', True)
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+
+        # Get original tensor shapes from context
+        inputBufferName = operatorRepresentation['data_in']
+        outputBufferName = operatorRepresentation['data_out']
+        inputBuffer = ctxt.lookup(inputBufferName)
+        outputBuffer = ctxt.lookup(outputBufferName)
+
+        # Use original dimensions for ReduceSum computation
+        originalInputShape = list(inputBuffer.shape)
+        originalOutputShape = list(outputBuffer.shape)
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"data_in_shape": [], "axes": [], "keepdims": [], "reduceLength": []}
+        replacementTypes = {
+            "data_in_shape": PointerClass(uint32_t),
+            "axes": PointerClass(uint32_t),
+            "keepdims": PointerClass(uint32_t),
+            "reduceLength": PointerClass(uint32_t)
+        }
+
+        # Get axis and keepdims information from operator representation
+        # Note: the key might be 'axes' (plural) instead of 'axis' (singular)
+        axis = operatorRepresentation.get('axis', operatorRepresentation.get('axes', None))
+        keepdims = operatorRepresentation.get('keepdims', True)
+
+        # Calculate axes (normalize negative indices)
+        if axis is not None:
+            if isinstance(axis, int):
+                axes = [axis]
+            else:
+                axes = list(axis)
+
+            # Handle negative axis indexing
+            normalized_axes = []
+            for ax in axes:
+                if ax < 0:
+                    ax = len(originalInputShape) + ax
+                normalized_axes.append(ax)
+            axes = normalized_axes
+        else:
+            # Global reduction - all axes
+            axes = list(range(len(originalInputShape)))
+
+        # Calculate reduceLength (product of dimensions being reduced)
+        reduceLength = 1
+        for ax in axes:
+            reduceLength *= originalInputShape[ax]
+
+        # For ReduceSum, we always use the original tensor dimensions
+        # regardless of how the tiler decides to split them
+        replacements['data_in_shape'].append(tuple(originalInputShape))
+        replacements['axes'].append(tuple(axes))
+        replacements['keepdims'].append(1 if keepdims else 0)
+        replacements['reduceLength'].append(reduceLength)
+
+        # Create scheduling based on original dimensions
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        # Create HyperRectangles with original dimensions
+        from Deeploy.TilingExtension.TilingCodegen import HyperRectangle
+
+        inputCube = HyperRectangle(dims = originalInputShape, offset = [0] * len(originalInputShape))
+
+        outputCube = HyperRectangle(dims = originalOutputShape, offset = [0] * len(originalOutputShape))
+
+        inputLoadSchedule.append({"data_in": inputCube})
+        outputLoadSchedule.append({"data_out": outputCube})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/RequantShiftTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/RequantShiftTileConstraint.py
index da3acf015d..eab0c3a4fd 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/RequantShiftTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/RequantShiftTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: RequantShiftTileConstraint.py
-#
-# Last edited: 05.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py
index 2a4477bcb1..b7757786e1 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/SGDTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: AddTileConstraint.py
-#
-# Last edited: 21.03.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.Targets.Generic.TileConstraints.BOPTileConstraint import BOPTileConstraint
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py
new file mode 100644
index 0000000000..5309300659
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py
@@ -0,0 +1,180 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class SliceTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # ===== GET NECESSARY INFORMATION =====
+        #   Get I/O buffer names
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        #   Get I/O shapes
+        inputShape = parseDict['data_in_shape']
+
+        #   Get other necessary information
+        sliceAxes = parseDict['axes']
+        sliceSteps = parseDict['steps']
+
+        # ===== ADD I/O DIMENSIONS TO THE MODEL AS VARIABLES =====
+        for bufferName in [inputBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # ===== ADD CONSTRAINTS =====
+        #   Add constraints for the I/O dimensions
+        for idx in range(len(inputShape)):
+            # Get current dimension variables
+            inputDimensionVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = idx)
+            outputDimensionVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx)
+
+            if idx in sliceAxes:
+                # For sliced axes, constrain to minimal input dimension
+                # based on the output dimension and the slicing step
+                axIndex = list(sliceAxes).index(idx)
+                axStep = sliceSteps[axIndex]
+
+                tilerModel.addConstraint(inputDimensionVar == ((outputDimensionVar - 1) * axStep + 1))
+            else:
+                # Otherwise, input and output dimensions need to be equal
+                tilerModel.addConstraint(outputDimensionVar == inputDimensionVar)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+        symbolicParseDict = parseDict.copy()
+
+        return symbolicParseDict
+
+    @staticmethod
+    def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict: Dict) -> HyperRectangle:
+        # Computes the input cube given the output cube and the slicing parameters.
+        #
+        # Will provide a minimal input cube, that only requires the data needed for the output cube
+        # by ignoring the input data that is outside of the slicing scope,
+        # as given by the slicing starting and ending parameters.
+        #
+        # (It will start with the first element required for the output cube,
+        # and will end with the last element required for the output cube).
+        #
+        # *Function is ready for multiple axes slicing.
+
+        # Start from the output cube dimensions and offsets
+        in_cube_dims = list(outputCube.dims).copy()
+        in_cube_offset = list(outputCube.offset).copy()
+
+        # Iterate through the sliced axes
+        for idx, ax in enumerate(parseDict['axes']):
+            # Get current sliced ax parameters
+            start = parseDict['starts'][idx]
+            step = parseDict['steps'][idx]
+
+            # Compute input cube parameters for the current axis
+            in_cube_dims[ax] = (outputCube.dims[ax] - 1) * step + 1
+            in_cube_offset[ax] = start + outputCube.offset[ax] * step
+
+        return HyperRectangle(offset = tuple(in_cube_offset), dims = tuple(in_cube_dims))
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        # Extract rectangle information (offsets and dimensions) from output cubes
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        # Prepare address names
+        addrNames = ['data_in', 'data_out']
+
+        # Extract memory base addresses for each of the required components,
+        # based on the computed memory configuration
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        # Prepare replacement lists for the elements inside the operator representation,
+        # for the cubes to be computed further down in this function
+
+        # ~~~~~ SEE ISSUE #134: https://github.com/pulp-platform/Deeploy/issues/134 ~~~~~
+        # Freeze tiling input and output tiling for now
+        replacements = {
+            # "data_in_shape": [],
+            # "data_out_shape": [],
+            # "starts": [[
+            #     0,
+            # ] * len(operatorRepresentation['axes'])] * len(outputCubes),
+            # "ends": [],
+            "data_in_size": [],
+        }
+
+        replacementTypes = {
+            # "data_in_shape": [
+            #     PointerClass(uint16_t),
+            #     PointerClass(uint16_t),
+            #     PointerClass(uint16_t),
+            #     PointerClass(uint16_t)
+            # ],
+            # "data_out_shape": [
+            #     PointerClass(uint16_t),
+            #     PointerClass(uint16_t),
+            #     PointerClass(uint16_t),
+            #     PointerClass(uint16_t)
+            # ],
+            # "starts": PointerClass(uint16_t),
+            # "ends": PointerClass(uint16_t),
+            "data_in_size": PointerClass(uint16_t),
+        }
+        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        # Prepare loading schedule lists
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out_cube in outputCubes:
+            # Compute input cube
+            in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, parseDict = operatorRepresentation)
+
+            # Compute new ends for replacement
+            new_ends = list()
+            for ax in operatorRepresentation['axes']:
+                new_ends.append(in_cube.offset[ax] + in_cube.dims[ax])
+
+            # Append replacement elements
+
+            # ~~~~~ SEE ISSUE #134: https://github.com/pulp-platform/Deeploy/issues/134 ~~~~~
+            # Freeze tiling input and output tiling for now
+            # replacements["data_in_shape"].append(list(in_cube.dims).copy())
+            # replacements["data_out_shape"].append(list(out_cube.dims).copy())
+            # replacements["ends"].append(new_ends)
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+            replacements["data_in_size"].append(int(np.prod(in_cube.dims)))
+
+            # Append new cubes
+            inputLoadSchedule.append({"data_in": in_cube})
+            outputLoadSchedule.append({"data_out": out_cube})
+
+        # Prepare containing objects with information computed in this function regarding tiling schedule
+        # and variable replacement inside operator representation
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py
index 343c4970eb..38c984de63 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/SoftmaxCrossEntropyTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: SoftmaxCrossEntropyTileConstraint.py
-#
-# Last edited: 19.03.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Run Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
@@ -114,7 +93,7 @@ def serializeTilingSolution(
             replacements['num_classes'].append(num_classes)
             replacements['batch'].append(batch)
 
-            labelCube = HyperRectangle((0, cube.offset[0]), (1, batch))
+            labelCube = HyperRectangle((cube.offset[0],), (batch,))
             inputlabelCubes.append(labelCube)
 
         inputLoadSchedule = []
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/__init__.py b/Deeploy/Targets/PULPOpen/TileConstraints/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/__init__.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/iSoftmaxTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/iSoftmaxTileConstraint.py
index 9fb2a6f272..f77834043c 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/iSoftmaxTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/iSoftmaxTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: iSoftmaxTileConstraint.py
-#
-# Last edited: 13.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
@@ -122,3 +101,88 @@ def serializeTilingSolution(
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
         return variableReplacementSchedule, tilingSchedule
+
+
+class SoftmaxGradTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        upstream_grad = parseDict['upstream_grad']
+        softmax_output = parseDict['softmax_output']
+        softmax_grad = parseDict['softmax_grad']
+
+        shapeLen = len(ctxt.lookup(upstream_grad).shape)
+
+        for bufferName in [upstream_grad, softmax_output, softmax_grad]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        for idx in range(shapeLen):
+            upstream_dim = tilerModel.getTensorDimVar(tensorName = upstream_grad, dimIdx = idx)
+            softmax_out_dim = tilerModel.getTensorDimVar(tensorName = softmax_output, dimIdx = idx)
+            softmax_grad_dim = tilerModel.getTensorDimVar(tensorName = softmax_grad, dimIdx = idx)
+
+            tilerModel.addConstraint(upstream_dim == softmax_out_dim)
+            tilerModel.addConstraint(upstream_dim == softmax_grad_dim)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        upstream_grad = parseDict['upstream_grad']
+        inputBuffer = ctxt.lookup(upstream_grad)
+
+        lastDimLength = inputBuffer.shape[-1]
+        lastDimIdx = len(inputBuffer.shape) - 1
+        lastDimVar = tilerModel.getTensorDimVar(tensorName = upstream_grad, dimIdx = lastDimIdx)
+
+        tilerModel.addConstraint(lastDimVar == lastDimLength)
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        upstream_grad = parseDict['upstream_grad']
+        inputBuffer = ctxt.lookup(upstream_grad)
+
+        lastDimIdx = len(inputBuffer.shape) - 1
+
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['lastDimLength'] = tilerModel.getTensorDimVar(upstream_grad, lastDimIdx)
+
+        return symbolicParseDict
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['upstream_grad', 'softmax_output', 'softmax_grad']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements = {"lastDimLength": [], "size": []}
+
+        replacementTypes = {"lastDimLength": PointerClass(uint32_t), "size": PointerClass(uint32_t)}
+
+        for cube in outputCubes:
+            lastDimLength = cube.dims[-1]
+            size = np.prod(cube.dims)
+
+            replacements['lastDimLength'].append(lastDimLength)
+            replacements['size'].append(size)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for out in outputCubes:
+            inputLoadSchedule.append({"upstream_grad": out, "softmax_output": out})
+            outputLoadSchedule.append({"softmax_grad": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
index ea48ac9b21..3d7d11f343 100644
--- a/Deeploy/Targets/PULPOpen/Tiler.py
+++ b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -1,34 +1,9 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPTiler.py
-#
-# Last edited: 09.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 
-from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import MemoryPassthroughGeneration
-from Deeploy.DeeployTypes import CodeTransformation
-from Deeploy.Targets.Generic.Bindings import BasicReshapeBindings
 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint
@@ -39,25 +14,32 @@
 from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
-from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint
 from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \
-    PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \
-    PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, PULPMaxPool2DBindings, PULPMulBindings, \
-    PULPReduceSumBindings, PULPReluBinding, PULPRQAddBindings, PULPRQSBindings, PULPRQSConv2DBindings, \
-    PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, \
-    PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, \
-    PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings
+    PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \
+    PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \
+    PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool2DBindings, PULPMulBindings, PULPReduceMeanBindings, \
+    PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, PULPRQSBindings, \
+    PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, \
+    PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, PULPSoftmaxBindings, \
+    PULPSoftmaxCrossEntropyLossBindings, PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, \
+    PULPTransposeBindings, PULPUniformRQSBindings
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint
-from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \
+    RQDWConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint
-from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint, \
-    MatrixVecTileConstraint, TallGEMMTileConstraint
-from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import iSoftmaxTileConstraint
-from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.GeluTileConstraint import GeluGradTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.GEMMTileConstraint import FloatGEMMTileConstraint, GEMMTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.iSoftmaxTileConstraint import SoftmaxGradTileConstraint, \
+    iSoftmaxTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.LayernormTileConstraint import LayernormGradTileConstraint, \
+    LayernormTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.MatMulTileConstraint import MatMulTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.MaxPoolTileConstraint import MaxPoolCTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.ReduceMeanConstraint import ReduceMeanTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.ReduceSumTileConstraint import ReduceSumTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \
     SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint
 from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
@@ -66,11 +48,14 @@
                                                            tileConstraint = RQConv2DTileConstraint())
 
 PULPRQSDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSDWConv2DBindings,
-                                                             tileConstraint = DWConv2DTileConstraint())
+                                                             tileConstraint = RQDWConv2DTileConstraint())
 
 PULPConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings,
                                                         tileConstraint = Conv2DTileConstraint())
 
+PULPDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConv2DBindings,
+                                                          tileConstraint = DWConv2DTileConstraint())
+
 PULPRQSGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSGEMMBindings,
                                                          tileConstraint = GEMMTileConstraint())
 
@@ -78,10 +63,10 @@
                                                         tileConstraint = FloatGEMMTileConstraint())
 
 PULPRQSMatrixVecTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSMatrixVecBindings,
-                                                              tileConstraint = MatrixVecTileConstraint())
+                                                              tileConstraint = GEMMTileConstraint())
 
 PULPRQSTallGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSTallGEMMBindings,
-                                                             tileConstraint = TallGEMMTileConstraint())
+                                                             tileConstraint = GEMMTileConstraint())
 
 PULPMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMatMulBindings,
                                                         tileConstraint = MatMulTileConstraint())
@@ -95,9 +80,7 @@
 PULPRQSiHardswishTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSiHardswishBindings,
                                                                tileConstraint = RQSiHardswishTileConstraint())
 
-_BasicFlattenBindings = copy.deepcopy(BasicReshapeBindings)
-for binding in _BasicFlattenBindings:
-    binding.codeTransformer = CodeTransformation([MemoryPassthroughGeneration("L.*"), MemoryPassthroughGeneration()])
+_BasicFlattenBindings = copy.deepcopy(PULPReshapeBindings)
 
 PULPFlattenTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _BasicFlattenBindings,
                                                          tileConstraint = NOPTileConstraint())
@@ -138,9 +121,15 @@
 PULPLayernormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPLayernormBinding],
                                                            tileConstraint = LayernormTileConstraint())
 
+PULPLayernormGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPLayernormGradBinding],
+                                                               tileConstraint = LayernormGradTileConstraint())
+
 PULPFPGELUTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPFloatGELUBinding],
                                                         tileConstraint = UnaryTileConstraint())
 
+PULPFPGELUGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = [PULPFloatGELUGradBinding],
+                                                            tileConstraint = GeluGradTileConstraint())
+
 PULPGatherTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPGatherBindings,
                                                         tileConstraint = GatherTileConstraint())
 
@@ -151,10 +140,16 @@
     nodeBindings = PULPSoftmaxCrossEntropyLossGradBindings, tileConstraint = SoftmaxCrossEntropyGradTileConstraint())
 
 PULPSoftmaxGradTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSoftmaxGradBindings,
-                                                             tileConstraint = UntiledTileConstraint())
+                                                             tileConstraint = SoftmaxGradTileConstraint())
 
 PULPReduceSumTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceSumBindings,
-                                                           tileConstraint = UntiledTileConstraint())
+                                                           tileConstraint = ReduceSumTileConstraint())
 
 PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings,
-                                                     tileConstraint = SGDTileConstraint())
\ No newline at end of file
+                                                     tileConstraint = SGDTileConstraint())
+
+PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings,
+                                                       tileConstraint = SliceTileConstraint())
+
+PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings,
+                                                            tileConstraint = ReduceMeanTileConstraint())
diff --git a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py
index aabcbb5928..43d490e80b 100644
--- a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py
+++ b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/Passes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPPasses.py
-#
-# Last edited: 10.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from collections import OrderedDict
diff --git a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/__init__.py
+++ b/Deeploy/Targets/PULPOpen/TopologyOptimizationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/PULPOpen/TypeCheckers.py b/Deeploy/Targets/PULPOpen/TypeCheckers.py
index 2685f4d7fc..e309624186 100644
--- a/Deeploy/Targets/PULPOpen/TypeCheckers.py
+++ b/Deeploy/Targets/PULPOpen/TypeCheckers.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: PULPCheckers.py
-#
-# Last edited: 03.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Sequence, Type
 
diff --git a/Deeploy/Targets/PULPOpen/__init__.py b/Deeploy/Targets/PULPOpen/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/PULPOpen/__init__.py
+++ b/Deeploy/Targets/PULPOpen/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/Snitch/Bindings.py b/Deeploy/Targets/Snitch/Bindings.py
index 37f7800d6b..25b150b553 100644
--- a/Deeploy/Targets/Snitch/Bindings.py
+++ b/Deeploy/Targets/Snitch/Bindings.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchBindings.py
-#
-# Last edited: 30.05.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from functools import partial
 
@@ -35,12 +14,14 @@
 from Deeploy.Targets.Generic.Templates import iNoNormTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, GEMMChecker, RQAddChecker, SoftmaxChecker, iNoNormChecker
 from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling, SnitchCoreFilterPass, \
-    SnitchProfileExecutionBlockPass, SnitchSynchCoresPass
+    SnitchSynchCoresPass
+from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma
 from Deeploy.Targets.Snitch.Templates import AddTemplate, FloatGemmTemplate, RQAddTemplate, iSoftmaxTemplate
 from Deeploy.Targets.Snitch.Templates.FloatSoftmaxTemplate import FloatSoftmax_Template
 from Deeploy.Targets.Snitch.Templates.GemmTemplate import SnitchGemm_Template
 from Deeploy.Targets.Snitch.Templates.RqGemmTemplate import SnitchRqGemm_Template
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
+    TilingVariableReplacementUpdate
 
 TilingCallClosure = partial(ClosureGeneration, closureSuffix = "_tiling_closure")
 MemoryAwareFunctionCallClosure = partial(MemoryAwareClosureGeneration,
@@ -56,11 +37,11 @@
 
 TiledTransformer = CodeTransformation([
     SnitchCoreFilterPass("compute"),
-    SnitchProfileExecutionBlockPass(),
     TilingVariableReplacement("L1"),
     TilingCallClosure(writeback = False),
     SnitchSynchCoresPass(),
-    SnitchClusterTiling("L1"),
+    TilingVariableReplacementUpdate("L1"),
+    SnitchClusterTiling("L2", "L1", SnitchDma()),
     ArgumentStructGeneration(),
     MemoryManagementGeneration("L1"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterSynch.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterSynch.py
index 8173b93ebb..cc0af164ad 100644
--- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterSynch.py
+++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterSynch.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchClusterSynch.py
-#
-# Last edited: 31.05.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py
index ce513a5355..e8204f6ae2 100644
--- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py
+++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTiling.py
@@ -1,49 +1,58 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchClusterTiling.py
-#
-# Last edited: 31.05.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
-from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, \
+    NodeTemplate, _NoVerbosity
+from Deeploy.TilingExtension.AsyncDma import AsyncDma
+from Deeploy.TilingExtension.CodeTransformationPasses.DoubleBufferingTilingCodeGeneration import \
+    DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn
+from Deeploy.TilingExtension.CodeTransformationPasses.SingleBufferingTilingCodeGeneration import \
+    ProfilingSingleBufferingTilingMixIn, SingleBufferingTilingCodeGeneration
+
 
-from .SnitchClusterTilingSB import SnitchClusterTilingGenerationSB
+class SnitchClusterTilingSB(SingleBufferingTilingCodeGeneration):
+    pass
+
+
+class SnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration):
+    pass
+
+
+class ProfilingSnitchClusterTilingSB(SingleBufferingTilingCodeGeneration, ProfilingSingleBufferingTilingMixIn):
+    _printCycleDifference = NodeTemplate(r"""
+    printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \
+    ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr});
+    """)
+
+
+class ProfilingSnitchClusterTilingDB(DoubleBufferingTilingCodeGeneration, ProfilingDoubleBufferingTilingMixIn):
+    _printCycleDifference = NodeTemplate(r"""
+    printf("%s%u][Core %d] %s%u%s", ${prefixStr}, ${profileIdxVar}, snrt_global_core_idx(), "${flavorStr}", \
+    ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}], ${suffixStr});
+    """)
 
 
 class SnitchClusterTiling(CodeTransformationPass):
 
-    def __init__(self, targetMemLevel: str):
-        self.SB = SnitchClusterTilingGenerationSB(targetMemLevel)
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+        self.SB = SnitchClusterTilingSB(externalMemory, localMemory, dma)
+        self.profilingSB = ProfilingSnitchClusterTilingSB(externalMemory, localMemory, dma)
+
+        self.DB = SnitchClusterTilingDB(externalMemory, localMemory, dma)
+        self.profilingDB = ProfilingSnitchClusterTilingDB(externalMemory, localMemory, dma)
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-
         if verbose.tilingProfiling:
-            raise NotImplementedError("Profiling not implemented for L2")
-            # ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.profilingSB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.profilingDB.apply(ctxt, executionBlock, name)
         else:
             ctxt, executionBlock = self.SB.apply(ctxt, executionBlock, name)
+            ctxt, executionBlock = self.DB.apply(ctxt, executionBlock, name)
         return ctxt, executionBlock
diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTilingSB.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTilingSB.py
deleted file mode 100644
index 8e31ee2627..0000000000
--- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchClusterTilingSB.py
+++ /dev/null
@@ -1,520 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: SnitchClusterTilingSB.py
-#
-# Last edited: 03.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-from collections import namedtuple
-from typing import Dict, List, Literal, Tuple
-
-from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation
-from Deeploy.Targets.Snitch.DataTypes import Snitch_DMA_copy
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
-from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import SingleBufferingTilingMixIn, TilingMetaInfo
-from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
-from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
-    calculateRectangleOffset, minimizeRectangleDims
-
-_openTileLoopTemplate = NodeTemplate("""
-
-// TILING LOOP
-for (int TILING_I=${numTiles}[*${tileIdxPtr}]; TILING_I<${numTiles}[(*${tileIdxPtr})+1]; TILING_I++){
-""")
-
-_closeTileLoopTemplate = NodeTemplate("""
-
-// CLOSE TILING LOOP
-}
-*${tileIdxPtr} += 1;
-                                      
-""")
-
-_moveTileInTemplate = NodeTemplate("""
-
-// IMPORT TILE ${innerTilePtr} from ${outerTilePtr}
-if(snrt_is_dm_core()){ 
-    ${stateReference}.tid = snrt_dma_start_2d(${stateReference}.dst,
-                    ${stateReference}.src,
-                    ${stateReference}.size,
-                    ${stateReference}.dst_stride,
-                    ${stateReference}.src_stride,
-                    ${stateReference}.repeat);
-}                                   
-""")
-
-_iteratedMoveTileInTemplate = NodeTemplate("""
-
-""")
-
-_blockTileInTemplate = NodeTemplate("""
-
-// BLOCKING IMPORT TILE ${innerTilePtr}
-if(snrt_is_dm_core()){                            
-    // snrt_dma_wait(${stateReference}.tid);
-    snrt_dma_wait_all();
-}
-""")
-
-_moveTileOutTemplate = NodeTemplate("""
-
-// EXPORT TILE ${innerTilePtr} to ${outerTilePtr}
-if(snrt_is_dm_core()){ 
-    ${stateReference}.tid = snrt_dma_start_2d(${stateReference}.dst,
-                    ${stateReference}.src,
-                    ${stateReference}.size,
-                    ${stateReference}.dst_stride,
-                    ${stateReference}.src_stride,
-                    ${stateReference}.repeat);
-}
-""")
-
-_blockTileOutTemplate = NodeTemplate("""
-
-// BLOCKING EXPORT TILE ${innerTilePtr}
-if(snrt_is_dm_core()){ 
-    //snrt_dma_wait(${stateReference}.tid);
-    snrt_dma_wait_all();
-}
-""")
-
-_updateDMATransferStructTemplate = NodeTemplate("""
-                                                
-// UPDATE DMA STRUCT ${stateReference}
-${stateReference}.dst = ((char*)${dstPtr}) + ${dstOffsetPtr}[${tileNum}];
-${stateReference}.src = ((char*)${srcPtr}) + ${srcOffsetPtr}[${tileNum}];
-${stateReference}.size = ${sizePtr}[${tileNum}];
-${stateReference}.dst_stride = ${dstStridePtr}[${tileNum}];
-${stateReference}.src_stride = ${srcStridePtr}[${tileNum}];
-${stateReference}.repeat = ${repeatPtr}[${tileNum}];
-""")
-
-_updateReferenceTemplate = NodeTemplate("""
-
-// UPDATE VARIABLE ${reference}
-*${reference} = ${baseReference}[${tileNum}];
-""")
-
-_DMAUpdate = namedtuple("_DMAUpdate", "dst src size dst_stride src_stride repeat tid direction")
-
-
-class SnitchClusterTilingSB(TilingCodeGeneration):
-
-    _prefix = "TILING_REPLACED_"
-
-    _openTileLoopTemplate = _openTileLoopTemplate
-    _closeTileLoopTemplate = _closeTileLoopTemplate
-
-    _moveTileInTemplate = _moveTileInTemplate
-    _iteratedMoveTileInTemplate = _iteratedMoveTileInTemplate
-    _blockTileInTemplate = _blockTileInTemplate
-
-    _moveTileOutTemplate = _moveTileOutTemplate
-    _blockTileOutTemplate = _blockTileOutTemplate
-
-    _updateDMATransferStructTemplate = _updateDMATransferStructTemplate
-    _updateReferenceTemplate = _updateReferenceTemplate
-
-    @property
-    def prefix(self):
-        return self._prefix + self.targetMemLevel + "_"
-
-    def _DMAStructName(self, tensorName: str, nodeName: str) -> str:
-        return f"{self.prefix}_DMA_{nodeName}_{tensorName}"
-
-    @classmethod
-    def _generatePointerUpdates(cls, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
-                                loadSchedule: List[Dict[str,
-                                                        HyperRectangle]], nodeMemoryConstraint: NodeMemoryConstraint,
-                                tilingSchedule: TilingSchedule) -> Dict[str, _DMAUpdate]:
-        updateDict = {}
-        deltaOffsets = {}
-
-        for idx, loadStep in enumerate(loadSchedule):
-            for _, (key, rect) in enumerate(loadStep.items()):
-
-                if key in tilingSchedule.outputBaseOffsets.keys():
-                    baseOffsets = tilingSchedule.outputBaseOffsets[key]
-                    direction = "FromL1"
-                else:
-                    baseOffsets = tilingSchedule.inputBaseOffsets[key]
-                    direction = "ToL1"
-
-                if key not in updateDict.keys():
-                    updateDict[key] = []
-                if key not in deltaOffsets.keys():
-                    deltaOffsets[key] = 0
-
-                referenceBuffer = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-                l1Buffer = ctxt.lookup(operatorRepresentation[key])
-
-                finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, l1Buffer)
-
-                struct = cls._rectToDMAStruct(ctxt, rect, direction, l1Buffer.name, l1Buffer._referenceName,
-                                              finalMemoryLevel)
-                accOffset = calculateRectangleOffset(rect, referenceBuffer)
-
-                lIdx = idx % len(baseOffsets)
-
-                if direction == "ToL1":
-                    src = accOffset
-                    dst = baseOffsets[lIdx]
-                else:
-                    src = baseOffsets[lIdx]
-                    dst = accOffset
-
-                size = struct.value['size'].value
-                dst_stride = struct.value['dst_stride'].value
-                src_stride = struct.value['src_stride'].value
-                repeat = struct.value['repeat'].value
-                tid = struct.value['tid'].value
-
-                sol = _DMAUpdate(dst, src, size, dst_stride, src_stride, repeat, tid, direction)
-
-                deltaOffsets[key] = accOffset
-                updateDict[key].append(sol)
-
-        return updateDict
-
-    @classmethod
-    def _rectToDMAStruct(cls, ctxt: NetworkContext, rectangle: HyperRectangle, direction: Literal["ToL1", "FromL1"],
-                         L1Name: str, L2Name: str, finalMemoryLevel: bool) -> Snitch_DMA_copy:
-
-        referenceBuffer = ctxt.lookup(L2Name)
-
-        rect, referenceRect = minimizeRectangleDims(rectangle, referenceBuffer)
-        assert len(rect.dims) <= 3, "Snitch's iDMA only 2D transfers are supported!"
-
-        if direction == "FromL1":
-            _src = L1Name
-            _dst = referenceBuffer.name
-        else:
-            _src = referenceBuffer.name
-            _dst = L1Name
-
-        transfer_size = rect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-
-        src_stride = 0
-        dst_stride = 0
-        repeat = 1
-        if len(rect.dims) > 1:
-            repeat = rect.dims[-2]
-            if direction == "ToL1":
-                dst_stride = rect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-                src_stride = referenceRect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-            else:
-                dst_stride = referenceRect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-                src_stride = rect.dims[-1] * (referenceBuffer._type.referencedType.typeWidth // 8)
-
-        struct = Snitch_DMA_copy(
-            {
-                "dst": _dst,
-                "src": _src,
-                "size": transfer_size,
-                "dst_stride": dst_stride,
-                "src_stride": src_stride,
-                "repeat": repeat,
-                "tid": 0
-            }, ctxt)
-
-        return struct
-
-    def _hoistDMAUpdates(self, ctxt: NetworkContext, tensorName: str, updateList: List[_DMAUpdate],
-                         operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict]:
-
-        operatorRepresentation = operatorRepresentation.copy()
-
-        nodeName = operatorRepresentation['nodeName']
-
-        dstList = []
-        srcList = []
-        sizeList = []
-        dstStrideList = []
-        srcStideList = []
-        repeatList = []
-        for update in updateList:
-            dstList.append(int(update.dst))
-            srcList.append(int(update.src))
-            sizeList.append(int(update.size))
-            dstStrideList.append(int(update.dst_stride))
-            srcStideList.append(int(update.src_stride))
-            repeatList.append(int(update.repeat))
-
-        dmaName = self._DMAStructName(tensorName, nodeName)
-
-        operatorRepresentation['stateReference'] = dmaName
-        operatorRepresentation['tileNum'] = "TILING_I"
-
-        if updateList[0].direction == "ToL1":
-            operatorRepresentation['dstPtr'] = ctxt.lookup(operatorRepresentation[tensorName]).name
-            operatorRepresentation['srcPtr'] = ctxt.lookup(operatorRepresentation[tensorName])._referenceName
-
-            dstOffsetList = [0] * len(updateList)
-            srcOffsetList = [srcList[i] - srcList[0] for i in range(0, len(srcList))]
-            # srcOffsetList = [0] + [sum(sizeList[:i+1]) for i in range(0, len(sizeList)-1)]
-        else:
-            operatorRepresentation['dstPtr'] = ctxt.lookup(operatorRepresentation[tensorName])._referenceName
-            operatorRepresentation['srcPtr'] = ctxt.lookup(operatorRepresentation[tensorName]).name
-
-            dstOffsetList = [dstList[i] - dstList[0] for i in range(0, len(dstList))]
-            # dstOffsetList = [0] + [sum(sizeList[:i+1]) for i in range(0, len(sizeList)-1)]
-            srcOffsetList = [0] * len(updateList)
-
-        namePrefix = self.prefix + f"{nodeName}_{tensorName}"
-
-        name = namePrefix + "_dst_offset"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], dstOffsetList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'dstOffsetPtr')
-
-        name = namePrefix + "_src_offset"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], srcOffsetList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'srcOffsetPtr')
-
-        name = namePrefix + "_size"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], sizeList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'sizePtr',
-                                                                       Snitch_DMA_copy.structTypeDict['size'])
-
-        name = namePrefix + "_dst_stride"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], dstStrideList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'dstStridePtr',
-                                                                       Snitch_DMA_copy.structTypeDict['dst_stride'])
-
-        name = namePrefix + "_src_stride"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], srcStideList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'srcStridePtr',
-                                                                       Snitch_DMA_copy.structTypeDict['src_stride'])
-
-        name = namePrefix + "_repeat"
-        cb = ctxt.ConstantBuffer(name, [len(updateList)], repeatList)
-        ctxt, operatorRepresentation = self._hoistConstantAndReference(ctxt, cb, operatorRepresentation, nodeName,
-                                                                       'repeatPtr',
-                                                                       Snitch_DMA_copy.structTypeDict['repeat'])
-
-        return ctxt, operatorRepresentation
-
-    def _generateEgressPointerUpdates(
-            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
-                                                  nodeMemoryConstraint, tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateIngressPointerUpdates(
-            self, nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, List[CodeSnippet]]:
-
-        updates = []
-        newCtxt = ctxt.copy()
-
-        updateDict = self._generatePointerUpdates(ctxt, operatorRepresentation, tilingSchedule.inputLoadSchedule,
-                                                  nodeMemoryConstraint, tilingSchedule)
-
-        for key, updateList in updateDict.items():
-
-            newCtxt, newNodeRep = self._hoistDMAUpdates(newCtxt, key, updateList, operatorRepresentation)
-            updates.append(CodeSnippet(self._updateDMATransferStructTemplate, newNodeRep))
-
-        return newCtxt, updates
-
-    def _generateVariableUpdates(self, tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme,
-                                 ctxt: NetworkContext,
-                                 operatorRepresentation: OperatorRepresentation) -> List[CodeSnippet]:
-
-        updates = []
-
-        for key in variableReplacement.perTileReplacements.keys():
-
-            buf = ctxt.lookup(operatorRepresentation[key])
-            reference = str(buf._instance)
-
-            updates.append(
-                CodeSnippet(self._updateReferenceTemplate, {
-                    "reference": reference,
-                    "tileNum": "TILING_I",
-                    "baseReference": buf._referenceName
-                }))
-
-        return updates
-
-    def _generateDMACode(self, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-                         operatorRepresentation: OperatorRepresentation, loadSchedule: List[Dict[str, HyperRectangle]],
-                         direction: Literal["ToL1", "FromL1"]) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        DMATransferCalls = []
-        DMAWaitStatements = []
-        transferNodeRep = {}
-
-        loadStep = loadSchedule[0]
-
-        for idx, (key, rectangle) in enumerate(loadStep.items()):
-
-            permName = f"in{idx}_perm"
-
-            externalPtr = ctxt.lookup(ctxt.lookup(operatorRepresentation[key])._referenceName)
-            internalPtr = ctxt.lookup(operatorRepresentation[key])
-
-            tensorName = key
-            nodeName = operatorRepresentation['nodeName']
-            dmaName = self._DMAStructName(tensorName, nodeName)
-
-            transferNodeRep = {
-                **transferNodeRep,
-                **{
-                    'innerTilePtr': str(internalPtr._instance),
-                    "outerTilePtr": str(externalPtr._instance),
-                    "stateReference": dmaName
-                }
-            }
-
-            finalMemoryLevel = TilingCodeGeneration.isFinalMemoryLevel(nodeMemoryConstraint, internalPtr)
-            struct = self._rectToDMAStruct(ctxt, rectangle, direction, internalPtr.name, externalPtr.name,
-                                           finalMemoryLevel)
-
-            transferNodeRep["stateStruct"] = struct
-            _ = ctxt.hoistStruct(struct, dmaName, Snitch_DMA_copy)
-            ctxt.lookup(dmaName)._users += [operatorRepresentation['nodeName']]
-
-            if permName in operatorRepresentation and direction == "ToL1":
-
-                DMATransferCalls.append(CodeSnippet(self._iteratedMoveTileInTemplate, transferNodeRep))
-            else:
-                DMATransferCalls.append(CodeSnippet(self._moveTileInTemplate, transferNodeRep))
-
-            DMAWaitStatements.append(CodeSnippet(self._blockTileInTemplate, transferNodeRep))
-
-        return DMATransferCalls, DMAWaitStatements
-
-    def _generateIngressDMACode(
-            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        importLoadStep = tilingSchedule.inputLoadSchedule
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateDMACode(nodeMemoryConstraint, ctxt,
-                                                                                  operatorRepresentation,
-                                                                                  importLoadStep, "ToL1")
-        return ingressDMATransferCalls, ingressDMAWaitStatements
-
-    def _generateEgressDMACode(
-            self, tilingSchedule: TilingSchedule, nodeMemoryConstraint: NodeMemoryConstraint, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[List[CodeSnippet], List[CodeSnippet]]:
-
-        exportLoadStep = tilingSchedule.outputLoadSchedule
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateDMACode(nodeMemoryConstraint, ctxt,
-                                                                                operatorRepresentation, exportLoadStep,
-                                                                                "FromL1")
-
-        return egressDMATransferCalls, egressDMAWaitStatements
-
-    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
-                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
-                    variableReplacement: VariableReplacementScheme,
-                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        tileIdxPtr = self._hoistTileIdxPtr(ctxt, operatorRepresentation)
-
-        ingressDMATransferCalls, ingressDMAWaitStatements = self._generateIngressDMACode(
-            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
-
-        egressDMATransferCalls, egressDMAWaitStatements = self._generateEgressDMACode(
-            tilingSchedule, nodeMemoryConstraint, ctxt, operatorRepresentation)
-
-        ctxt, ingressDMAUpdates = self._generateIngressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
-                                                                      operatorRepresentation)
-        ctxt, egressDMAUpdates = self._generateEgressPointerUpdates(nodeMemoryConstraint, tilingSchedule, ctxt,
-                                                                    operatorRepresentation)
-
-        openLoopStatement = [
-            CodeSnippet(self._openTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        closeLoopStatement = [
-            CodeSnippet(self._closeTileLoopTemplate, {
-                "numTiles": operatorRepresentation["numTiles"],
-                "tileIdxPtr": tileIdxPtr
-            })
-        ]
-
-        variableUpdates = self._generateVariableUpdates(tilingSchedule, variableReplacement, ctxt,
-                                                        operatorRepresentation)
-
-        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + "_L2",
-                                  nodeOps = operatorRepresentation['nodeOps'],
-                                  numTiles = len(tilingSchedule.outputLoadSchedule),
-                                  tileIdxVar = "TILING_I",
-                                  kernelLevelTiling = True)
-
-        newExecutionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                       ingressDMAWaitStatements, ingressDMAUpdates,
-                                                       egressDMATransferCalls, egressDMAWaitStatements,
-                                                       egressDMAUpdates, variableUpdates, openLoopStatement,
-                                                       closeLoopStatement, [], [])
-
-        return ctxt, newExecutionBlock, True
-
-    def generateTilingLoop(
-            self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
-            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
-
-        flatTilingSchedule = copy.copy(tilingSchedules[0])
-        for tilingSchedule in tilingSchedules[1:]:
-            flatTilingSchedule += tilingSchedule
-
-        # SCHEREMO: hoist numTiles
-
-        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
-
-        if len(offsetLists) == 0:
-            return ctxt, executionBlock, False
-
-        for offsetList in offsetLists:
-            if not len(offsetList) == 1:
-                return ctxt, executionBlock, False
-
-        operatorRepresentation["numTiles"] = self._hoistNumTiles(ctxt, operatorRepresentation['nodeName'],
-                                                                 tilingSchedules)
-
-        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
-                                operatorRepresentation)
-
-
-class SnitchClusterTilingGenerationSB(SnitchClusterTilingSB, SingleBufferingTilingMixIn):
-    pass
diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchCoreFilter.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchCoreFilter.py
index 6ea21f478b..a864c5db8f 100644
--- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchCoreFilter.py
+++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchCoreFilter.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchCoreFilter.py
-#
-# Last edited: 04.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Luka Macan, luka.macan@unibo.it, University of Bologna
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Literal, Tuple
 
diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchProfileExecutionBlock.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchProfileExecutionBlock.py
index 7559a1a54c..ec17b36011 100644
--- a/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchProfileExecutionBlock.py
+++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/SnitchProfileExecutionBlock.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchProfileExecutionBlock.py
-#
-# Last edited: 05.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/Snitch/CodeTransformationPasses/__init__.py b/Deeploy/Targets/Snitch/CodeTransformationPasses/__init__.py
index d3281dd173..a45f50652e 100644
--- a/Deeploy/Targets/Snitch/CodeTransformationPasses/__init__.py
+++ b/Deeploy/Targets/Snitch/CodeTransformationPasses/__init__.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 03.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from .SnitchClusterSynch import *
 from .SnitchClusterTiling import *
diff --git a/Deeploy/Targets/Snitch/DMA/SnitchDma.py b/Deeploy/Targets/Snitch/DMA/SnitchDma.py
new file mode 100644
index 0000000000..ac0c622cc8
--- /dev/null
+++ b/Deeploy/Targets/Snitch/DMA/SnitchDma.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Tuple
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
+
+
+class SnitchBarrierFuture(Future):
+    _initTemplate = NodeTemplate("")
+    _deinitTemplate = NodeTemplate("")
+    _allocTemplate = NodeTemplate("")
+    _waitTemplate = NodeTemplate("if (snrt_is_dm_core()) snrt_dma_wait_all();")
+
+
+# LMACAN: TODO: Add single transfer waiting
+class SnitchFuture(Future):
+    _initTemplate = NodeTemplate("snrt_dma_txid_t ${name} = (snrt_dma_txid_t) -1;")
+
+    _deinitTemplate = NodeTemplate("")
+
+    _allocTemplate = NodeTemplate("")
+
+    _waitTemplate = NodeTemplate(
+        "if ( (${name} != ( (snrt_dma_txid_t) -1) ) && snrt_is_dm_core() ) snrt_dma_wait(${name});")
+
+
+class SnitchDma(AsyncDma):
+
+    _transferTemplates = {
+        2:
+            NodeTemplate("""
+            if (snrt_is_dm_core()) {
+                ${future} = snrt_dma_start_2d(${dest}, ${src}, ${size}, ${stride_dest}, ${stride_src}, ${repeat});
+                // WIESEP: Hack as otherwise the last commited DMA transaction ID can never be resolved.
+                snrt_dma_start_2d(${dest}, ${dest}, 1, 0, 0, 0);
+            }
+            """),
+    }
+    _waitingStrategy = PerTensorWaitingStrategy(SnitchFuture)
+
+    def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
+        super().__init__(transferTemplates)
+
+    def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                      shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                      direction: DmaDirection) -> None:
+        super().checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction)
+        assert strideLoc[1] == 1 and strideExt[1] == 1, f"Supports only contigous transfers in the innermost dimension"
+
+    def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...],
+                       strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection,
+                       future: Future) -> OperatorRepresentation:
+        operatorRepresentation: OperatorRepresentation = {
+            "dest": localBuffer.name if direction == "ExternalToLocal" else externalBuffer.name,
+            "src": externalBuffer.name if direction == "ExternalToLocal" else localBuffer.name,
+            "repeat": shape[0],
+            "size": shape[1],
+            "stride_dest": strideLoc[0] if direction == "ExternalToLocal" else strideExt[0],
+            "stride_src": strideExt[0] if direction == "ExternalToLocal" else strideLoc[0],
+            "future": future.name
+        }
+        return operatorRepresentation
diff --git a/Deeploy/Targets/Snitch/DataTypes.py b/Deeploy/Targets/Snitch/DataTypes.py
index b1d3a92eda..16cd6e8cbe 100644
--- a/Deeploy/Targets/Snitch/DataTypes.py
+++ b/Deeploy/Targets/Snitch/DataTypes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchDataTypes.py
-#
-# Last edited: 03.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.AbstractDataTypes import PointerClass, Struct, VoidType
 from Deeploy.CommonExtensions.DataTypes import uint16_t
diff --git a/Deeploy/Targets/Snitch/Deployer.py b/Deeploy/Targets/Snitch/Deployer.py
index ff32066902..7c3922a6bb 100644
--- a/Deeploy/Targets/Snitch/Deployer.py
+++ b/Deeploy/Targets/Snitch/Deployer.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchDeployer.py
-#
-# Last edited: 23.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
diff --git a/Deeploy/Targets/Snitch/Parsers.py b/Deeploy/Targets/Snitch/Parsers.py
index dfd32484cb..0051994686 100644
--- a/Deeploy/Targets/Snitch/Parsers.py
+++ b/Deeploy/Targets/Snitch/Parsers.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchParsers.py
-#
-# Last edited: 07.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Luka Macan, luka.macan@unibo.it, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the Lic
-# ense is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Tuple
 
diff --git a/Deeploy/Targets/Snitch/Platform.py b/Deeploy/Targets/Snitch/Platform.py
index b6bed5a57a..d62d1c3802 100644
--- a/Deeploy/Targets/Snitch/Platform.py
+++ b/Deeploy/Targets/Snitch/Platform.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchPlatform.py
-#
-# Last edited: 23.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List
 
@@ -160,7 +138,8 @@ class SnitchStructBuffer(StructBuffer):
     MergeConstAddAndRequantPass(),
     AddRequantMergePass(),
     GEMMRequantMergePass(),
-])
+],
+                                    name = "SnitchOptimizer")
 
 _includeList = [
     "snrt.h",
diff --git a/Deeploy/Targets/Snitch/Templates/AddTemplate.py b/Deeploy/Targets/Snitch/Templates/AddTemplate.py
index f60462516c..428f087300 100644
--- a/Deeploy/Targets/Snitch/Templates/AddTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/AddTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: AddTemplate.py
-#
-# Last edited: 11.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -54,5 +33,5 @@ def alignToContext(self, ctxt: NetworkContext,
 
 referenceTemplate = _SnitchAddTemplate("""
 // Snitch Add (Name: ${nodeName}, Op: ${nodeOp})
-SnitchAdd(${data_in_1}, ${data_in_2}, ${data_out}, ${size}, ${offset});                         
+SnitchAdd(${data_in_1}, ${data_in_2}, ${data_out}, ${size}, ${offset});
 """)
diff --git a/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py b/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py
index aa37b58df5..6c1d898645 100644
--- a/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/AllocateTemplate.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: AllocateTemplate.py
-#
-# Last edited: 23.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Snitch/Templates/FloatGemmTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatGemmTemplate.py
index 5db62baacb..17729a2eec 100644
--- a/Deeploy/Targets/Snitch/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/FloatGemmTemplate.py
@@ -1,11 +1,15 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
 uint32_t compute_num = snrt_cluster_compute_core_num();
-                
+
 % if transB:
 gemm_fp32_transB_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${N}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
-% else:                                 
+% else:
 gemm_fp32_opt(${M} / compute_num, ${O}, ${N}, ${A}, ${N} * compute_num, ${B}, ${O}, ${C}, ${O} * compute_num, ${data_out}, 1, 1 );
 %endif
 """)
diff --git a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
index 877c02fef0..216ff35b9a 100644
--- a/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/FloatSoftmaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iSoftmaxTemplate.py
-#
-# Last edited: 30.05.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
@@ -50,7 +29,7 @@ def alignToContext(self, ctxt: NetworkContext,
     uint32_t compute_num = 1; //snrt_cluster_compute_core_num();
     int32_t ldI = compute_num * ${input_samples};
     int32_t batch_offset = ${seq_len} * ${input_samples};
-    
+
     // JUNGVI: This implementation is broken and has memory leak.
     if (snrt_hartid() == 0){
         ${kernelName}(${data_in}, ${data_out}, ldI, batch_offset, batch_size, ${seq_len}, ${input_samples});
diff --git a/Deeploy/Targets/Snitch/Templates/FreeTemplate.py b/Deeploy/Targets/Snitch/Templates/FreeTemplate.py
index 39e08ef429..e8e5fbe4d9 100644
--- a/Deeploy/Targets/Snitch/Templates/FreeTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/FreeTemplate.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: FreeTemplate.py
-#
-# Last edited: 23.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/Snitch/Templates/GemmTemplate.py b/Deeploy/Targets/Snitch/Templates/GemmTemplate.py
index 8bc0fee698..d72b3c11f2 100644
--- a/Deeploy/Targets/Snitch/Templates/GemmTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/GemmTemplate.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Tuple
 
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
diff --git a/Deeploy/Targets/Snitch/Templates/RQAddTemplate.py b/Deeploy/Targets/Snitch/Templates/RQAddTemplate.py
index afc637c21a..ceacb1c657 100644
--- a/Deeploy/Targets/Snitch/Templates/RQAddTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/RQAddTemplate.py
@@ -1,29 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: RQAddTemplate.py
-#
-# Last edited: 11.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.Targets.Generic.Templates.RQAddTemplate import RQAddTemplate
 
diff --git a/Deeploy/Targets/Snitch/Templates/RqGemmTemplate.py b/Deeploy/Targets/Snitch/Templates/RqGemmTemplate.py
index 918690e4e0..f77b6d6127 100644
--- a/Deeploy/Targets/Snitch/Templates/RqGemmTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/RqGemmTemplate.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Tuple
 
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
diff --git a/Deeploy/Targets/Snitch/Templates/__init__.py b/Deeploy/Targets/Snitch/Templates/__init__.py
index b9742821a6..aa29624681 100644
--- a/Deeploy/Targets/Snitch/Templates/__init__.py
+++ b/Deeploy/Targets/Snitch/Templates/__init__.py
@@ -1 +1,5 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from . import *
\ No newline at end of file
diff --git a/Deeploy/Targets/Snitch/Templates/iSoftmaxTemplate.py b/Deeploy/Targets/Snitch/Templates/iSoftmaxTemplate.py
index 9a15d91104..b4b5abbf16 100644
--- a/Deeploy/Targets/Snitch/Templates/iSoftmaxTemplate.py
+++ b/Deeploy/Targets/Snitch/Templates/iSoftmaxTemplate.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iSoftmaxTemplate.py
-#
-# Last edited: 30.05.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.Targets.Generic.Templates.iSoftmaxPreAllocatedBuffTemplate import iSoftmaxPreAllocatedBuffTemplate
 
diff --git a/Deeploy/Targets/Snitch/TileConstraints/GemmTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/GemmTileConstraint.py
index 99fdddd21a..e5d35c4a42 100644
--- a/Deeploy/Targets/Snitch/TileConstraints/GemmTileConstraint.py
+++ b/Deeploy/Targets/Snitch/TileConstraints/GemmTileConstraint.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
diff --git a/Deeploy/Targets/Snitch/TileConstraints/RqGemmTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/RqGemmTileConstraint.py
index 5feae3b206..47bf7e29cc 100644
--- a/Deeploy/Targets/Snitch/TileConstraints/RqGemmTileConstraint.py
+++ b/Deeploy/Targets/Snitch/TileConstraints/RqGemmTileConstraint.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
diff --git a/Deeploy/Targets/Snitch/TileConstraints/__init__.py b/Deeploy/Targets/Snitch/TileConstraints/__init__.py
index 93b3563586..947a6fd82a 100644
--- a/Deeploy/Targets/Snitch/TileConstraints/__init__.py
+++ b/Deeploy/Targets/Snitch/TileConstraints/__init__.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 03.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
 from .iNoNormTileConstraint import *
diff --git a/Deeploy/Targets/Snitch/TileConstraints/iNoNormTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/iNoNormTileConstraint.py
index ab7b0cf10d..770b78902c 100644
--- a/Deeploy/Targets/Snitch/TileConstraints/iNoNormTileConstraint.py
+++ b/Deeploy/Targets/Snitch/TileConstraints/iNoNormTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: iNoNormTileConstraint.py
-#
-# Last edited: 06.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple
 
diff --git a/Deeploy/Targets/Snitch/TileConstraints/iSoftmaxTileConstraint.py b/Deeploy/Targets/Snitch/TileConstraints/iSoftmaxTileConstraint.py
index 55284915a3..aa405bbcb6 100644
--- a/Deeploy/Targets/Snitch/TileConstraints/iSoftmaxTileConstraint.py
+++ b/Deeploy/Targets/Snitch/TileConstraints/iSoftmaxTileConstraint.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: iSoftmaxTileConstraint.py
-#
-# Last edited: 13.11.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, List, Tuple, Union
 
diff --git a/Deeploy/Targets/Snitch/Tiler.py b/Deeploy/Targets/Snitch/Tiler.py
index 38ba29f2dd..475a425779 100644
--- a/Deeploy/Targets/Snitch/Tiler.py
+++ b/Deeploy/Targets/Snitch/Tiler.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SnitchTiler.py
-#
-# Last edited: 03.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
 from Deeploy.Targets.Snitch.Bindings import SnitchAddBindings, SnitchGemmBindings, SnitchiNoNormBindings, \
diff --git a/Deeploy/Targets/Snitch/__init__.py b/Deeploy/Targets/Snitch/__init__.py
index b9742821a6..aa29624681 100644
--- a/Deeploy/Targets/Snitch/__init__.py
+++ b/Deeploy/Targets/Snitch/__init__.py
@@ -1 +1,5 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 from . import *
\ No newline at end of file
diff --git a/Deeploy/Targets/SoftHier/Deployer.py b/Deeploy/Targets/SoftHier/Deployer.py
index db3b1081ec..e4ab37f299 100644
--- a/Deeploy/Targets/SoftHier/Deployer.py
+++ b/Deeploy/Targets/SoftHier/Deployer.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: SoftHierDeployer.py
-#
-# Last edited: 03.04.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author: Bowen Wang <bowwang@iis.ee.ethz.ch>, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Callable, Dict, Type
 
diff --git a/Deeploy/Targets/SoftHier/Platform.py b/Deeploy/Targets/SoftHier/Platform.py
index f56093d142..42265bbd81 100644
--- a/Deeploy/Targets/SoftHier/Platform.py
+++ b/Deeploy/Targets/SoftHier/Platform.py
@@ -1,26 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
-# File: SoftHierPlatform.py
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Bowen Wang <bowwang@iis.ee.ethz.ch>, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import numpy as np
 
@@ -111,7 +91,7 @@ class SoftHierStructBuffer(StructBuffer):
     deallocTemplate = NodeTemplate("")
 
 
-SoftHierOptimizer = TopologyOptimizer([])
+SoftHierOptimizer = TopologyOptimizer([], name = "SoftHierOptimizer")
 includeList = ["DeeployBasicMath.h", "flex_alloc_api.h"]
 
 
diff --git a/Deeploy/Targets/SoftHier/Templates/AllocateTemplate.py b/Deeploy/Targets/SoftHier/Templates/AllocateTemplate.py
index 66bf552f67..30da2d1998 100644
--- a/Deeploy/Targets/SoftHier/Templates/AllocateTemplate.py
+++ b/Deeploy/Targets/SoftHier/Templates/AllocateTemplate.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: AllocateTemplate.py
-#
-# Last edited: 07.06.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#         Bowen Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/SoftHier/Templates/FreeTemplate.py b/Deeploy/Targets/SoftHier/Templates/FreeTemplate.py
index bcd78952bc..f798c9e5ab 100644
--- a/Deeploy/Targets/SoftHier/Templates/FreeTemplate.py
+++ b/Deeploy/Targets/SoftHier/Templates/FreeTemplate.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: FreeTemplate.py
-#
-# Last edited: 07.06.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#         Bowen Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from Deeploy.DeeployTypes import NodeTemplate
 
diff --git a/Deeploy/Targets/SoftHier/Templates/__init__.py b/Deeploy/Targets/SoftHier/Templates/__init__.py
index e86f60fbe6..be436b64a3 100644
--- a/Deeploy/Targets/SoftHier/Templates/__init__.py
+++ b/Deeploy/Targets/SoftHier/Templates/__init__.py
@@ -1,27 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 07.06.2025
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#         Bowen Wang, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/Targets/__init__.py b/Deeploy/Targets/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/Targets/__init__.py
+++ b/Deeploy/Targets/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/TilingExtension/AsyncDma.py b/Deeploy/TilingExtension/AsyncDma.py
new file mode 100644
index 0000000000..9679681051
--- /dev/null
+++ b/Deeploy/TilingExtension/AsyncDma.py
@@ -0,0 +1,276 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from abc import ABC, abstractmethod
+from typing import Dict, List, Literal, Set, Tuple, Type
+
+from Deeploy.DeeployTypes import CodeSnippet, NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer, \
+    _ReferenceBuffer
+from Deeploy.TilingExtension.TilingCodegen import padShape, padStride
+
+DmaDirection = Literal["ExternalToLocal", "LocalToExternal"]
+
+
+class Future:
+
+    _initTemplate: NodeTemplate
+    _allocTemplate: NodeTemplate
+    _deinitTemplate: NodeTemplate
+    _waitTemplate: NodeTemplate
+
+    def __init__(self, name: str):
+        self.name = name
+
+    def _operatorRepresentation(self) -> OperatorRepresentation:
+        return {"name": self.name}
+
+    def init(self) -> CodeSnippet:
+        return CodeSnippet(self._initTemplate, self._operatorRepresentation())
+
+    def alloc(self) -> CodeSnippet:
+        return CodeSnippet(self._allocTemplate, self._operatorRepresentation())
+
+    def deinit(self) -> CodeSnippet:
+        return CodeSnippet(self._deinitTemplate, self._operatorRepresentation())
+
+    def wait(self) -> CodeSnippet:
+        return CodeSnippet(self._waitTemplate, self._operatorRepresentation())
+
+
+class AsyncDmaWaitingStrategy(ABC):
+
+    def __init__(self, FutureCls: Type[Future]) -> None:
+        self.FutureCls = FutureCls
+
+    @abstractmethod
+    def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
+        pass
+
+
+class PerTensorWaitingStrategy(AsyncDmaWaitingStrategy):
+
+    def __init__(self, FutureCls: Type[Future]) -> None:
+        super().__init__(FutureCls)
+        # map (tensorName, direction) -> Future instance so the same Future
+        # object is returned for repeated requests for the same tensor/direction
+        self._futures: Dict[Tuple[str, DmaDirection], Future] = {}
+
+    def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
+        key = (tensorName, direction)
+        if key not in self._futures:
+            # include direction in the future name to avoid accidental name
+            # collisions between directions for the same tensor
+            future_name = f"{tensorName}_{direction}"
+            self._futures[key] = self.FutureCls(future_name)
+        return self._futures[key]
+
+
+class DirectionWaitingStrategy(AsyncDmaWaitingStrategy):
+
+    def __init__(self, FutureCls: Type[Future], asyncGroupName: str) -> None:
+        super().__init__(FutureCls)
+        self.asyncGroupName = asyncGroupName
+        self.asyncGroupFutures = {
+            "ExternalToLocal": FutureCls(asyncGroupName + "_input"),
+            "LocalToExternal": FutureCls(asyncGroupName + "_output")
+        }
+
+    def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
+        _ = tensorName
+        return self.asyncGroupFutures[direction]
+
+
+class BarrierWaitingStrategy(AsyncDmaWaitingStrategy):
+
+    def __init__(self, FutureCls: Type[Future], barrierName: str) -> None:
+        super().__init__(FutureCls)
+        self.barrier = FutureCls(barrierName)
+
+    def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
+        _ = tensorName, direction
+        return self.barrier
+
+
+class AsyncDma(ABC):
+
+    _waitingStrategy: AsyncDmaWaitingStrategy
+
+    def __init__(self, transferTemplates: Dict[int, NodeTemplate]) -> None:
+        self._transferTemplates = transferTemplates
+
+    def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
+        return self._waitingStrategy.getFuture(tensorName, direction)
+
+    def supportedTransferRanks(self) -> Set[int]:
+        return set(self._transferTemplates.keys())
+
+    def checkTransfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                      shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                      direction: DmaDirection) -> None:
+        transferRank = len(shape)
+        assert transferRank == len(strideLoc) and transferRank == len(
+            strideExt), f"The shape and stride rank should match"
+        assert transferRank in self.supportedTransferRanks(
+        ), f"Unsupported transfer rank {transferRank}. Supported ranks are {self.supportedTransferRanks()}"
+
+    @abstractmethod
+    def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...],
+                       strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection,
+                       future: Future) -> OperatorRepresentation:
+        return {"loc": localBuffer.name, "ext": externalBuffer.name, "future": future.name}
+
+    def transfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                 shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                 direction: DmaDirection, future: Future) -> List[CodeSnippet]:
+        self.checkTransfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction)
+        opRepr = self.transferOpRepr(externalBuffer, localBuffer, shape, strideExt, strideLoc, direction, future)
+        template = self._transferTemplates[len(shape)]
+        return [CodeSnippet(template, opRepr)]
+
+
+class EmptyFuture(Future):
+
+    _initTemplate = NodeTemplate("")
+    _allocTemplate = NodeTemplate("")
+    _deinitTemplate = NodeTemplate("")
+    _waitTemplate = NodeTemplate("")
+
+
+class BlockingDmaFromAsyncDmaAdapter(AsyncDma):
+
+    _waitingStrategy = PerTensorWaitingStrategy(EmptyFuture)
+
+    def __init__(self, dma: AsyncDma) -> None:
+        self.dma = dma
+
+    @property
+    def _transferTemplates(self) -> Dict[int, NodeTemplate]:
+        return self.dma._transferTemplates
+
+    def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
+        return self.dma.getFuture(tensorName, direction)
+
+    def transferOpRepr(self, externalBuffer: VariableBuffer, localBuffer: VariableBuffer, shape: Tuple[int, ...],
+                       strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...], direction: DmaDirection,
+                       future: Future) -> OperatorRepresentation:
+        return self.dma.transferOpRepr(externalBuffer, localBuffer, shape, strideExt, strideLoc, direction, future)
+
+    def transfer(self, ctxt: NetworkContext, externalBuffer: VariableBuffer, localBuffer: VariableBuffer,
+                 shape: Tuple[int, ...], strideExt: Tuple[int, ...], strideLoc: Tuple[int, ...],
+                 direction: DmaDirection, future: Future) -> List[CodeSnippet]:
+        callStack = []
+        dma_code = self.dma.transfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction, future)
+        callStack.append(future.alloc())
+        callStack.extend(dma_code)
+        callStack.append(future.wait())
+        return callStack
+
+
+class AnydimAsyncDmaTransferAdapter:
+
+    class NestedForLoopOpenTemplate(NodeTemplate):
+
+        def __init__(self, depth: int):
+            templateStr = ""
+            for level in range(depth):
+                iter = f"i_{level}"
+                templateStr += f"for (uint32_t {iter} = 0; {iter} < ${{end_{level}}}; {iter}++) {{"
+            super().__init__(templateStr)
+
+    class NestedForLoopCloseTemplate(NodeTemplate):
+
+        def __init__(self, depth: int):
+            templateStr = ""
+            for _ in range(depth):
+                templateStr += "}"
+            super().__init__(templateStr)
+
+    class OffsetCalculationTemplate(NodeTemplate):
+
+        def __init__(self, name: str, depth: int):
+            templateStr = f"const uint32_t {name} = "
+            for i in range(depth):
+                templateStr += f"i_{i} * ${{stride_{i}}}"
+                if i < depth - 1:
+                    templateStr += " + "
+            templateStr += ";"
+            super().__init__(templateStr)
+
+    offsetPtrTemplate = NodeTemplate("void * const ${resultPtr} = (void *)((char *)${basePtr} + ${offset});")
+
+    def __init__(self, dma: AsyncDma) -> None:
+        self.dma = dma
+
+    def getFuture(self, tensorName: str, direction: DmaDirection) -> Future:
+        return self.dma.getFuture(tensorName, direction)
+
+    def nearestSupportedTransferRank(self, transfer_rank: int) -> int:
+        sortedRanks = sorted(self.dma.supportedTransferRanks())
+
+        # Find nearest smaller
+        for rank in reversed(sortedRanks):
+            if rank <= transfer_rank:
+                return rank
+
+        # All supported ranks are bigger so return the smallest one
+        return sortedRanks[0]
+
+    def transfer(self,
+                 ctxt: NetworkContext,
+                 externalBuffer: VariableBuffer,
+                 localBuffer: VariableBuffer,
+                 shape: Tuple[int, ...],
+                 strideExt: Tuple[int, ...],
+                 strideLoc: Tuple[int, ...],
+                 direction: DmaDirection,
+                 future: Future,
+                 strideExtPad: int = 0) -> List[CodeSnippet]:
+        transferRank = len(shape)
+        kernelRank = self.nearestSupportedTransferRank(transferRank)
+
+        if kernelRank < transferRank:
+            nestedLoopDepth = transferRank - kernelRank
+
+            nestedLoopOpRepr = {f"end_{level}": shape[level] for level in range(nestedLoopDepth)}
+            locOffsetCalculationOpRepr = {f"stride_{level}": strideLoc[level] for level in range(nestedLoopDepth)}
+            extOffsetCalculationOpRepr = {f"stride_{level}": strideExt[level] for level in range(nestedLoopDepth)}
+
+            callStack = []
+            callStack.append(CodeSnippet(self.NestedForLoopOpenTemplate(nestedLoopDepth), nestedLoopOpRepr))
+            callStack.append(
+                CodeSnippet(self.OffsetCalculationTemplate("ext_offset", nestedLoopDepth), extOffsetCalculationOpRepr))
+            callStack.append(
+                CodeSnippet(self.OffsetCalculationTemplate("loc_offset", nestedLoopDepth), locOffsetCalculationOpRepr))
+
+            localBufferOffseted = _ReferenceBuffer("local_buffer_offsetted", localBuffer)
+            localBufferOffseted._memoryLevel = localBuffer._memoryLevel
+            callStack.append(
+                CodeSnippet(self.offsetPtrTemplate, {
+                    "resultPtr": "local_buffer_offsetted",
+                    "basePtr": localBuffer.name,
+                    "offset": "loc_offset"
+                }))
+
+            externalBufferOffseted = _ReferenceBuffer("external_buffer_offsetted", externalBuffer)
+            externalBufferOffseted._memoryLevel = externalBuffer._memoryLevel
+            callStack.append(
+                CodeSnippet(self.offsetPtrTemplate, {
+                    "resultPtr": externalBufferOffseted.name,
+                    "basePtr": externalBuffer.name,
+                    "offset": "ext_offset"
+                }))
+
+            dma_code = self.dma.transfer(ctxt, externalBufferOffseted, localBufferOffseted, shape[-kernelRank:],
+                                         strideExt[-kernelRank:], strideLoc[-kernelRank:], direction, future)
+
+            callStack.extend(dma_code)
+            callStack.append(CodeSnippet(self.NestedForLoopCloseTemplate(nestedLoopDepth), {}))
+            return callStack
+        elif kernelRank == transferRank:
+            return self.dma.transfer(ctxt, externalBuffer, localBuffer, shape, strideExt, strideLoc, direction, future)
+        else:
+            return self.dma.transfer(ctxt, externalBuffer, localBuffer, padShape(shape, kernelRank),
+                                     padStride(strideExt, kernelRank, strideExtPad),
+                                     padStride(strideLoc, kernelRank, math.prod(shape)), direction, future)
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
new file mode 100644
index 0000000000..ad9c6ad012
--- /dev/null
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/DoubleBufferingTilingCodeGeneration.py
@@ -0,0 +1,366 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from typing import List, Set, Tuple
+
+from Deeploy.AbstractDataTypes import VoidType
+from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, NodeTemplate, OperatorRepresentation, \
+    VariableBuffer, _ReferenceBuffer
+from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, Future
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
+    PrototypeTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, stridesFromShape
+
+
+class DoubleBufferingTilingCodeGeneration(TilingCodeGeneration):
+
+    _moveTileInCheckOpenStatement = NodeTemplate("""
+    // DOUBLE BUFFERING CHECK TILE LOAD
+    if ((${tileIdxVar}) < ${numTiles}[*${tileIdxPtr}+1]) {
+    """)
+
+    _moveTileInCheckCloseStatement = NodeTemplate("""
+    }
+    """)
+
+    # LMACAN: The brackets around ${tileIdxVar} are important to ensure correct order
+    #         of the modulo operation. Breaking case without the brackets is when we
+    #         put "TILING_I + 1" for tileIdxVar.
+    _switchOpen = NodeTemplate("switch((${tileIdxVar}) % ${bufferCount}) {")
+    _caseOpen = NodeTemplate("case ${case}:")
+    _caseClose = NodeTemplate("break;")
+
+    _blockClose = NodeTemplate("""
+    }
+    """)
+
+    _referenceUpdate = NodeTemplate("${reference} = (${type})${update};")
+
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+        super().__init__(externalMemory, localMemory, dma, 2)
+
+    def _switch(self, caseBlocks: List[List[CodeSnippet]], tileIdxVar: str) -> List[CodeSnippet]:
+        assert len(caseBlocks) == self.bufferCount, f"Expected {self.bufferCount} cases, got {len(caseBlocks)}`"
+        callStack = [CodeSnippet(self._switchOpen, {"tileIdxVar": tileIdxVar, "bufferCount": self.bufferCount})]
+        for i, block in enumerate(caseBlocks):
+            callStack.append(CodeSnippet(self._caseOpen, {"case": i}))
+            callStack.extend(block)
+            callStack.append(CodeSnippet(self._caseClose, {}))
+        callStack.append(CodeSnippet(self._blockClose, {}))
+        return callStack
+
+    def _generateBufferChoice(self, reference: VariableBuffer,
+                              buffers: List[_ReferenceBuffer]) -> List[List[CodeSnippet]]:
+        return [[
+            CodeSnippet(self._referenceUpdate, {
+                "reference": reference.name,
+                "type": reference._type.typeName,
+                "update": buff.name
+            })
+        ] for buff in buffers]
+
+    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
+                    variableReplacement: VariableReplacementScheme,
+                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        # Double Buffering Tiling Loop Strategy
+        # ===================================
+        # - 1) Initialize all futures
+        # - 2) Start transfer for first input tile
+        # - 3) Update input reference for second tile
+        # - 4) for TILING_I in numTiles:
+        #   - 4.1) Choose buffers for current tile (inputs and outputs)
+        #   - 4.2) Input data transfer for next tile (see "4.2) Input Data Transfers")
+        #   - 4.3) Process current tile
+        #   - 4.4) Output data transfer for current tile (see "4.4) Output Data Transfers")
+        # - 5) Wait for final output tile to be ready
+        # - 6) Deinitialize all futures
+
+        # 4.2) Input Data Transfers
+        # -----------------------------------
+        # - for each input tensor:
+        #   - 4.2.1) Wait for current input tile
+        #   - 4.2.2) if there is a next tile:
+        #     - 4.2.3) Choose buffers for next tile
+        #     - 4.2.4) Start transfer for next input tile
+        #     - 4.2.5) Update input reference for next tile
+
+        # 4.4) Output Data Transfers
+        # -----------------------------------
+        # - for each output tensor:
+        #   - 4.4.1) Wait for previous output tile
+        #   - 4.4.2) Start transfer for current output tile
+        #   - 4.4.3) Update outut reference for next tile
+
+        setupStatements: List[CodeSnippet] = []
+        openLoopStatements: List[CodeSnippet] = [CodeSnippet(self._openTileLoopTemplate, {**operatorRepresentation})]
+
+        ingressDMAStatements: List[CodeSnippet] = []
+        ingressFutures: Set[Future] = set()
+
+        egressDMAStatements: List[CodeSnippet] = []
+        egressFutures: Set[Future] = set()
+
+        closeLoopStatements: List[CodeSnippet] = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})]
+        teardownStatements: List[CodeSnippet] = []
+
+        # 4.2) Input Data Transfers
+        # -----------------------------------
+
+        buffer_choices: List[List[CodeSnippet]] = [[], []]
+        for tensorName, rectangles in dictOfArrays(tilingSchedule.inputLoadSchedule).items():
+            localBuffer = ctxt.lookup(operatorRepresentation[tensorName])
+            assert localBuffer._memoryLevel == self.localMemory
+            assert isinstance(localBuffer, _ReferenceBuffer)
+            externalBuffer = ctxt.lookup(localBuffer._referenceName)
+            assert isinstance(externalBuffer, VariableBuffer)
+            tensorMemoryConstraint = nodeMemoryConstraint.inputTensorMemoryConstraints[externalBuffer.name]
+            externalBufferShape = tensorMemoryConstraint.memoryConstraints[self.externalMemory].shape
+            assert externalBufferShape is not None
+
+            rectangles, externalBufferShape = self._legalizeTransfers(rectangles, tuple(externalBufferShape),
+                                                                      localBuffer._type.referencedType.typeWidth,
+                                                                      self.isFinalMemoryLevel(tensorMemoryConstraint))
+
+            externalBufferRef = self._hoistReference(ctxt,
+                                                     externalBuffer.name + "_ref",
+                                                     externalBuffer,
+                                                     externalBufferShape,
+                                                     override_type = VoidType)
+
+            tensorMemoryConstraint = nodeMemoryConstraint.inputTensorMemoryConstraints[externalBuffer.name]
+            l1BuffersReferences = self._hoistMultibufferReferences(ctxt, localBuffer, tensorMemoryConstraint)
+
+            nextLocalBufferReference = self._hoistReference(ctxt, f"{tensorName}_next", l1BuffersReferences[1])
+
+            future = self.dma.getFuture(tensorName, "ExternalToLocal")
+
+            # 2) Load initial input tiles
+            anydimAdapter = AnydimAsyncDmaTransferAdapter(self.dma)
+            initialDmaTransferCalls = anydimAdapter.transfer(ctxt, externalBufferRef, localBuffer, rectangles[0].dims,
+                                                             stridesFromShape(externalBufferShape),
+                                                             stridesFromShape(rectangles[0].dims), "ExternalToLocal",
+                                                             future, math.prod(externalBufferShape))
+            if future not in ingressFutures:
+                setupStatements.append(future.alloc())
+            setupStatements.extend(initialDmaTransferCalls)
+
+            # 4.1) Choose buffers for current tile (inputs and outputs)
+            _buffer_choice = self._generateBufferChoice(localBuffer, l1BuffersReferences)
+            for i in range(len(buffer_choices)):
+                buffer_choices[i].extend(_buffer_choice[i])
+
+            # 4.2.1) Wait for current input tile
+            ingressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for current input tile"}))
+
+            if future not in ingressFutures:
+                ingressDMAStatements.append(future.wait())
+
+            # 4.2.2) if there is a next tile:
+            ingressDMAStatements.append(
+                CodeSnippet(self._moveTileInCheckOpenStatement, {
+                    **operatorRepresentation, "tileIdxVar": "TILING_I+1"
+                }))
+
+            # 4.2.3) Choose buffers for next tile
+            ingressDMAStatements += self._switch(
+                self._generateBufferChoice(nextLocalBufferReference, l1BuffersReferences), "TILING_I+1")
+
+            # 4.2.4) Start transfer for next input tile
+            ingressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Transfer next input tile"}))
+
+            # Allocate the future for the next transfer
+            if future not in ingressFutures:
+                ingressDMAStatements.append(future.alloc())
+
+            ingressDMAStatements.extend(
+                self._generateDmaTransferCalls(ctxt, tensorName, rectangles, "TILING_I+1", nextLocalBufferReference,
+                                               externalBufferRef, "ExternalToLocal", future))
+            # 4.2.5) Update external reference for next til
+            referenceUpdate = self._generateExternalReferenceUpdate(ctxt, tensorName, rectangles, "TILING_I+1",
+                                                                    externalBufferRef)
+            if referenceUpdate is not None:
+                ingressDMAStatements.append(referenceUpdate)
+
+                # 3) Update input reference for second tile
+                initialReferenceUpdate = CodeSnippet(referenceUpdate.template,
+                                                     operatorRepresentation = {
+                                                         **referenceUpdate.operatorRepresentation,
+                                                         "tileIdxVar": 0,
+                                                     })
+                setupStatements.append(initialReferenceUpdate)
+
+            # Close the "if there is a next tile" block
+            ingressDMAStatements.append(CodeSnippet(self._moveTileInCheckCloseStatement, {}))
+
+            # Add future to the set to prevent double wait/allocation
+            ingressFutures.add(future)
+
+        # 4.4) Output Data Transfers
+        # -----------------------------------
+        for tensorName, rectangles in dictOfArrays(tilingSchedule.outputLoadSchedule).items():
+            localBuffer = ctxt.lookup(operatorRepresentation[tensorName])
+            assert localBuffer._memoryLevel == self.localMemory
+            assert isinstance(localBuffer, _ReferenceBuffer)
+            externalBuffer = ctxt.lookup(localBuffer._referenceName)
+            assert isinstance(externalBuffer, VariableBuffer)
+            tensorMemoryConstraint = nodeMemoryConstraint.outputTensorMemoryConstraints[externalBuffer.name]
+            externalBufferShape = tensorMemoryConstraint.memoryConstraints[self.externalMemory].shape
+            assert externalBufferShape is not None
+
+            rectangles, externalBufferShape = self._legalizeTransfers(rectangles, tuple(externalBufferShape),
+                                                                      localBuffer._type.referencedType.typeWidth,
+                                                                      self.isFinalMemoryLevel(tensorMemoryConstraint))
+
+            externalBufferRef = self._hoistReference(ctxt,
+                                                     externalBuffer.name + "_ref",
+                                                     externalBuffer,
+                                                     externalBufferShape,
+                                                     override_type = VoidType)
+
+            tensorMemoryConstraint = nodeMemoryConstraint.outputTensorMemoryConstraints[externalBuffer.name]
+            l1BuffersReferences = self._hoistMultibufferReferences(ctxt, localBuffer, tensorMemoryConstraint)
+
+            # 4.1) Choose buffers for current tile (inputs and outputs)
+            _buffer_choice = self._generateBufferChoice(localBuffer, l1BuffersReferences)
+            for i in range(len(buffer_choices)):
+                buffer_choices[i].extend(_buffer_choice[i])
+
+            # 4.4.1) Wait for previous output tile
+            future = self.dma.getFuture(tensorName, "LocalToExternal")
+
+            egressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for previous output tile"}))
+
+            if future not in egressFutures:
+                egressDMAStatements.append(future.wait())
+
+            # 4.4.2) Start transfer for current output tile
+            dmaTransferCalls = self._generateDmaTransferCalls(ctxt, tensorName, rectangles, "TILING_I", localBuffer,
+                                                              externalBufferRef, "LocalToExternal", future)
+
+            egressDMAStatements.append(CodeSnippet(self._lineComment, {"comment": "Transfer current output tile"}))
+            # Allocate the future for the next transfer
+            if future not in egressFutures:
+                egressDMAStatements.append(future.alloc())
+
+            egressDMAStatements.extend(dmaTransferCalls)
+
+            # 4.4.3) Update outut reference for next tile
+            referenceUpdate = self._generateExternalReferenceUpdate(ctxt, tensorName, rectangles, "TILING_I",
+                                                                    externalBufferRef)
+            if referenceUpdate is not None:
+                egressDMAStatements.append(referenceUpdate)
+
+            # Add future to the set to prevent double wait/allocation
+            egressFutures.add(future)
+
+        # 4.2.
+        openLoopStatements += self._switch(buffer_choices, "TILING_I")
+
+        # 1. Initialize all futures
+        setupStatements = [f.init() for f in ingressFutures | egressFutures] + setupStatements
+        setupStatements = [CodeSnippet(self._lineComment, {"comment": "Initialize DMA future"})] + setupStatements
+
+        # 5. Wait for final output tile to be ready
+        teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Wait for final output tile"}))
+        teardownStatements.extend([f.wait() for f in egressFutures])
+
+        # 6. Deinitialize all futures
+
+        teardownStatements.append(CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA future"}))
+        teardownStatements.extend(f.deinit() for f in ingressFutures | egressFutures)
+
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = operatorRepresentation['numTiles'],
+                                  totalNumTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxPtr = operatorRepresentation['tileIdxPtr'],
+                                  tileIdxVar = "TILING_I",
+                                  kernelLevelTiling = True)
+
+        executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
+                                                    openLoopStatements, closeLoopStatements, setupStatements,
+                                                    teardownStatements)
+
+        return ctxt, executionBlock, True
+
+
+class ProfilingDoubleBufferingTilingMixIn(PrototypeTilingMixIn, ProfilingPrototypeMixIn):
+
+    @classmethod
+    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                                     setupStatements: List[CodeSnippet],
+                                     teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        nodeName = metaInfo.nodeName
+        totalNumTiles = metaInfo.totalNumTiles
+
+        executionBlock.addLeft(cls._measureCycles, {
+            "measurements": f"{nodeName}_ingress_dma_wait_start_measurements",
+            "tileIdxVar": 0
+        })
+
+        executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "DB")
+
+        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
+                                                              teardownStatements)
+        executionBlock.addRight(cls._measureCycles, {
+            "measurements": f"{nodeName}_egress_dma_wait_end_measurements",
+            "tileIdxVar": totalNumTiles - 1
+        })
+
+        executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo)
+
+        return executionBlock
+
+    @classmethod
+    def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                         openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
+                         egressDMAStatements: List[CodeSnippet],
+                         closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        nodeName = metaInfo.nodeName
+        tileIdxVar = metaInfo.tileIdxVar
+
+        _openLoopStatements = [openLoopStatements[0]]
+        _openLoopStatements.append(CodeSnippet(cls._measureConditionSetup, {"cond": f"{tileIdxVar} > 0"}))
+        _openLoopStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_ingress_dma_wait_start_measurements",
+                "tileIdxVar": tileIdxVar
+            }))
+        _openLoopStatements.append(CodeSnippet(cls._measureConditionEnd, {}))
+        _openLoopStatements += openLoopStatements[1:]
+
+        _ingressDMAStatements = []
+        _ingressDMAStatements += ingressDMAStatements
+        _ingressDMAStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_ingress_dma_wait_end_measurements",
+                "tileIdxVar": tileIdxVar
+            }))
+
+        executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo)
+
+        _egressDMAStatements = []
+        _egressDMAStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_egress_dma_wait_start_measurements",
+                "tileIdxVar": f"{tileIdxVar}"
+            }))
+        _egressDMAStatements += egressDMAStatements
+        _egressDMAStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_egress_dma_wait_end_measurements",
+                "tileIdxVar": f"{tileIdxVar}"
+            }))
+
+        executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
+                                                  _egressDMAStatements, closeLoopStatements)
+        return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
new file mode 100644
index 0000000000..ea1e938b58
--- /dev/null
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/SingleBufferingTilingCodeGeneration.py
@@ -0,0 +1,193 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Set, Tuple
+
+from Deeploy.AbstractDataTypes import VoidType
+from Deeploy.DeeployTypes import CodeSnippet, ExecutionBlock, NetworkContext, OperatorRepresentation, VariableBuffer, \
+    _ReferenceBuffer
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingCodeGeneration import TilingCodeGeneration
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import dictOfArrays
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import ProfilingPrototypeMixIn, \
+    PrototypeTilingMixIn, TilingMetaInfo
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme
+
+
+class SingleBufferingTilingCodeGeneration(TilingCodeGeneration):
+
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+        super().__init__(externalMemory, localMemory, dma, 1)
+
+    def _generateTransferScheduleCalls(
+            self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+            transferSchedule: List[Dict[str, HyperRectangle]], tensorMemoryConstraintDict: Dict[str,
+                                                                                                TensorMemoryConstraint],
+            tileIdxVar: str, direction: DmaDirection) -> Tuple[NetworkContext, List[CodeSnippet], Set[Future]]:
+        callStack: List[CodeSnippet] = []
+        futures: Set[Future] = set()
+
+        for tensorName, rectangles in dictOfArrays(transferSchedule).items():
+            localBuffer = ctxt.lookup(operatorRepresentation[tensorName])
+            assert localBuffer._memoryLevel == self.localMemory
+            assert isinstance(localBuffer, _ReferenceBuffer)
+            externalBuffer = ctxt.lookup(localBuffer._referenceName)
+            assert isinstance(externalBuffer, VariableBuffer)
+            tensorMemoryConstraint = tensorMemoryConstraintDict[externalBuffer.name]
+            externalBufferShape = tensorMemoryConstraint.memoryConstraints[self.externalMemory].shape
+            assert externalBufferShape is not None
+
+            rectangles, externalBufferShape = self._legalizeTransfers(rectangles, tuple(externalBufferShape),
+                                                                      localBuffer._type.referencedType.typeWidth,
+                                                                      self.isFinalMemoryLevel(tensorMemoryConstraint))
+
+            externalBufferRef = self._hoistReference(ctxt,
+                                                     externalBuffer.name + "_ref",
+                                                     externalBuffer,
+                                                     shape = externalBufferShape,
+                                                     override_type = VoidType)
+
+            future = self.dma.getFuture(tensorName, direction)
+
+            # Allocate a future for this transfer
+            if future not in futures:
+                callStack.append(future.alloc())
+
+            try:
+                callStack.extend(
+                    self._generateDmaTransferCalls(ctxt, tensorName, rectangles, tileIdxVar, localBuffer,
+                                                   externalBufferRef, direction, future))
+            except AssertionError as e:
+                raise AssertionError(f"{e} while generating DMA transfer for tensor '{tensorName}'") from e
+
+            referenceUpdate = self._generateExternalReferenceUpdate(ctxt, tensorName, rectangles, tileIdxVar,
+                                                                    externalBufferRef)
+            if referenceUpdate is not None:
+                callStack.append(referenceUpdate)
+
+            futures.add(future)
+
+        return ctxt, callStack, futures
+
+    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
+                    variableReplacement: VariableReplacementScheme,
+                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+
+        # Single Buffering Tiling Loop Strategy
+        # ===================================
+        # - 1) Initialize all futures
+        # - 2) for TILING_I in numTiles:
+        #   - 2.1) Input data transfer for current tile (see "4.2) Input Data Transfers")
+        #   - 2.2) Process current tile
+        #   - 2.3) Output data transfer for current tile (see "4.4) Output Data Transfers")
+        # - 3) Deinitialize all futures
+
+        # 2) for TILING_I in numTiles:
+        openLoopStatements = [CodeSnippet(self._openTileLoopTemplate, {**operatorRepresentation})]
+
+        # 2.2) Input data transfer for current tile
+        ctxt, ingressDMAStatements, ingressFutures = self._generateTransferScheduleCalls(
+            ctxt, operatorRepresentation, tilingSchedule.inputLoadSchedule,
+            nodeMemoryConstraint.inputTensorMemoryConstraints, "TILING_I", "ExternalToLocal")
+
+        ingressDMAStatements = [CodeSnippet(self._lineComment, {"comment": "Transfer input tiles"})
+                               ] + ingressDMAStatements
+        ingressDMAStatements += [CodeSnippet(self._lineComment, {"comment": "Wait for input tiles"})]
+        ingressDMAStatements += [future.wait() for future in ingressFutures]
+
+        # 2.4) Output data transfer for current tile
+        ctxt, egressDMAStatements, egressFutures = self._generateTransferScheduleCalls(
+            ctxt, operatorRepresentation, tilingSchedule.outputLoadSchedule,
+            nodeMemoryConstraint.outputTensorMemoryConstraints, "TILING_I", "LocalToExternal")
+        egressDMAStatements = [CodeSnippet(self._lineComment, {"comment": "Transfer output tiles"})
+                              ] + egressDMAStatements
+        egressDMAStatements += [CodeSnippet(self._lineComment, {"comment": "Wait for output tiles"})]
+        egressDMAStatements += [future.wait() for future in egressFutures]
+
+        # 1) Initialize all futures
+        setupStatements = [CodeSnippet(self._lineComment, {"comment": "Initialize DMA futures"})]
+        setupStatements.extend([f.init() for f in ingressFutures | egressFutures])
+
+        # 3) Deinitialize all futures
+        teardownStatements = [CodeSnippet(self._lineComment, {"comment": "Deinitialize DMA futures"})]
+        teardownStatements.extend([f.deinit() for f in ingressFutures | egressFutures])
+
+        closeLoopStatements = [CodeSnippet(self._closeTileLoopTemplate, {**operatorRepresentation})]
+
+        metaInfo = TilingMetaInfo(nodeName = operatorRepresentation['nodeName'] + f"_{self.externalMemory}",
+                                  nodeOps = operatorRepresentation['nodeOps'],
+                                  numTiles = operatorRepresentation['numTiles'],
+                                  totalNumTiles = len(tilingSchedule.outputLoadSchedule),
+                                  tileIdxPtr = operatorRepresentation['tileIdxPtr'],
+                                  tileIdxVar = "TILING_I",
+                                  kernelLevelTiling = True)
+
+        executionBlock = self.generateAllTilingCode(executionBlock, metaInfo, ingressDMAStatements, egressDMAStatements,
+                                                    openLoopStatements, closeLoopStatements, setupStatements,
+                                                    teardownStatements)
+
+        return ctxt, executionBlock, True
+
+
+class ProfilingSingleBufferingTilingMixIn(PrototypeTilingMixIn, ProfilingPrototypeMixIn):
+
+    @classmethod
+    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                                     setupStatements: List[CodeSnippet],
+                                     teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
+                                                              teardownStatements)
+
+        executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "SB")
+
+        executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo)
+
+        return executionBlock
+
+    @classmethod
+    def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
+                         openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
+                         egressDMAStatements: List[CodeSnippet],
+                         closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
+
+        nodeName = metaInfo.nodeName
+        tileIdxVar = metaInfo.tileIdxVar
+
+        _openLoopStatements = [openLoopStatements[0]]
+        _openLoopStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_ingress_dma_wait_start_measurements",
+                "tileIdxVar": tileIdxVar
+            }))
+        _openLoopStatements += openLoopStatements[1:]
+
+        _ingressDMAStatements = []
+        _ingressDMAStatements += ingressDMAStatements
+        _ingressDMAStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_ingress_dma_wait_end_measurements",
+                "tileIdxVar": tileIdxVar
+            }))
+
+        executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo)
+
+        _egressDMAStatements = []
+        _egressDMAStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_egress_dma_wait_start_measurements",
+                "tileIdxVar": tileIdxVar
+            }))
+        _egressDMAStatements += egressDMAStatements
+        _egressDMAStatements.append(
+            CodeSnippet(cls._measureCycles, {
+                "measurements": f"{nodeName}_egress_dma_wait_end_measurements",
+                "tileIdxVar": tileIdxVar
+            }))
+
+        executionBlock = super().generateLoopCode(executionBlock, metaInfo, _openLoopStatements, _ingressDMAStatements,
+                                                  _egressDMAStatements, closeLoopStatements)
+        return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
index f4e4d9aae9..51f87534ea 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
@@ -1,172 +1,242 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: TilingCodeGeneration.py
-#
-# Last edited: 24.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
+import copy
+import math
 from abc import abstractmethod
-from typing import Dict, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, TypeVar
+
+import numpy as np
 
-import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
-from Deeploy.AbstractDataTypes import Immediate, PointerClass
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureExecutionBlock
 from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
     IntrospectiveCodeTransformationMixIn
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration
-from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ConstantBuffer, ExecutionBlock, \
+from Deeploy.DeeployTypes import CodeGenVerbosity, CodeSnippet, CodeTransformationPass, ExecutionBlock, \
     NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer, _NoVerbosity
+from Deeploy.TilingExtension.AsyncDma import AnydimAsyncDmaTransferAdapter, AsyncDma, DmaDirection, Future
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PrototypeTilingMixIn
-from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
-from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
+    calculateFlatOffset, minimizeRectangle, minimizeVariableReplacement, padOffset, padShape, stridesFromShape
 
+T = TypeVar('T')
 
-class TilingCodeGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn, PrototypeTilingMixIn):
 
-    def __init__(self, targetMemLevel: str):
-        self.targetMemLevel = targetMemLevel
-        self.argStructGeneration = ArgumentStructGeneration()
+def transposeListOfLists(listOfLists: List[List[T]]) -> List[List[T]]:
+    transposedListOfLists = []
+    for _list in listOfLists:
+        for i, element in enumerate(_list):
+            if i >= len(transposedListOfLists):
+                assert i == len(transposedListOfLists)
+                transposedListOfLists.append([element])
+            else:
+                transposedListOfLists[i].append(element)
+    return transposedListOfLists
+
+
+class TilingCodeGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn, PrototypeTilingMixIn,
+                           TilingHoistingMixIn):
+
+    _lineComment = NodeTemplate("\n// ${comment}")
+
+    _relativeOffsetReferenceUpdateTemplate = NodeTemplate("""
+    // UPDATE VARIABLE ${reference}
+    ${reference} = (${typeName}*)((char*)(${reference}) + ${relativeOffset});
+    """)
+
+    _relativeOffsetReferenceUpdateTiledTemplate = NodeTemplate("""
+    // UPDATE VARIABLE ${reference}
+    ${reference} = (${typeName}*)((char*)(${reference}) + ${relativeOffset}[${tileIdxVar}]);
+    """)
+
+    _openTileLoopTemplate = NodeTemplate("""
+    // TILING LOOP
+    for (int TILING_I=${numTiles}[*${tileIdxPtr}]; TILING_I<${numTiles}[(*${tileIdxPtr})+1]; TILING_I++){
+    """)
+
+    _closeTileLoopTemplate = NodeTemplate("""
+    // CLOSE TILING LOOP
+    }
+    *${tileIdxPtr} += 1;
+    """)
 
     @abstractmethod
+    def _tilingLoop(self, ctxt: NetworkContext, executionBlock: ExecutionBlock,
+                    nodeMemoryConstraint: NodeMemoryConstraint, tilingSchedule: TilingSchedule,
+                    variableReplacement: VariableReplacementScheme,
+                    operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
+        pass
+
     def generateTilingLoop(
             self, ctxt: NetworkContext, executionBlock: ExecutionBlock, nodeMemoryConstraint: NodeMemoryConstraint,
-            tilingSchedule: TilingSchedule, variableReplacement: VariableReplacementScheme,
+            tilingSchedules: List[TilingSchedule], variableReplacement: VariableReplacementScheme,
             operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, ExecutionBlock, bool]:
 
-        return ctxt, executionBlock, False
+        flatTilingSchedule = copy.copy(tilingSchedules[0])
+        for tilingSchedule in tilingSchedules[1:]:
+            flatTilingSchedule += tilingSchedule
 
-    # SCHEREMO: internalPtr refers to the HIGHER memory level of a transfer,
-    # e.g. in both an L2 -> L1 and L1 -> L2 transfer, the internalPtr is in L1.
-    @staticmethod
-    def isFinalMemoryLevel(nodeMemoryConstraint: NodeMemoryConstraint, internalPtr: VariableBuffer) -> bool:
-        externalName = internalPtr._referenceName
-        tensorMemoryConstraint = nodeMemoryConstraint.tensorMemoryConstraints[externalName]
-        if len(tensorMemoryConstraint.memoryConstraints.keys()) <= 2:
-            return True
+        offsetLists = list({**flatTilingSchedule.inputBaseOffsets, **flatTilingSchedule.outputBaseOffsets}.values())
 
-        finalMemoryLevels = list(tensorMemoryConstraint.memoryConstraints.keys())[:2]
-        memoryLevel = internalPtr._memoryLevel
+        if len(offsetLists) == 0:
+            return ctxt, executionBlock, False
 
-        return memoryLevel in finalMemoryLevels
+        for offsetList in offsetLists:
+            if not len(offsetList) == self.bufferCount:
+                return ctxt, executionBlock, False
 
-    def _hoistTileIdxPtr(self,
-                         ctxt: NetworkContext,
-                         operatorRepresentation: OperatorRepresentation,
-                         sourceMemoryLevel: str = "L2") -> str:
+        numTiles, tileIdxPtr = self._hoistTileNumAndIdxPtr(ctxt, tilingSchedules)
+        operatorRepresentation["numTiles"] = numTiles.name
+        operatorRepresentation["tileIdxPtr"] = tileIdxPtr.name
 
-        newPtrName = self.prefix + operatorRepresentation['nodeName'] + "_tileIdxPtr"
+        return self._tilingLoop(ctxt, executionBlock, nodeMemoryConstraint, flatTilingSchedule, variableReplacement,
+                                operatorRepresentation)
 
-        tilePtrBuffer = ctxt.VariableBuffer(newPtrName, shape = [1])
-        ctxt.add(tilePtrBuffer, "local")
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, bufferCount: int):
+        self.externalMemory = externalMemory
+        self.localMemory = localMemory
+        self.dma = dma
+        self.bufferCount = bufferCount
+        TilingHoistingMixIn.__init__(self, localMemory)
+        self.argStructGeneration = ArgumentStructGeneration()
 
-        _type = ctxt.lookup(self.prefix + operatorRepresentation['nodeName'] + "_numTiles")._type
+    # SCHEREMO: internalPtr refers to the HIGHER memory level of a transfer,
+    # e.g. in both an L2 -> L1 and L1 -> L2 transfer, the internalPtr is in L1.
+    def isFinalMemoryLevel(self, tensorMemoryConstraint: TensorMemoryConstraint) -> bool:
+        memoryOrder = list(tensorMemoryConstraint.memoryConstraints.keys())
+        assert self.localMemory in memoryOrder, f"Memory {self.localMemory} does not exist in the tensor memory constraint {tensorMemoryConstraint}"
+        if len(memoryOrder) < 2:
+            return True
+        return self.localMemory in memoryOrder[:2]
 
-        tilePtrBuffer._type = _type
-        tilePtrBuffer._instance = tilePtrBuffer._type(newPtrName, ctxt)
-        tilePtrBuffer._memoryLevel = sourceMemoryLevel
+    def _generateDmaTransferCalls(self, ctxt: NetworkContext, tensorName: str, transfers: List[HyperRectangle],
+                                  tileIdxVar: str, localBuffer: VariableBuffer, externalBuffer: VariableBuffer,
+                                  direction: DmaDirection, future: Future) -> List[CodeSnippet]:
+        assert all(len(transfers[0].dims) == len(rect.dims) for rect in transfers), \
+            "Currently supporting only rectangles of same rank"
 
-        tilePtrBuffer.allocTemplate = NodeTemplate("")
-        tilePtrBuffer.deallocTemplate = NodeTemplate("")
-        tilePtrBuffer.initTemplate = NodeTemplate("""
-        ${type.referencedType.typeName} bu_${name} = 0;
-        ${type.referencedType.typeName}* ${name} = &bu_${name};""")
+        assert len(transfers[0].dims) > 0, "Expecting transfers of rank greater than 0"
 
-        return newPtrName
+        assert len(transfers[0].dims) == len(externalBuffer.shape), \
+            "External buffer's rank should be equal to the internal buffer's"
 
-    def _hoistNumTiles(self,
-                       ctxt: NetworkContext,
-                       nodeName: str,
-                       tilingSchedules: List[TilingSchedule],
-                       sourceMemoryLevel: str = "L2") -> str:
+        anydimAdapter = AnydimAsyncDmaTransferAdapter(self.dma)
 
-        newPtrName = self.prefix + nodeName + "_numTiles"
+        initSnippets = anydimAdapter.transfer(ctxt, externalBuffer, localBuffer, transfers[0].dims,
+                                              stridesFromShape(externalBuffer.shape),
+                                              stridesFromShape(transfers[0].dims), direction, future,
+                                              math.prod(externalBuffer.shape,))
 
-        numTiles = [len(tilingSchedule.outputLoadSchedule) for tilingSchedule in tilingSchedules]
-        cumNumTiles = [0]
-        for idx in list(range(len(numTiles))):
-            cumNumTiles.append(cumNumTiles[-1] + numTiles[idx])
+        # Add allocation snippets
+        templates = [snippet.template for snippet in initSnippets]
+        opReprUpdates = [[] for _ in range(len(initSnippets))]
 
-        cb = ctxt.ConstantBuffer(newPtrName, [len(cumNumTiles)], values = cumNumTiles)
-        ctxt.add(cb, "global")
+        for rect in transfers:
+            snippets = anydimAdapter.transfer(ctxt, externalBuffer, localBuffer, rect.dims,
+                                              stridesFromShape(externalBuffer.shape), stridesFromShape(rect.dims),
+                                              direction, future, math.prod(externalBuffer.shape))
+            for i, snippet in enumerate(snippets):
+                opReprUpdates[i].append(snippet.operatorRepresentation)
 
-        minType = None
-        if BasicDataTypes.uint8_t.checkValue(cumNumTiles):
-            minType = BasicDataTypes.uint8_t
-        elif BasicDataTypes.uint16_t.checkValue(cumNumTiles):
-            minType = BasicDataTypes.uint16_t
-        else:
-            minType = BasicDataTypes.uint32_t
+        tiledSnippets: List[CodeSnippet] = [
+            CodeSnippet(*self._tileTemplate(ctxt, opReprUpdate, template, tileIdxVar, f"{tensorName}_"))
+            for template, opReprUpdate in zip(templates, opReprUpdates)
+        ]
 
-        cb._type = PointerClass(minType)
-        cb._instance = cb._type(newPtrName, ctxt)
-        cb._memoryLevel = sourceMemoryLevel
+        return tiledSnippets
 
-        return newPtrName
+    def _generateExternalReferenceUpdate(self, ctxt: NetworkContext, tensorName: str, transfers: List[HyperRectangle],
+                                         tileIdxVar: str, externalBuffer: VariableBuffer) -> Optional[CodeSnippet]:
+        externalBufferStrides = stridesFromShape(externalBuffer.shape)
+        offsets = [calculateFlatOffset(rect.offset, externalBufferStrides) for rect in transfers]
+        relativeOffsets = [_next - _prev for _prev, _next in zip(offsets[:-1], offsets[1:])]
 
-    def _hoistConstantAndReference(self,
-                                   ctxt: NetworkContext,
-                                   constBuf: ConstantBuffer,
-                                   operatorRepresentation: OperatorRepresentation,
-                                   nodeName: str,
-                                   operatorRepresentationName: str,
-                                   immediateType: Optional[Type[Immediate]] = None) -> Tuple[NetworkContext, Dict]:
+        if len(relativeOffsets) == 0 or all(offset == 0 for offset in relativeOffsets):
+            return None
 
-        if immediateType is None:
-            _type = PointerClass(BasicDataTypes.int32_t)
+        operatorRepresentation: OperatorRepresentation = {
+            "reference": externalBuffer.name,
+            "tileIdxVar": tileIdxVar,
+            "typeName": externalBuffer._type.referencedType.typeName,
+        }
+
+        if all(relativeOffsets[0] == offset for offset in relativeOffsets):
+            operatorRepresentation["relativeOffset"] = relativeOffsets[0]
+            template = self._relativeOffsetReferenceUpdateTemplate
+        else:
+            relativeOffsets.append(0)  # To have the same length as the number of tiles
+            buffer = self._hoistValues(ctxt, f'{tensorName}_relativeOffset', relativeOffsets)
+            operatorRepresentation["relativeOffset"] = buffer.name
+            operatorRepresentation["tileIdxVar"] = tileIdxVar
+            template = self._relativeOffsetReferenceUpdateTiledTemplate
+
+        return CodeSnippet(template, operatorRepresentation)
+
+    # TODO: Not super sure this should go here. It could be shared, but it seems a little bit too specific
+    # with the `isFinalMemory` thing.
+    def _legalizeTransfers(self, transfers: List[HyperRectangle], outerShape: Tuple[int, ...], typeWidth: int,
+                           isFinalMemoryLevel: bool) -> Tuple[List[HyperRectangle], Tuple[int, ...]]:
+        transfersCommonRank = max(len(rect.dims) for rect in transfers)
+        commonRank = max(transfersCommonRank, len(outerShape))
+        outerShape = padShape(outerShape, commonRank)
+
+        minOuterShape = None
+
+        if isFinalMemoryLevel:
+            minimizedTransfers = []
+            for rect in transfers:
+                paddedRect = HyperRectangle(padOffset(rect.offset, commonRank), padShape(rect.dims, commonRank))
+                minRect, newMinOuterShape = minimizeRectangle(paddedRect, outerShape)
+                if minOuterShape is None:
+                    minOuterShape = newMinOuterShape
+                else:
+                    if minOuterShape != newMinOuterShape:
+                        rectStr = "\n".join(str(trans) for trans in transfers[:transfers.index(rect)])
+                        raise RuntimeError(f"""Currently support a single minimal outer shape.
+Old minOuterShape: {minOuterShape} vs. new minOuterShape {newMinOuterShape}.
+New minOuterShape produced by outerDims: {outerShape} and rect: {rect}.
+Old minOuterShape produced by outerDims: {outerShape} and rects:
+{rectStr}""")
+                minimizedTransfers.append(minRect)
         else:
-            _type = PointerClass(immediateType)
+            minimizedTransfers = [HyperRectangle((0,), (int(np.prod(rect.dims)),)) for rect in transfers]
+            minOuterShape = (int(np.prod(outerShape)),)
+
+        if minOuterShape is not None:
+            outerShape = minOuterShape
+        transfers = minimizedTransfers
 
-        name = constBuf.name
+        def sizeInBytes(length: int, typeWidth: int) -> int:
+            return int(np.ceil((length * typeWidth) / 8))
 
-        ctxt.add(constBuf, "global")
-        constBuf._type = _type
-        constBuf._instance = constBuf._type(name, ctxt)
-        constBuf._users = [nodeName]
-        constBuf._memoryLevel = self.targetMemLevel
+        outerShape = outerShape[:-1] + (sizeInBytes(outerShape[-1], typeWidth),)
 
-        refName = name + "_ref"
-        reference = ctxt.hoistReference(name, refName)
-        ctxt.lookup(reference)._memoryLevel = self.targetMemLevel
+        inBytesTransfers = []
+        for rect in transfers:
+            newOffset = rect.offset[:-1] + (sizeInBytes(rect.offset[-1], typeWidth),)
+            newDims = rect.dims[:-1] + (sizeInBytes(rect.dims[-1], typeWidth),)
+            inBytesTransfers.append(HyperRectangle(newOffset, newDims))
+        transfers = inBytesTransfers
 
-        operatorRepresentation[operatorRepresentationName] = refName
+        return transfers, outerShape
 
-        return ctxt, operatorRepresentation
+    def _tileTemplate(self, ctxt: NetworkContext, perTileOpReprs: List[OperatorRepresentation], template: NodeTemplate,
+                      tileIdxVar: str, prefix: str) -> Tuple[NodeTemplate, OperatorRepresentation]:
+        opRepr, hoistedNames = self._hoistOpReprUpdates(ctxt, perTileOpReprs, prefix)
+        if len(hoistedNames) > 0:
+            template = copy.deepcopy(template)
+            self.indexVars(template.template, hoistedNames, "tileIdxVar")
+            opRepr["tileIdxVar"] = tileIdxVar
+        return template, opRepr
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-
-        def unravelReference(ctxt: NetworkContext, name: str) -> str:
-
-            if name not in ctxt.localObjects.keys() and name not in ctxt.globalObjects.keys():
-                return name
-
-            refBuffer = ctxt.lookup(name)
-            if not hasattr(refBuffer, "_referenceName"):
-                return name
-
-            return unravelReference(ctxt, refBuffer._referenceName)
-
         if isinstance(executionBlock, ClosureExecutionBlock):
             baseExecutionBlock = executionBlock.baseBlock
         else:
@@ -190,25 +260,24 @@ def unravelReference(ctxt: NetworkContext, name: str) -> str:
 
         templateNode = possibleTemplateNodes[0]
 
-        operatorRepresentation = templateNode.operatorRepresentation
-        unravelRep = operatorRepresentation.copy()
-        for key in unravelRep.keys():
-
-            val = unravelRep[key]
-            if not isinstance(val, str):
-                continue
-
-            unravelRep[key] = unravelReference(ctxt, val)
+        self._initPrefix(templateNode.operatorRepresentation['nodeName'])
 
+        operatorRepresentation = templateNode.operatorRepresentation
         template = templateNode.template
 
+        unraveledOpRepr = operatorRepresentation.copy()
+        for key, value in unraveledOpRepr.items():
+            if ctxt.is_buffer(value):
+                buffer = ctxt.lookup(value)
+                assert isinstance(buffer, VariableBuffer)
+                unraveledOpRepr[key] = ctxt.unravelReference(buffer).name
+
         variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
-            nodeMemoryConstraint, self.targetMemLevel, ctxt, unravelRep)
+            nodeMemoryConstraint, self.localMemory, ctxt, unraveledOpRepr)
 
-        minimalVariableReplacement, newNodeRep = minimizeVariableReplacement(variableReplacement,
-                                                                             templateNode.operatorRepresentation)
-        for key, value in newNodeRep.items():
-            templateNode.operatorRepresentation[key] = value
+        minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
+
+        operatorRepresentation.update(newOpRepr)
 
         ctxt, executionBlock, applicable = self.generateTilingLoop(ctxt, executionBlock, nodeMemoryConstraint,
                                                                    tilingSchedules, minimalVariableReplacement,
@@ -216,4 +285,6 @@ def unravelReference(ctxt: NetworkContext, name: str) -> str:
         if applicable:
             ctxt, executionBlock = self.argStructGeneration.apply(ctxt, executionBlock, name)
 
+        self._deinitPrefix()
+
         return ctxt, executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py
new file mode 100644
index 0000000000..f1a6b1ca23
--- /dev/null
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py
@@ -0,0 +1,153 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from typing import List, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
+
+import Deeploy.CommonExtensions.DataTypes as BasicDataTypes
+from Deeploy.AbstractDataTypes import BaseType, PointerClass, VoidType
+from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer, \
+    _ReferenceBuffer
+from Deeploy.TilingExtension.MemoryConstraints import TensorMemoryConstraint
+from Deeploy.TilingExtension.TilingCodegen import TilingSchedule
+
+KT = TypeVar('KT')
+VT = TypeVar('VT')
+
+
+def dictOfArrays(arrayOfDicts: Sequence[Mapping[KT, VT]]) -> Mapping[KT, List[VT]]:
+    ret: Mapping[KT, List[VT]] = {}
+    for i, _dict in enumerate(arrayOfDicts):
+        if i == 0:
+            ret.update({key: [value] for key, value in _dict.items()})
+        else:
+            assert set(ret.keys()) == set(_dict.keys()), "Keys should be the same"
+            for key, value in _dict.items():
+                ret[key].append(value)
+    return ret
+
+
+class TilingHoistingMixIn:
+
+    _DEFAULT_HOIST_PREFIX = "TILING_CODEGEN_"
+
+    def __init__(self, memory: str) -> None:
+        self.memory = memory
+        self._prefix = None
+
+    def _initPrefix(self, nodeName: str) -> None:
+        self._prefix = f"{self._DEFAULT_HOIST_PREFIX}{self.memory}_{nodeName}_"
+
+    def _deinitPrefix(self) -> None:
+        self._prefix = None
+
+    @property
+    def prefix(self) -> str:
+        assert self._prefix is not None, "Prefix is not initialized!"
+        return self._prefix
+
+    def _hoistValues(self,
+                     ctxt: NetworkContext,
+                     name: str,
+                     values: List[int],
+                     override_type: Optional[Type[BaseType]] = None) -> ConstantBuffer:
+        assert all(isinstance(value, int) for value in values)
+        cb = ctxt.ConstantBuffer(self.prefix + name, [len(values)], values)
+        ctxt.add(cb, 'global')
+        if override_type is not None:
+            cb._type = PointerClass(override_type)
+        else:
+            cb._type = PointerClass(BasicDataTypes.minimalIntegerType(values))
+        cb._instance = cb._type(cb.name, ctxt)
+        cb._memoryLevel = self.memory
+        return cb
+
+    def _hoistReference(self,
+                        ctxt: NetworkContext,
+                        name: str,
+                        reference: VariableBuffer,
+                        shape: Tuple[int, ...] = (1,),
+                        offset: Union[int, str, VariableBuffer] = 0,
+                        override_type: Optional[Type[BaseType]] = None) -> _ReferenceBuffer:
+        ref = ctxt.hoistReference(self.prefix + name, reference, shape, offset, override_type)
+        ref._memoryLevel = self.memory
+        return ref
+
+    def _hoistTileNumAndIdxPtr(self, ctxt: NetworkContext,
+                               tilingSchedules: List[TilingSchedule]) -> Tuple[ConstantBuffer, VariableBuffer]:
+        stepsNumTiles = [len(tilingSchedule.outputLoadSchedule) for tilingSchedule in tilingSchedules]
+
+        cumulativeNumTiles = [0]
+        for numTiles in stepsNumTiles:
+            cumulativeNumTiles.append(cumulativeNumTiles[-1] + numTiles)
+
+        tileNum = self._hoistValues(ctxt, "numTiles", cumulativeNumTiles)
+
+        tileIdxPtr = ctxt.VariableBuffer(f"{self.prefix}tileIdxPtr", shape = [1])
+        ctxt.add(tileIdxPtr, "local")
+
+        tileIdxPtr._type = tileNum._type
+        tileIdxPtr._instance = tileIdxPtr._type(tileIdxPtr.name, ctxt)
+        # LMACAN: Intentionally don't annotate memory level so it gets allocated
+        # outside of the tiling loops
+
+        tileIdxPtr.allocTemplate = NodeTemplate("")
+        tileIdxPtr.deallocTemplate = NodeTemplate("")
+        tileIdxPtr.initTemplate = NodeTemplate("""
+        ${type.referencedType.typeName} bu_${name} = 0;
+        ${type.referencedType.typeName}* ${name} = &bu_${name};""")
+
+        return (tileNum, tileIdxPtr)
+
+    def _hoistOpReprUpdates(self,
+                            ctxt: NetworkContext,
+                            opReprs: List[OperatorRepresentation],
+                            prefix: str = "") -> Tuple[OperatorRepresentation, List[str]]:
+        # Early exit if the opReprs list is empty because the following code assumes at least 1 opRepr is in the list
+        if len(opReprs) == 0:
+            return {}, []
+
+        newOpRepr = {}
+        hoistedReprNames = []
+        for var, updates in dictOfArrays(opReprs).items():
+            if all(update == updates[0] for update in updates):
+                newOpRepr[var] = updates[0]
+            else:
+                cb = self._hoistValues(ctxt, f"{prefix}{var}", updates)
+                newOpRepr[var] = cb.name
+                hoistedReprNames.append(var)
+        return newOpRepr, hoistedReprNames
+
+    def _hoistMultibufferReferences(self, ctxt: NetworkContext, buffer: VariableBuffer,
+                                    tensorMemoryConstraint: TensorMemoryConstraint) -> List[_ReferenceBuffer]:
+        tensorName = tensorMemoryConstraint.tensorName
+        memoryConstraint = tensorMemoryConstraint.memoryConstraints[self.memory]
+        assert memoryConstraint.addrSpace is not None, "Assuming address space is set"
+        totalSize = memoryConstraint.addrSpace[1] - memoryConstraint.addrSpace[0]
+        assert isinstance(memoryConstraint.multiBufferCoefficient,
+                          int), "Assuming multi buffer coefficient has been assigned"
+        assert totalSize % memoryConstraint.multiBufferCoefficient == 0, "Assuming total size is divisible by the multi buffer coefficient"
+        bufferSize = totalSize // memoryConstraint.multiBufferCoefficient
+
+        assert memoryConstraint.multiBufferCoefficient == 2, "Multi buffer coefficient has to be equal to 2 since this is for double buffering"
+        assert memoryConstraint.shape is not None
+        assert len(memoryConstraint.shape) > 0
+        assert isinstance(memoryConstraint.shape[0], int)
+        tileLength = math.prod(memoryConstraint.shape)
+        tileSize = int(math.ceil(tileLength * buffer._type.referencedType.typeWidth / 8))
+
+        assert bufferSize >= tileSize, f"Provided buffer size is not enough to fit the tile. Buffer size: {bufferSize}, tile size: {tileSize}"
+
+        refs = [
+            self._hoistReference(
+                ctxt,
+                f"{tensorName}_buffer_{i}",
+                buffer,
+                memoryConstraint.shape,
+                offset = i * bufferSize,
+                override_type = VoidType,
+            ) for i in range(memoryConstraint.multiBufferCoefficient)
+        ]
+
+        return refs
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
index 6b4b297da3..09a4ef56eb 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingPrototypes.py
@@ -1,31 +1,8 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: TilingPrototypes.py
-#
-# Last edited: 17.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-from abc import ABC, abstractmethod
+from abc import ABC
 from dataclasses import dataclass
 from typing import List, Literal
 
@@ -36,61 +13,19 @@
 class TilingMetaInfo:
     nodeName: str
     nodeOps: int
-    numTiles: int
+    numTiles: str
+    totalNumTiles: int
+    tileIdxPtr: str
     tileIdxVar: str
     kernelLevelTiling: bool
 
 
-_CodeSegmentType = List[CodeSnippet]
-
-_measureCycles = NodeTemplate("""
-${nodeName}_${measurementName}_measurements[${tileIdx}] = getCycles();
-""")
-
-_measurementArrayDeclaration = NodeTemplate("""
-uint32_t ${nodeName}_${measurementName}_measurements[${numTiles}];
-""")
-
-_printPrefixAndSufixDeclaration = NodeTemplate("""
-const static char ${nodeName}_prefix[] = "[${nodeName}][${buffering}][${nodeOps} ops][Tile ";
-const static char ${nodeName}_suffix[] = " cycles \\n";
-""")
-
-_measureConditionSetup = NodeTemplate("""
-if(${cond}){
-""")
-
-_measureConditionEnd = NodeTemplate("""
-}
-""")
-
-_printLoopSetup = NodeTemplate("""
-StopTimer(); 
-<%
-current_level_num = nodeName[-1]
-lower_level_num = str(int(current_level_num) - 1)                            
-%>
-for (int printLoopIdx = DeeployNetwork_TILING_REPLACED_L${lower_level_num}_${nodeName[:-3]}_numTiles[*DeeployNetwork_TILING_REPLACED_L${lower_level_num}_${nodeName[:-3]}_tileIdxPtr -1];
-    printLoopIdx < DeeployNetwork_TILING_REPLACED_L${lower_level_num}_${nodeName[:-3]}_numTiles[*DeeployNetwork_TILING_REPLACED_L${lower_level_num}_${nodeName[:-3]}_tileIdxPtr]; 
-    printLoopIdx++){
-""")
-_printCycleDifference = NodeTemplate(r"""
-printf("%s%u] %s%u%s", ${nodeName}_prefix,${tileIdx},"${flavorStr}", \
-${nodeName}_${endMeasurementName}_measurements[${tileIdx}] - ${nodeName}_${startMeasurementName}_measurements[${tileIdx}],${nodeName}_suffix);
-""")
-
-_printLoopTeardown = NodeTemplate("""
-}
-StartTimer();
-""")
-
-
 class PrototypeTilingMixIn(ABC):
 
     @classmethod
     def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                                     setupStatements: _CodeSegmentType,
-                                     teardownStatements: _CodeSegmentType) -> ExecutionBlock:
+                                     setupStatements: List[CodeSnippet],
+                                     teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
 
         for transaction in reversed(setupStatements):
             executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
@@ -102,53 +37,83 @@ def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo:
 
     @classmethod
     def generateLoopCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                         openLoopStatements: _CodeSegmentType, closeLoopStatements: _CodeSegmentType) -> ExecutionBlock:
+                         openLoopStatements: List[CodeSnippet], ingressDMAStatements: List[CodeSnippet],
+                         egressDMAStatements: List[CodeSnippet],
+                         closeLoopStatements: List[CodeSnippet]) -> ExecutionBlock:
 
-        for transaction in reversed(openLoopStatements):
+        for transaction in reversed(openLoopStatements + ingressDMAStatements):
             executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
 
-        for transaction in closeLoopStatements:
+        for transaction in egressDMAStatements + closeLoopStatements:
             executionBlock.addRight(transaction.template, transaction.operatorRepresentation)
 
         return executionBlock
 
     @classmethod
     def generateAllTilingCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                              ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
-                              ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
-                              egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
-                              variableUpdates: _CodeSegmentType, openLoopStatements: _CodeSegmentType,
-                              closeLoopStatements: _CodeSegmentType, setupStatements: _CodeSegmentType,
-                              teardownStatements: _CodeSegmentType) -> ExecutionBlock:
-
-        if not hasattr(cls, "generateInnerCode"):
-            raise Exception("You need to mix in a code gen strategy!")
-
-        newExecutionBlock = cls.generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                  ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls,
-                                                  egressDMAWaitStatements, egressDMAUpdates, variableUpdates)
-
-        newExecutionBlock = cls.generateLoopCode(newExecutionBlock, metaInfo, openLoopStatements, closeLoopStatements)
-
-        newExecutionBlock = cls.generateSetupAndTeardownCode(newExecutionBlock, metaInfo, setupStatements,
-                                                             teardownStatements)
-
-        return newExecutionBlock
+                              ingressDMAStatements: List[CodeSnippet], egressDMAStatements: List[CodeSnippet],
+                              openLoopStatements: List[CodeSnippet], closeLoopStatements: List[CodeSnippet],
+                              setupStatements: List[CodeSnippet],
+                              teardownStatements: List[CodeSnippet]) -> ExecutionBlock:
 
+        executionBlock = cls.generateLoopCode(executionBlock, metaInfo, openLoopStatements, ingressDMAStatements,
+                                              egressDMAStatements, closeLoopStatements)
 
-class TilingCodeGenMixin(ABC):
-
-    @abstractmethod
-    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
-                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
-                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
-                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
+        executionBlock = cls.generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements, teardownStatements)
 
         return executionBlock
 
 
 class ProfilingPrototypeMixIn(ABC):
+    _measureCycles = NodeTemplate("""
+    ${measurements}[${tileIdxVar}] = getCycles();
+    """)
+
+    _measurementArrayDeclaration = NodeTemplate("""
+    uint32_t ${measurements}[${totalNumTiles}];
+    """)
+
+    _stringDeclaration = NodeTemplate("""
+    const static char ${name}[] = "${string}";
+    """)
+
+    _printLoopSetup = NodeTemplate("""
+    StopTimer();
+    printf("===== Profiling ${nodeName} =====\\n");
+    for (int ${profileIdxVar} = ((*${tileIdxPtr} > 0) ? ${numTiles}[(*${tileIdxPtr} - 1)] : 0);
+        ${profileIdxVar} < ${numTiles}[*${tileIdxPtr}];
+        ${profileIdxVar}++){
+    """)
+
+    _measurementDeclaration = NodeTemplate("""
+    uint32_t ${measurement} = ${measurementsEnd}[${profileIdxVar}] - ${measurementsStart}[${profileIdxVar}];
+    """)
+
+    _printCycleDifference = NodeTemplate("""
+    printf("%s%u] %s%6u%s", ${prefixStr}, ${profileIdxVar}, "${flavorStr}", \
+    ${measurement}, ${suffixStr});
+    """)
+
+    _printCycleContribution = NodeTemplate("""
+    uint32_t total = ${measurementInput} + ${measurementKernel} + ${measurementOutput};
+    uint32_t dma = ${measurementInput} + ${measurementOutput};
+    float overhead_percentage = (total == 0) ? 0 : dma * 100.0f / total;
+    float kernel_percentage = (total == 0) ? 0 : ${measurementKernel} * 100.0f / total;
+    printf("%s%u] Total      :%6u cycles (%2.1f%% Kernel + %2.1f%% Overhead, %u + %u)\\n", ${prefixStr}, ${profileIdxVar}, total, kernel_percentage, overhead_percentage    , ${measurementKernel}, dma);
+    """)
+
+    _printLoopTeardown = NodeTemplate("""
+    }
+    StartTimer();
+    """)
+
+    _measureConditionSetup = NodeTemplate("""
+    if(${cond}){
+    """)
+
+    _measureConditionEnd = NodeTemplate("""
+    }
+    """)
 
     @classmethod
     def measurementArrayDeclaration(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
@@ -156,26 +121,30 @@ def measurementArrayDeclaration(cls, executionBlock: ExecutionBlock, metaInfo: T
 
         nodeName = metaInfo.nodeName
         numTiles = metaInfo.numTiles
+        totalNumTiles = metaInfo.totalNumTiles
         nodeOps = metaInfo.nodeOps
 
-        measurementNameList = [
+        measurementsList = [
             "ingress_dma_wait_start", "ingress_dma_wait_end", "egress_dma_wait_start", "egress_dma_wait_end"
         ]
 
         if metaInfo.kernelLevelTiling:
-            measurementNameList = ["kernel_start", "kernel_end"] + measurementNameList
+            measurementsList = ["kernel_start", "kernel_end"] + measurementsList
 
-        for measurementName in measurementNameList:
-            executionBlock.addLeft(_measurementArrayDeclaration, {
-                "nodeName": nodeName,
-                "measurementName": measurementName,
-                "numTiles": numTiles
+        for measurements in measurementsList:
+            executionBlock.addLeft(cls._measurementArrayDeclaration, {
+                "measurements": f"{nodeName}_{measurements}_measurements",
+                "totalNumTiles": totalNumTiles
             })
 
-        executionBlock.addLeft(_printPrefixAndSufixDeclaration, {
-            "nodeName": nodeName,
-            "nodeOps": nodeOps,
-            "buffering": bufferingStr
+        executionBlock.addLeft(cls._stringDeclaration, {
+            "name": f"{nodeName}_prefix",
+            "string": f"[{nodeName}][{bufferingStr}][{nodeOps} ops][Tile ",
+        })
+
+        executionBlock.addLeft(cls._stringDeclaration, {
+            "name": f"{nodeName}_suffix",
+            "string": " cycles \\n",
         })
 
         return executionBlock
@@ -185,270 +154,98 @@ def injectPrintCycleDiff(cls, executionBlock: ExecutionBlock, metaInfo: TilingMe
 
         numTiles = metaInfo.numTiles
         nodeName = metaInfo.nodeName
+        tileIdxPtr = metaInfo.tileIdxPtr
+        profileIdxVar = "PROFILING_I"
 
-        executionBlock.addRight(_printLoopSetup, {"numTiles": numTiles, "nodeName": nodeName})
+        executionBlock.addRight(cls._printLoopSetup, {
+            "numTiles": numTiles,
+            "nodeName": nodeName,
+            "profileIdxVar": profileIdxVar,
+            "tileIdxPtr": tileIdxPtr,
+        })
 
         executionBlock.addRight(
-            _printCycleDifference, {
-                "nodeName": nodeName,
-                "flavorStr": "Input DMA took ",
-                "startMeasurementName": "ingress_dma_wait_start",
-                "endMeasurementName": "ingress_dma_wait_end",
-                "tileIdx": "printLoopIdx"
+            cls._measurementDeclaration, {
+                "measurement": f"{nodeName}_ingress_dma_wait_measurement",
+                "measurementsStart": f"{nodeName}_ingress_dma_wait_start_measurements",
+                "measurementsEnd": f"{nodeName}_ingress_dma_wait_end_measurements",
+                "profileIdxVar": profileIdxVar,
             })
+
         if metaInfo.kernelLevelTiling:
             executionBlock.addRight(
-                _printCycleDifference, {
-                    "nodeName": nodeName,
-                    "flavorStr": "Kernel took ",
-                    "startMeasurementName": "kernel_start",
-                    "endMeasurementName": "kernel_end",
-                    "tileIdx": "printLoopIdx"
+                cls._measurementDeclaration, {
+                    "measurement": f"{nodeName}_kernel_measurement",
+                    "measurementsStart": f"{nodeName}_kernel_start_measurements",
+                    "measurementsEnd": f"{nodeName}_kernel_end_measurements",
+                    "profileIdxVar": profileIdxVar,
                 })
+
         executionBlock.addRight(
-            _printCycleDifference, {
-                "nodeName": nodeName,
-                "flavorStr": "Output DMA took ",
-                "startMeasurementName": "egress_dma_wait_start",
-                "endMeasurementName": "egress_dma_wait_end",
-                "tileIdx": "printLoopIdx"
+            cls._measurementDeclaration, {
+                "measurement": f"{nodeName}_egress_dma_wait_measurement",
+                "measurementsStart": f"{nodeName}_egress_dma_wait_start_measurements",
+                "measurementsEnd": f"{nodeName}_egress_dma_wait_end_measurements",
+                "profileIdxVar": profileIdxVar,
             })
 
-        executionBlock.addRight(_printLoopTeardown, {})
-
-        return executionBlock
-
-    @classmethod
-    def kernelProfilingWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
-        nodeName = metaInfo.nodeName
-        tileIdxVar = metaInfo.tileIdxVar
-
-        if metaInfo.kernelLevelTiling:
-            executionBlock.addLeft(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "kernel_start",
-                "tileIdx": tileIdxVar
-            })
-            executionBlock.addRight(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "kernel_end",
-                "tileIdx": tileIdxVar
+        executionBlock.addRight(
+            cls._printCycleDifference, {
+                "prefixStr": f"{nodeName}_prefix",
+                "suffixStr": f"{nodeName}_suffix",
+                "flavorStr": "Pre-Kernel :",
+                "measurement": f"{nodeName}_ingress_dma_wait_measurement",
+                "profileIdxVar": profileIdxVar,
             })
 
-        return executionBlock
-
-
-class SingleBufferingTilingMixIn(PrototypeTilingMixIn, TilingCodeGenMixin):
-
-    @classmethod
-    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
-                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
-                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
-                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
-
-        # Structure:
-        # Update DMA Structs
-        # Transfer in tiles (async)
-        # Update tile variables
-        # Wait for tiles
-
-        # Kernel execution
-
-        # Update DMA Structs
-        # Transfer out tiles (async)
-        # Wait for out transfers
-
-        for transaction in reversed(ingressDMAUpdates + ingressDMATransferCalls + variableUpdates +
-                                    ingressDMAWaitStatements):
-            executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
-
-        for transaction in (egressDMAUpdates + egressDMATransferCalls + egressDMAWaitStatements):
-            executionBlock.addRight(transaction.template, transaction.operatorRepresentation)
-
-        return executionBlock
-
-
-class ProfilingSingleBufferingTilingMixIn(SingleBufferingTilingMixIn, ProfilingPrototypeMixIn):
-
-    @classmethod
-    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                                     setupStatements: _CodeSegmentType,
-                                     teardownStatements: _CodeSegmentType) -> ExecutionBlock:
-
-        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
-                                                              teardownStatements)
-
-        executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "SB")
-
-        executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo)
-
-        return executionBlock
-
-    @classmethod
-    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
-                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
-                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
-                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
-
-        nodeName = metaInfo.nodeName
-        tileIdxVar = metaInfo.tileIdxVar
-
-        executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo)
-
-        _ingressDMAWaitStatements = []
-        _ingressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "ingress_dma_wait_start",
-                "tileIdx": tileIdxVar
-            }))
-        _ingressDMAWaitStatements += ingressDMAWaitStatements
-        _ingressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "ingress_dma_wait_end",
-                "tileIdx": tileIdxVar
-            }))
-
-        _egressDMAWaitStatements = []
-        _egressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "egress_dma_wait_start",
-                "tileIdx": tileIdxVar
-            }))
-        _egressDMAWaitStatements += egressDMAWaitStatements
-        _egressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "egress_dma_wait_end",
-                "tileIdx": tileIdxVar
-            }))
-
-        executionBlock = super().generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                   _ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls,
-                                                   _egressDMAWaitStatements, egressDMAUpdates, variableUpdates)
-
-        return executionBlock
-
-
-class DoubleBufferingTilingMixIn(PrototypeTilingMixIn, TilingCodeGenMixin):
-
-    @classmethod
-    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
-                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
-                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
-                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
-
-        # Structure:
-
-        # Update input DMA Structs
-        # Update tile variables
-        # Wait for current input tiles
-        # Transfer in next input tiles (async)
-        # Update output DMA Structs
-        # Wait for current output tiles
-
-        # Kernel execution
-
-        # Transfer out tiles (async)
-
-        for transaction in reversed(ingressDMAWaitStatements + ingressDMAUpdates + ingressDMATransferCalls +
-                                    variableUpdates + egressDMAWaitStatements + egressDMAUpdates):
-            executionBlock.addLeft(transaction.template, transaction.operatorRepresentation)
-
-        for transaction in egressDMATransferCalls:
-            executionBlock.addRight(transaction.template, transaction.operatorRepresentation)
-
-        return executionBlock
-
-
-class ProfilingDoubleBufferingTilingMixIn(DoubleBufferingTilingMixIn, ProfilingPrototypeMixIn):
-
-    @classmethod
-    def generateSetupAndTeardownCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                                     setupStatements: _CodeSegmentType,
-                                     teardownStatements: _CodeSegmentType) -> ExecutionBlock:
-
-        nodeName = metaInfo.nodeName
-        numTiles = metaInfo.numTiles
-
-        executionBlock.addLeft(_measureCycles, {
-            "nodeName": nodeName,
-            "measurementName": "ingress_dma_wait_start",
-            "tileIdx": 0
-        })
+        if metaInfo.kernelLevelTiling:
+            executionBlock.addRight(
+                cls._printCycleDifference, {
+                    "prefixStr": f"{nodeName}_prefix",
+                    "suffixStr": f"{nodeName}_suffix",
+                    "flavorStr": "Kernel     :",
+                    "measurement": f"{nodeName}_kernel_measurement",
+                    "profileIdxVar": profileIdxVar,
+                })
 
-        executionBlock = cls.measurementArrayDeclaration(executionBlock, metaInfo, bufferingStr = "DB")
+        executionBlock.addRight(
+            cls._printCycleDifference, {
+                "prefixStr": f"{nodeName}_prefix",
+                "suffixStr": f"{nodeName}_suffix",
+                "flavorStr": "Post-Kernel:",
+                "measurement": f"{nodeName}_egress_dma_wait_measurement",
+                "profileIdxVar": profileIdxVar,
+            })
 
-        executionBlock.addRight(_measureCycles, {
-            "nodeName": nodeName,
-            "measurementName": "egress_dma_wait_start",
-            "tileIdx": numTiles - 1
-        })
-        executionBlock = super().generateSetupAndTeardownCode(executionBlock, metaInfo, setupStatements,
-                                                              teardownStatements)
-        executionBlock.addRight(_measureCycles, {
-            "nodeName": nodeName,
-            "measurementName": "egress_dma_wait_end",
-            "tileIdx": numTiles - 1
-        })
+        # Total Time: Input + Kernel + Output
+        # Overhead: (Input + Output) / Total
+        if metaInfo.kernelLevelTiling:
+            executionBlock.addRight(
+                cls._printCycleContribution, {
+                    "prefixStr": f"{nodeName}_prefix",
+                    "measurementInput": f"{nodeName}_ingress_dma_wait_measurement",
+                    "measurementKernel": f"{nodeName}_kernel_measurement",
+                    "measurementOutput": f"{nodeName}_egress_dma_wait_measurement",
+                    "profileIdxVar": profileIdxVar,
+                })
 
-        executionBlock = cls.injectPrintCycleDiff(executionBlock, metaInfo)
+        executionBlock.addRight(cls._printLoopTeardown, {})
 
         return executionBlock
 
     @classmethod
-    def generateInnerCode(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo,
-                          ingressDMATransferCalls: _CodeSegmentType, ingressDMAWaitStatements: _CodeSegmentType,
-                          ingressDMAUpdates: _CodeSegmentType, egressDMATransferCalls: _CodeSegmentType,
-                          egressDMAWaitStatements: _CodeSegmentType, egressDMAUpdates: _CodeSegmentType,
-                          variableUpdates: _CodeSegmentType) -> ExecutionBlock:
-
+    def kernelProfilingWrap(cls, executionBlock: ExecutionBlock, metaInfo: TilingMetaInfo) -> ExecutionBlock:
         nodeName = metaInfo.nodeName
         tileIdxVar = metaInfo.tileIdxVar
 
-        executionBlock = cls.kernelProfilingWrap(executionBlock, metaInfo)
-
-        _ingressDMAWaitStatements = []
-        _ingressDMAWaitStatements.append(CodeSnippet(_measureConditionSetup, {"cond": f"{tileIdxVar} > 0"}))
-        _ingressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "ingress_dma_wait_start",
-                "tileIdx": tileIdxVar
-            }))
-        _ingressDMAWaitStatements.append(CodeSnippet(_measureConditionEnd, {}))
-        _ingressDMAWaitStatements += ingressDMAWaitStatements
-        _ingressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "ingress_dma_wait_end",
-                "tileIdx": tileIdxVar
-            }))
-
-        _egressDMAWaitStatements = []
-        _egressDMAWaitStatements.append(CodeSnippet(_measureConditionSetup, {"cond": f"{tileIdxVar} > 0"}))
-        _egressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "egress_dma_wait_start",
-                "tileIdx": f"{tileIdxVar} - 1"
-            }))
-        _egressDMAWaitStatements += egressDMAWaitStatements
-        _egressDMAWaitStatements.append(
-            CodeSnippet(_measureCycles, {
-                "nodeName": nodeName,
-                "measurementName": "egress_dma_wait_end",
-                "tileIdx": f"{tileIdxVar} - 1"
-            }))
-        _egressDMAWaitStatements.append(CodeSnippet(_measureConditionEnd, {}))
-
-        executionBlock = super().generateInnerCode(executionBlock, metaInfo, ingressDMATransferCalls,
-                                                   _ingressDMAWaitStatements, ingressDMAUpdates, egressDMATransferCalls,
-                                                   _egressDMAWaitStatements, egressDMAUpdates, variableUpdates)
+        if metaInfo.kernelLevelTiling:
+            executionBlock.addLeft(cls._measureCycles, {
+                "measurements": f"{nodeName}_kernel_start_measurements",
+                "tileIdxVar": tileIdxVar
+            })
+            executionBlock.addRight(cls._measureCycles, {
+                "measurements": f"{nodeName}_kernel_end_measurements",
+                "tileIdxVar": tileIdxVar
+            })
 
         return executionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
index fd910cc16b..76eacd10dd 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
@@ -1,215 +1,209 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: TilingVariableReplacement.py
-#
-# Last edited: 28.09.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
-from typing import Dict, List, Tuple, Type
+import itertools
+from typing import List, Tuple
 
-from mako.parsetree import Expression, Node, Text
-
-from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.AbstractDataTypes import Struct
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureExecutionBlock
 from Deeploy.CommonExtensions.CodeTransformationPasses.IntrospectiveCodeTransformation import \
     IntrospectiveCodeTransformationMixIn
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeSnippet, CodeTransformationPass, ExecutionBlock, \
-    NetworkContext, NodeTemplate, OperatorRepresentation, TransientBuffer, _NoVerbosity
+    NetworkContext, NodeTemplate, OperatorRepresentation, TransientBuffer, VariableBuffer, _NoVerbosity, \
+    _ReferenceBuffer
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TilerExtension import Tiler
 from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement
 
 
-class TilingVariableReplacement(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
-
-    _prefix = "TILING_REPLACED_"
+class TilingVariableReplacement(CodeTransformationPass, IntrospectiveCodeTransformationMixIn, TilingHoistingMixIn):
 
     def __init__(self, targetMemLevel: str):
         self.targetMemLevel = targetMemLevel
-        self._name: str
+        TilingHoistingMixIn.__init__(self, targetMemLevel)
 
     @property
-    def prefix(self):
-        return self._prefix + f"{self._name}_" + self.targetMemLevel + "_"
+    def arenaName(self):
+        return f"{Tiler.arenaName}_{self.targetMemLevel}"
 
-    def _dereferencePointer(self, nodes: List[Node], name: str) -> List[Node]:
-        instanceIdxs = [idx for idx, node in enumerate(nodes) if isinstance(node, Expression) and node.text == name]
+    def _arenaAllocate(self, ctxt: NetworkContext, buffer: VariableBuffer, offset: int) -> VariableBuffer:
+        arena = ctxt.lookup(self.arenaName)
+        buffer.allocTemplate = NodeTemplate(" \
+        ${type.typeName} ${name} = (${type.typeName}) " + f"((char*){str(arena._instance)} + {offset});")
+        buffer.deallocTemplate = NodeTemplate("")
+        return buffer
 
-        for offset, idx in enumerate(instanceIdxs):
-            text = Text("*", source = "*", lineno = 0, pos = 0, filename = None)
-            nodes.insert(offset + idx, text)
-
-        return nodes
+    def _replaceTransients(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
+                           nodeMemoryConstraint: NodeMemoryConstraint) -> NetworkContext:
+        for value in operatorRepresentation.values():
+            if not (isinstance(value, str) and ctxt.is_local(value)):
+                continue
 
-    def _replaceImmediate(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
-                          variableReplacement: Tuple[str,
-                                                     List], dataType: Type[Pointer]) -> Tuple[NetworkContext, Dict]:
+            buffer = ctxt.lookup(value)
 
-        varName = variableReplacement[0]
-        varVal = variableReplacement[1]
+            if not (isinstance(buffer, TransientBuffer) and buffer._memoryLevel == self.targetMemLevel):
+                continue
 
-        newConstName = self.prefix + varName
-        newRefName = self.prefix + "ref_" + varName
+            memoryConstraints = nodeMemoryConstraint.tensorMemoryConstraints[buffer.name].memoryConstraints
+            assert len(memoryConstraints) == 1, f"Tiled transient buffer {buffer.name} has more than one memory level!"
+            constraint = next(iter(memoryConstraints.values()))
+            assert constraint.addrSpace is not None, f"Address space of {constraint} cannot be None!"
+            offset = constraint.addrSpace[0]
+            self._arenaAllocate(ctxt, buffer, offset)
 
-        cb = ctxt.ConstantBuffer(newConstName, shape = (len(varVal),), values = varVal)
-        ctxt.add(cb, "global")
+        return ctxt
 
-        cb._type = dataType
-        cb._instance = dataType(newConstName, ctxt)
-        cb._memoryLevel = self.targetMemLevel
+    def _replaceVariableReplacements(self, ctxt: NetworkContext, snippet: CodeSnippet,
+                                     variableReplacement: VariableReplacementScheme) -> NetworkContext:
+        operatorRepresentation = snippet.operatorRepresentation
+        template = snippet.template
 
-        reference = ctxt.hoistReference(newConstName, newRefName)
-        ctxt.lookup(reference)._memoryLevel = self.targetMemLevel
+        replacedVars = []
 
-        operatorRepresentation[varName] = reference
+        for name, values in variableReplacement.perTileReplacements.items():
+            # Case where we have already replaced the variable
+            if isinstance(operatorRepresentation[name], str):
+                continue
+            _type = variableReplacement.replacementTypes[name]
+            # LMACAN: Hoist values expects integers (should be the only thing we deal with for now...)
+            intValues = [int(v) for v in values]
+            assert all(intV == v for intV, v in zip(intValues, values)), f"Received non-int values"
+            buff = self._hoistValues(ctxt, name, intValues, _type.referencedType)
+            ref = self._hoistReference(ctxt, name + "_ref", buff)
+            operatorRepresentation[name] = ref.name
+            replacedVars.append(name)
 
-        return ctxt, operatorRepresentation
+        self.dereferenceVars(template.template, replacedVars)
 
-    def _hoistTileReference(self, ctxt: NetworkContext, reference: str, name: str, offset: int) -> NetworkContext:
+        return ctxt
 
-        refName = ctxt.hoistReference(reference, name)
-        refBuf = ctxt.lookup(refName)
+    def _replaceTiledTensors(self, ctxt: NetworkContext, snippet: CodeSnippet,
+                             tilingSchedule: TilingSchedule) -> NetworkContext:
+        operatorRepresentation = snippet.operatorRepresentation
 
-        staticBuf = ctxt.lookup(f"MEMORYARENA_{self.targetMemLevel}")
+        for name, offsets in itertools.chain(tilingSchedule.inputBaseOffsets.items(),
+                                             tilingSchedule.outputBaseOffsets.items()):
+            buffer = ctxt.lookup(operatorRepresentation[name])
+            assert isinstance(buffer, VariableBuffer)
+            unraveledBuffer = ctxt.unravelReference(buffer)
 
-        refBuf.allocTemplate = NodeTemplate(" \
-        ${type.typeName} ${name} = (${type.typeName}) " + f"((char*){str(staticBuf._instance)} + {offset});")
-        refBuf._memoryLevel = self.targetMemLevel
+            ref = self._hoistReference(ctxt, name + "_ref", unraveledBuffer)
+            ref = self._arenaAllocate(ctxt, ref, offsets[0])
+            operatorRepresentation[name] = ref.name
 
         return ctxt
 
-    def _replaceReferences(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
-                           tilingSchedule: TilingSchedule, name: str) -> Tuple[NetworkContext, Dict]:
+    def apply(self,
+              ctxt: NetworkContext,
+              executionBlock: ExecutionBlock,
+              name: str,
+              verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
+        self._initPrefix(name)
 
-        def unravelOldRef(refName):
-            oldBuf = ctxt.lookup(refName)
-            if hasattr(oldBuf, "_referenceName"):
-                return unravelOldRef(oldBuf._referenceName)
-            return oldBuf.name
+        if isinstance(executionBlock, ClosureExecutionBlock):
+            baseExecutionBlock = executionBlock.baseBlock
+        else:
+            baseExecutionBlock = executionBlock
 
-        newRefName = self.prefix + "ref_" + name
-        oldRefName = operatorRepresentation[name]
+        patternMemoryConstraint = baseExecutionBlock.patternMemoryConstraint
 
-        if name in tilingSchedule.inputBaseOffsets:
-            offset = tilingSchedule.inputBaseOffsets[name]
-        elif name in tilingSchedule.outputBaseOffsets:
-            offset = tilingSchedule.outputBaseOffsets[name]
-        else:
-            raise RuntimeError(f"Name {name} not found in TilingSchedule {tilingSchedule}")
+        if patternMemoryConstraint is None:
+            return ctxt, executionBlock
 
-        unravelRef = unravelOldRef(oldRefName)
+        assert len(patternMemoryConstraint.nodeConstraints) == 1, "Only layerwise supported for now!"
+        #assert len(executionBlock.codeSnippets) == 1, "Only layerwise supported for now!"
 
-        ctxt = self._hoistTileReference(ctxt, unravelRef, newRefName, offset[0])
-        operatorRepresentation[name] = newRefName
+        nodeMemoryConstraint = patternMemoryConstraint.nodeConstraints[0]
 
-        return ctxt, operatorRepresentation
+        possibleSnippets = [
+            node for node in baseExecutionBlock.codeSnippets if hasattr(node.template, 'tileConstraint')
+        ]
 
-    def _replaceTransients(self, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation,
-                           nodeMemoryConstraint: NodeMemoryConstraint, name: str) -> Tuple[NetworkContext, Dict]:
+        assert len(possibleSnippets) == 1, "More than one template node with TCF found"
 
-        memoryConstraints = nodeMemoryConstraint.tensorMemoryConstraints[operatorRepresentation[name]].memoryConstraints
-        assert len(memoryConstraints
-                  ) == 1, f"Tiled transient buffer {operatorRepresentation[name]} has more than one memory level!"
-        key = list(memoryConstraints.keys())[0]
-        constraint = memoryConstraints[key]
-        assert constraint.addrSpace is not None, f"Address space of {constraint} cannot be None!"
-        offset = constraint.addrSpace[0]
+        snippet = possibleSnippets[0]
+        operatorRepresentation = snippet.operatorRepresentation
+        template = snippet.template
 
-        refBuf = ctxt.lookup(operatorRepresentation[name])
+        unraveledOpRepr = {
+            key: ctxt.unravelReference(ctxt.lookup(value)).name if ctxt.is_buffer(value) else value
+            for key, value in operatorRepresentation.items()
+        }
 
-        if refBuf._memoryLevel != self.targetMemLevel:
-            return ctxt, operatorRepresentation
+        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
+            nodeMemoryConstraint, self.targetMemLevel, ctxt, unraveledOpRepr)
 
-        staticBuf = ctxt.lookup(f"MEMORYARENA_{self.targetMemLevel}")
+        minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
+        operatorRepresentation.update(newOpRepr)
 
-        refBuf.allocTemplate = NodeTemplate(" \
-        ${type.typeName} ${name} = (${type.typeName}) " + f"((char*){str(staticBuf._instance)} + {offset});")
-        refBuf.deallocTemplate = NodeTemplate("")
-        refBuf._memoryLevel = self.targetMemLevel
+        flatTilingSchedule = copy.copy(tilingSchedules[0])
+        for tilingSchedule in tilingSchedules[1:]:
+            flatTilingSchedule += tilingSchedule
 
-        return ctxt, operatorRepresentation
+        ctxt = self._replaceVariableReplacements(ctxt, snippet, minimalVariableReplacement)
+        ctxt = self._replaceTiledTensors(ctxt, snippet, flatTilingSchedule)
+        ctxt = self._replaceTransients(ctxt, operatorRepresentation, nodeMemoryConstraint)
 
-    def _replaceTiledExpressions(self, ctxt: NetworkContext, templateNode: CodeSnippet,
-                                 variableReplacement: VariableReplacementScheme, tilingSchedule: TilingSchedule,
-                                 nodeMemoryConstraint: NodeMemoryConstraint) -> NetworkContext:
+        tilingReplacedRefMap = {}
+        for key in list(flatTilingSchedule.inputBaseOffsets.keys()) + list(flatTilingSchedule.outputBaseOffsets.keys()):
+            tilingReplacedRefMap[unraveledOpRepr[key]] = operatorRepresentation[key]
 
-        operatorRepresentation = templateNode.operatorRepresentation
-        template = templateNode.template
+        # Swap any original tensor occurances with the tiled targetMemLevel-local tensor
+        for codeSnippet in executionBlock.codeSnippets:
+            template, opRepr = codeSnippet.template, codeSnippet.operatorRepresentation
 
-        immediateList = [(key, value)
-                         for key, value in variableReplacement.perTileReplacements.items()
-                         if type(operatorRepresentation[key]) != str]
+            for key, value in opRepr.items():
+                if isinstance(value, str) and value in tilingReplacedRefMap:
+                    opRepr[key] = tilingReplacedRefMap[value]
 
-        inoutSchedule = {**tilingSchedule.inputBaseOffsets, **tilingSchedule.outputBaseOffsets}
-        variableList = [key for key, value in inoutSchedule.items() if type(operatorRepresentation[key]) == str]
+            if "closureStructArgs" in opRepr:
+                closureArgsStruct: Struct = opRepr['closureStructArgs']
+                structDict = closureArgsStruct.value
 
-        transientBufferList = []
-        for key, value in operatorRepresentation.items():
-            if not isinstance(value, str):
-                continue
-            if (ctxt.is_local(value) and isinstance(ctxt.lookup(value), TransientBuffer)):
-                transientBufferList.append(key)
+                for key, value in structDict.items():
+                    if value.referenceName in tilingReplacedRefMap:
+                        structDict[key] = type(value)(tilingReplacedRefMap[value.referenceName], ctxt)
 
-        parseTree = IntrospectiveCodeTransformationMixIn._generateParseTree(template)
-        newParseTree = copy.copy(parseTree)
-        nodes = parseTree.nodes
+        self._deinitPrefix()
 
-        newNodes = copy.copy(nodes)
+        return ctxt, executionBlock
 
-        for rep in immediateList:
-            ctxt, operatorRepresentation = self._replaceImmediate(ctxt, operatorRepresentation, rep,
-                                                                  variableReplacement.replacementTypes[rep[0]])
-            newNodes = self._dereferencePointer(newNodes, rep[0])
 
-        for rep in variableList:
-            ctxt, operatorRepresentation = self._replaceReferences(ctxt, operatorRepresentation, tilingSchedule, rep)
+class TilingVariableReplacementUpdate(CodeTransformationPass, IntrospectiveCodeTransformationMixIn,
+                                      TilingHoistingMixIn):
 
-        for rep in transientBufferList:
-            ctxt, operatorRepresentation = self._replaceTransients(ctxt, operatorRepresentation, nodeMemoryConstraint,
-                                                                   rep)
+    _updateReferenceTemplate = NodeTemplate("""
+    // UPDATE VARIABLE ${reference}
+    *${reference} = ${baseReference}[${tileIdxVar}];
+    """)
 
-        newParseTree.nodes = newNodes
-        IntrospectiveCodeTransformationMixIn._reconstructCode(template, newParseTree)
+    def __init__(self, targetMemLevel: str, tileIdxVar: str = "TILING_I"):
+        super().__init__()
+        self.tileIdxVar = tileIdxVar
+        self.targetMemLevel = targetMemLevel
 
-        return ctxt
+    def _generateVariableUpdates(self, variableReplacement: VariableReplacementScheme, ctxt: NetworkContext,
+                                 operatorRepresentation: OperatorRepresentation) -> List[CodeSnippet]:
+        updates = []
+        for key in variableReplacement.perTileReplacements.keys():
+            ref = ctxt.lookup(operatorRepresentation[key])
+            assert isinstance(ref, _ReferenceBuffer)
+            updates.append(
+                CodeSnippet(self._updateReferenceTemplate, {
+                    "reference": ref.name,
+                    "tileIdxVar": self.tileIdxVar,
+                    "baseReference": ref._referenceName
+                }))
+        return updates
 
     def apply(self,
               ctxt: NetworkContext,
               executionBlock: ExecutionBlock,
               name: str,
               verbose: CodeGenVerbosity = _NoVerbosity) -> Tuple[NetworkContext, ExecutionBlock]:
-
-        def unravelReference(ctxt: NetworkContext, name: str) -> str:
-
-            if name not in ctxt.localObjects.keys() and name not in ctxt.globalObjects.keys():
-                return name
-
-            refBuffer = ctxt.lookup(name)
-            if not hasattr(refBuffer, "_referenceName"):
-                return name
-
-            return unravelReference(ctxt, refBuffer._referenceName)
-
-        self._name = name
-
         if isinstance(executionBlock, ClosureExecutionBlock):
             baseExecutionBlock = executionBlock.baseBlock
         else:
@@ -221,61 +215,33 @@ def unravelReference(ctxt: NetworkContext, name: str) -> str:
             return ctxt, executionBlock
 
         assert len(patternMemoryConstraint.nodeConstraints) == 1, "Only layerwise supported for now!"
-        #assert len(executionBlock.codeSnippets) == 1, "Only layerwise supported for now!"
 
         nodeMemoryConstraint = patternMemoryConstraint.nodeConstraints[0]
 
-        possibleTemplateNodes = [
+        possibleSnippets = [
             node for node in baseExecutionBlock.codeSnippets if hasattr(node.template, 'tileConstraint')
         ]
 
-        assert len(possibleTemplateNodes) == 1, "More than one template node with TCF found"
+        assert len(possibleSnippets) == 1, "More than one template node with TCF found"
 
-        templateNode = possibleTemplateNodes[0]
-        operatorRepresentation = templateNode.operatorRepresentation
+        snippet = possibleSnippets[0]
+        operatorRepresentation = snippet.operatorRepresentation
+        template = snippet.template
 
-        unravelRep = operatorRepresentation.copy()
-        for key in unravelRep.keys():
+        unraveledOpRepr = {
+            key: ctxt.unravelReference(ctxt.lookup(value)).name if ctxt.is_buffer(value) else value
+            for key, value in operatorRepresentation.items()
+        }
 
-            val = unravelRep[key]
-            if not isinstance(val, str):
-                continue
-
-            unravelRep[key] = unravelReference(ctxt, val)
+        variableReplacement, _ = template.tileConstraint.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel,
+                                                                            ctxt, unraveledOpRepr)
 
-        template = templateNode.template
+        minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
+        operatorRepresentation.update(newOpRepr)
 
-        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
-            nodeMemoryConstraint, self.targetMemLevel, ctxt, unravelRep)
+        updates = self._generateVariableUpdates(minimalVariableReplacement, ctxt, operatorRepresentation)
 
-        minimalVariableReplacement, newNodeRep = minimizeVariableReplacement(variableReplacement,
-                                                                             templateNode.operatorRepresentation)
-        for key, value in newNodeRep.items():
-            templateNode.operatorRepresentation[key] = value
+        for snippet in updates:
+            executionBlock.addLeft(snippet.template, snippet.operatorRepresentation)
 
-        flatTilingSchedule = copy.copy(tilingSchedules[0])
-        for tilingSchedule in tilingSchedules[1:]:
-            flatTilingSchedule += tilingSchedule
-
-        ctxt = self._replaceTiledExpressions(ctxt, templateNode, minimalVariableReplacement, flatTilingSchedule,
-                                             nodeMemoryConstraint)
-
-        for codeSnippet in executionBlock.codeSnippets:
-
-            template, nRep = codeSnippet.template, codeSnippet.operatorRepresentation
-
-            if not "closureStructArgs" in nRep:
-                continue
-
-            keyList = {}
-
-            for key in list(flatTilingSchedule.inputBaseOffsets.keys()) + list(
-                    flatTilingSchedule.outputBaseOffsets.keys()):
-                keyList[unravelRep[key]] = operatorRepresentation[key]
-
-            for key in copy.copy(nRep['closureStructArgs'].value).keys():
-                if nRep['closureStructArgs'].value[key].referenceName in keyList.keys():
-                    nRep['closureStructArgs'].value[key] = type(nRep['closureStructArgs'].value[key])(
-                        keyList[nRep['closureStructArgs'].value[key].referenceName], ctxt)
-
-        return ctxt, executionBlock
+        return super().apply(ctxt, executionBlock, name, verbose)
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/__init__.py b/Deeploy/TilingExtension/CodeTransformationPasses/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/__init__.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/TilingExtension/GenericFlow.py b/Deeploy/TilingExtension/GenericFlow.py
index 626bef0bf7..cdb2d5c728 100644
--- a/Deeploy/TilingExtension/GenericFlow.py
+++ b/Deeploy/TilingExtension/GenericFlow.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: Flow.py
-#
-# Last edited: 28.07.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from abc import abstractmethod
 from dataclasses import dataclass
diff --git a/Deeploy/TilingExtension/HtmlTemplates.py b/Deeploy/TilingExtension/HtmlTemplates.py
index 9466fd966f..5755621489 100644
--- a/Deeploy/TilingExtension/HtmlTemplates.py
+++ b/Deeploy/TilingExtension/HtmlTemplates.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
 #
-# File: HtmlTemplates.py
-#
-# Last edited: 20.03.2025
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 # Create Monad that take a Deployer and make it TilerAware
 # Define Tiler Obj centralize all tilling related functionalities for a given deployer.
diff --git a/Deeploy/TilingExtension/MemoryConstraintFlows.py b/Deeploy/TilingExtension/MemoryConstraintFlows.py
index 4696498ef5..4e74945d92 100644
--- a/Deeploy/TilingExtension/MemoryConstraintFlows.py
+++ b/Deeploy/TilingExtension/MemoryConstraintFlows.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryConstraintFlows.py
-#
-# Last edited: 01.08.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from collections import namedtuple
diff --git a/Deeploy/TilingExtension/MemoryConstraints.py b/Deeploy/TilingExtension/MemoryConstraints.py
index 0c12368250..756a6ccc48 100644
--- a/Deeploy/TilingExtension/MemoryConstraints.py
+++ b/Deeploy/TilingExtension/MemoryConstraints.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryConstraints.py
-#
-# Last edited: 27.07.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from __future__ import annotations
 
@@ -42,7 +21,7 @@ def __init__(self, memoryLevel: str, size: Union[IntVar, int]):
         self.size: Union[int, IntVar] = size
         self.multiBufferCoefficient: Union[int, IntVar] = 1
 
-        self.shape: Optional[Tuple[int]] = None
+        self.shape: Optional[Tuple[int, ...]] = None
         self.addrSpace: Optional[Tuple[int, int]] = None
 
     def __repr__(self) -> str:
diff --git a/Deeploy/TilingExtension/MemoryScheduler.py b/Deeploy/TilingExtension/MemoryScheduler.py
index a6f6a75bc4..e46f50e6f7 100644
--- a/Deeploy/TilingExtension/MemoryScheduler.py
+++ b/Deeploy/TilingExtension/MemoryScheduler.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: MemoryScheduler.py
-#
-# Last edited: 06.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from __future__ import annotations
 
@@ -33,8 +12,7 @@
 import numpy as np
 from ortools.constraint_solver.pywrapcp import IntVar
 
-from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
-    _permuteList
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import _permute
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, TransientBuffer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy
 from Deeploy.TilingExtension.MemoryConstraints import PatternMemoryConstraints, TensorMemoryConstraint
@@ -540,7 +518,7 @@ def scheduleMemoryConstraints(self,
     def constraintTileBuffersWithOverlappingLifetime(self, tilerModel: TilerModel, ctxt: NetworkContext,
                                                      patternMemoryConstraint: PatternMemoryConstraints,
                                                      memoryHierarchy: MemoryHierarchy):
-        """This method adds the necessay constraints for tiling to be performed before the static memory allocation of the tile buffers.
+        """This method adds the necessary constraints for tiling to be performed before the static memory allocation of the tile buffers.
         To perform static memory allocation after tiling (i.e. decouple tiling and memory alloc), we need to do two assumptions
 
             1. All tile buffers for each node have overlapping lifetime, so we can find their memory footprint by just summing their sizes and hence we don't need to know the specific memory allocation. This assumption is true as soon as we don't do tile several nodes together (ask me if you don't know what I mean here).
@@ -575,7 +553,7 @@ def constraintTileBuffersWithOverlappingLifetime(self, tilerModel: TilerModel, c
                     if memoryLevel.name == infoDict['memoryLevel']:
                         sumExpr += infoDict['sizeVar'] * infoDict['typeWidthFactor'] * infoDict['multiBufferCoeff']
                 if sumExpr != 0:
-                    tilerModel.addConstraint(sumExpr + constantTensorOffset <= memoryLevel.size)
+                    tilerModel.addConstraint(sumExpr + constantTensorOffset, memoryLevel = memoryLevel)
 
     def getSymbolicCostName(self, patternIdx: int, memoryLevel: str) -> str:
         stringSuffix = self._stringSuffix + f"_{memoryLevel}"
@@ -659,7 +637,7 @@ def permMatrix2permList(permMatrix: np.ndarray) -> List[int]:
                 permList = permMatrix2permList(_permutationMatrix)
 
                 if pattern != [] and len(pattern) > 1:
-                    permPattern = _permuteList(pattern, permList)
+                    permPattern = _permute(pattern, permList)
                 else:
                     permPattern = pattern
 
diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py
index eed22b0961..5b067b2ce9 100644
--- a/Deeploy/TilingExtension/TileConstraint.py
+++ b/Deeploy/TilingExtension/TileConstraint.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: TileConstraint.py
-#
-# Last edited: 26.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 from abc import abstractmethod
@@ -31,12 +9,11 @@
 import numpy as np
 from ortools.constraint_solver.pywrapcp import IntVar
 
-#from Deeploy import TilerModel
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, TensorMemoryConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
 from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, MemoryTransfer, \
-    TilingSchedule, VariableReplacementScheme, computeHyperRectangleList
+    TilingSchedule, VariableReplacementScheme, computeTileHyperRectangles
 
 
 class TileConstraint():
@@ -65,19 +42,15 @@ def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
 
     @staticmethod
     def getBaseAddr(tilingSolution, targetMemLevel, name) -> List[Optional[int]]:
+        mc = tilingSolution.tensorMemoryConstraints[name].memoryConstraints[targetMemLevel]
 
-        block = tilingSolution.tensorMemoryConstraints[name].memoryConstraints[targetMemLevel]
-
-        if block.addrSpace is None:
+        if mc.addrSpace is None:
             return [None]
 
-        baseAddr = block.addrSpace[0]
-        endAddr = block.addrSpace[1]
-        sol = []
-        for it in range(block.multiBufferCoefficient):
-            addr = ((endAddr - baseAddr) // block.multiBufferCoefficient) * it + baseAddr
-            sol.append(addr)
-        return sol
+        start, end = mc.addrSpace
+        bufferSize = (end - start) // mc.multiBufferCoefficient
+
+        return [start + bufferSize * i for i in range(mc.multiBufferCoefficient)]
 
     @staticmethod
     def extractBaseAddr(tilingSolution: NodeMemoryConstraint, targetMemLevel: str,
@@ -102,9 +75,6 @@ def extractBaseAddr(tilingSolution: NodeMemoryConstraint, targetMemLevel: str,
 
     @staticmethod
     def sanitizeTilingSchedule(tilingSchedule: TilingSchedule) -> TilingSchedule:
-
-        _tilingSchedule = tilingSchedule
-
         for baseOffsetName, baseOffsetValue in tilingSchedule.inputBaseOffsets.copy().items():
             if baseOffsetValue == [None]:
                 for step in tilingSchedule.inputLoadSchedule:
@@ -117,7 +87,7 @@ def sanitizeTilingSchedule(tilingSchedule: TilingSchedule) -> TilingSchedule:
                     del step[baseOffsetName]
                 del tilingSchedule.outputBaseOffsets[baseOffsetName]
 
-        return _tilingSchedule
+        return tilingSchedule
 
     @classmethod
     def wrapTilingSolution(
@@ -144,14 +114,13 @@ def _offsetAdd(offsetA: Tuple[int, ...], offsetB: Tuple[int, ...]) -> Tuple[int,
         def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List[AbsoluteHyperRectangle],
                              sourceMemoryLevel: str,
                              targetMemoryLevel: str) -> Tuple[List[AbsoluteHyperRectangle], List[int]]:
-
             solution = []
             solutionLengths = []
 
             for sourceCube in sourceCubes:
                 memTransfer = getMemoryTransfer(tensorConstraint, sourceCube.rectangle, sourceMemoryLevel,
                                                 targetMemoryLevel)
-                solutionCubes = computeHyperRectangleList(memTransfer)
+                solutionCubes = computeTileHyperRectangles(memTransfer)
                 solutionAbsoluteCubes = [
                     AbsoluteHyperRectangle(rectangle = cube,
                                            absoluteOffset = _offsetAdd(sourceCube.absoluteOffset, cube.offset))
@@ -162,32 +131,29 @@ def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List
 
             return solution, solutionLengths
 
-        assert len(tilingSolution.outputTensorMemoryConstraints.keys()) == 1, "Expected node to have only one output!"
-        varOut = list(tilingSolution.outputTensorMemoryConstraints.keys())[0]
+        assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!"
+
+        outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items()))
+        memoryPath = list(outTensorConstraint.memoryConstraints.keys())
 
-        outTensorConstraint = tilingSolution.tensorMemoryConstraints[varOut]
-        outTensorMemoryLevelPath = list(outTensorConstraint.memoryConstraints.keys())
-        targetIdxs = [idx for idx, key in enumerate(outTensorMemoryLevelPath) if key == targetMemLevel]
+        assert targetMemLevel in memoryPath, \
+            f"Target memory level {targetMemLevel} does not exist in the memory path {memoryPath}"
 
-        assert len(targetIdxs) == 1, f"Received more than one spec for memoryLevel {targetMemLevel}"
-        targetIdx = targetIdxs[0]
+        targetIdx = memoryPath.index(targetMemLevel)
 
         if targetIdx == 0:
             # SCHEREMO: Watch out - this happens if inputs are in L(N+1) but outputs only in L(N)
             targetIdx = 1
 
-        fullShape = ctxt.lookup(varOut).shape
-        initialOffset = tuple([0] * len(fullShape))
+        fullShape = ctxt.lookup(outVar).shape
+        initialOffset = (0,) * len(fullShape)
         outputCubes = [
             AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)),
                                    absoluteOffset = initialOffset)
         ]
 
-        for targetIdx in list(range(targetIdx + 1))[1:]:
-            sourceMemoryLevel = outTensorMemoryLevelPath[targetIdx - 1]
-            targetMemoryLevel = outTensorMemoryLevelPath[targetIdx]
-            outputCubes, solutionLengths = getCubeTransfers(outTensorConstraint, outputCubes, sourceMemoryLevel,
-                                                            targetMemoryLevel)
+        for source, target in zip(memoryPath[:targetIdx], memoryPath[1:targetIdx + 1]):
+            outputCubes, solutionLengths = getCubeTransfers(outTensorConstraint, outputCubes, source, target)
 
         arrayOfCubes = []
         _idx = 0
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index abd3e38329..9b48d9456c 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: TilerExtension.py
-#
-# Last edited: 09.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 # Create Monad that take a Deployer and make it TilerAware
 # Define Tiler Obj centralize all tilling related functionalities for a given deployer.
@@ -45,6 +23,8 @@
 from Deeploy.CommonExtensions.NetworkDeployers.NetworkDeployerWrapper import NetworkDeployerWrapper
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeBinding, NodeTemplate, ONNXLayer, Schedule, \
     SubGraph, TransientBuffer
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.Logging import SUCCESS_MARK
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper, \
     MemoryLevelAwareDeployer, MemoryPlatform, MemoryPlatformWrapper, TargetMemoryLevelMapping
@@ -58,6 +38,7 @@
 from Deeploy.TilingExtension.TilerModel import TilerModel
 
 TilingSolution = List[PatternMemoryConstraints]
+MemoryMap = Dict[str, List[List[MemoryBlock]]]
 
 _deallocTemplate = NodeTemplate("")
 
@@ -71,7 +52,7 @@ class Tiler():
     _MINIMALLOC_OUTPUT_FILENAME = "output_minimalloc"
 
     # Initialize with the list of TemplateTCFbinding
-    def __init__(self, memoryHierarchy: MemoryHierarchy):
+    def __init__(self, memoryHierarchy: MemoryHierarchy, testName: Optional[str] = None, workDir: Optional[str] = None):
 
         self.memoryHierarchy = memoryHierarchy
         self.tilerModel: Optional[TilerModel] = None
@@ -85,6 +66,23 @@ def __init__(self, memoryHierarchy: MemoryHierarchy):
         self.memoryAllocStrategy: Literal["TetrisRandom", "TetrisCo-Opt", "MiniMalloc"] = "TetrisRandom"
         self.searchStrategy: Literal["min", "max", "random-max"] = "random-max"
 
+        if workDir is not None:
+            os.makedirs(workDir, exist_ok = True)
+            minimalloc_base = os.path.join(workDir, self._MINIMALLOC_INPUT_FILENAME)
+            minimalloc_output_base = os.path.join(workDir, self._MINIMALLOC_OUTPUT_FILENAME)
+        else:
+            minimalloc_base = self._MINIMALLOC_INPUT_FILENAME
+            minimalloc_output_base = self._MINIMALLOC_OUTPUT_FILENAME
+
+        if testName is not None:
+            # VJUNG: Sanitize path
+            safe_test_name = testName.replace("/", "_").replace("\\", "_")
+            self._minimalloc_input = f"{minimalloc_base}_{safe_test_name}"
+            self._minimalloc_output = f"{minimalloc_output_base}_{safe_test_name}"
+        else:
+            self._minimalloc_input = minimalloc_base
+            self._minimalloc_output = minimalloc_output_base
+
     @property
     def worstCaseBufferSize(self):
         return self._worstCaseBufferSize
@@ -126,6 +124,12 @@ def plotSingleMemoryLevel(memoryLevel: MemoryLevel):
 
             for memoryMapStep in memoryMap[memoryLevel.name]:
                 for buffer in memoryMapStep:
+                    if not hasattr(buffer, "_addrSpace") or buffer._addrSpace is None:
+                        log.warning(
+                            f"Buffer {buffer.name} has no address space assigned, skipping it in the memory allocation plot."
+                        )
+                        continue
+
                     fig.add_trace(
                         go.Scatter(x = [
                             buffer._lifetime[0] - 0.5, buffer._lifetime[0] - 0.5, buffer._lifetime[1] + 0.5,
@@ -251,7 +255,7 @@ def _convertCtxtToStaticSchedule(self, ctxt: NetworkContext,
 
     def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memoryLevel: str):
 
-        with open(f"{self._MINIMALLOC_INPUT_FILENAME}.csv", mode = "w", newline = "") as file:
+        with open(f"{self._minimalloc_input}.csv", mode = "w", newline = "") as file:
             writer = csv.writer(file, lineterminator = "\n")
             writer.writerow(["id", "lower", "upper", "size"])
             for memoryBlock in memoryMap:
@@ -285,19 +289,19 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
             raise KeyError("MINIMALLOC_INSTALL_DIR symbol not found!")
 
         minimallocOutput = subprocess.run([
-            f"{minimallocInstallDir}/minimalloc", f"--capacity={capacity}",
-            f"--input={self._MINIMALLOC_INPUT_FILENAME}.csv", f"--output={self._MINIMALLOC_OUTPUT_FILENAME}.csv"
+            f"{minimallocInstallDir}/minimalloc", f"--capacity={capacity}", f"--input={self._minimalloc_input}.csv",
+            f"--output={self._minimalloc_output}.csv"
         ],
                                           capture_output = True,
                                           text = True)
 
         if minimallocOutput.returncode != 0:
-            print(
-                f"\033[91mError: Memory allocator failed with return code {minimallocOutput.returncode} at memory level {memoryLevel} with capacity of {capacity} bytes \033[0m"
+            log.error(
+                f"Memory allocator failed with return code {minimallocOutput.returncode} at memory level {memoryLevel} with capacity of {capacity} bytes!"
             )
             raise subprocess.CalledProcessError(minimallocOutput.returncode, " ".join(minimallocOutput.args))
 
-        with open(f"{self._MINIMALLOC_OUTPUT_FILENAME}.csv", mode = "r", newline = "") as file:
+        with open(f"{self._minimalloc_output}.csv", mode = "r", newline = "") as file:
             reader = csv.reader(file)
             header = next(reader)
             for row in reader:
@@ -307,23 +311,25 @@ def minimalloc(self, memoryMap, ctxt, nodeMemoryConstraint, capacity: int, memor
 
         return memoryMap
 
-    def computeTilingSchedule(self, ctxt: NetworkContext) -> Tuple[TilingSolution, Dict[str, List[List[MemoryBlock]]]]:
-
+    def computeTilingSchedule(self, ctxt: NetworkContext) -> TilingSolution:
         assert self.tilerModel is not None and self.symbolicMemoryConstraints is not None, "Set up the model before trying to compute a schedule!"
-
         collector = self.tilerModel.trySolveModel()
-        tilingSchedule = self._getTilingSolution(self.tilerModel, ctxt, collector, self.symbolicMemoryConstraints)
-
+        tilingSolution = self._getTilingSolution(self.tilerModel, ctxt, collector, self.symbolicMemoryConstraints)
         if not self.memoryAllocStrategy == "MiniMalloc":
+            assert self.tilerModel is not None
+            log.debug(" - Extract Memory Allocation")
             self.innerMemoryScheduler.annotateSolution(ctxt, self.tilerModel)
             self.outerMemoryScheduler.annotateSolution(ctxt, self.tilerModel)
+        return tilingSolution
 
+    def computeMemoryMap(self, ctxt: NetworkContext, tilingSolution: TilingSolution) -> MemoryMap:
         memoryMap = {}
 
         for key in self.innerMemoryScheduler.memoryMap.keys():
             memoryMap[key] = [*self.innerMemoryScheduler.memoryMap[key], *self.outerMemoryScheduler.memoryMap[key]]
 
         if self.memoryAllocStrategy == "MiniMalloc":
+            log.debug(" - Solve Memory Allocation with MiniMalloc")
             for memoryLevel in memoryMap.keys():
                 constantTensorOffset = self.outerMemoryScheduler.getConstantTensorOffset(ctxt, memoryLevel)
                 if memoryLevel == self.memoryHierarchy._defaultMemoryLevel.name:
@@ -334,11 +340,15 @@ def computeTilingSchedule(self, ctxt: NetworkContext) -> Tuple[TilingSolution, D
                     for idx, memMap in enumerate(memoryMap[memoryLevel]):
                         if len(memoryMap[memoryLevel][idx]) != 0:
                             memoryMap[memoryLevel][idx] = self.minimalloc(
-                                memMap, ctxt, tilingSchedule[idx].nodeConstraints[0],
+                                memMap, ctxt, tilingSolution[idx].nodeConstraints[0],
                                 self.memoryHierarchy.memoryLevels[memoryLevel].size - constantTensorOffset, memoryLevel)
-            print(f"\033[92mMemory allocation sucessful!\033[0m")
+            log.info(f" {SUCCESS_MARK} Memory allocation successful!")
 
-        for idx, pattern in enumerate(tilingSchedule):
+        return memoryMap
+
+    def annotateMemoryLevel(self, ctxt: NetworkContext, tilingSolution: TilingSolution,
+                            memoryMap: Dict) -> NetworkContext:
+        for idx, pattern in enumerate(tilingSolution):
             for nodeIdx, nodeConstraint in enumerate(pattern.nodeConstraints):
                 for tensorConstraint in nodeConstraint.tensorMemoryConstraints.values():
                     for memoryConstraint in tensorConstraint.memoryConstraints.values():
@@ -359,10 +369,7 @@ def computeTilingSchedule(self, ctxt: NetworkContext) -> Tuple[TilingSolution, D
 
                         block = _block[0]
                         memoryConstraint.addrSpace = block.addrSpace
-
-        self._convertCtxtToStaticSchedule(ctxt, memoryMap)
-
-        return tilingSchedule, memoryMap
+        return ctxt
 
     def setupModel(self, ctxt: NetworkContext, schedule: Schedule, layerBinding: OrderedDict[str, ONNXLayer],
                    targetMemoryLevelMapping: TargetMemoryLevelMapping) -> NetworkContext:
@@ -445,7 +452,8 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo
 
             if not isinstance(ctxt.lookup(tensorName), TransientBuffer):
 
-                tensorShapeLen = len(ctxt.lookup(tensorName).shape)
+                tensorShapeLen = 1 if isinstance(ctxt.lookup(tensorName).shape, int) else len(
+                    ctxt.lookup(tensorName).shape)
                 newShape: List[int] = []
 
                 if isinstance(memoryConstraint.size, int):
@@ -456,7 +464,7 @@ def _resolveTensorMemoryConstraint(self, tilerModel: TilerModel, ctxt: NetworkCo
                         newShape.append(
                             self.tilerModel._resolveVariable(tilerModel.getTensorDimVar(tensorName, i, copyIdx)))
 
-                newMemoryConstraint.shape = tuple(newShape)
+                newMemoryConstraint.shape = (newShape,) if isinstance(newShape, int) else tuple(newShape)
 
             solvedTensorConstraint.addMemoryConstraint(newMemoryConstraint)
 
@@ -911,6 +919,19 @@ def assertUniformMemoryLevelAllocation(self, ctxt: NetworkContext, defaultMemory
                 return False
         return True
 
+    def testTilingSolutionCorrectness(self, tilingSolution: TilingSolution) -> None:
+        # LMACAN: Assert buffer sizes are word aligned as per comment in MemoryScheduler.py:MemoryScheduler._buildCostVector()
+        byteAlignment = MemoryScheduler.byteAlignment
+        for patternMemoryConstraint in tilingSolution:
+            for nodeMemoryConstraint in patternMemoryConstraint.nodeConstraints:
+                for tensorMemoryConstraint in nodeMemoryConstraint.tensorMemoryConstraints.values():
+                    for memoryConstraint in tensorMemoryConstraint.memoryConstraints.values():
+                        if memoryConstraint.addrSpace is not None:
+                            assert isinstance(memoryConstraint.multiBufferCoefficient, int)
+                            bufferSize = (memoryConstraint.addrSpace[1] -
+                                          memoryConstraint.addrSpace[0]) // memoryConstraint.multiBufferCoefficient
+                            assert bufferSize % byteAlignment == 0, f"Buffer in {memoryConstraint} is not {byteAlignment} byte aligned"
+
     def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]], graph: gs.Graph,
                                  schedule: Schedule) -> None:
 
@@ -919,8 +940,8 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]]
         }
 
         # JUNGVI: Assert output buffers are alive until the end
-        for outputBuffer in graph.outputs:
-            assert memoryBlockMap[outputBuffer.name]._lifetime[-1] == len(
+        for tensor in graph.outputs:
+            assert memoryBlockMap[tensor.name]._lifetime[-1] == len(
                 schedule), "Invalid memory map! Output buffer is not alive at the last step!"
 
         # JUNGVI: Assert input buffers are alive at the beginning
@@ -940,26 +961,27 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]]
 
 class TilerDeployerWrapper(NetworkDeployerWrapper):
 
-    def __init__(self, deployer: Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper], tilerCls: Type[Tiler] = Tiler):
+    def __init__(self,
+                 deployer: Union[MemoryLevelAwareDeployer, MemoryDeployerWrapper],
+                 tilerCls: Type[Tiler] = Tiler,
+                 testName: Optional[str] = None,
+                 workDir: Optional[str] = None):
         super().__init__(deployer)
         assert isinstance(self.Platform, (MemoryPlatform, MemoryPlatformWrapper)), \
             f"Platform should be a MemoryPlatform or MemoryPlatformWrapper! Got {type(self.Platform).__name__}"
-        self.tiler = tilerCls(self.Platform.memoryHierarchy)
+        self.tiler = tilerCls(self.Platform.memoryHierarchy, testName = testName, workDir = workDir)
 
     @property
     def worstCaseBufferSize(self):
-        maxAddr: Dict[str, int] = self.tiler.worstCaseBufferSize
-
-        # WIESEP: Memory map form tiler does not include inputs and outputs
-        for node in (self.inputs() + self.outputs()):
-            maxAddr[node._memoryLevel] += np.prod(node.shape) * node._type.referencedType.typeWidth // 8
+        return self.tiler.worstCaseBufferSize
 
-        return maxAddr
+    def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optional[MemoryMap] = None):
+        assert (tilingSolution is None and memoryMap is None) or (tilingSolution is not None and memoryMap is not None), \
+            "You need to provide both the manual tilingSolution and the memoryMap to override tiling."
 
-    def tile(self, tilingSolution: Optional[TilingSolution] = None):
-        if tilingSolution is None:
-            schedule = self.scheduler(self.graph)
+        schedule = self.scheduler(self.graph)
 
+        if tilingSolution is None and memoryMap is None:
             # JUNGVI: Currently using MiniMalloc is only supported for layer-wise execution and all tensors in the default memory level.
             if self.tiler.memoryAllocStrategy == "MiniMalloc":
                 assert self.tiler.assertLayerWiseTiling(schedule), "Using MiniMalloc and DFT is not supported!"
@@ -967,15 +989,31 @@ def tile(self, tilingSolution: Optional[TilingSolution] = None):
                     self.ctxt, self.Platform.memoryHierarchy._defaultMemoryLevel.name
                 ), "All tensors have to be in the default memory level when using MiniMalloc!"
 
+            log.debug(" - Setup Constraint Model")
             self.tiler.setupModel(ctxt = self.ctxt,
                                   schedule = schedule,
                                   layerBinding = self.layerBinding,
                                   targetMemoryLevelMapping = self.getTargetMemoryLevelMapping())
-            tilingSolution, memoryMap = self.tiler.computeTilingSchedule(self.ctxt)
-            if self.tiler.visualizeMemoryAlloc:
-                self.tiler.plotMemoryAlloc(memoryMap, self.ctxt, self.deeployStateDir, self.Platform.memoryHierarchy)
+            tilingSolution = self.tiler.computeTilingSchedule(self.ctxt)
+
+            memoryMap = self.tiler.computeMemoryMap(self.ctxt, tilingSolution)
+
+        assert tilingSolution is not None and memoryMap is not None
 
-            self.tiler.testMemoryMapCorrectness(memoryMap, self.graph, schedule)
+        log.debug(" - Test Tiling Solution Correctness")
+        self.tiler.testTilingSolutionCorrectness(tilingSolution)
+
+        log.debug(" - Annotate Memory Levels")
+        self.tiler.annotateMemoryLevel(self.ctxt, tilingSolution, memoryMap)
+
+        self.ctxt = self.tiler._convertCtxtToStaticSchedule(self.ctxt, memoryMap)
+
+        if self.tiler.visualizeMemoryAlloc:
+            log.info(f" > Export Memory Allocation Visualization to {self.deeployStateDir}")
+            self.tiler.plotMemoryAlloc(memoryMap, self.ctxt, self.deeployStateDir, self.Platform.memoryHierarchy)
+
+        log.debug(" - Test Memory Map Correctness")
+        self.tiler.testMemoryMapCorrectness(memoryMap, self.graph, schedule)
 
         # SCHEREMO: Annotate execution block with solution
         for layer, pattern in zip(self.layerBinding.values(), tilingSolution):
@@ -987,9 +1025,25 @@ def bind(self):
         if not super().bind():
             return False
 
+        log.info("- Performing Tiling and Memory Allocation")
         self.tile()
         return True
 
+    def _printMemorySummary(self):
+        log.info("")
+        log.info("Memory Usage Report:")
+        log.info(f"  {'Level':<14} {'Capacity (bytes)':>10} {'Total':>10} (    Static + Dynamic   ) (Usage )")
+        log.info("  " + "-" * 78)
+
+        for level, dynamicSize in self.worstCaseBufferSize.items():
+            staticSize = self.tiler.outerMemoryScheduler.getConstantTensorOffset(self.ctxt, level)
+            capacity = self.tiler.memoryHierarchy.memoryLevels[level].size
+            total = staticSize + dynamicSize
+
+            log.info(f"  {level:<20} {capacity:10,} {total:10,d} "
+                     f"({staticSize:10,d} + {dynamicSize:10,d}) "
+                     f"({total / capacity * 100:5.1f}%)")
+
 
 def TilingReadyNodeBindings(nodeBindings: List[NodeBinding], tileConstraint: TileConstraint) -> List[NodeBinding]:
     '''
diff --git a/Deeploy/TilingExtension/TilerModel.py b/Deeploy/TilingExtension/TilerModel.py
index fb3645e0f6..db83974f0c 100644
--- a/Deeploy/TilingExtension/TilerModel.py
+++ b/Deeploy/TilingExtension/TilerModel.py
@@ -1,29 +1,8 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: TilerModel.py
-#
-# Last edited: 25.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
+import logging
 from dataclasses import dataclass
 from pprint import pformat
 from typing import Dict, List, Literal, Optional, Tuple, Union
@@ -32,6 +11,7 @@
 from ortools.constraint_solver.pywrapcp import IntExpr, IntVar, SolutionCollector, Solver
 
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryLevel
 
 _COPYIDXSUFFIX = "_copyIdx_"
@@ -124,6 +104,10 @@ def addConstraint(self,
                       constraintExpression: IntExpr,
                       memoryLevel: Optional[MemoryLevel] = None,
                       strategy: Optional[AddConstraintStrategy] = None):
+        # Skip TrueConstraints
+        if constraintExpression.DebugString() == "TrueConstraint()":
+            return
+
         if isinstance(strategy, PerformanceHint):
             if memoryLevel is None:
                 self._performanceConstraints.append((strategy.priority, constraintExpression))
@@ -163,7 +147,9 @@ def addTensorDimToModel(self, ctxt: NetworkContext, tensorName: str, copyIdx: Op
         '''
         tensor = ctxt.lookup(tensorName)
 
-        for idx, dim in enumerate(tensor.shape):
+        for idx, dim in enumerate([
+                tensor.shape,
+        ] if isinstance(tensor.shape, int) else tensor.shape):
 
             varName = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx)
 
@@ -186,7 +172,9 @@ def addTensorNumOfEltToModel(self, ctxt: NetworkContext, tensorName: str, copyId
 
         tensorDimProductExpr = 1
 
-        for idx, _ in enumerate(tensor.shape):
+        for idx, _ in enumerate([
+                tensor.shape,
+        ] if isinstance(tensor.shape, int) else tensor.shape):
 
             varNameIdx = f"{tensor.name}_dim_{idx}" + self._getSuffix(copyIdx)
             tensorDimProductExpr *= self._variables[varNameIdx]
@@ -404,9 +392,13 @@ def _solveModel(
 
         timeLimit = self._model.TimeLimit(_SOLVERTIMEOUT)
 
-        log = self._model.SearchLog(1000000)
+        log.debug(" - Solve Constraint Model")
 
-        _ = self._model.Solve(decisionBuilder, [objective, collector, log, timeLimit])
+        if log.getEffectiveLevel() <= logging.DEBUG:
+            searchLog = self._model.SearchLog(1000000)
+            _ = self._model.Solve(decisionBuilder, [objective, collector, searchLog, timeLimit])
+        else:
+            _ = self._model.Solve(decisionBuilder, [objective, collector, None, timeLimit])
 
         assert collector.SolutionCount() > 0, "Error in Tiler: No solution found"
 
diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py
index 6a2ff26674..0974fa337b 100644
--- a/Deeploy/TilingExtension/TilingCodegen.py
+++ b/Deeploy/TilingExtension/TilingCodegen.py
@@ -1,37 +1,17 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: TilingCodegen.py
-#
-# Last edited: 11.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Dict, Iterable, List, Optional, Tuple, Type
+from typing import Dict, Generator, List, Sequence, Tuple, Type
 
 import numpy as np
 
 from Deeploy.AbstractDataTypes import Pointer
-from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint
+from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer
+from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint
 
 
 @dataclass
@@ -51,8 +31,8 @@ def __init__(self, offset: Tuple[int, ...], dims: Tuple[int, ...]):
         assert len(offset) == len(
             dims), f"HyperRectangle offset and dims for mismatching dimensions {offset} and {dims}"
 
-        self.offset = offset
-        self.dims = dims
+        self.offset = tuple(offset) if not isinstance(offset, tuple) else offset
+        self.dims = tuple(dims) if not isinstance(dims, tuple) else dims
 
 
 @dataclass
@@ -194,177 +174,117 @@ def minimizeVariableReplacement(
     return VariableReplacementScheme(newPerTileRep, newRepTypes), operatorRepresentation
 
 
-def minimizeRectangleDims(hyperRectangle: HyperRectangle,
-                          referenceBuffer: VariableBuffer) -> Tuple[HyperRectangle, HyperRectangle]:
-
-    rectDims = hyperRectangle.dims
-    rectOffset = hyperRectangle.offset
-    shape = referenceBuffer.shape
-    newDims: List[int] = []
-    newOffset: List[int] = []
-
-    newBaseline = []
-
-    reversedRectOffset = list(reversed(rectOffset))
+def minimizeRectangle(rect: HyperRectangle, referenceShape: Sequence[int]) -> Tuple[HyperRectangle, Tuple[int, ...]]:
+    minRectShape: List[int] = []
+    minRectOffset: List[int] = []
+    minReferenceShape: List[int] = []
 
     # SCHEREMO: Collapse dimensions right to left
-    acc = 0
-    for idx, (tileDim, bufDim) in enumerate(zip(reversed(rectDims), reversed(shape))):
-
-        if tileDim == bufDim:
-            assert reversedRectOffset[idx] == 0, "Can't not tile a dimension and have an offset, tf"
-
-        # SCHEREMO: Collapse if equal
-        if tileDim == bufDim and acc != 0:
-            acc *= tileDim
-        elif tileDim == bufDim and acc == 0:
-            acc = tileDim
-        elif tileDim != bufDim and acc != 0:
-            newDims.insert(0, acc * tileDim)
-            newBaseline.insert(0, acc * bufDim)
-            newOffset.insert(0, acc * reversedRectOffset[idx])
-            acc = 0
+    currentCollapsedDim = 1
+    for rectDim, rectOffset, referenceDim in zip(reversed(rect.dims), reversed(rect.offset), reversed(referenceShape)):
+        if rectDim == referenceDim:
+            assert rectOffset == 0, f"Rectangle offset should be zero when the dimensions are the same. Received rectangle {rect} and reference shape {referenceShape}"
+            currentCollapsedDim *= rectDim
         else:
-            newDims.insert(0, tileDim)
-            newBaseline.insert(0, bufDim)
-            newOffset.insert(0, reversedRectOffset[idx])
-
-    if acc > 1:
-        newDims.insert(0, acc)
-        newBaseline.insert(0, acc)
-        newOffset.insert(0, acc * reversedRectOffset[idx])
+            minRectShape.insert(0, currentCollapsedDim * rectDim)
+            minReferenceShape.insert(0, currentCollapsedDim * referenceDim)
+            minRectOffset.insert(0, currentCollapsedDim * rectOffset)
+            currentCollapsedDim = 1
 
-    # JUNGVI: If the function collapsed all dimensions of the tensor, set it to dim 1 and offset 0
-    if len(newDims) == 0:
-        newDims = [1]
-        newBaseline = [1]
-        newOffset = [0]
+    if currentCollapsedDim > 1 or len(minRectShape) == 0:
+        minRectShape.insert(0, currentCollapsedDim)
+        minReferenceShape.insert(0, currentCollapsedDim)
+        minRectOffset.insert(0, currentCollapsedDim * rect.offset[0])
 
-    newRect = HyperRectangle(tuple(newOffset), tuple(newDims))
-    newBaseline = HyperRectangle(tuple([0] * len(newOffset)), tuple(newBaseline))
+    return HyperRectangle(tuple(minRectOffset), tuple(minRectShape)), tuple(minReferenceShape)
 
-    return newRect, newBaseline
 
+def padShape(shape: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
+    assert rank >= len(
+        shape), f"Cannot pad to rank smaller then shape's. Received rank: {rank}, shape rank: {len(shape)}"
+    ret = tuple([1] * (rank - len(shape))) + shape
+    assert len(ret) == rank
+    return ret
 
-def calculateRectangleOffset(hyperRectangle: HyperRectangle, referenceBuffer: VariableBuffer) -> int:
 
-    minimalRect, baselineRect = minimizeRectangleDims(hyperRectangle, referenceBuffer)
+def padOffset(offset: Tuple[int, ...], rank: int) -> Tuple[int, ...]:
+    assert rank >= len(
+        offset), f"Cannot pad to rank smaller then offset's. Received rank: {rank}, offset rank: {len(offset)}"
+    ret = tuple([0] * (rank - len(offset))) + offset
+    assert len(ret) == rank
+    return ret
 
-    offsetMult = [1]
-    for dim in reversed(baselineRect.dims[1:]):
-        offsetMult.insert(0, dim * np.prod(offsetMult))
 
-    accOffset = 0
-    for offsetIdx, mult in zip(minimalRect.offset, offsetMult):
-        accOffset += offsetIdx * mult
+def padStride(stride: Tuple[int, ...], rank: int, paddingStride: int) -> Tuple[int, ...]:
+    assert rank >= len(
+        stride), f"Cannot pad to rank smaller then stride's. Received rank: {rank}, stride rank: {len(stride)}"
+    ret = tuple([paddingStride] * (rank - len(stride))) + stride
+    assert len(ret) == rank
+    return ret
 
-    return int(accOffset * (referenceBuffer._type.referencedType.typeWidth // 8))
 
+def stridesFromShape(shape: Sequence[int]) -> Tuple[int, ...]:
+    strides = [1] * len(shape)
+    for idx, dim in enumerate(reversed(shape[1:])):
+        strides[idx + 1] = strides[idx] * dim
+    return tuple(reversed(strides))
 
-def extractTilingTransfer(tilingSolution: NodeMemoryConstraint, targetMemLevel: str,
-                          tensorName: str) -> Optional[MemoryTransfer]:
 
-    for name, constraint in tilingSolution.tensorMemoryConstraints.items():
-        if not name == tensorName:
-            continue
+def calculateFlatOffset(offsets: Sequence[int], strides: Sequence[int]) -> int:
+    assert len(offsets) == len(strides), \
+        f"Offsets and strides have to have the same number of dimensions. Length offsets: {len(offsets)}, strides: {len(strides)}"
+    return sum(offset * stride for offset, stride in zip(offsets, strides))
 
-        sourceIdx = 0
 
-        for idx, memConstraint in enumerate(constraint.memoryConstraints.values()):
-            if memConstraint.memoryLevel != targetMemLevel:
-                continue
+def calculateFlatOffsetInBytes(tile: HyperRectangle, referenceBuffer: VariableBuffer) -> int:
+    return int(
+        calculateFlatOffset(tile.offset, stridesFromShape(referenceBuffer.shape)) *
+        (referenceBuffer._type.referencedType.typeWidth // 8))
 
-            sourceIdx = idx
-            targetIdx = idx - 1
 
-            if sourceIdx == 0:
-                return None
+def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRectangle]:
+    assert memoryTransfer.source.shape is not None, "Source transfer shape cannot be undefined!"
+    assert memoryTransfer.destination.shape is not None, "Destination transfer shape cannot be undefined!"
 
-            return MemoryTransfer(
-                list(constraint.memoryConstraints.values())[targetIdx],
-                list(constraint.memoryConstraints.values())[sourceIdx])
+    assert len(memoryTransfer.source.shape) == len(memoryTransfer.destination.shape), \
+    f"Source and target of memory transfer {memoryTransfer} don't have the same number of dimensions!"
 
-    raise RuntimeError(f"{tensorName} not found in tilingSolution!")
+    largeShape = memoryTransfer.source.shape
+    smallShape = memoryTransfer.destination.shape
 
+    for dimIdx, (dimSizeSmall, dimSizeLarge) in enumerate(zip(smallShape, largeShape)):
+        assert dimSizeSmall <= dimSizeLarge, f"smallShape[{dimIdx}] should not be bigger then largeShape[{dimIdx}]. ({dimSizeSmall} > {dimSizeLarge})"
 
-def computeHyperRectangleList(memTrans: MemoryTransfer) -> List[HyperRectangle]:
-
-    def nextElement(idxVec: List[int], targetVector: List[int]) -> Optional[List[int]]:
-        nextIdx = []
-
-        countUp = True
-        for vecIdx, maxIdx in zip(reversed(idxVec), reversed(targetVector)):
-            if countUp:
-                if vecIdx == maxIdx:
-                    nextIdx.append(1)
+    def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]:
+        tileCount = np.prod(tileIndexEnd)
+        tileIndex = [0] * len(tileIndexEnd)
+        for _ in range(tileCount):
+            yield tileIndex
+            for dimIdx, (idx, end) in enumerate(zip(tileIndex, tileIndexEnd)):
+                if idx + 1 < end:
+                    tileIndex[dimIdx] = idx + 1
+                    break
                 else:
-                    nextIdx.append(vecIdx + 1)
-                    countUp = False
-            else:
-                nextIdx.append(vecIdx)
-
-        nextIdx.reverse()
-
-        if countUp:
-            return None
-
-        return nextIdx
-
-    def calculateCost(idxVec: Iterable[int], smallShape: Tuple[int]) -> List[int]:
-        outVec = []
-        for idx, step in zip(idxVec, smallShape):
-            outVec.append((idx - 1) * step)
-
-        return outVec
-
-    def calculateDim(idxVec: List[int], numTiles: List[int], smallShape: Tuple[int],
-                     largeShape: Tuple[int]) -> List[int]:
-
-        dimVec = []
-
-        for idx, (vecIdx, maxIdx) in enumerate(zip(idxVec, numTiles)):
-            if vecIdx != maxIdx:
-                dimVec.append(smallShape[idx])
-                continue
-            if largeShape[idx] % smallShape[idx] == 0:
-                dimVec.append(smallShape[idx])
-                continue
-            dimVec.append(largeShape[idx] % smallShape[idx])
-
-        return dimVec
-
-    src = memTrans.source
-    dst = memTrans.destination
-
-    largeShape = src.shape
-    smallShape = dst.shape
-
-    assert largeShape is not None, "Transfer shapes cannot be undefined!"
-    assert smallShape is not None, "Transfer shapes cannot be undefined!"
-
-    assert len(smallShape) == len(
-        largeShape), f"Source and target of memory transfer {memTrans} don't have the same number of dimensions!"
-    for idx, (dim1, dim2) in enumerate(zip(smallShape, largeShape)):
-        assert dim1 <= dim2, f"Large shape is smaller in dimension {idx}"
-
-    totNumTiles = 1
-    numTiles: List[int] = []
+                    tileIndex[dimIdx] = 0
 
-    for (dim1, dim2) in zip(smallShape, largeShape):
-        totNumTiles *= np.ceil(dim2 / dim1)
-        numTiles.append(int(np.ceil(dim2 / dim1)))
+    tileHyperRectangles = []
 
-    cubeList: List[HyperRectangle] = []
-    idxVec = [1] * len(smallShape)
+    tileIndexEnd = [
+        int(np.ceil(dimSizeLarge / dimSizeSmall)) for dimSizeLarge, dimSizeSmall in zip(largeShape, smallShape)
+    ]
+    for tileIndex in nextTileIndex(tileIndexEnd):
+        tileOffset = tuple(dimIdx * dimSizeSmall for dimIdx, dimSizeSmall in zip(tileIndex, smallShape))
+        for dimIdx, (dimOffset, dimSizeLarge) in enumerate(zip(tileOffset, largeShape)):
+            assert dimOffset >= 0, f"tileOffset[{dimIdx}] shoud not be smaller then zero ({dimOffset} < 0)"
+            assert dimOffset < dimSizeLarge, f"tileOffset[{dimIdx}] should not be bigger or equal then largeShape[{dimIdx}] ({dimOffset} >= {dimSizeLarge})"
 
-    for i in range(int(totNumTiles)):
-        offsetVec = calculateCost(idxVec, smallShape)
-        dimVec = calculateDim(idxVec, numTiles, smallShape, largeShape)
-        cubeList.append(HyperRectangle(tuple(offsetVec), tuple(dimVec)))
+        tileSize = tuple(
+            min(dimSizeSmall, dimSizeLarge - dimOffset)
+            for dimSizeSmall, dimSizeLarge, dimOffset in zip(smallShape, largeShape, tileOffset))
+        for dimIdx, (dimSize, dimSizeSmall) in enumerate(zip(tileSize, smallShape)):
+            assert dimSize > 0, f"tileOffset[{dimIdx}] shoud not be smaller or equal then zero ({dimSize} <= 0)"
+            assert dimSize <= dimSizeSmall, f"tileSize[{dimIdx}] should not be bigger then smallShape[{dimIdx}] ({dimSize} > {dimSizeSmall})"
 
-        nextVec = nextElement(idxVec, numTiles)
-        if nextVec is None:
-            break
-        idxVec = nextVec
+        tileHyperRectangles.append(HyperRectangle(tileOffset, tileSize))
 
-    return cubeList
+    return tileHyperRectangles
diff --git a/Deeploy/TilingExtension/__init__.py b/Deeploy/TilingExtension/__init__.py
index b50445f83c..be436b64a3 100644
--- a/Deeploy/TilingExtension/__init__.py
+++ b/Deeploy/TilingExtension/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 10.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/Deeploy/__init__.py b/Deeploy/__init__.py
index 65ec809815..be436b64a3 100644
--- a/Deeploy/__init__.py
+++ b/Deeploy/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
index a48d6c8acb..0634b4ba0f 100644
--- a/DeeployTest/CMakeLists.txt
+++ b/DeeployTest/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 include_directories(${GENERATED_SOURCE})
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
diff --git a/DeeployTest/Platforms/Chimera/CMakeLists.txt b/DeeployTest/Platforms/Chimera/CMakeLists.txt
index 6d3b0aa74e..a424729cc3 100644
--- a/DeeployTest/Platforms/Chimera/CMakeLists.txt
+++ b/DeeployTest/Platforms/Chimera/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/Chimera/main.c b/DeeployTest/Platforms/Chimera/main.c
index 425aafa265..1beda44640 100644
--- a/DeeployTest/Platforms/Chimera/main.c
+++ b/DeeployTest/Platforms/Chimera/main.c
@@ -1,9 +1,7 @@
 /*
- * Copyright 2025 ETH Zurich.
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
  *
- * Victor Jung <jungvi@iis.ee.ethz.ch>
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <stdio.h>
diff --git a/DeeployTest/Platforms/Generic/CMakeLists.txt b/DeeployTest/Platforms/Generic/CMakeLists.txt
index 2a53f51c38..f97f1cdf1b 100644
--- a/DeeployTest/Platforms/Generic/CMakeLists.txt
+++ b/DeeployTest/Platforms/Generic/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/Generic/main.c b/DeeployTest/Platforms/Generic/main.c
index ebfa116fd6..e2b0449fb5 100644
--- a/DeeployTest/Platforms/Generic/main.c
+++ b/DeeployTest/Platforms/Generic/main.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        main.c
- * Description:
- *
- * Date:        15.03.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include <stdint.h>
diff --git a/DeeployTest/Platforms/MemPool/CMakeLists.txt b/DeeployTest/Platforms/MemPool/CMakeLists.txt
index 0b4b94b031..e237425c58 100644
--- a/DeeployTest/Platforms/MemPool/CMakeLists.txt
+++ b/DeeployTest/Platforms/MemPool/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/MemPool/main.c b/DeeployTest/Platforms/MemPool/main.c
index 5603bf2395..a0eda86c83 100644
--- a/DeeployTest/Platforms/MemPool/main.c
+++ b/DeeployTest/Platforms/MemPool/main.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        main.c
- * Description:
- *
- * Date:        15.03.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include <stdint.h>
diff --git a/DeeployTest/Platforms/PULPOpen/CMakeLists.txt b/DeeployTest/Platforms/PULPOpen/CMakeLists.txt
index 8810862885..9dfd2f466f 100644
--- a/DeeployTest/Platforms/PULPOpen/CMakeLists.txt
+++ b/DeeployTest/Platforms/PULPOpen/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/PULPOpen/inc/CycleCounter.h b/DeeployTest/Platforms/PULPOpen/inc/CycleCounter.h
index 21d501e031..1bb48d7cb6 100644
--- a/DeeployTest/Platforms/PULPOpen/inc/CycleCounter.h
+++ b/DeeployTest/Platforms/PULPOpen/inc/CycleCounter.h
@@ -1,42 +1,22 @@
-/* =====================================================================
- * Title:        CycleCounter.h
- * Description:
- *
- * $Date:        26.07.2024
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef CYCLECOUNTER
-#define CYCLECOUNTER
-
-// Resets the internal cycle counter to zero
-void ResetTimer(void);
-
-// Starts the internal cycle counter
-void StartTimer(void);
-
-// Stops the internal cycle counter
-void StopTimer(void);
-
-// Returns the current number of cycles according to the internal cycle counter
-unsigned int getCycles(void);
-
-#endif
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CYCLECOUNTER
+#define CYCLECOUNTER
+
+// Resets the internal cycle counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle counter
+void StartTimer(void);
+
+// Stops the internal cycle counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+unsigned int getCycles(void);
+
+#endif
diff --git a/DeeployTest/Platforms/PULPOpen/src/CycleCounter.c b/DeeployTest/Platforms/PULPOpen/src/CycleCounter.c
index 1f35d31891..e69af95d93 100644
--- a/DeeployTest/Platforms/PULPOpen/src/CycleCounter.c
+++ b/DeeployTest/Platforms/PULPOpen/src/CycleCounter.c
@@ -1,39 +1,19 @@
-/* =====================================================================
- * Title:        CycleCounter.c
- * Description:
- *
- * $Date:        26.07.2024
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "CycleCounter.h"
-#include "pmsis.h"
-
-void ResetTimer() {
-  pi_perf_conf(PI_PERF_CYCLES);
-  pi_perf_cl_reset();
-}
-
-void StartTimer() { pi_perf_cl_start(); }
-
-void StopTimer() { pi_perf_cl_stop(); }
-
-unsigned int getCycles() { return pi_perf_cl_read(PI_PERF_CYCLES); }
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "CycleCounter.h"
+#include "pmsis.h"
+
+void ResetTimer() {
+  pi_perf_conf(PI_PERF_CYCLES);
+  pi_perf_cl_reset();
+}
+
+void StartTimer() { pi_perf_cl_start(); }
+
+void StopTimer() { pi_perf_cl_stop(); }
+
+unsigned int getCycles() { return pi_perf_cl_read(PI_PERF_CYCLES); }
diff --git a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
index 525852d8d5..11d889e48d 100644
--- a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
+++ b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        deeploytest.c
- * Description:
- *
- * $Date:        26.12.2021
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "CycleCounter.h"
@@ -32,82 +11,160 @@
 #include "testinputs.h"
 #include "testoutputs.h"
 
+#define MAINSTACKSIZE 8000
+#define SLAVESTACKSIZE 3800
+
 struct pi_device cluster_dev;
 
-void main(void) {
+typedef struct {
+  void *expected;
+  void *actual;
+  uint32_t num_elements;
+  uint32_t output_buf_index;
+  uint32_t *err_count;
+} FloatCompareArgs;
+
+void CompareFloatOnCluster(void *args) {
+
+  if (pi_core_id() == 0) {
+    FloatCompareArgs *compare_args = (FloatCompareArgs *)args;
+    float *expected = (float *)compare_args->expected;
+    float *actual = (float *)compare_args->actual;
+    uint32_t num_elements = compare_args->num_elements;
+    uint32_t output_buf_index = compare_args->output_buf_index;
+    uint32_t *err_count = compare_args->err_count;
+
+    uint32_t local_err_count = 0;
+
+    for (uint32_t i = 0; i < num_elements; i++) {
+      float expected_val = expected[i];
+      float actual_val = actual[i];
+      float diff = expected_val - actual_val;
+
+      if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) {
+        local_err_count += 1;
+
+        printf("Expected: %10.6f  ", expected_val);
+        printf("Actual: %10.6f  ", actual_val);
+        printf("Diff: %10.6f at Index %12u in Output %u\r\n", diff, i,
+               output_buf_index);
+      }
+    }
+
+    *err_count = local_err_count;
+  }
+}
+
+int main(void) {
 #ifndef CI
   printf("HELLO WORLD:\r\n");
 #endif
-
   struct pi_cluster_conf conf;
 
   pi_cluster_conf_init(&conf);
   conf.id = 0;
   pi_open_from_conf(&cluster_dev, &conf);
   if (pi_cluster_open(&cluster_dev))
-    return;
+    return -1;
 
-  struct pi_cluster_task cluster_task_mem_init;
+  mem_init();
+#ifndef NOFLASH
+  open_fs();
+#endif
 
-  pi_cluster_task(&cluster_task_mem_init, mem_init, NULL);
-  cluster_task_mem_init.stack_size = 5000;
-  cluster_task_mem_init.slave_stack_size = 3800;
-  pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task_mem_init);
+  printf("Intializing\r\n");
 
   struct pi_cluster_task cluster_task;
 
   pi_cluster_task(&cluster_task, InitNetwork, NULL);
-  cluster_task.stack_size = 5000;
-  cluster_task.slave_stack_size = 3800;
+  cluster_task.stack_size = MAINSTACKSIZE;
+  cluster_task.slave_stack_size = SLAVESTACKSIZE;
   pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
 
 #ifndef CI
   printf("Initialized\r\n");
 #endif
-  for (int buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
-    memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
-           DeeployNetwork_inputs_bytes[buf]);
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+    if ((uint32_t)DeeployNetwork_inputs[buf] >= 0x10000000) {
+      memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
+             DeeployNetwork_inputs_bytes[buf]);
+    }
   }
+
 #ifndef CI
   printf("Input copied\r\n");
 #endif
-  ResetTimer();
-  StartTimer();
 
-  // RunNetwork(0, 1);
   pi_cluster_task(&cluster_task, RunNetwork, NULL);
-  cluster_task.stack_size = 5000;
-  cluster_task.slave_stack_size = 3800;
+  cluster_task.stack_size = MAINSTACKSIZE;
+  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  ResetTimer();
+  StartTimer();
   pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
-#ifndef CI
-  printf("Run\r\n");
-#endif
   StopTimer();
+
 #ifndef CI
   printf("Output:\r\n");
 #endif
-  int32_t diff, tot_err;
+
+  uint32_t tot_err, tot_tested;
   tot_err = 0;
-  for (int buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
-    for (int i = 0; i < DeeployNetwork_outputs_bytes[buf]; i++) {
-      diff = ((char *)testOutputVector[buf])[i] -
-             ((char *)DeeployNetwork_outputs[buf])[i];
-      if (diff) {
-        tot_err += 1;
-#ifndef CI
-        printf("Expected: %i\t\t", ((int8_t *)testOutputVector[buf])[i]);
-        printf("Actual: %i \t\t", ((int8_t *)DeeployNetwork_outputs[buf])[i]);
-#endif
-#ifndef CI
-        printf("Diff: %i at Index %u \r\n", diff, i);
-#endif
-      } else {
-        /* #ifndef CI */
-        /*       printf("\r\n"); */
-        /* #endif */
+  tot_tested = 0;
+  void *compbuf;
+  FloatCompareArgs float_compare_args;
+  uint32_t float_error_count = 0;
+
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+    tot_tested += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);
+
+    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) {
+      compbuf = pi_l2_malloc((int)DeeployNetwork_outputs_bytes[buf]);
+      ram_read(compbuf, DeeployNetwork_outputs[buf],
+               DeeployNetwork_outputs_bytes[buf]);
+    } else {
+      compbuf = DeeployNetwork_outputs[buf];
+    }
+
+    if (ISOUTPUTFLOAT) {
+      float_error_count = 0;
+      float_compare_args.expected = testOutputVector[buf];
+      float_compare_args.actual = compbuf;
+      float_compare_args.num_elements =
+          DeeployNetwork_outputs_bytes[buf] / sizeof(float);
+      float_compare_args.output_buf_index = buf;
+      float_compare_args.err_count = &float_error_count;
+
+      pi_cluster_task(&cluster_task, CompareFloatOnCluster,
+                      &float_compare_args);
+      cluster_task.stack_size = MAINSTACKSIZE;
+      cluster_task.slave_stack_size = SLAVESTACKSIZE;
+      pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+      tot_err += float_error_count;
+    } else {
+
+      for (uint32_t i = 0;
+           i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
+        OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
+        OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i];
+        int32_t error = expected - actual;
+        OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error);
+
+        if (diff) {
+          tot_err += 1;
+          printf("Expected: %4d  ", expected);
+          printf("Actual: %4d  ", actual);
+          printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
+        }
       }
     }
+    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) {
+      pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]);
+    }
   }
+
   printf("Runtime: %u cycles\r\n", getCycles());
-  printf("Errors: %u out of %u \r\n", tot_err, DeeployNetwork_output_0_len);
-}
+  printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
+
+  return (int)tot_err;
+}
\ No newline at end of file
diff --git a/DeeployTest/Platforms/QEMU_ARM/CMakeLists.txt b/DeeployTest/Platforms/QEMU_ARM/CMakeLists.txt
index 9820396bd0..8c5dca824b 100644
--- a/DeeployTest/Platforms/QEMU_ARM/CMakeLists.txt
+++ b/DeeployTest/Platforms/QEMU_ARM/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/QEMU_ARM/inc/CycleCounter.h b/DeeployTest/Platforms/QEMU_ARM/inc/CycleCounter.h
index e9ca1c1801..5d679e0b11 100644
--- a/DeeployTest/Platforms/QEMU_ARM/inc/CycleCounter.h
+++ b/DeeployTest/Platforms/QEMU_ARM/inc/CycleCounter.h
@@ -1,46 +1,26 @@
-/* =====================================================================
- * Title:        CycleCounter.h
- * Description:
- *
- * $Date:        26.07.2024
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef CYCLECOUNTER
-#define CYCLECOUNTER
-
-extern volatile unsigned int *DWT_CYCCNT;
-extern volatile unsigned int *DWT_CONTROL;
-extern volatile unsigned int *SCB_DEMCR;
-
-// Resets the internal cycle counter to zero
-void ResetTimer(void);
-
-// Starts the internal cycle counter
-void StartTimer(void);
-
-// Stops the internal cycle counter
-void StopTimer(void);
-
-// Returns the current number of cycles according to the internal cycle counter
-unsigned int getCycles(void);
-
-#endif
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CYCLECOUNTER
+#define CYCLECOUNTER
+
+extern volatile unsigned int *DWT_CYCCNT;
+extern volatile unsigned int *DWT_CONTROL;
+extern volatile unsigned int *SCB_DEMCR;
+
+// Resets the internal cycle counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle counter
+void StartTimer(void);
+
+// Stops the internal cycle counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+unsigned int getCycles(void);
+
+#endif
diff --git a/DeeployTest/Platforms/QEMU_ARM/src/CycleCounter.c b/DeeployTest/Platforms/QEMU_ARM/src/CycleCounter.c
index 3fbdc12b74..d522f6517d 100644
--- a/DeeployTest/Platforms/QEMU_ARM/src/CycleCounter.c
+++ b/DeeployTest/Platforms/QEMU_ARM/src/CycleCounter.c
@@ -1,64 +1,44 @@
-/* =====================================================================
- * Title:        CycleCounter.c
- * Description:
- *
- * $Date:        26.07.2024
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "CycleCounter.h"
-
-volatile unsigned int *DWT_CYCCNT =
-    (unsigned int *)0xE0001004; // address of the register
-volatile unsigned int *DWT_CONTROL =
-    (unsigned int *)0xE0001000; // address of the register
-volatile unsigned int *SCB_DEMCR =
-    (unsigned int *)0xE000EDFC; // address of the register
-
-static unsigned int prev_val = 0;
-static int stopped = 0;
-
-void ResetTimer() {
-
-  *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
-  *DWT_CYCCNT = 0; // reset the counter
-  *DWT_CONTROL = 1;
-  stopped = 1;
-  prev_val = 0;
-}
-
-void StartTimer() {
-  prev_val = *DWT_CYCCNT;
-  stopped = 0;
-}
-
-void StopTimer() {
-  prev_val = *DWT_CYCCNT - prev_val;
-  stopped = 1;
-}
-
-unsigned int getCycles() {
-  if (stopped) {
-    return prev_val;
-  } else {
-    return *DWT_CYCCNT - prev_val;
-  }
-}
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "CycleCounter.h"
+
+volatile unsigned int *DWT_CYCCNT =
+    (unsigned int *)0xE0001004; // address of the register
+volatile unsigned int *DWT_CONTROL =
+    (unsigned int *)0xE0001000; // address of the register
+volatile unsigned int *SCB_DEMCR =
+    (unsigned int *)0xE000EDFC; // address of the register
+
+static unsigned int prev_val = 0;
+static int stopped = 0;
+
+void ResetTimer() {
+
+  *SCB_DEMCR = *SCB_DEMCR | 0x01000000;
+  *DWT_CYCCNT = 0; // reset the counter
+  *DWT_CONTROL = 1;
+  stopped = 1;
+  prev_val = 0;
+}
+
+void StartTimer() {
+  prev_val = *DWT_CYCCNT;
+  stopped = 0;
+}
+
+void StopTimer() {
+  prev_val = *DWT_CYCCNT - prev_val;
+  stopped = 1;
+}
+
+unsigned int getCycles() {
+  if (stopped) {
+    return prev_val;
+  } else {
+    return *DWT_CYCCNT - prev_val;
+  }
+}
diff --git a/DeeployTest/Platforms/QEMU_ARM/src/deeploytest.c b/DeeployTest/Platforms/QEMU_ARM/src/deeploytest.c
index 0043941460..63ec1e1963 100644
--- a/DeeployTest/Platforms/QEMU_ARM/src/deeploytest.c
+++ b/DeeployTest/Platforms/QEMU_ARM/src/deeploytest.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        deeploytest.c
- * Description:
- *
- * Date:        15.03.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "Network.h"
diff --git a/DeeployTest/Platforms/Siracusa/CMakeLists.txt b/DeeployTest/Platforms/Siracusa/CMakeLists.txt
index e42b250a71..45e6191490 100644
--- a/DeeployTest/Platforms/Siracusa/CMakeLists.txt
+++ b/DeeployTest/Platforms/Siracusa/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/Siracusa/inc/CycleCounter.h b/DeeployTest/Platforms/Siracusa/inc/CycleCounter.h
index 21d501e031..1bb48d7cb6 100644
--- a/DeeployTest/Platforms/Siracusa/inc/CycleCounter.h
+++ b/DeeployTest/Platforms/Siracusa/inc/CycleCounter.h
@@ -1,42 +1,22 @@
-/* =====================================================================
- * Title:        CycleCounter.h
- * Description:
- *
- * $Date:        26.07.2024
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef CYCLECOUNTER
-#define CYCLECOUNTER
-
-// Resets the internal cycle counter to zero
-void ResetTimer(void);
-
-// Starts the internal cycle counter
-void StartTimer(void);
-
-// Stops the internal cycle counter
-void StopTimer(void);
-
-// Returns the current number of cycles according to the internal cycle counter
-unsigned int getCycles(void);
-
-#endif
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef CYCLECOUNTER
+#define CYCLECOUNTER
+
+// Resets the internal cycle counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle counter
+void StartTimer(void);
+
+// Stops the internal cycle counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+unsigned int getCycles(void);
+
+#endif
diff --git a/DeeployTest/Platforms/Siracusa/src/CycleCounter.c b/DeeployTest/Platforms/Siracusa/src/CycleCounter.c
index 1f35d31891..e69af95d93 100644
--- a/DeeployTest/Platforms/Siracusa/src/CycleCounter.c
+++ b/DeeployTest/Platforms/Siracusa/src/CycleCounter.c
@@ -1,39 +1,19 @@
-/* =====================================================================
- * Title:        CycleCounter.c
- * Description:
- *
- * $Date:        26.07.2024
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "CycleCounter.h"
-#include "pmsis.h"
-
-void ResetTimer() {
-  pi_perf_conf(PI_PERF_CYCLES);
-  pi_perf_cl_reset();
-}
-
-void StartTimer() { pi_perf_cl_start(); }
-
-void StopTimer() { pi_perf_cl_stop(); }
-
-unsigned int getCycles() { return pi_perf_cl_read(PI_PERF_CYCLES); }
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "CycleCounter.h"
+#include "pmsis.h"
+
+void ResetTimer() {
+  pi_perf_conf(PI_PERF_CYCLES);
+  pi_perf_cl_reset();
+}
+
+void StartTimer() { pi_perf_cl_start(); }
+
+void StopTimer() { pi_perf_cl_stop(); }
+
+unsigned int getCycles() { return pi_perf_cl_read(PI_PERF_CYCLES); }
diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
index 9a0d8f39db..11d889e48d 100644
--- a/DeeployTest/Platforms/Siracusa/src/deeploytest.c
+++ b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        deeploytest.c
- * Description:
- *
- * $Date:        26.12.2021
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- * Author: Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "CycleCounter.h"
@@ -41,9 +19,9 @@ struct pi_device cluster_dev;
 typedef struct {
   void *expected;
   void *actual;
-  int num_elements;
-  int output_buf_index;
-  int *err_count;
+  uint32_t num_elements;
+  uint32_t output_buf_index;
+  uint32_t *err_count;
 } FloatCompareArgs;
 
 void CompareFloatOnCluster(void *args) {
@@ -52,13 +30,13 @@ void CompareFloatOnCluster(void *args) {
     FloatCompareArgs *compare_args = (FloatCompareArgs *)args;
     float *expected = (float *)compare_args->expected;
     float *actual = (float *)compare_args->actual;
-    int num_elements = compare_args->num_elements;
-    int output_buf_index = compare_args->output_buf_index;
-    int *err_count = compare_args->err_count;
+    uint32_t num_elements = compare_args->num_elements;
+    uint32_t output_buf_index = compare_args->output_buf_index;
+    uint32_t *err_count = compare_args->err_count;
 
-    int local_err_count = 0;
+    uint32_t local_err_count = 0;
 
-    for (int i = 0; i < num_elements; i++) {
+    for (uint32_t i = 0; i < num_elements; i++) {
       float expected_val = expected[i];
       float actual_val = actual[i];
       float diff = expected_val - actual_val;
@@ -77,7 +55,7 @@ void CompareFloatOnCluster(void *args) {
   }
 }
 
-void main(void) {
+int main(void) {
 #ifndef CI
   printf("HELLO WORLD:\r\n");
 #endif
@@ -87,7 +65,7 @@ void main(void) {
   conf.id = 0;
   pi_open_from_conf(&cluster_dev, &conf);
   if (pi_cluster_open(&cluster_dev))
-    return;
+    return -1;
 
   mem_init();
 #ifndef NOFLASH
@@ -106,8 +84,8 @@ void main(void) {
 #ifndef CI
   printf("Initialized\r\n");
 #endif
-  for (int buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
-    if (DeeployNetwork_inputs[buf] >= 0x10000000) {
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
+    if ((uint32_t)DeeployNetwork_inputs[buf] >= 0x10000000) {
       memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
              DeeployNetwork_inputs_bytes[buf]);
     }
@@ -116,7 +94,7 @@ void main(void) {
 #ifndef CI
   printf("Input copied\r\n");
 #endif
-  // RunNetwork(0, 1);
+
   pi_cluster_task(&cluster_task, RunNetwork, NULL);
   cluster_task.stack_size = MAINSTACKSIZE;
   cluster_task.slave_stack_size = SLAVESTACKSIZE;
@@ -136,11 +114,11 @@ void main(void) {
   FloatCompareArgs float_compare_args;
   uint32_t float_error_count = 0;
 
-  for (int buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+  for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
     tot_tested += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);
 
-    if (DeeployNetwork_outputs[buf] < 0x1000000) {
-      compbuf = pi_l2_malloc(DeeployNetwork_outputs_bytes[buf]);
+    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) {
+      compbuf = pi_l2_malloc((int)DeeployNetwork_outputs_bytes[buf]);
       ram_read(compbuf, DeeployNetwork_outputs[buf],
                DeeployNetwork_outputs_bytes[buf]);
     } else {
@@ -165,11 +143,12 @@ void main(void) {
       tot_err += float_error_count;
     } else {
 
-      for (int i = 0;
+      for (uint32_t i = 0;
            i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
         OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
         OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i];
-        OUTPUTTYPE diff = expected - actual;
+        int32_t error = expected - actual;
+        OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error);
 
         if (diff) {
           tot_err += 1;
@@ -179,11 +158,13 @@ void main(void) {
         }
       }
     }
-    if (DeeployNetwork_outputs[buf] < 0x1000000) {
-      pi_l2_free(compbuf, DeeployNetwork_outputs_bytes[buf]);
+    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) {
+      pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]);
     }
   }
 
   printf("Runtime: %u cycles\r\n", getCycles());
   printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
+
+  return (int)tot_err;
 }
\ No newline at end of file
diff --git a/DeeployTest/Platforms/Snitch/CMakeLists.txt b/DeeployTest/Platforms/Snitch/CMakeLists.txt
index 44568890de..03a40258eb 100644
--- a/DeeployTest/Platforms/Snitch/CMakeLists.txt
+++ b/DeeployTest/Platforms/Snitch/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/Snitch/main.c b/DeeployTest/Platforms/Snitch/main.c
index b493fd52b9..a7251f3844 100644
--- a/DeeployTest/Platforms/Snitch/main.c
+++ b/DeeployTest/Platforms/Snitch/main.c
@@ -1,28 +1,7 @@
 /*
- * ----------------------------------------------------------------------
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
- * File: deeploytest.c
- *
- * Last edited: 23.04.2024
- *
- * Copyright (C) 2024, ETH Zurich and University of Bologna.
- *
- * Author: Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
- *
- * ----------------------------------------------------------------------
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "CycleCounter.h"
@@ -98,23 +77,20 @@ int main(void) {
 
   snrt_cluster_hw_barrier();
 
-#if !defined(BANSHEE_SIMULATION) && !defined(GVSOC_SIMULATION)
-  if (snrt_is_dm_core()) {
-    ResetTimer();
-    StartTimer();
-  }
-#endif // BANSHEE_SIMULATION and GVSOC_SIMULATION
-
+  ResetTimer();
+  StartTimer();
+  snrt_cluster_hw_barrier();
   RunNetwork(compute_core_id, num_compute_cores);
 
   uint32_t runtimeCycles = 0;
-#if !defined(BANSHEE_SIMULATION) && !defined(GVSOC_SIMULATION)
   if (snrt_is_dm_core()) {
     runtimeCycles = getCycles();
+#if !defined(BANSHEE_SIMULATION) && !defined(GVSOC_SIMULATION)
     DUMP(runtimeCycles);
-    StopTimer();
-  }
 #endif // BANSHEE_SIMULATION and GVSOC_SIMULATION
+  }
+
+  StopTimer();
 
   snrt_cluster_hw_barrier();
 
diff --git a/DeeployTest/Platforms/SoftHier/CMakeLists.txt b/DeeployTest/Platforms/SoftHier/CMakeLists.txt
index 22ebafc2f9..413136a533 100644
--- a/DeeployTest/Platforms/SoftHier/CMakeLists.txt
+++ b/DeeployTest/Platforms/SoftHier/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(ProjectId ${TESTNAME})
 
 file(GLOB_RECURSE SOURCES
diff --git a/DeeployTest/Platforms/SoftHier/main.c b/DeeployTest/Platforms/SoftHier/main.c
index 86eb31f712..157c048159 100644
--- a/DeeployTest/Platforms/SoftHier/main.c
+++ b/DeeployTest/Platforms/SoftHier/main.c
@@ -1,28 +1,7 @@
 /*
- * ----------------------------------------------------------------------
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
  *
- * File: main.c
- *
- * Last edited: 26.05.2025
- *
- * Copyright (C) 2025, ETH Zurich and University of Bologna.
- *
- * Author: Bowen Wang (bowwang@iis.ee.ethz.ch), ETH Zurich
- *
- * ----------------------------------------------------------------------
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include <math.h>
diff --git a/DeeployTest/README.md b/DeeployTest/README.md
new file mode 100644
index 0000000000..ac56399e62
--- /dev/null
+++ b/DeeployTest/README.md
@@ -0,0 +1,71 @@
+
+
+# How to use the DeeployTest PyTest Suite?
+
+### Executing and Collecting Test Groups
+
+The test suite is located in the `DeeployTest` folder, all commands below are assumed to be executed from the `DeeployTest` folder. The test suite is grouped with different markers, you can list the markers with `pytest --markers`. This will return something like:
+```
+@pytest.mark.generic: mark test as a Generic platform test
+@pytest.mark.cortexm: mark test as a Cortex-M (QEMU-ARM) platform test
+@pytest.mark.mempool: mark test as a MemPool platform test
+```
+
+You can run all test from a given mark group with `pytest -m <marker-name> -v`. Each platform has a given marker, if you want to run all tests from the generic platform, you can use:
+```
+pytest -m generic -v
+```
+
+You can use boolean expressions on the markers to execute unions or intersections of markers. For instance, to execute only the kernel tests from the generic platform, one can use:
+```
+pytest -m 'generic and kernels' -v
+```
+
+To display the tests captured by a given marker or expression, you can use the `--collect-only` flag. For instance, to list the kernel tests on the Siracusa with Neureka platform that are from L2 and single-buffered, you can use `pytest -m 'siracusa_neureka_tiled and kernels and l2 and singlebuffer' -v --collect-only`, which returns:
+
+```
+platform linux -- Python 3.10.0, pytest-9.0.2, pluggy-1.6.0 -- /usr/scratch/normandie/jungvi/micromamba/envs/deeploy/bin/python3.10
+cachedir: .pytest_cache
+rootdir: /scratch/jungvi/Deeploy/DeeployTest
+configfile: pytest.ini
+plugins: xdist-3.8.0
+collected 378 items / 370 deselected / 8 selected
+
+<Dir DeeployTest>
+  <Module test_platforms.py>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer[testRequantizedLinear-16000-L2-singlebuffer]>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer[testPointwise-32000-L2-singlebuffer]>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer[testPointwiseConvBNReLU-32000-L2-singlebuffer]>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer[testPointwiseUnsignedWeights-32000-L2-singlebuffer]>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer_wmem[testRequantizedLinear-16000-L2-singlebuffer-wmem]>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer_wmem[testPointwise-32000-L2-singlebuffer-wmem]>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer_wmem[testPointwiseConvBNReLU-32000-L2-singlebuffer-wmem]>
+    <Function test_siracusa_neureka_tiled_kernels_l2_singlebuffer_wmem[testPointwiseUnsignedWeights-32000-L2-singlebuffer-wmem]>
+```
+
+### Executing a Single Test
+
+To run a single test, one can use the test identifier from the `--collect-only` output, for instance:
+```
+pytest 'test_platforms.py::test_siracusa_neureka_tiled_kernels_l2_singlebuffer[testRequantizedLinear-16000-L2-singlebuffer]' -v
+```
+
+### Controlling Test Verbosity
+
+By default, the test output is captured and displayed only if a test fails. If you want to see the captured output, use the `-s` flag. To increase the verbosity of the test, you can add more `v` to the `-v` flag, for instance, `-vvv` will display the commands executed during the test. You can filter the level of the messages from Python's built-in logging module with `--log-cli-level=<log-level>`. For instance, the following line captures only the commands executed by the tests:
+```
+pytest test_platforms.py -m "generic and kernels" -vvv --log-cli-level=DEBUG
+```
+
+### Parallelized Test Execution
+
+You can run tests in parallel with the `-n` flag followed by the number of parallel threads. For instance, to run all generic tests with 16 threads, you can use:
+```
+pytest test_platforms.py -m generic -v -n 16
+```
+
+### Misc
+
+- When running `pytest -m <my-markers>` in a folder, PyTest will scan each file looking for tests. To speed up the detection you can specify the platform test file like `pytest test_platforms.py -m <my-markers>`.
+- If you place a breakpoint like `import IPython; IPython.embed()`, you need to run the test with `-s` to be able to enter breakpoints.
+- The `--pdb` flag is very useful as it drops a debugger session on failure.
\ No newline at end of file
diff --git a/DeeployTest/Tests/Adder/inputs.npz b/DeeployTest/Tests/Adder/inputs.npz
deleted file mode 100644
index e74c7e5ad2..0000000000
Binary files a/DeeployTest/Tests/Adder/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/Adder/outputs.npz b/DeeployTest/Tests/Adder/outputs.npz
deleted file mode 100644
index 9a8d6680e3..0000000000
Binary files a/DeeployTest/Tests/Adder/outputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/largeFloatAdd/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Add/Large/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/largeFloatAdd/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Add/Large/inputs.npz
diff --git a/DeeployTest/Tests/largeFloatAdd/network.onnx b/DeeployTest/Tests/Kernels/FP32/Add/Large/network.onnx
similarity index 100%
rename from DeeployTest/Tests/largeFloatAdd/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Add/Large/network.onnx
diff --git a/DeeployTest/Tests/largeFloatAdd/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Add/Large/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/largeFloatAdd/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Add/Large/outputs.npz
diff --git a/DeeployTest/Tests/TestiNoNorm/activations.npz b/DeeployTest/Tests/Kernels/FP32/Add/Regular/activations.npz
similarity index 100%
rename from DeeployTest/Tests/TestiNoNorm/activations.npz
rename to DeeployTest/Tests/Kernels/FP32/Add/Regular/activations.npz
diff --git a/DeeployTest/Tests/testFloatAdder/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Add/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatAdder/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Add/Regular/inputs.npz
diff --git a/DeeployTest/Tests/testFloatAdder/network.onnx b/DeeployTest/Tests/Kernels/FP32/Add/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatAdder/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Add/Regular/network.onnx
diff --git a/DeeployTest/Tests/testFloatAdder/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Add/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatAdder/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Add/Regular/outputs.npz
diff --git a/DeeployTest/Tests/testFloat2DDWConvolutionBias/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_Bias/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolutionBias/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_Bias/inputs.npz
diff --git a/DeeployTest/Tests/testFloat2DDWConvolutionBias/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_Bias/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolutionBias/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_Bias/network.onnx
diff --git a/DeeployTest/Tests/testFloat2DDWConvolutionBias/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_Bias/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolutionBias/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_Bias/outputs.npz
diff --git a/DeeployTest/Tests/testFloat2DDWConvolution/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_NoBias/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolution/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_NoBias/inputs.npz
diff --git a/DeeployTest/Tests/testFloat2DDWConvolution/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_NoBias/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolution/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_NoBias/network.onnx
diff --git a/DeeployTest/Tests/testFloat2DDWConvolution/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_NoBias/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolution/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_NoBias/outputs.npz
diff --git a/DeeployTest/Tests/testFloat2DDWConvolutionZeroBias/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_ZeroValuedBias/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolutionZeroBias/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_ZeroValuedBias/inputs.npz
diff --git a/DeeployTest/Tests/testFloat2DDWConvolutionZeroBias/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_ZeroValuedBias/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolutionZeroBias/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_ZeroValuedBias/network.onnx
diff --git a/DeeployTest/Tests/testFloat2DDWConvolutionZeroBias/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_ZeroValuedBias/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DDWConvolutionZeroBias/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/DW_2D_ZeroValuedBias/outputs.npz
diff --git a/DeeployTest/Tests/testFloat2DConvolutionBias/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_Bias/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolutionBias/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_Bias/inputs.npz
diff --git a/DeeployTest/Tests/testFloat2DConvolutionBias/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_Bias/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolutionBias/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_Bias/network.onnx
diff --git a/DeeployTest/Tests/testFloat2DConvolutionBias/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_Bias/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolutionBias/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_Bias/outputs.npz
diff --git a/DeeployTest/Tests/testFloat2DConvolution/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_NoBias/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolution/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_NoBias/inputs.npz
diff --git a/DeeployTest/Tests/testFloat2DConvolution/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_NoBias/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolution/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_NoBias/network.onnx
diff --git a/DeeployTest/Tests/testFloat2DConvolution/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_NoBias/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolution/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_NoBias/outputs.npz
diff --git a/DeeployTest/Tests/testFloat2DConvolutionZeroBias/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_ZeroValuedBias/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolutionZeroBias/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_ZeroValuedBias/inputs.npz
diff --git a/DeeployTest/Tests/testFloat2DConvolutionZeroBias/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_ZeroValuedBias/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolutionZeroBias/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_ZeroValuedBias/network.onnx
diff --git a/DeeployTest/Tests/testFloat2DConvolutionZeroBias/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_ZeroValuedBias/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DConvolutionZeroBias/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Conv/Regular_2D_ZeroValuedBias/outputs.npz
diff --git a/DeeployTest/Tests/testFloatDiv/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Div/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatDiv/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Div/inputs.npz
diff --git a/DeeployTest/Tests/testFloatDiv/network.onnx b/DeeployTest/Tests/Kernels/FP32/Div/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatDiv/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Div/network.onnx
diff --git a/DeeployTest/Tests/testFloatDiv/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Div/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatDiv/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Div/outputs.npz
diff --git a/DeeployTest/Tests/testFloatGelu/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GELU/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatGelu/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/GELU/inputs.npz
diff --git a/DeeployTest/Tests/testFloatGelu/network.onnx b/DeeployTest/Tests/Kernels/FP32/GELU/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatGelu/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/GELU/network.onnx
diff --git a/DeeployTest/Tests/testFloatGelu/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GELU/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatGelu/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/GELU/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/inputs.npz
new file mode 100644
index 0000000000..47728a3608
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/network.onnx b/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/network.onnx
new file mode 100644
index 0000000000..59264acd46
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/outputs.npz
new file mode 100644
index 0000000000..4e0debdf54
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GEMM/NoBias/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatGEMM/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GEMM/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatGEMM/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/GEMM/Regular/inputs.npz
diff --git a/DeeployTest/Tests/testFloatGEMM/network.onnx b/DeeployTest/Tests/Kernels/FP32/GEMM/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatGEMM/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/GEMM/Regular/network.onnx
diff --git a/DeeployTest/Tests/testFloatGEMM/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GEMM/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatGEMM/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/GEMM/Regular/outputs.npz
diff --git a/DeeployTest/Tests/testFloatGEMMtransB/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GEMM/TransB/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatGEMMtransB/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/GEMM/TransB/inputs.npz
diff --git a/DeeployTest/Tests/testFloatGEMMtransB/network.onnx b/DeeployTest/Tests/Kernels/FP32/GEMM/TransB/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatGEMMtransB/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/GEMM/TransB/network.onnx
diff --git a/DeeployTest/Tests/testFloatGEMMtransB/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GEMM/TransB/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatGEMMtransB/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/GEMM/TransB/outputs.npz
diff --git a/DeeployTest/Tests/testFloatLayerNorm/inputs.npz b/DeeployTest/Tests/Kernels/FP32/LayerNorm/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatLayerNorm/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/LayerNorm/inputs.npz
diff --git a/DeeployTest/Tests/testFloatLayerNorm/network.onnx b/DeeployTest/Tests/Kernels/FP32/LayerNorm/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatLayerNorm/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/LayerNorm/network.onnx
diff --git a/DeeployTest/Tests/testFloatLayerNorm/outputs.npz b/DeeployTest/Tests/Kernels/FP32/LayerNorm/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatLayerNorm/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/LayerNorm/outputs.npz
diff --git a/DeeployTest/Tests/testFloatMatmul/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatMatmul/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/MatMul/inputs.npz
diff --git a/DeeployTest/Tests/testFloatMatmul/network.onnx b/DeeployTest/Tests/Kernels/FP32/MatMul/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatMatmul/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/MatMul/network.onnx
diff --git a/DeeployTest/Tests/testFloatMatmul/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MatMul/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatMatmul/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/MatMul/outputs.npz
diff --git a/DeeployTest/Tests/testFloatMaxPool/inputs.npz b/DeeployTest/Tests/Kernels/FP32/MaxPool/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatMaxPool/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/MaxPool/inputs.npz
diff --git a/DeeployTest/Tests/testFloatMaxPool/network.onnx b/DeeployTest/Tests/Kernels/FP32/MaxPool/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatMaxPool/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/MaxPool/network.onnx
diff --git a/DeeployTest/Tests/testFloatMaxPool/outputs.npz b/DeeployTest/Tests/Kernels/FP32/MaxPool/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatMaxPool/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/MaxPool/outputs.npz
diff --git a/DeeployTest/Tests/testFloatMul/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Mul/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatMul/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Mul/inputs.npz
diff --git a/DeeployTest/Tests/testFloatMul/network.onnx b/DeeployTest/Tests/Kernels/FP32/Mul/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatMul/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Mul/network.onnx
diff --git a/DeeployTest/Tests/testFloatMul/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Mul/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatMul/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Mul/outputs.npz
diff --git a/DeeployTest/Tests/testFloat2DPadding/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Pad/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DPadding/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Pad/inputs.npz
diff --git a/DeeployTest/Tests/testFloat2DPadding/network.onnx b/DeeployTest/Tests/Kernels/FP32/Pad/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloat2DPadding/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Pad/network.onnx
diff --git a/DeeployTest/Tests/testFloat2DPadding/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Pad/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloat2DPadding/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Pad/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/inputs.npz
new file mode 100644
index 0000000000..1a1fbacabc
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/network.onnx b/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/network.onnx
new file mode 100644
index 0000000000..50701ff540
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/outputs.npz
new file mode 100644
index 0000000000..8e99e82420
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Pow/Scalar/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Pow/Vector/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Pow/Vector/inputs.npz
new file mode 100644
index 0000000000..fe8942ac67
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Pow/Vector/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Pow/Vector/network.onnx b/DeeployTest/Tests/Kernels/FP32/Pow/Vector/network.onnx
new file mode 100644
index 0000000000..91be88483c
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/Pow/Vector/network.onnx
@@ -0,0 +1,23 @@
+
+deeploy_test_generator:�
+3
+data_in
+exponentdata_outPow_Vector_Test"Powtest_float_pow_vectorZ!
+data_in
+
+
+
+
+Z"
+exponent
+
+
+
+
+b"
+data_out
+
+
+
+
+B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/Pow/Vector/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Pow/Vector/outputs.npz
new file mode 100644
index 0000000000..ebe9468d52
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Pow/Vector/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/inputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm/inputs.npz
new file mode 100644
index 0000000000..60df101e2e
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/network.onnx b/DeeployTest/Tests/Kernels/FP32/RMSNorm/network.onnx
new file mode 100644
index 0000000000..906e25d254
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/RMSNorm/outputs.npz b/DeeployTest/Tests/Kernels/FP32/RMSNorm/outputs.npz
new file mode 100644
index 0000000000..eb8c1c4942
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/RMSNorm/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatRelu/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReLU/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatRelu/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/ReLU/inputs.npz
diff --git a/DeeployTest/Tests/testFloatRelu/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReLU/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatRelu/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/ReLU/network.onnx
diff --git a/DeeployTest/Tests/testFloatRelu/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReLU/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatRelu/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/ReLU/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/network.onnx
new file mode 100644
index 0000000000..5bb612d31c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/outputs.npz
new file mode 100644
index 0000000000..48b72a636f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/network.onnx
new file mode 100644
index 0000000000..8675da8b4e
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/outputs.npz
new file mode 100644
index 0000000000..241904dd35
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/inputs.npz
new file mode 100644
index 0000000000..8970f698ad
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/network.onnx
new file mode 100644
index 0000000000..c8c9d011f7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/outputs.npz
new file mode 100644
index 0000000000..13ebdd8585
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/AllAxes/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/inputs.npz
new file mode 100644
index 0000000000..8970f698ad
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/network.onnx
new file mode 100644
index 0000000000..589670ea8e
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/outputs.npz
new file mode 100644
index 0000000000..13ebdd8585
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/network.onnx
new file mode 100644
index 0000000000..aeaf210c69
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/outputs.npz
new file mode 100644
index 0000000000..b39a9200bd
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes1_3/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/network.onnx
new file mode 100644
index 0000000000..7941bad80c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/outputs.npz
new file mode 100644
index 0000000000..b108ed17bf
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axes2_1/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/network.onnx
new file mode 100644
index 0000000000..3bc8a3a93a
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/outputs.npz
new file mode 100644
index 0000000000..21761d2281
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis0/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/network.onnx
new file mode 100644
index 0000000000..1f6be48e8e
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/outputs.npz
new file mode 100644
index 0000000000..180cd2795f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/Axis2/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/network.onnx
new file mode 100644
index 0000000000..babf226169
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/outputs.npz
new file mode 100644
index 0000000000..48b72a636f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/network.onnx
new file mode 100644
index 0000000000..3efac8fdf7
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/outputs.npz
new file mode 100644
index 0000000000..3eef9f240f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/network.onnx
new file mode 100644
index 0000000000..887b122ea8
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/outputs.npz
new file mode 100644
index 0000000000..4a4bc781c8
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/inputs.npz
new file mode 100644
index 0000000000..8970f698ad
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/network.onnx
new file mode 100644
index 0000000000..5526be4a80
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/outputs.npz
new file mode 100644
index 0000000000..e2e66e5824
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/AllAxes/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/inputs.npz
new file mode 100644
index 0000000000..8970f698ad
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/network.onnx
new file mode 100644
index 0000000000..b21abffd28
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/outputs.npz
new file mode 100644
index 0000000000..abe06e5a28
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/network.onnx
new file mode 100644
index 0000000000..524e462371
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/outputs.npz
new file mode 100644
index 0000000000..7de7741ef8
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/network.onnx
new file mode 100644
index 0000000000..24a7ad68d6
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/outputs.npz
new file mode 100644
index 0000000000..9db04e582f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/network.onnx
new file mode 100644
index 0000000000..249858b03b
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/outputs.npz
new file mode 100644
index 0000000000..847ceaf4ec
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis0/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/network.onnx
new file mode 100644
index 0000000000..496f7a9e18
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/outputs.npz
new file mode 100644
index 0000000000..95e79e2a23
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/Axis2/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/inputs.npz
new file mode 100644
index 0000000000..42c1f86253
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/network.onnx
new file mode 100644
index 0000000000..40188be091
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/outputs.npz
new file mode 100644
index 0000000000..3eef9f240f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatReduceSum/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceSum/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatReduceSum/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/ReduceSum/inputs.npz
diff --git a/DeeployTest/Tests/testFloatReduceSum/network.onnx b/DeeployTest/Tests/Kernels/FP32/ReduceSum/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatReduceSum/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/ReduceSum/network.onnx
diff --git a/DeeployTest/Tests/testFloatReduceSum/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ReduceSum/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatReduceSum/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/ReduceSum/outputs.npz
diff --git a/DeeployTest/Tests/testFloatReshape/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Reshape/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatReshape/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Reshape/Regular/inputs.npz
diff --git a/DeeployTest/Tests/testFloatReshape/network.onnx b/DeeployTest/Tests/Kernels/FP32/Reshape/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatReshape/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Reshape/Regular/network.onnx
diff --git a/DeeployTest/Tests/testFloatReshape/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Reshape/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatReshape/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Reshape/Regular/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/inputs.npz
new file mode 100644
index 0000000000..36567a96ce
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/network.onnx b/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/network.onnx
new file mode 100644
index 0000000000..5eb3ae446e
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/outputs.npz
new file mode 100644
index 0000000000..0e2e55fcfe
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Reshape/SkipConnection/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatSoftmaxCrossEntropy/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxCrossEntropy/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/inputs.npz
diff --git a/DeeployTest/Tests/testFloatSoftmaxCrossEntropy/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxCrossEntropy/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/network.onnx
diff --git a/DeeployTest/Tests/testFloatSoftmaxCrossEntropy/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxCrossEntropy/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropy/outputs.npz
diff --git a/DeeployTest/Tests/testFloatSoftmaxCrossEntropyGrad/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropyGrad/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxCrossEntropyGrad/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropyGrad/inputs.npz
diff --git a/DeeployTest/Tests/testFloatSoftmaxCrossEntropyGrad/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropyGrad/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxCrossEntropyGrad/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropyGrad/network.onnx
diff --git a/DeeployTest/Tests/testFloatSoftmaxCrossEntropyGrad/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropyGrad/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxCrossEntropyGrad/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/CrossEntropyGrad/outputs.npz
diff --git a/DeeployTest/Tests/testFloatSoftmaxGrad/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Grad/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxGrad/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/Grad/inputs.npz
diff --git a/DeeployTest/Tests/testFloatSoftmaxGrad/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/Grad/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxGrad/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Softmax/Grad/network.onnx
diff --git a/DeeployTest/Tests/testFloatSoftmaxGrad/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Grad/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmaxGrad/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/Grad/outputs.npz
diff --git a/DeeployTest/Tests/testFloatSoftmax/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmax/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/Regular/inputs.npz
diff --git a/DeeployTest/Tests/testFloatSoftmax/network.onnx b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmax/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Softmax/Regular/network.onnx
diff --git a/DeeployTest/Tests/testFloatSoftmax/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Softmax/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSoftmax/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Softmax/Regular/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/FP32/Sqrt/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Sqrt/inputs.npz
new file mode 100644
index 0000000000..c54577cb24
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sqrt/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sqrt/network.onnx b/DeeployTest/Tests/Kernels/FP32/Sqrt/network.onnx
new file mode 100644
index 0000000000..c2f27907fa
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sqrt/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sqrt/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Sqrt/outputs.npz
new file mode 100644
index 0000000000..f6d42c73a1
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sqrt/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatSqueeze/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Squeeze/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSqueeze/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Squeeze/inputs.npz
diff --git a/DeeployTest/Tests/testFloatSqueeze/network.onnx b/DeeployTest/Tests/Kernels/FP32/Squeeze/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatSqueeze/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Squeeze/network.onnx
diff --git a/DeeployTest/Tests/testFloatSqueeze/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Squeeze/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSqueeze/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Squeeze/outputs.npz
diff --git a/DeeployTest/Tests/testFloatTranspose/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Transpose/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatTranspose/inputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Transpose/inputs.npz
diff --git a/DeeployTest/Tests/testFloatTranspose/network.onnx b/DeeployTest/Tests/Kernels/FP32/Transpose/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatTranspose/network.onnx
rename to DeeployTest/Tests/Kernels/FP32/Transpose/network.onnx
diff --git a/DeeployTest/Tests/testFloatTranspose/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Transpose/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatTranspose/outputs.npz
rename to DeeployTest/Tests/Kernels/FP32/Transpose/outputs.npz
diff --git a/DeeployTest/Tests/TestAdderLarge/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/Large/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestAdderLarge/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Add/Large/inputs.npz
diff --git a/DeeployTest/Tests/TestAdderLarge/network.onnx b/DeeployTest/Tests/Kernels/Integer/Add/Large/network.onnx
similarity index 100%
rename from DeeployTest/Tests/TestAdderLarge/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Add/Large/network.onnx
diff --git a/DeeployTest/Tests/TestAdderLarge/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/Large/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestAdderLarge/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Add/Large/outputs.npz
diff --git a/DeeployTest/Tests/MultIO/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/MultIO/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/MultIO/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Add/MultIO/inputs.npz
diff --git a/DeeployTest/Tests/MultIO/network.onnx b/DeeployTest/Tests/Kernels/Integer/Add/MultIO/network.onnx
similarity index 100%
rename from DeeployTest/Tests/MultIO/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Add/MultIO/network.onnx
diff --git a/DeeployTest/Tests/MultIO/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/MultIO/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/MultIO/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Add/MultIO/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/Integer/Add/Regular/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/Regular/inputs.npz
new file mode 100644
index 0000000000..6b104c7770
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Add/Regular/inputs.npz differ
diff --git a/DeeployTest/Tests/Adder/network.onnx b/DeeployTest/Tests/Kernels/Integer/Add/Regular/network.onnx
similarity index 51%
rename from DeeployTest/Tests/Adder/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Add/Regular/network.onnx
index 851801aaa5..6c836a0ff3 100644
--- a/DeeployTest/Tests/Adder/network.onnx
+++ b/DeeployTest/Tests/Kernels/Integer/Add/Regular/network.onnx
@@ -1,7 +1,8 @@
-pytorch1.11.0:�
-)
+pytorch2.7.1:�
+(
 onnx::Add_0
-onnx::Add_12Add_0"Addtorch-jit-exportZ%
+onnx::Add_12/Add"Add
+main_graphZ%
 onnx::Add_0
 
 
@@ -19,10 +20,4 @@
 
 
 
-j
-2
-
-
-
-
-B	
\ No newline at end of file
+B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/Integer/Add/Regular/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/Regular/outputs.npz
new file mode 100644
index 0000000000..f73b28d551
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Add/Regular/outputs.npz differ
diff --git a/DeeployTest/Tests/TestRQAdd/activations.npz b/DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/activations.npz
similarity index 100%
rename from DeeployTest/Tests/TestRQAdd/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/activations.npz
diff --git a/DeeployTest/Tests/TestRQAdd/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestRQAdd/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/inputs.npz
diff --git a/DeeployTest/Tests/TestRQAdd/network.onnx b/DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/TestRQAdd/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/network.onnx
diff --git a/DeeployTest/Tests/TestRQAdd/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestRQAdd/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Add/Regular_RQ/outputs.npz
diff --git a/DeeployTest/Tests/Attention/activations.npz b/DeeployTest/Tests/Kernels/Integer/Attention/activations.npz
similarity index 100%
rename from DeeployTest/Tests/Attention/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Attention/activations.npz
diff --git a/DeeployTest/Tests/Attention/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Attention/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/Attention/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Attention/inputs.npz
diff --git a/DeeployTest/Tests/Attention/network.onnx b/DeeployTest/Tests/Kernels/Integer/Attention/network.onnx
similarity index 100%
rename from DeeployTest/Tests/Attention/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Attention/network.onnx
diff --git a/DeeployTest/Tests/Attention/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Attention/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/Attention/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Attention/outputs.npz
diff --git a/DeeployTest/Tests/TestiSoftmaxLarge/activations.npz b/DeeployTest/Tests/Kernels/Integer/Concat/activations.npz
similarity index 100%
rename from DeeployTest/Tests/TestiSoftmaxLarge/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Concat/activations.npz
diff --git a/DeeployTest/Tests/testConcat/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Concat/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testConcat/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Concat/inputs.npz
diff --git a/DeeployTest/Tests/testConcat/network.onnx b/DeeployTest/Tests/Kernels/Integer/Concat/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testConcat/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Concat/network.onnx
diff --git a/DeeployTest/Tests/testConcat/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Concat/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testConcat/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Concat/outputs.npz
diff --git a/DeeployTest/Tests/test1DDWConvolution/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_1D/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/test1DDWConvolution/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_1D/inputs.npz
diff --git a/DeeployTest/Tests/test1DDWConvolution/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/DW_1D/network.onnx
similarity index 100%
rename from DeeployTest/Tests/test1DDWConvolution/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_1D/network.onnx
diff --git a/DeeployTest/Tests/test1DDWConvolution/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_1D/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/test1DDWConvolution/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_1D/outputs.npz
diff --git a/DeeployTest/Tests/test2DDWConvolution/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DDWConvolution/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_2D/inputs.npz
diff --git a/DeeployTest/Tests/test2DDWConvolution/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D/network.onnx
similarity index 100%
rename from DeeployTest/Tests/test2DDWConvolution/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_2D/network.onnx
diff --git a/DeeployTest/Tests/test2DDWConvolution/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DDWConvolution/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_2D/outputs.npz
diff --git a/DeeployTest/Tests/testRequantizedDWConv/activations.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testRequantizedDWConv/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/activations.npz
diff --git a/DeeployTest/Tests/testRequantizedDWConv/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRequantizedDWConv/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/inputs.npz
diff --git a/DeeployTest/Tests/testRequantizedDWConv/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRequantizedDWConv/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/network.onnx
diff --git a/DeeployTest/Tests/testRequantizedDWConv/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRequantizedDWConv/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/DW_2D_RQ/outputs.npz
diff --git a/DeeployTest/Tests/testPointwise/activations.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwise/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/activations.npz
diff --git a/DeeployTest/Tests/testPointwise/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwise/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/inputs.npz
diff --git a/DeeployTest/Tests/testPointwise/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testPointwise/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/network.onnx
diff --git a/DeeployTest/Tests/testPointwise/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwise/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D/outputs.npz
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/activations.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwiseConvBNReLU/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/activations.npz
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwiseConvBNReLU/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/inputs.npz
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testPointwiseConvBNReLU/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/network.onnx
diff --git a/DeeployTest/Tests/testPointwiseConvBNReLU/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwiseConvBNReLU/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ/outputs.npz
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/activations.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwiseUnsignedWeights/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/activations.npz
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwiseUnsignedWeights/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/inputs.npz
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testPointwiseUnsignedWeights/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/network.onnx
diff --git a/DeeployTest/Tests/testPointwiseUnsignedWeights/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testPointwiseUnsignedWeights/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ/outputs.npz
diff --git a/DeeployTest/Tests/test1DConvolution/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_1D/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/test1DConvolution/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_1D/inputs.npz
diff --git a/DeeployTest/Tests/test1DConvolution/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_1D/network.onnx
similarity index 100%
rename from DeeployTest/Tests/test1DConvolution/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_1D/network.onnx
diff --git a/DeeployTest/Tests/test1DConvolution/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_1D/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/test1DConvolution/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_1D/outputs.npz
diff --git a/DeeployTest/Tests/test2DConvolution/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DConvolution/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D/inputs.npz
diff --git a/DeeployTest/Tests/test2DConvolution/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D/network.onnx
similarity index 100%
rename from DeeployTest/Tests/test2DConvolution/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D/network.onnx
diff --git a/DeeployTest/Tests/test2DConvolution/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DConvolution/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D/outputs.npz
diff --git a/DeeployTest/Tests/testRQConv/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQConv/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D_RQ/inputs.npz
diff --git a/DeeployTest/Tests/testRQConv/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRQConv/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D_RQ/network.onnx
diff --git a/DeeployTest/Tests/testRQConv/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQConv/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/Regular_2D_RQ/outputs.npz
diff --git a/DeeployTest/Tests/test2DRequantizedStriddedPaddedConv/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/StriddedPadded_2D_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DRequantizedStriddedPaddedConv/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/StriddedPadded_2D_RQ/inputs.npz
diff --git a/DeeployTest/Tests/test2DRequantizedStriddedPaddedConv/network.onnx b/DeeployTest/Tests/Kernels/Integer/Conv/StriddedPadded_2D_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/test2DRequantizedStriddedPaddedConv/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Conv/StriddedPadded_2D_RQ/network.onnx
diff --git a/DeeployTest/Tests/test2DRequantizedStriddedPaddedConv/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Conv/StriddedPadded_2D_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DRequantizedStriddedPaddedConv/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Conv/StriddedPadded_2D_RQ/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/inputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/inputs.npz
new file mode 100644
index 0000000000..beb5343bfa
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/network.onnx b/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/network.onnx
new file mode 100644
index 0000000000..eaf45f0ff4
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/outputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/outputs.npz
new file mode 100644
index 0000000000..6578113295
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/GEMM/Batch_RQ/outputs.npz differ
diff --git a/DeeployTest/Tests/testGEMM/inputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testGEMM/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular/inputs.npz
diff --git a/DeeployTest/Tests/testGEMM/network.onnx b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testGEMM/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular/network.onnx
diff --git a/DeeployTest/Tests/testGEMM/outputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testGEMM/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular/outputs.npz
diff --git a/DeeployTest/Tests/testRequantizedLinear/activations.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testRequantizedLinear/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/activations.npz
diff --git a/DeeployTest/Tests/testRequantizedLinear/inputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRequantizedLinear/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/inputs.npz
diff --git a/DeeployTest/Tests/testRequantizedLinear/network.onnx b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRequantizedLinear/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/network.onnx
diff --git a/DeeployTest/Tests/testRequantizedLinear/outputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRequantizedLinear/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerColumn/outputs.npz
diff --git a/DeeployTest/Tests/testRQGEMM/inputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerRow/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQGEMM/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerRow/inputs.npz
diff --git a/DeeployTest/Tests/testRQGEMM/network.onnx b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerRow/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRQGEMM/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerRow/network.onnx
diff --git a/DeeployTest/Tests/testRQGEMM/outputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerRow/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQGEMM/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/Regular_RQPerRow/outputs.npz
diff --git a/DeeployTest/Tests/testRQGEMMTransB/inputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/TransB_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQGEMMTransB/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/TransB_RQ/inputs.npz
diff --git a/DeeployTest/Tests/testRQGEMMTransB/network.onnx b/DeeployTest/Tests/Kernels/Integer/GEMM/TransB_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRQGEMMTransB/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/GEMM/TransB_RQ/network.onnx
diff --git a/DeeployTest/Tests/testRQGEMMTransB/outputs.npz b/DeeployTest/Tests/Kernels/Integer/GEMM/TransB_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQGEMMTransB/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/GEMM/TransB_RQ/outputs.npz
diff --git a/DeeployTest/Tests/Hardswish/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Hardswish/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/Hardswish/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Hardswish/Regular/inputs.npz
diff --git a/DeeployTest/Tests/Hardswish/network.onnx b/DeeployTest/Tests/Kernels/Integer/Hardswish/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/Hardswish/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Hardswish/Regular/network.onnx
diff --git a/DeeployTest/Tests/Hardswish/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Hardswish/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/Hardswish/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Hardswish/Regular/outputs.npz
diff --git a/DeeployTest/Tests/RQHardswish/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Hardswish/Regular_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/RQHardswish/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Hardswish/Regular_RQ/inputs.npz
diff --git a/DeeployTest/Tests/RQHardswish/network.onnx b/DeeployTest/Tests/Kernels/Integer/Hardswish/Regular_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/RQHardswish/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Hardswish/Regular_RQ/network.onnx
diff --git a/DeeployTest/Tests/RQHardswish/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Hardswish/Regular_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/RQHardswish/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Hardswish/Regular_RQ/outputs.npz
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/inputIdentity/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Identity/InputIdentity/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/inputIdentity/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Identity/InputIdentity/inputs.npz
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/inputIdentity/network.onnx b/DeeployTest/Tests/Kernels/Integer/Identity/InputIdentity/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/inputIdentity/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Identity/InputIdentity/network.onnx
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/inputIdentity/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Identity/InputIdentity/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/inputIdentity/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Identity/InputIdentity/outputs.npz
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/multiOutputIdentity/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Identity/MultiOutputIdentity/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/multiOutputIdentity/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Identity/MultiOutputIdentity/inputs.npz
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/multiOutputIdentity/network.onnx b/DeeployTest/Tests/Kernels/Integer/Identity/MultiOutputIdentity/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/multiOutputIdentity/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Identity/MultiOutputIdentity/network.onnx
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/multiOutputIdentity/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Identity/MultiOutputIdentity/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/multiOutputIdentity/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Identity/MultiOutputIdentity/outputs.npz
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/outputIdentity/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Identity/OutputIdentity/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/outputIdentity/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Identity/OutputIdentity/inputs.npz
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/outputIdentity/network.onnx b/DeeployTest/Tests/Kernels/Integer/Identity/OutputIdentity/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/outputIdentity/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Identity/OutputIdentity/network.onnx
diff --git a/DeeployTest/Tests/testRemoveIdentityOp/outputIdentity/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Identity/OutputIdentity/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRemoveIdentityOp/outputIdentity/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Identity/OutputIdentity/outputs.npz
diff --git a/DeeployTest/Tests/testMatMulAdd/inputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Add/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testMatMulAdd/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Add/inputs.npz
diff --git a/DeeployTest/Tests/testMatMulAdd/network.onnx b/DeeployTest/Tests/Kernels/Integer/MatMul/Add/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testMatMulAdd/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Add/network.onnx
diff --git a/DeeployTest/Tests/testMatMulAdd/outputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Add/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testMatMulAdd/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Add/outputs.npz
diff --git a/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/inputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/inputs.npz
new file mode 100644
index 0000000000..d9098e15d3
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/network.onnx b/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/network.onnx
new file mode 100644
index 0000000000..f8e86371f0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/outputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/outputs.npz
new file mode 100644
index 0000000000..a3ed1f597a
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/MatMul/Batch/outputs.npz differ
diff --git a/DeeployTest/Tests/testMatMul/inputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testMatMul/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Regular/inputs.npz
diff --git a/DeeployTest/Tests/testMatMul/network.onnx b/DeeployTest/Tests/Kernels/Integer/MatMul/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testMatMul/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Regular/network.onnx
diff --git a/DeeployTest/Tests/testMatMul/outputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testMatMul/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Regular/outputs.npz
diff --git a/DeeployTest/Tests/testRQMatMul/inputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Regular_RQ/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQMatMul/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Regular_RQ/inputs.npz
diff --git a/DeeployTest/Tests/testRQMatMul/network.onnx b/DeeployTest/Tests/Kernels/Integer/MatMul/Regular_RQ/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRQMatMul/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Regular_RQ/network.onnx
diff --git a/DeeployTest/Tests/testRQMatMul/outputs.npz b/DeeployTest/Tests/Kernels/Integer/MatMul/Regular_RQ/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRQMatMul/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MatMul/Regular_RQ/outputs.npz
diff --git a/DeeployTest/Tests/testMaxPool/inputs.npz b/DeeployTest/Tests/Kernels/Integer/MaxPool/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testMaxPool/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MaxPool/inputs.npz
diff --git a/DeeployTest/Tests/testMaxPool/network.onnx b/DeeployTest/Tests/Kernels/Integer/MaxPool/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testMaxPool/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/MaxPool/network.onnx
diff --git a/DeeployTest/Tests/testMaxPool/outputs.npz b/DeeployTest/Tests/Kernels/Integer/MaxPool/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testMaxPool/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/MaxPool/outputs.npz
diff --git a/DeeployTest/Tests/test1DPad/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Pad/Regular_1D/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/test1DPad/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Pad/Regular_1D/inputs.npz
diff --git a/DeeployTest/Tests/test1DPad/network.onnx b/DeeployTest/Tests/Kernels/Integer/Pad/Regular_1D/network.onnx
similarity index 100%
rename from DeeployTest/Tests/test1DPad/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Pad/Regular_1D/network.onnx
diff --git a/DeeployTest/Tests/test1DPad/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Pad/Regular_1D/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/test1DPad/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Pad/Regular_1D/outputs.npz
diff --git a/DeeployTest/Tests/test2DPad/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Pad/Regular_2D/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DPad/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Pad/Regular_2D/inputs.npz
diff --git a/DeeployTest/Tests/test2DPad/network.onnx b/DeeployTest/Tests/Kernels/Integer/Pad/Regular_2D/network.onnx
similarity index 100%
rename from DeeployTest/Tests/test2DPad/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Pad/Regular_2D/network.onnx
diff --git a/DeeployTest/Tests/test2DPad/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Pad/Regular_2D/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/test2DPad/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Pad/Regular_2D/outputs.npz
diff --git a/DeeployTest/Tests/testRMSNorm/activations.npz b/DeeployTest/Tests/Kernels/Integer/RMSNorm/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testRMSNorm/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/RMSNorm/activations.npz
diff --git a/DeeployTest/Tests/testRMSNorm/inputs.npz b/DeeployTest/Tests/Kernels/Integer/RMSNorm/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRMSNorm/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/RMSNorm/inputs.npz
diff --git a/DeeployTest/Tests/testRMSNorm/network.onnx b/DeeployTest/Tests/Kernels/Integer/RMSNorm/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testRMSNorm/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/RMSNorm/network.onnx
diff --git a/DeeployTest/Tests/testRMSNorm/outputs.npz b/DeeployTest/Tests/Kernels/Integer/RMSNorm/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testRMSNorm/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/RMSNorm/outputs.npz
diff --git a/DeeployTest/Tests/testReduceMean/inputs.npz b/DeeployTest/Tests/Kernels/Integer/ReduceMean/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testReduceMean/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/ReduceMean/inputs.npz
diff --git a/DeeployTest/Tests/testReduceMean/network.onnx b/DeeployTest/Tests/Kernels/Integer/ReduceMean/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testReduceMean/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/ReduceMean/network.onnx
diff --git a/DeeployTest/Tests/testReduceMean/outputs.npz b/DeeployTest/Tests/Kernels/Integer/ReduceMean/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testReduceMean/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/ReduceMean/outputs.npz
diff --git a/DeeployTest/Tests/testReduceSum/inputs.npz b/DeeployTest/Tests/Kernels/Integer/ReduceSum/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testReduceSum/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/ReduceSum/inputs.npz
diff --git a/DeeployTest/Tests/testReduceSum/network.onnx b/DeeployTest/Tests/Kernels/Integer/ReduceSum/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testReduceSum/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/ReduceSum/network.onnx
diff --git a/DeeployTest/Tests/testReduceSum/outputs.npz b/DeeployTest/Tests/Kernels/Integer/ReduceSum/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testReduceSum/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/ReduceSum/outputs.npz
diff --git a/DeeployTest/Tests/iSoftmax/activations.npz b/DeeployTest/Tests/Kernels/Integer/Slice/activations.npz
similarity index 100%
rename from DeeployTest/Tests/iSoftmax/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Slice/activations.npz
diff --git a/DeeployTest/Tests/testSlice/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Slice/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testSlice/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Slice/inputs.npz
diff --git a/DeeployTest/Tests/testSlice/network.onnx b/DeeployTest/Tests/Kernels/Integer/Slice/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testSlice/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Slice/network.onnx
diff --git a/DeeployTest/Tests/testSlice/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Slice/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testSlice/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Slice/outputs.npz
diff --git a/DeeployTest/Tests/testBacktracking/activations.npz b/DeeployTest/Tests/Kernels/Integer/Softmax/Large/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testBacktracking/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Large/activations.npz
diff --git a/DeeployTest/Tests/TestiSoftmaxLarge/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Softmax/Large/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestiSoftmaxLarge/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Large/inputs.npz
diff --git a/DeeployTest/Tests/TestiSoftmaxLarge/network.onnx b/DeeployTest/Tests/Kernels/Integer/Softmax/Large/network.onnx
similarity index 100%
rename from DeeployTest/Tests/TestiSoftmaxLarge/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Large/network.onnx
diff --git a/DeeployTest/Tests/TestiSoftmaxLarge/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Softmax/Large/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestiSoftmaxLarge/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Large/outputs.npz
diff --git a/DeeployTest/Tests/testConcat/activations.npz b/DeeployTest/Tests/Kernels/Integer/Softmax/Regular/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testConcat/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Regular/activations.npz
diff --git a/DeeployTest/Tests/iSoftmax/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Softmax/Regular/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/iSoftmax/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Regular/inputs.npz
diff --git a/DeeployTest/Tests/iSoftmax/network.onnx b/DeeployTest/Tests/Kernels/Integer/Softmax/Regular/network.onnx
similarity index 100%
rename from DeeployTest/Tests/iSoftmax/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Regular/network.onnx
diff --git a/DeeployTest/Tests/iSoftmax/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Softmax/Regular/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/iSoftmax/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/Softmax/Regular/outputs.npz
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/activations.npz b/DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/activations.npz
similarity index 100%
rename from DeeployTest/Tests/trueIntegerDivSandwich/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/activations.npz
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/inputs.npz b/DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/trueIntegerDivSandwich/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/inputs.npz
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/network.onnx b/DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/network.onnx
similarity index 100%
rename from DeeployTest/Tests/trueIntegerDivSandwich/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/network.onnx
diff --git a/DeeployTest/Tests/trueIntegerDivSandwich/outputs.npz b/DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/trueIntegerDivSandwich/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/TrueIntegerDiv/outputs.npz
diff --git a/DeeployTest/Tests/testFloatAdder/activations.npz b/DeeployTest/Tests/Kernels/Integer/iNoNorm/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatAdder/activations.npz
rename to DeeployTest/Tests/Kernels/Integer/iNoNorm/activations.npz
diff --git a/DeeployTest/Tests/TestiNoNorm/inputs.npz b/DeeployTest/Tests/Kernels/Integer/iNoNorm/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestiNoNorm/inputs.npz
rename to DeeployTest/Tests/Kernels/Integer/iNoNorm/inputs.npz
diff --git a/DeeployTest/Tests/TestiNoNorm/network.onnx b/DeeployTest/Tests/Kernels/Integer/iNoNorm/network.onnx
similarity index 100%
rename from DeeployTest/Tests/TestiNoNorm/network.onnx
rename to DeeployTest/Tests/Kernels/Integer/iNoNorm/network.onnx
diff --git a/DeeployTest/Tests/TestiNoNorm/outputs.npz b/DeeployTest/Tests/Kernels/Integer/iNoNorm/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/TestiNoNorm/outputs.npz
rename to DeeployTest/Tests/Kernels/Integer/iNoNorm/outputs.npz
diff --git a/DeeployTest/Tests/Dequant/inputs.npz b/DeeployTest/Tests/Kernels/Mixed/Dequant/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/Dequant/inputs.npz
rename to DeeployTest/Tests/Kernels/Mixed/Dequant/inputs.npz
diff --git a/DeeployTest/Tests/Dequant/network.onnx b/DeeployTest/Tests/Kernels/Mixed/Dequant/network.onnx
similarity index 100%
rename from DeeployTest/Tests/Dequant/network.onnx
rename to DeeployTest/Tests/Kernels/Mixed/Dequant/network.onnx
diff --git a/DeeployTest/Tests/Dequant/outputs.npz b/DeeployTest/Tests/Kernels/Mixed/Dequant/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/Dequant/outputs.npz
rename to DeeployTest/Tests/Kernels/Mixed/Dequant/outputs.npz
diff --git a/DeeployTest/Tests/Quant/inputs.npz b/DeeployTest/Tests/Kernels/Mixed/Quant/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/Quant/inputs.npz
rename to DeeployTest/Tests/Kernels/Mixed/Quant/inputs.npz
diff --git a/DeeployTest/Tests/Quant/network.onnx b/DeeployTest/Tests/Kernels/Mixed/Quant/network.onnx
similarity index 100%
rename from DeeployTest/Tests/Quant/network.onnx
rename to DeeployTest/Tests/Kernels/Mixed/Quant/network.onnx
diff --git a/DeeployTest/Tests/Quant/outputs.npz b/DeeployTest/Tests/Kernels/Mixed/Quant/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/Quant/outputs.npz
rename to DeeployTest/Tests/Kernels/Mixed/Quant/outputs.npz
diff --git a/DeeployTest/Tests/Models/Autoencoder1D/inputs.npz b/DeeployTest/Tests/Models/Autoencoder1D/inputs.npz
new file mode 100644
index 0000000000..cc639dab2f
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder1D/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder1D/network.onnx b/DeeployTest/Tests/Models/Autoencoder1D/network.onnx
new file mode 100644
index 0000000000..d70e48e6dd
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder1D/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder1D/outputs.npz b/DeeployTest/Tests/Models/Autoencoder1D/outputs.npz
new file mode 100644
index 0000000000..13e8f46fad
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder1D/outputs.npz differ
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_128/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_128/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_128/inputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_128/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_128/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_128/network.onnx
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_128/network.onnx
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_128/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_128/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_128/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_128/outputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_128/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_32/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_32/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_32/inputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_32/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_32/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_32/network.onnx
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_32/network.onnx
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_32/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_32/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_32/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_32/outputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_32/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_64/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_64/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_64/inputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_64/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_64/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_64/network.onnx
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_64/network.onnx
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_64/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_64/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_64/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_64/outputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_64/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_8/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_8/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_8/inputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_8/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_8/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_8/network.onnx
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_8/network.onnx
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_8/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_16_16_8/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_8/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_16_16_8/outputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_16_16_8/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_32/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_32/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_32_32_32/inputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_32/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_32/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_32/network.onnx
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_32_32_32/network.onnx
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_32/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_32/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_32/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_32_32_32/outputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_32/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_8/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_8/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_32_32_8/inputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_8/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_8/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_8/network.onnx
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_32_32_8/network.onnx
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_8/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_1_32_32_8/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_8/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_1_32_32_8/outputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_1_32_32_8/outputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_2_32_32_128/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_2_32_32_128/inputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128/inputs.npz
diff --git a/DeeployTest/Tests/CCT/CCT_2_32_32_128/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128/network.onnx
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_2_32_32_128/network.onnx
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128/network.onnx
diff --git a/DeeployTest/Tests/CCT/CCT_2_32_32_128/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/CCT/CCT_2_32_32_128/outputs.npz
rename to DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128/outputs.npz
diff --git a/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/inputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/inputs.npz
new file mode 100644
index 0000000000..de35d4d758
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/network.onnx b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/network.onnx
new file mode 100644
index 0000000000..fefd0c2c6d
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/network.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/outputs.npz b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/outputs.npz
new file mode 100644
index 0000000000..9686a6345d
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT/FP32/CCT_2_32_32_128_Opset20/outputs.npz differ
diff --git a/DeeployTest/Tests/ICCT/activations.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT/activations.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT/activations.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT/activations.npz
diff --git a/DeeployTest/Tests/ICCT/inputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT/inputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT/inputs.npz
diff --git a/DeeployTest/Tests/ICCT/network.onnx b/DeeployTest/Tests/Models/CCT/Int/ICCT/network.onnx
similarity index 100%
rename from DeeployTest/Tests/ICCT/network.onnx
rename to DeeployTest/Tests/Models/CCT/Int/ICCT/network.onnx
diff --git a/DeeployTest/Tests/ICCT/outputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT/outputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT/outputs.npz
diff --git a/DeeployTest/Tests/ICCT_8/activations.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_8/activations.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_8/activations.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_8/activations.npz
diff --git a/DeeployTest/Tests/ICCT_8/inputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_8/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_8/inputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_8/inputs.npz
diff --git a/DeeployTest/Tests/ICCT_8/network.onnx b/DeeployTest/Tests/Models/CCT/Int/ICCT_8/network.onnx
similarity index 100%
rename from DeeployTest/Tests/ICCT_8/network.onnx
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_8/network.onnx
diff --git a/DeeployTest/Tests/ICCT_8/outputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_8/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_8/outputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_8/outputs.npz
diff --git a/DeeployTest/Tests/ICCT_ITA/activations.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/activations.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA/activations.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/activations.npz
diff --git a/DeeployTest/Tests/ICCT_ITA/inputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA/inputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/inputs.npz
diff --git a/DeeployTest/Tests/ICCT_ITA/network.onnx b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/network.onnx
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA/network.onnx
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/network.onnx
diff --git a/DeeployTest/Tests/ICCT_ITA/outputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA/outputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA/outputs.npz
diff --git a/DeeployTest/Tests/ICCT_ITA_8/activations.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/activations.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA_8/activations.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/activations.npz
diff --git a/DeeployTest/Tests/ICCT_ITA_8/inputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA_8/inputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/inputs.npz
diff --git a/DeeployTest/Tests/ICCT_ITA_8/network.onnx b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/network.onnx
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA_8/network.onnx
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/network.onnx
diff --git a/DeeployTest/Tests/ICCT_ITA_8/outputs.npz b/DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/ICCT_ITA_8/outputs.npz
rename to DeeployTest/Tests/Models/CCT/Int/ICCT_ITA_8/outputs.npz
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/inputs.npz
new file mode 100644
index 0000000000..a9018350f2
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network.onnx
new file mode 100644
index 0000000000..7473d7e5c1
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network_infer.onnx
new file mode 100644
index 0000000000..11b0ca1f69
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/network_infer.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/outputs.npz
new file mode 100644
index 0000000000..d2ad678b76
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT1/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/inputs.npz
new file mode 100644
index 0000000000..7af9629e9b
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network.onnx
new file mode 100644
index 0000000000..ac9569fb58
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network_infer.onnx
new file mode 100644
index 0000000000..366a0be89e
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/network_infer.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/outputs.npz
new file mode 100644
index 0000000000..c2850ae68a
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_FT2/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/inputs.npz
new file mode 100644
index 0000000000..c32b8dfd64
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network.onnx
new file mode 100644
index 0000000000..798e35f96b
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network_infer.onnx
new file mode 100644
index 0000000000..2eae9e8d7e
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/network_infer.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/outputs.npz
new file mode 100644
index 0000000000..bb23f3a08a
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LP/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/inputs.npz
new file mode 100644
index 0000000000..c4296c01c6
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network.onnx
new file mode 100644
index 0000000000..8f183a9e2c
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network_infer.onnx
new file mode 100644
index 0000000000..6cc128149a
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/network_infer.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/outputs.npz
new file mode 100644
index 0000000000..e34b4860ed
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA1/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/inputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/inputs.npz
new file mode 100644
index 0000000000..71d400304c
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network.onnx
new file mode 100644
index 0000000000..93a262b786
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network_infer.onnx b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network_infer.onnx
new file mode 100644
index 0000000000..9c5a0963db
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/network_infer.onnx differ
diff --git a/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/outputs.npz b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/outputs.npz
new file mode 100644
index 0000000000..b134b08d6a
Binary files /dev/null and b/DeeployTest/Tests/Models/CCT_Train/CCT2_LoRA2/outputs.npz differ
diff --git a/DeeployTest/Tests/simpleCNN/activations.npz b/DeeployTest/Tests/Models/CNN_Linear1/activations.npz
similarity index 100%
rename from DeeployTest/Tests/simpleCNN/activations.npz
rename to DeeployTest/Tests/Models/CNN_Linear1/activations.npz
diff --git a/DeeployTest/Tests/simpleCNN/inputs.npz b/DeeployTest/Tests/Models/CNN_Linear1/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/simpleCNN/inputs.npz
rename to DeeployTest/Tests/Models/CNN_Linear1/inputs.npz
diff --git a/DeeployTest/Tests/simpleCNN/network.onnx b/DeeployTest/Tests/Models/CNN_Linear1/network.onnx
similarity index 100%
rename from DeeployTest/Tests/simpleCNN/network.onnx
rename to DeeployTest/Tests/Models/CNN_Linear1/network.onnx
diff --git a/DeeployTest/Tests/simpleCNN/outputs.npz b/DeeployTest/Tests/Models/CNN_Linear1/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/simpleCNN/outputs.npz
rename to DeeployTest/Tests/Models/CNN_Linear1/outputs.npz
diff --git a/DeeployTest/Tests/simpleRegression/activations.npz b/DeeployTest/Tests/Models/CNN_Linear2/activations.npz
similarity index 100%
rename from DeeployTest/Tests/simpleRegression/activations.npz
rename to DeeployTest/Tests/Models/CNN_Linear2/activations.npz
diff --git a/DeeployTest/Tests/simpleRegression/inputs.npz b/DeeployTest/Tests/Models/CNN_Linear2/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/simpleRegression/inputs.npz
rename to DeeployTest/Tests/Models/CNN_Linear2/inputs.npz
diff --git a/DeeployTest/Tests/simpleRegression/network.onnx b/DeeployTest/Tests/Models/CNN_Linear2/network.onnx
similarity index 100%
rename from DeeployTest/Tests/simpleRegression/network.onnx
rename to DeeployTest/Tests/Models/CNN_Linear2/network.onnx
diff --git a/DeeployTest/Tests/simpleRegression/outputs.npz b/DeeployTest/Tests/Models/CNN_Linear2/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/simpleRegression/outputs.npz
rename to DeeployTest/Tests/Models/CNN_Linear2/outputs.npz
diff --git a/DeeployTest/Tests/EEGFormer/activations.npz b/DeeployTest/Tests/Models/EEGFormer/activations.npz
similarity index 100%
rename from DeeployTest/Tests/EEGFormer/activations.npz
rename to DeeployTest/Tests/Models/EEGFormer/activations.npz
diff --git a/DeeployTest/Tests/EEGFormer/inputs.npz b/DeeployTest/Tests/Models/EEGFormer/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/EEGFormer/inputs.npz
rename to DeeployTest/Tests/Models/EEGFormer/inputs.npz
diff --git a/DeeployTest/Tests/EEGFormer/network.onnx b/DeeployTest/Tests/Models/EEGFormer/network.onnx
similarity index 100%
rename from DeeployTest/Tests/EEGFormer/network.onnx
rename to DeeployTest/Tests/Models/EEGFormer/network.onnx
diff --git a/DeeployTest/Tests/EEGFormer/outputs.npz b/DeeployTest/Tests/Models/EEGFormer/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/EEGFormer/outputs.npz
rename to DeeployTest/Tests/Models/EEGFormer/outputs.npz
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/activations.npz b/DeeployTest/Tests/Models/MLPerf/AnomalyDetection/activations.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/AnomalyDetection/activations.npz
rename to DeeployTest/Tests/Models/MLPerf/AnomalyDetection/activations.npz
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/inputs.npz b/DeeployTest/Tests/Models/MLPerf/AnomalyDetection/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/AnomalyDetection/inputs.npz
rename to DeeployTest/Tests/Models/MLPerf/AnomalyDetection/inputs.npz
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/network.onnx b/DeeployTest/Tests/Models/MLPerf/AnomalyDetection/network.onnx
similarity index 100%
rename from DeeployTest/Tests/MLPerf/AnomalyDetection/network.onnx
rename to DeeployTest/Tests/Models/MLPerf/AnomalyDetection/network.onnx
diff --git a/DeeployTest/Tests/MLPerf/AnomalyDetection/outputs.npz b/DeeployTest/Tests/Models/MLPerf/AnomalyDetection/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/AnomalyDetection/outputs.npz
rename to DeeployTest/Tests/Models/MLPerf/AnomalyDetection/outputs.npz
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/activations.npz b/DeeployTest/Tests/Models/MLPerf/ImageClassification/activations.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/ImageClassification/activations.npz
rename to DeeployTest/Tests/Models/MLPerf/ImageClassification/activations.npz
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/inputs.npz b/DeeployTest/Tests/Models/MLPerf/ImageClassification/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/ImageClassification/inputs.npz
rename to DeeployTest/Tests/Models/MLPerf/ImageClassification/inputs.npz
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/network.onnx b/DeeployTest/Tests/Models/MLPerf/ImageClassification/network.onnx
similarity index 100%
rename from DeeployTest/Tests/MLPerf/ImageClassification/network.onnx
rename to DeeployTest/Tests/Models/MLPerf/ImageClassification/network.onnx
diff --git a/DeeployTest/Tests/MLPerf/ImageClassification/outputs.npz b/DeeployTest/Tests/Models/MLPerf/ImageClassification/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/ImageClassification/outputs.npz
rename to DeeployTest/Tests/Models/MLPerf/ImageClassification/outputs.npz
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/activations.npz b/DeeployTest/Tests/Models/MLPerf/KeywordSpotting/activations.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/KeywordSpotting/activations.npz
rename to DeeployTest/Tests/Models/MLPerf/KeywordSpotting/activations.npz
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/inputs.npz b/DeeployTest/Tests/Models/MLPerf/KeywordSpotting/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/KeywordSpotting/inputs.npz
rename to DeeployTest/Tests/Models/MLPerf/KeywordSpotting/inputs.npz
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/network.onnx b/DeeployTest/Tests/Models/MLPerf/KeywordSpotting/network.onnx
similarity index 100%
rename from DeeployTest/Tests/MLPerf/KeywordSpotting/network.onnx
rename to DeeployTest/Tests/Models/MLPerf/KeywordSpotting/network.onnx
diff --git a/DeeployTest/Tests/MLPerf/KeywordSpotting/outputs.npz b/DeeployTest/Tests/Models/MLPerf/KeywordSpotting/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/KeywordSpotting/outputs.npz
rename to DeeployTest/Tests/Models/MLPerf/KeywordSpotting/outputs.npz
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/activations.npz b/DeeployTest/Tests/Models/MLPerf/VisualWakeWords/activations.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/VisualWakeWords/activations.npz
rename to DeeployTest/Tests/Models/MLPerf/VisualWakeWords/activations.npz
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/inputs.npz b/DeeployTest/Tests/Models/MLPerf/VisualWakeWords/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/VisualWakeWords/inputs.npz
rename to DeeployTest/Tests/Models/MLPerf/VisualWakeWords/inputs.npz
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/network.onnx b/DeeployTest/Tests/Models/MLPerf/VisualWakeWords/network.onnx
similarity index 100%
rename from DeeployTest/Tests/MLPerf/VisualWakeWords/network.onnx
rename to DeeployTest/Tests/Models/MLPerf/VisualWakeWords/network.onnx
diff --git a/DeeployTest/Tests/MLPerf/VisualWakeWords/outputs.npz b/DeeployTest/Tests/Models/MLPerf/VisualWakeWords/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/MLPerf/VisualWakeWords/outputs.npz
rename to DeeployTest/Tests/Models/MLPerf/VisualWakeWords/outputs.npz
diff --git a/DeeployTest/Tests/MobileNetv2/activations.npz b/DeeployTest/Tests/Models/MobileNetv2/activations.npz
similarity index 100%
rename from DeeployTest/Tests/MobileNetv2/activations.npz
rename to DeeployTest/Tests/Models/MobileNetv2/activations.npz
diff --git a/DeeployTest/Tests/MobileNetv2/inputs.npz b/DeeployTest/Tests/Models/MobileNetv2/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/MobileNetv2/inputs.npz
rename to DeeployTest/Tests/Models/MobileNetv2/inputs.npz
diff --git a/DeeployTest/Tests/MobileNetv2/network.onnx b/DeeployTest/Tests/Models/MobileNetv2/network.onnx
similarity index 100%
rename from DeeployTest/Tests/MobileNetv2/network.onnx
rename to DeeployTest/Tests/Models/MobileNetv2/network.onnx
diff --git a/DeeployTest/Tests/MobileNetv2/outputs.npz b/DeeployTest/Tests/Models/MobileNetv2/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/MobileNetv2/outputs.npz
rename to DeeployTest/Tests/Models/MobileNetv2/outputs.npz
diff --git a/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/inputs.npz b/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/inputs.npz
new file mode 100644
index 0000000000..cc4264b282
Binary files /dev/null and b/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/network.onnx b/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/network.onnx
new file mode 100644
index 0000000000..6f07b14c01
Binary files /dev/null and b/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/network.onnx differ
diff --git a/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/outputs.npz b/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/outputs.npz
new file mode 100644
index 0000000000..7142ff12b2
Binary files /dev/null and b/DeeployTest/Tests/Models/TinyViT/5M/Layers/FP32/ReduceMean/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatDemoTinyViT/inputs.npz b/DeeployTest/Tests/Models/TinyViT/Demo/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatDemoTinyViT/inputs.npz
rename to DeeployTest/Tests/Models/TinyViT/Demo/inputs.npz
diff --git a/DeeployTest/Tests/testFloatDemoTinyViT/network.onnx b/DeeployTest/Tests/Models/TinyViT/Demo/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatDemoTinyViT/network.onnx
rename to DeeployTest/Tests/Models/TinyViT/Demo/network.onnx
diff --git a/DeeployTest/Tests/testFloatDemoTinyViT/outputs.npz b/DeeployTest/Tests/Models/TinyViT/Demo/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatDemoTinyViT/outputs.npz
rename to DeeployTest/Tests/Models/TinyViT/Demo/outputs.npz
diff --git a/DeeployTest/Tests/Transformer/activations.npz b/DeeployTest/Tests/Models/Transformer/activations.npz
similarity index 100%
rename from DeeployTest/Tests/Transformer/activations.npz
rename to DeeployTest/Tests/Models/Transformer/activations.npz
diff --git a/DeeployTest/Tests/Transformer/inputs.npz b/DeeployTest/Tests/Models/Transformer/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/Transformer/inputs.npz
rename to DeeployTest/Tests/Models/Transformer/inputs.npz
diff --git a/DeeployTest/Tests/Transformer/network.onnx b/DeeployTest/Tests/Models/Transformer/network.onnx
similarity index 100%
rename from DeeployTest/Tests/Transformer/network.onnx
rename to DeeployTest/Tests/Models/Transformer/network.onnx
diff --git a/DeeployTest/Tests/Transformer/outputs.npz b/DeeployTest/Tests/Models/Transformer/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/Transformer/outputs.npz
rename to DeeployTest/Tests/Models/Transformer/outputs.npz
diff --git a/DeeployTest/Tests/QuantizedLinear/inputs.npz b/DeeployTest/Tests/Models/Transformer_DeepQuant/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/QuantizedLinear/inputs.npz
rename to DeeployTest/Tests/Models/Transformer_DeepQuant/inputs.npz
diff --git a/DeeployTest/Tests/QuantizedLinear/network.onnx b/DeeployTest/Tests/Models/Transformer_DeepQuant/network.onnx
similarity index 100%
rename from DeeployTest/Tests/QuantizedLinear/network.onnx
rename to DeeployTest/Tests/Models/Transformer_DeepQuant/network.onnx
diff --git a/DeeployTest/Tests/QuantizedLinear/outputs.npz b/DeeployTest/Tests/Models/Transformer_DeepQuant/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/QuantizedLinear/outputs.npz
rename to DeeployTest/Tests/Models/Transformer_DeepQuant/outputs.npz
diff --git a/DeeployTest/Tests/testFloatSGD/inputs.npz b/DeeployTest/Tests/Models/Transformer_Train/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSGD/inputs.npz
rename to DeeployTest/Tests/Models/Transformer_Train/inputs.npz
diff --git a/DeeployTest/Tests/testFloatSGD/network.onnx b/DeeployTest/Tests/Models/Transformer_Train/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testFloatSGD/network.onnx
rename to DeeployTest/Tests/Models/Transformer_Train/network.onnx
diff --git a/DeeployTest/Tests/testFloatSGD/outputs.npz b/DeeployTest/Tests/Models/Transformer_Train/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testFloatSGD/outputs.npz
rename to DeeployTest/Tests/Models/Transformer_Train/outputs.npz
diff --git a/DeeployTest/Tests/WaveFormer/inputs.npz b/DeeployTest/Tests/Models/WaveFormer/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/WaveFormer/inputs.npz
rename to DeeployTest/Tests/Models/WaveFormer/inputs.npz
diff --git a/DeeployTest/Tests/WaveFormer/network.onnx b/DeeployTest/Tests/Models/WaveFormer/network.onnx
similarity index 100%
rename from DeeployTest/Tests/WaveFormer/network.onnx
rename to DeeployTest/Tests/Models/WaveFormer/network.onnx
diff --git a/DeeployTest/Tests/WaveFormer/outputs.npz b/DeeployTest/Tests/Models/WaveFormer/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/WaveFormer/outputs.npz
rename to DeeployTest/Tests/Models/WaveFormer/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama1/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama1/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama1/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama1/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama1/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama1/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama1/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama1/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama1/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama1/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama1/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama1/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama128/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama128/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama128/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama128/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama128/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama128/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama128/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama128/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama128/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama128/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama128/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama128/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama128/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama128/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama128/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama128/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama16/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama16/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama16/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama16/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama16/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama16/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama16/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama16/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama16/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama16/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama16/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama16/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16_parallel/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama16_parallel/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16_parallel/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama16_parallel/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16_parallel/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama16_parallel/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama16_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama16_parallel/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama16_parallel/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama16_parallel/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1_parallel/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama1_parallel/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1_parallel/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama1_parallel/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1_parallel/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama1_parallel/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama1_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama1_parallel/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama1_parallel/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama1_parallel/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama2/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama2/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama2/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama2/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama2/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama2/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama2/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama2/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama2/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama2/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama2/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama2/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama256/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama256/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama256/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama256/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama256/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama256/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama256/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama256/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama256/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama256/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama256/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama256/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama256/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama256/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama256/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama256/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2_parallel/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama2_parallel/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2_parallel/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama2_parallel/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2_parallel/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama2_parallel/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama2_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama2_parallel/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama2_parallel/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama2_parallel/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama32/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama32/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama32/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama32/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama32/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama32/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama32/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama32/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama32/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama32/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama32/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama32/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32_parallel/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama32_parallel/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32_parallel/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama32_parallel/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32_parallel/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama32_parallel/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama32_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama32_parallel/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama32_parallel/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama32_parallel/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama4/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama4/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama4/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama4/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama4/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama4/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama4/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama4/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama4/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama4/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama4/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama4/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4_parallel/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama4_parallel/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4_parallel/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama4_parallel/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4_parallel/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama4_parallel/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama4_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama4_parallel/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama4_parallel/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama4_parallel/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama64/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama64/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama64/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama64/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama64/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama64/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama64/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama64/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama64/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama64/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama64/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama64/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64_parallel/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama64_parallel/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64_parallel/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama64_parallel/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64_parallel/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama64_parallel/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama64_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama64_parallel/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama64_parallel/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama64_parallel/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama8/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama8/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama8/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama8/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama8/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama8/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama8/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama8/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama8/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama8/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama8/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama8/outputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/activations.npz b/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/activations.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8_parallel/activations.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama8_parallel/activations.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/inputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8_parallel/inputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama8_parallel/inputs.npz
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/network.onnx b/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/network.onnx
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8_parallel/network.onnx
rename to DeeployTest/Tests/Models/microLlama/microLlama8_parallel/network.onnx
diff --git a/DeeployTest/Tests/microLlama/microLlama8_parallel/outputs.npz b/DeeployTest/Tests/Models/microLlama/microLlama8_parallel/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/microLlama/microLlama8_parallel/outputs.npz
rename to DeeployTest/Tests/Models/microLlama/microLlama8_parallel/outputs.npz
diff --git a/DeeployTest/Tests/miniMobileNet/activations.npz b/DeeployTest/Tests/Models/miniMobileNet/activations.npz
similarity index 100%
rename from DeeployTest/Tests/miniMobileNet/activations.npz
rename to DeeployTest/Tests/Models/miniMobileNet/activations.npz
diff --git a/DeeployTest/Tests/miniMobileNet/inputs.npz b/DeeployTest/Tests/Models/miniMobileNet/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/miniMobileNet/inputs.npz
rename to DeeployTest/Tests/Models/miniMobileNet/inputs.npz
diff --git a/DeeployTest/Tests/miniMobileNet/network.onnx b/DeeployTest/Tests/Models/miniMobileNet/network.onnx
similarity index 100%
rename from DeeployTest/Tests/miniMobileNet/network.onnx
rename to DeeployTest/Tests/Models/miniMobileNet/network.onnx
diff --git a/DeeployTest/Tests/miniMobileNet/outputs.npz b/DeeployTest/Tests/Models/miniMobileNet/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/miniMobileNet/outputs.npz
rename to DeeployTest/Tests/Models/miniMobileNet/outputs.npz
diff --git a/DeeployTest/Tests/miniMobileNetv2/activations.npz b/DeeployTest/Tests/Models/miniMobileNetv2/activations.npz
similarity index 100%
rename from DeeployTest/Tests/miniMobileNetv2/activations.npz
rename to DeeployTest/Tests/Models/miniMobileNetv2/activations.npz
diff --git a/DeeployTest/Tests/miniMobileNetv2/inputs.npz b/DeeployTest/Tests/Models/miniMobileNetv2/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/miniMobileNetv2/inputs.npz
rename to DeeployTest/Tests/Models/miniMobileNetv2/inputs.npz
diff --git a/DeeployTest/Tests/miniMobileNetv2/network.onnx b/DeeployTest/Tests/Models/miniMobileNetv2/network.onnx
similarity index 100%
rename from DeeployTest/Tests/miniMobileNetv2/network.onnx
rename to DeeployTest/Tests/Models/miniMobileNetv2/network.onnx
diff --git a/DeeployTest/Tests/miniMobileNetv2/outputs.npz b/DeeployTest/Tests/Models/miniMobileNetv2/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/miniMobileNetv2/outputs.npz
rename to DeeployTest/Tests/Models/miniMobileNetv2/outputs.npz
diff --git a/DeeployTest/Tests/testSlice/activations.npz b/DeeployTest/Tests/Others/Backtracking/activations.npz
similarity index 100%
rename from DeeployTest/Tests/testSlice/activations.npz
rename to DeeployTest/Tests/Others/Backtracking/activations.npz
diff --git a/DeeployTest/Tests/testBacktracking/inputs.npz b/DeeployTest/Tests/Others/Backtracking/inputs.npz
similarity index 100%
rename from DeeployTest/Tests/testBacktracking/inputs.npz
rename to DeeployTest/Tests/Others/Backtracking/inputs.npz
diff --git a/DeeployTest/Tests/testBacktracking/network.onnx b/DeeployTest/Tests/Others/Backtracking/network.onnx
similarity index 100%
rename from DeeployTest/Tests/testBacktracking/network.onnx
rename to DeeployTest/Tests/Others/Backtracking/network.onnx
diff --git a/DeeployTest/Tests/testBacktracking/outputs.npz b/DeeployTest/Tests/Others/Backtracking/outputs.npz
similarity index 100%
rename from DeeployTest/Tests/testBacktracking/outputs.npz
rename to DeeployTest/Tests/Others/Backtracking/outputs.npz
diff --git a/DeeployTest/Tests/Others/TypeInference/inputs.npz b/DeeployTest/Tests/Others/TypeInference/inputs.npz
new file mode 100644
index 0000000000..db396786bd
Binary files /dev/null and b/DeeployTest/Tests/Others/TypeInference/inputs.npz differ
diff --git a/DeeployTest/Tests/Others/TypeInference/network.onnx b/DeeployTest/Tests/Others/TypeInference/network.onnx
new file mode 100644
index 0000000000..e04e330731
Binary files /dev/null and b/DeeployTest/Tests/Others/TypeInference/network.onnx differ
diff --git a/DeeployTest/Tests/Others/TypeInference/outputs.npz b/DeeployTest/Tests/Others/TypeInference/outputs.npz
new file mode 100644
index 0000000000..2269dd7292
Binary files /dev/null and b/DeeployTest/Tests/Others/TypeInference/outputs.npz differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/activations.npz b/DeeployTest/Tests/test2DRequantizedConv/activations.npz
deleted file mode 100644
index 7b4b8c7309..0000000000
Binary files a/DeeployTest/Tests/test2DRequantizedConv/activations.npz and /dev/null differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/inputs.npz b/DeeployTest/Tests/test2DRequantizedConv/inputs.npz
deleted file mode 100644
index e49b356ab2..0000000000
Binary files a/DeeployTest/Tests/test2DRequantizedConv/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/network.onnx b/DeeployTest/Tests/test2DRequantizedConv/network.onnx
deleted file mode 100644
index c4ba5aa650..0000000000
Binary files a/DeeployTest/Tests/test2DRequantizedConv/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/test2DRequantizedConv/outputs.npz b/DeeployTest/Tests/test2DRequantizedConv/outputs.npz
deleted file mode 100644
index c9fdd5e818..0000000000
Binary files a/DeeployTest/Tests/test2DRequantizedConv/outputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz
deleted file mode 100644
index a98a6c33b9..0000000000
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx
deleted file mode 100644
index ae1b3ac939..0000000000
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz b/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz
deleted file mode 100644
index a5d4b6e974..0000000000
Binary files a/DeeployTest/Tests/testFloatReshapeWithSkipConnection/outputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/inputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/inputs.npz
deleted file mode 100644
index 964a5a3551..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/network.onnx b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/network.onnx
deleted file mode 100644
index 0216957044..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/outputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/outputs.npz
deleted file mode 100644
index ced669e248..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_128/outputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/inputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/inputs.npz
deleted file mode 100644
index 3916dc0ff0..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/network.onnx b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/network.onnx
deleted file mode 100644
index e2955db62f..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/outputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/outputs.npz
deleted file mode 100644
index 8bc8dc897a..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_16/outputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/inputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/inputs.npz
deleted file mode 100644
index 13f2cc4c68..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/network.onnx b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/network.onnx
deleted file mode 100644
index 5f13ba3560..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/outputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/outputs.npz
deleted file mode 100644
index 62e9a7ca96..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_32/outputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/inputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/inputs.npz
deleted file mode 100644
index f9f1b89c37..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/network.onnx b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/network.onnx
deleted file mode 100644
index cd0cbb25cc..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/outputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/outputs.npz
deleted file mode 100644
index c850e02db2..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_64/outputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/inputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/inputs.npz
deleted file mode 100644
index de4d4f8e06..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/inputs.npz and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/network.onnx b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/network.onnx
deleted file mode 100644
index 3ea0bcc0ba..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/network.onnx and /dev/null differ
diff --git a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/outputs.npz b/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/outputs.npz
deleted file mode 100644
index f5b9700baa..0000000000
Binary files a/DeeployTest/Tests/testTrainCCT/CCT1_Classifier_Training/CCT_1_16_16_8/outputs.npz and /dev/null differ
diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
new file mode 100644
index 0000000000..39c24ef6b8
--- /dev/null
+++ b/DeeployTest/conftest.py
@@ -0,0 +1,155 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from pathlib import Path
+
+import coloredlogs
+import pytest
+
+from Deeploy.Logging import DEFAULT_FMT
+from Deeploy.Logging import DEFAULT_LOGGER as log
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Native PyTest hook: add custom command-line options for Deeploy tests."""
+    parser.addoption(
+        "--skipgen",
+        action = "store_true",
+        default = False,
+        help = "Skip network generation step",
+    )
+    parser.addoption(
+        "--skipsim",
+        action = "store_true",
+        default = False,
+        help = "Skip simulation step (only generate and build)",
+    )
+    parser.addoption(
+        "--profile-untiled",
+        action = "store_true",
+        default = False,
+        help = "Enable profiling for untiled Siracusa runs",
+    )
+    parser.addoption(
+        "--toolchain",
+        action = "store",
+        default = "LLVM",
+        help = "Compiler toolchain to use (LLVM or GCC)",
+    )
+    parser.addoption(
+        "--toolchain-install-dir",
+        action = "store",
+        default = os.environ.get("LLVM_INSTALL_DIR"),
+        help = "Path to toolchain installation directory",
+    )
+    parser.addoption(
+        "--cmake-args",
+        action = "append",
+        default = [],
+        help = "Additional CMake arguments (can be used multiple times)",
+    )
+
+
+def pytest_configure(config: pytest.Config) -> None:
+    """Native PyTest hook: configure pytest for Deeploy tests."""
+    # Register custom markers
+    config.addinivalue_line("markers", "generic: mark test as a Generic platform test")
+    config.addinivalue_line("markers", "cortexm: mark test as a Cortex-M (QEMU-ARM) platform test")
+    config.addinivalue_line("markers", "mempool: mark test as a MemPool platform test")
+    config.addinivalue_line("markers", "chimera: mark test as a Chimera platform test")
+    config.addinivalue_line("markers", "softhier: mark test as a SoftHier platform test")
+    config.addinivalue_line("markers", "snitch: mark test as a Snitch platform test")
+    config.addinivalue_line("markers", "snitch_tiled: mark test as a Snitch platform test (tiled)")
+    config.addinivalue_line("markers", "siracusa: mark test as a Siracusa platform test (untiled)")
+    config.addinivalue_line("markers", "siracusa_tiled: mark test as a Siracusa platform test (tiled)")
+    config.addinivalue_line("markers",
+                            "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)")
+    config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)")
+    config.addinivalue_line("markers", "models: mark test as a model test (full networks)")
+    config.addinivalue_line("markers", "singlebuffer: mark test as single-buffer configuration")
+    config.addinivalue_line("markers", "doublebuffer: mark test as double-buffer configuration")
+    config.addinivalue_line("markers", "l2: mark test as L2 default memory level")
+    config.addinivalue_line("markers", "l3: mark test as L3 default memory level")
+    config.addinivalue_line("markers", "wmem: mark test as using Neureka weight memory")
+    config.addinivalue_line("markers", "dma: mark test as DMA test")
+    config.addinivalue_line(
+        "markers",
+        "deeploy_internal: mark test as internal Deeploy test (state serialization, extensions, transformations)")
+
+    # Configure logging based on verbosity
+    verbosity = config.option.verbose
+    if verbosity >= 3:
+        coloredlogs.install(level = 'DEBUG', logger = log, fmt = DEFAULT_FMT)
+    elif verbosity >= 2:
+        coloredlogs.install(level = 'INFO', logger = log, fmt = DEFAULT_FMT)
+    else:
+        coloredlogs.install(level = 'WARNING', logger = log, fmt = DEFAULT_FMT)
+
+
+@pytest.fixture(scope = "session")
+def deeploy_test_dir():
+    """Return the DeeployTest directory path."""
+    return Path(__file__).parent
+
+
+@pytest.fixture(scope = "session")
+def tests_dir(deeploy_test_dir):
+    """Return the Tests directory path."""
+    return deeploy_test_dir / "Tests"
+
+
+@pytest.fixture(scope = "session")
+def toolchain_dir(request):
+    """Return the toolchain installation directory."""
+    toolchain_install = request.config.getoption("--toolchain-install-dir")
+    if toolchain_install is None:
+        pytest.skip(reason = "LLVM_INSTALL_DIR not set")
+    return toolchain_install
+
+
+@pytest.fixture(scope = "session", autouse = True)
+def ccache_dir():
+    """Setup and return ccache directory."""
+    # Use existing CCACHE_DIR if already set
+    if "CCACHE_DIR" in os.environ:
+        return Path(os.environ["CCACHE_DIR"])
+
+    # Fall back to /app/.ccache if it exists (for CI containers)
+    ccache_path = Path("/app/.ccache")
+    if ccache_path.exists():
+        os.environ["CCACHE_DIR"] = str(ccache_path)
+        return ccache_path
+
+    return None
+
+
+@pytest.fixture
+def skipgen(request):
+    """Return whether to skip network generation."""
+    return request.config.getoption("--skipgen")
+
+
+@pytest.fixture
+def skipsim(request):
+    """Return whether to skip simulation."""
+    return request.config.getoption("--skipsim")
+
+
+@pytest.fixture
+def profile_untiled(request):
+    """Return whether untiled profiling is enabled."""
+    return request.config.getoption("--profile-untiled")
+
+
+@pytest.fixture
+def toolchain(request):
+    """Return the toolchain to use."""
+    return request.config.getoption("--toolchain")
+
+
+@pytest.fixture
+def cmake_args(request):
+    """Return additional CMake arguments."""
+    return request.config.getoption("--cmake-args")
diff --git a/DeeployTest/deeployRunner_chimera.py b/DeeployTest/deeployRunner_chimera.py
new file mode 100644
index 0000000000..3026020338
--- /dev/null
+++ b/DeeployTest/deeployRunner_chimera.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Deeploy runner for Chimera platform."""
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add Chimera-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--cores', type = int, default = 8, help = 'Number of cores (default: 8)\n')
+
+    sys.exit(
+        main(default_platform = "Chimera",
+             default_simulator = "gvsoc",
+             tiling_enabled = False,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployRunner_cortexm.py b/DeeployTest/deeployRunner_cortexm.py
new file mode 100644
index 0000000000..dddba473e5
--- /dev/null
+++ b/DeeployTest/deeployRunner_cortexm.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+    sys.exit(main(default_platform = "QEMU-ARM", default_simulator = "qemu", tiling_enabled = False))
diff --git a/DeeployTest/deeployRunner_generic.py b/DeeployTest/deeployRunner_generic.py
new file mode 100644
index 0000000000..b0757e3a7e
--- /dev/null
+++ b/DeeployTest/deeployRunner_generic.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+    sys.exit(main(default_platform = "Generic", default_simulator = "host", tiling_enabled = False))
diff --git a/DeeployTest/deeployRunner_mempool.py b/DeeployTest/deeployRunner_mempool.py
new file mode 100644
index 0000000000..400cda92a9
--- /dev/null
+++ b/DeeployTest/deeployRunner_mempool.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Deeploy runner for MemPool platform."""
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add MemPool-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--num-cores',
+                            type = int,
+                            default = 16,
+                            dest = 'num_cores',
+                            help = 'Number of cores (default: 16)\n')
+
+    sys.exit(
+        main(default_platform = "MemPool",
+             default_simulator = "banshee",
+             tiling_enabled = False,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployRunner_siracusa.py b/DeeployTest/deeployRunner_siracusa.py
new file mode 100644
index 0000000000..b754a0c233
--- /dev/null
+++ b/DeeployTest/deeployRunner_siracusa.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add Siracusa-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--cores', type = int, default = 8, help = 'Number of cores (default: 8)\n')
+
+    sys.exit(
+        main(default_platform = "Siracusa",
+             default_simulator = "gvsoc",
+             tiling_enabled = False,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployRunner_snitch.py b/DeeployTest/deeployRunner_snitch.py
new file mode 100644
index 0000000000..aa97933319
--- /dev/null
+++ b/DeeployTest/deeployRunner_snitch.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add Snitch-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--num-cores',
+                            type = int,
+                            default = 8,
+                            dest = 'num_cores',
+                            help = 'Number of cores (default: 8)\n')
+
+    sys.exit(
+        main(default_platform = "Snitch",
+             default_simulator = "gvsoc",
+             tiling_enabled = False,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployRunner_softhier.py b/DeeployTest/deeployRunner_softhier.py
new file mode 100644
index 0000000000..9c9360770e
--- /dev/null
+++ b/DeeployTest/deeployRunner_softhier.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Deeploy runner for SoftHier platform."""
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add SoftHier-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--num-clusters',
+                            type = int,
+                            default = 1,
+                            dest = 'num_clusters',
+                            help = 'Number of clusters (default: 1)\n')
+        parser.add_argument('--cores', type = int, default = 8, help = 'Number of cores (default: 8)\n')
+
+    sys.exit(
+        main(default_platform = "SoftHier",
+             default_simulator = "gvsoc",
+             tiling_enabled = False,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployRunner_tiled_siracusa.py b/DeeployTest/deeployRunner_tiled_siracusa.py
new file mode 100644
index 0000000000..2184a4105c
--- /dev/null
+++ b/DeeployTest/deeployRunner_tiled_siracusa.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add Siracusa-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--cores', type = int, default = 8, help = 'Number of cores (default: 8)\n')
+
+    sys.exit(
+        main(default_platform = "Siracusa",
+             default_simulator = "gvsoc",
+             tiling_enabled = True,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployRunner_tiled_siracusa_w_neureka.py b/DeeployTest/deeployRunner_tiled_siracusa_w_neureka.py
new file mode 100644
index 0000000000..6b58844327
--- /dev/null
+++ b/DeeployTest/deeployRunner_tiled_siracusa_w_neureka.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add Siracusa+Neureka-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--cores', type = int, default = 8, help = 'Number of cores (default: 8)\n')
+        parser.add_argument('--neureka-wmem', action = 'store_true', help = 'Enable Neureka weight memory\n')
+        parser.add_argument('--enable-3x3', action = 'store_true', help = 'Enable 3x3 convolutions\n')
+
+    sys.exit(
+        main(default_platform = "Siracusa_w_neureka",
+             default_simulator = "gvsoc",
+             tiling_enabled = True,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployRunner_tiled_snitch.py b/DeeployTest/deeployRunner_tiled_snitch.py
new file mode 100644
index 0000000000..d6e5ffd196
--- /dev/null
+++ b/DeeployTest/deeployRunner_tiled_snitch.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+from testUtils.deeployRunner import main
+
+if __name__ == "__main__":
+
+    # Define parser setup callback to add Snitch-specific arguments
+    def setup_parser(parser):
+        parser.add_argument('--num-cores',
+                            type = int,
+                            default = 8,
+                            dest = 'num_cores',
+                            help = 'Number of cores (default: 8)\n')
+
+    sys.exit(
+        main(default_platform = "Snitch",
+             default_simulator = "gvsoc",
+             tiling_enabled = True,
+             parser_setup_callback = setup_parser))
diff --git a/DeeployTest/deeployStateEqualityTest.py b/DeeployTest/deeployStateEqualityTest.py
index 297e52e65c..1ed0aac9e0 100644
--- a/DeeployTest/deeployStateEqualityTest.py
+++ b/DeeployTest/deeployStateEqualityTest.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: deeployStateEqualityTest.py
-#
-# Last edited: 04.05.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import argparse
 import copy
@@ -32,7 +10,7 @@
 import onnx
 import onnx_graphsurgeon as gs
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset
 
 from Deeploy.DeeployTypes import NetworkContext, StructBuffer, VariableBuffer, _backendPostBindingFilename, \
     _middlewarePreLoweringFilename
@@ -46,7 +24,7 @@
                         metavar = 'testdir',
                         dest = 'dir',
                         type = str,
-                        default = './Tests/simpleRegression',
+                        default = './Tests/Models/CNN_Linear2',
                         help = 'Set the regression test\n')
     parser.add_argument('-d',
                         metavar = 'dumpdir',
@@ -79,7 +57,7 @@
     platform, signProp = mapPlatform(args.platform)
 
     for index, num in enumerate(test_inputs):
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
diff --git a/DeeployTest/generateNetwork.py b/DeeployTest/generateNetwork.py
index a8a533db7b..f029be7361 100644
--- a/DeeployTest/generateNetwork.py
+++ b/DeeployTest/generateNetwork.py
@@ -1,71 +1,34 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: generateNetwork.py
-#
-# Last edited: 08.01.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Moritz Scherer, ETH Zurich
-# - Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
+import sys
 
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
-from testUtils.codeGenerate import generateTestInputsHeader, generateTestNetworkHeader, \
-    generateTestNetworkImplementation, generateTestOutputsHeader
+from testUtils.codeGenerate import generateTestNetwork
 from testUtils.graphDebug import generateDebugConfig
 from testUtils.platformMapping import mapDeployer, mapPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset, parseDataType
 
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import IntegerDataTypes
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import EmulateCMSISRequantPass
 from Deeploy.DeeployTypes import _NoVerbosity
+from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.Targets.CortexM.Platform import CMSISPlatform
-from Deeploy.Targets.PULPOpen.Platform import PULPPlatform
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPPlatform
 
-_TEXT_ALIGN = 30
-
-if __name__ == '__main__':
-
-    parser = TestGeneratorArgumentParser(description = "Deeploy Code Generation Utility.")
-    parser.add_argument('--debug',
-                        dest = 'debug',
-                        action = 'store_true',
-                        default = False,
-                        help = 'Enable debugging mode\n')
-    parser.add_argument('--profileUntiled',
-                        action = 'store_true',
-                        dest = 'profileUntiled',
-                        default = False,
-                        help = 'Profile Untiled for L2\n')
 
-    args = parser.parse_args()
+def generateNetwork(args):
+    log.debug("Arguments: %s", args)
 
     onnx_graph = onnx.load_model(f'{args.dir}/network.onnx')
     graph = gs.import_onnx(onnx_graph)
 
-    inputTypes = {}
-    inputOffsets = {}
-
     inputs = np.load(f'{args.dir}/inputs.npz')
     outputs = np.load(f'{args.dir}/outputs.npz')
     if os.path.isfile(f'{args.dir}/activations.npz'):
@@ -73,7 +36,38 @@
     else:
         activations = None
 
-    tensors = graph.tensors()
+    # build {name, type} and {name, offset} maps
+    manual_types = {}
+    manual_offsets = {}
+    for kv in args.input_type_map:
+        try:
+            name, tstr = kv.split('=', 1)
+        except ValueError as exc:
+            raise ValueError(f"Invalid --input-type-map entry '{kv}'. Expected NAME=TYPE.") from exc
+        name, tstr = name.strip(), tstr.strip()
+        try:
+            manual_types[name] = parseDataType(tstr)
+        except ValueError as exc:
+            raise ValueError(f"Invalid --input-type-map entry '{kv}': {exc}") from exc
+    for kv in args.input_offset_map:
+        try:
+            name, ostr = kv.split('=', 1)
+        except ValueError as exc:
+            raise ValueError(f"Invalid --input-offset-map entry '{kv}'. Expected NAME=OFFSET.") from exc
+        name, ostr = name.strip(), ostr.strip()
+        try:
+            manual_offsets[name] = int(ostr)
+        except ValueError as exc:
+            raise ValueError(f"Invalid --input-offset-map entry '{kv}': OFFSET must be an integer.") from exc
+
+    # Sanity check for unknown input names
+    manual_keys = set(manual_types)
+    assert manual_keys == set(
+        manual_offsets
+    ), f"Override inputs should have both type and offset specified. Inputs without both specified: {manual_keys ^ set(manual_types)}"
+    assert manual_keys <= set(
+        inputs.files
+    ), f"Unknown input names in overrides: {manual_keys - set(inputs.files)} (Valid names are: {set(inputs.files)})"
 
     if args.debug:
         test_inputs, test_outputs, graph = generateDebugConfig(inputs, outputs, activations, graph)
@@ -90,11 +84,46 @@
 
     platform, signProp = mapPlatform(args.platform)
 
-    for index, num in enumerate(test_inputs):
-        # WIESP: Do not infer types and offset of empty arrays
-        if np.prod(num.shape) == 0:
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
+    inputTypes = {}
+    inputOffsets = {}
+
+    log.debug(f"Platform: {platform} (sign: {signProp})")
+
+    log.debug("Platform Engines:")
+    for engine in platform.engines:
+        log.debug(f" - {engine.name}: {engine}")
+
+    for index, (name, values) in enumerate(zip(inputs.files, test_inputs)):
+        if np.prod(values.shape) == 0:
             continue
-        _type, offset = inferInputType(num, signProp)[0]
+
+        if name in manual_keys:
+            _type = manual_types[name]
+            offset = manual_offsets[name]
+
+            # Check if the provided values fit into the dereferenced type
+            vals = values.astype(np.int64) - offset
+            if not _type.checkPromotion(vals):
+                lo, hi = _type.typeMin, _type.typeMax
+                raise RuntimeError(f"Provided type '{_type.typeName}' with offset {offset} "
+                                   f"does not match input values in range [{vals.min()}, {vals.max()}] "
+                                   f"(expected range [{lo}, {hi}])")
+
+            # Suggest a smaller fitting type if possible
+            fitting_types = [t for t in sorted(IntegerDataTypes, key = lambda x: x.typeWidth) if t.checkPromotion(vals)]
+            if fitting_types and fitting_types[0] is not _type:
+                log.warning(f"Data spans [{int(vals.min())}, {int(vals.max())}], "
+                            f"which would fit in '{fitting_types[0].typeName}', "
+                            f"but user forced '{_type.typeName}'.")
+
+            _type = PointerClass(_type)
+        else:
+            _type, offset = inferTypeAndOffset(values, signProp)
+
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
@@ -102,9 +131,11 @@
 
     deployer = mapDeployer(platform, graph, inputTypes, deeployStateDir = _DEEPLOYSTATEDIR, inputOffsets = inputOffsets)
 
+    log.debug(f"Deployer: {deployer}")
+
     if not isinstance(
             platform, CMSISPlatform
-    ) and not "simpleCNN" in args.dir and not "testRQMatMul" in args.dir and not "testRQGEMM" in args.dir:
+    ) and not "CNN_Linear1" in args.dir and not "GEMM/Regular_RQPerRow" in args.dir and not "MatMul/Regular_RQ" in args.dir:
         deployer.loweringOptimizer.passes.insert(0, EmulateCMSISRequantPass())
 
     verbosityCfg = _NoVerbosity
@@ -112,43 +143,69 @@
         verbosityCfg.untiledProfiling = args.profileUntiled
 
     # Parse graph and infer output levels and signedness
-    _ = deployer.generateFunction(verbose = verbosityCfg)
-
-    # Create input and output vectors
-    os.makedirs(f'{args.dumpdir}', exist_ok = True)
-
-    testInputStr = generateTestInputsHeader(deployer, test_inputs, inputTypes, inputOffsets)
-    f = open(f'{args.dumpdir}/testinputs.h', "w")
-    f.write(testInputStr)
-    f.close()
-
-    testOutputStr = generateTestOutputsHeader(deployer, test_outputs, signProp, verbose = args.verbose)
-    f = open(f'{args.dumpdir}/testoutputs.h', "w")
-    f.write(testOutputStr)
-    f.close()
-
-    # Generate code for Network
-    testNetworkHeaderStr = generateTestNetworkHeader(deployer, platform)
-    f = open(f'{args.dumpdir}/Network.h', "w")
-    f.write(testNetworkHeaderStr)
-    f.close()
-
-    testNetworkImplementationStr = generateTestNetworkImplementation(deployer, platform, verbose = args.verbose)
-    f = open(f'{args.dumpdir}/Network.c', "w")
-    f.write(testNetworkImplementationStr)
-    f.close()
-
-    clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
-    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.c')
-    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.h')
-    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testoutputs.h')
-    os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testinputs.h')
-
-    if args.verbose:
-        print()
-        print("=" * 80)
-        num_ops = deployer.numberOfOps(args.verbose)
-        print("=" * 80)
-        print()
-        print(f"{'Number of Ops:' :<{_TEXT_ALIGN}} {num_ops}")
-        print(f"{'Model Parameters: ' :<{_TEXT_ALIGN}} {deployer.getParameterSize()}")
\ No newline at end of file
+    _ = deployer.prepare(verbosityCfg)
+
+    # Offset the input and output values if signprop
+    if signProp:
+        test_inputs = [value - inputOffsets[f"input_{i}"] for i, value in enumerate(test_inputs)]
+
+        for i, values in enumerate(test_outputs):
+            buffer = deployer.ctxt.lookup(f"output_{i}")
+            if buffer._type.referencedType.typeName == "float32_t":
+                continue
+            if not buffer._signed:
+                values -= buffer.nLevels // 2
+
+    generateTestNetwork(deployer, test_inputs, test_outputs, args.dumpdir, verbosityCfg)
+
+
+if __name__ == '__main__':
+
+    parser = TestGeneratorArgumentParser(description = "Deeploy Code Generation Utility.")
+    parser.add_argument('--debug',
+                        dest = 'debug',
+                        action = 'store_true',
+                        default = False,
+                        help = 'Enable debugging mode\n')
+    parser.add_argument('--profileUntiled',
+                        action = 'store_true',
+                        dest = 'profileUntiled',
+                        default = False,
+                        help = 'Profile Untiled for L2\n')
+    parser.add_argument('--input-type-map',
+                        nargs = '*',
+                        default = [],
+                        type = str,
+                        help = '(Optional) mapping of input names to data types. '
+                        'If not specified, types are inferred from the input data. '
+                        'Example: --input-type-map input_0=int8_t input_1=float32_t ...')
+    parser.add_argument('--input-offset-map',
+                        nargs = '*',
+                        default = [],
+                        type = str,
+                        help = '(Optional) mapping of input names to offsets. '
+                        'If not specified, offsets are set to 0. '
+                        'Example: --input-offset-map input_0=0 input_1=128 ...')
+    parser.add_argument('--shouldFail', action = 'store_true')
+    parser.add_argument(
+        "--cores",
+        type = int,
+        default = 1,
+        help =
+        "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1.",
+    )
+    parser.set_defaults(shouldFail = False)
+
+    args = parser.parse_args()
+
+    try:
+        generateNetwork(args)
+    except Exception as e:
+        if args.shouldFail:
+            print("\033[92mNetwork generation ended, failed as expected!\033[0m")
+            sys.exit(0)
+        else:
+            raise e
+
+    if args.shouldFail:
+        raise RuntimeError("Expected to fail!")
diff --git a/DeeployTest/profiling2csv.py b/DeeployTest/profiling2csv.py
index 7d8feb5950..da080c07d4 100644
--- a/DeeployTest/profiling2csv.py
+++ b/DeeployTest/profiling2csv.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: profiling2csv.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import argparse
 import csv
diff --git a/DeeployTest/testComponentGraph.py b/DeeployTest/testComponentGraph.py
index e091e0b419..91470e6432 100644
--- a/DeeployTest/testComponentGraph.py
+++ b/DeeployTest/testComponentGraph.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: testComponentGraph.py
-#
-# Last edited: 10.10.2023.
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-#   - Luka Macan, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 
@@ -33,7 +11,7 @@
 from Deeploy.ComponentGraph import extractComponentGraph, extractComponentsFromComponentGraph
 
 if __name__ == "__main__":
-    test_dir = "Tests/WaveFormer"
+    test_dir = "Tests/Models/WaveFormer"
     colors = ["red", "green", "blue", "yellow"]
     component_color = "red"
     color_attr = "color"
diff --git a/DeeployTest/testDebugPrintPass.py b/DeeployTest/testDebugPrintPass.py
index bbd7373f10..6cd0e9f037 100644
--- a/DeeployTest/testDebugPrintPass.py
+++ b/DeeployTest/testDebugPrintPass.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: testDebugPrinting.py
-#
-# Last edited: 14.05.2024.
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-#   - Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 
@@ -31,7 +9,7 @@
 import onnx_graphsurgeon as gs
 from testUtils.platformMapping import mapDeployer, mapPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser, getPaths
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset
 
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.DebugPasses import DebugPrintPass
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
@@ -71,10 +49,7 @@
     test_inputs = [inputs[x].reshape(-1).astype(np.float64) for x in inputs.files]
     test_outputs = [outputs[x].reshape(-1).astype(np.float64) for x in outputs.files]
     for index, num in enumerate(test_inputs):
-        # WIESP: Do not infer types and offset of empty arrays
-        if np.prod(num.shape) == 0:
-            continue
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
diff --git a/DeeployTest/testDmas.py b/DeeployTest/testDmas.py
new file mode 100644
index 0000000000..df6926b48d
--- /dev/null
+++ b/DeeployTest/testDmas.py
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import itertools
+import subprocess
+from typing import Tuple
+
+
+def test(dma: str, inputShape: Tuple[int, ...], tileShape: Tuple[int, ...], nodeCount: int, dataType: str,
+         doublebuffer: bool):
+    cfg_str = f"""
+    - input shape: {inputShape}
+    - tile shape: {tileShape}
+    - node count: {nodeCount}
+    - data type: {dataType}
+    - doublebuffering: {doublebuffer}
+    - dma: {dma}
+    """
+
+    print(f"test{dma}: Testing {dma} with followig configuration:" + cfg_str)
+
+    testRunnerMap = {
+        "MchanDma": "testRunner_siracusa_mchandma.py",
+        "L3Dma": "testRunner_siracusa_l3dma.py",
+        "SnitchDma": "testRunner_snitch_dma.py",
+    }
+
+    assert dma in testRunnerMap, f"{dma} missing its own testRunner mapping"
+
+    testRunner = testRunnerMap[dma]
+
+    cmd = [f"python {testRunner}", f"-t test{dma}", "-DNUM_CORES=8"]
+    cmd.append(f"--input-shape {' '.join(str(x) for x in inputShape)}")
+    cmd.append(f"--tile-shape {' '.join(str(x) for x in tileShape)}")
+    cmd.append(f"--node-count {nodeCount}")
+    cmd.append(f"--type {dataType}")
+    if doublebuffer:
+        cmd.append("--doublebuffer")
+
+    full_cmd = " ".join(cmd)
+
+    print(f"Running command:\n{full_cmd}\n")
+
+    try:
+        subprocess.run(full_cmd, shell = True, check = True)
+    except subprocess.CalledProcessError:
+        print(f"test{dma}: Failed test:" + cfg_str)
+        print(f"Rerun with command:\n{full_cmd}")
+        exit(-1)
+
+
+# input shape, tile shape, node count, data type
+test_shapes_and_more = [
+    ((10, 10), (10, 10), 1, "uint8_t"),
+    ((10, 10), (10, 4), 1, "uint8_t"),
+    ((10, 10), (10, 4), 1, "uint16_t"),
+    ((10, 10), (10, 4), 1, "uint32_t"),
+    ((10, 10), (3, 4), 1, "uint32_t"),
+    ((10, 10), (3, 4), 2, "uint32_t"),
+    ((10, 10, 10), (2, 3, 4), 1, "uint8_t"),
+    ((10, 10, 10, 10), (2, 3, 5, 4), 1, "uint8_t"),
+    ((10, 10, 10, 10), (2, 3, 5, 4), 1, "uint32_t"),
+    ((10, 10, 10, 10, 10), (2, 3, 5, 7, 4), 1, "uint8_t"),
+]
+
+is_doublebuffers = [True, False]
+dmas = ["MchanDma", "L3Dma", "SnitchDma"]
+
+for testShape, doublebuffer, dma in itertools.product(test_shapes_and_more, is_doublebuffers, dmas):
+    inputShape, tileShape, nodeCount, dataType = testShape
+    test(dma, inputShape, tileShape, nodeCount, dataType, doublebuffer)
diff --git a/DeeployTest/testEngineAwareOptimizerWrapper.py b/DeeployTest/testEngineAwareOptimizerWrapper.py
index 877230bc75..effe0c3712 100644
--- a/DeeployTest/testEngineAwareOptimizerWrapper.py
+++ b/DeeployTest/testEngineAwareOptimizerWrapper.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: testEngineAwareOptimizerWrapper.py
-#
-# Last edited: 10.10.2023.
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-#   - Luka Macan, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 
@@ -44,7 +22,7 @@
 
 
 def _test_partial_coloring():
-    test_dir = "Tests/simpleRegression"
+    test_dir = "Tests/Models/CNN_Linear2"
 
     model = onnx.load(os.path.join(test_dir, "network.onnx"))
     graph = gs.import_onnx(model).toposort()
@@ -101,7 +79,7 @@ def _test_pass(_pass: TopologyOptimizationPass, graph: gs.Graph, engineName: str
 
 
 def _test_passes():
-    test_dir = "Tests/simpleRegression"
+    test_dir = "Tests/Models/CNN_Linear2"
     model = onnx.load(os.path.join(test_dir, "network.onnx"))
     graph = gs.import_onnx(model).toposort()
     passes = [
diff --git a/DeeployTest/testMVP.py b/DeeployTest/testMVP.py
index ff7a3ccf3a..01216984af 100644
--- a/DeeployTest/testMVP.py
+++ b/DeeployTest/testMVP.py
@@ -1,117 +1,34 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: testMVP.py
-#
-# Last edited: 31.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
+import argparse
+import hashlib
 import os
 import sys
 from collections import OrderedDict
-from typing import List, Union
+from typing import List, Tuple
 
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
 import pytest
-from ortools.constraint_solver.pywrapcp import IntVar
-from testUtils.codeGenerate import generateL3HexDump, generateTestInputsHeader, generateTestNetworkHeader, \
-    generateTestNetworkImplementation, generateTestOutputsHeader
+from testUtils.codeGenerate import generateTestNetwork
 from testUtils.graphDebug import generateDebugConfig
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.typeMapping import inferInputType
+from testUtils.tilingUtils import DBOnlyL3Tiler, DBTiler, SBTiler
+from testUtils.typeMapping import inferTypeAndOffset
 
-from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, NetworkContext, NetworkDeployer, ONNXLayer, \
-    SubGraph, TransientBuffer
+from Deeploy.DeeployTypes import CodeGenVerbosity, NetworkDeployer, ONNXLayer
 from Deeploy.EngineExtension.NetworkDeployers.EngineColoringDeployer import EngineColoringDeployerWrapper
+from Deeploy.Logging import DEFAULT_LOGGER as log
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
 from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
     AnnotateIOMemoryLevel, AnnotateNeurekaWeightMemoryLevel
-from Deeploy.TilingExtension.TilerExtension import Tiler, TilerDeployerWrapper
-from Deeploy.TilingExtension.TilerModel import TilerModel
-
-_TEXT_ALIGN = 30
-
-
-class DBOnlyL3Tiler(Tiler):
-
-    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
-                            hop: str, tensorName: str) -> Union[int, IntVar]:
-
-        varBuffer = ctxt.lookup(tensorName)
-
-        generalCoeff = 2
-
-        if isinstance(varBuffer, TransientBuffer):
-            coefficient = 1
-        elif isinstance(varBuffer, ConstantBuffer):
-            coefficient = generalCoeff
-        else:
-            coefficient = generalCoeff
-
-        if args.defaultMemLevel == "L2":
-            return coefficient
-
-        if hop == 'L1':
-            return 1
-
-        return coefficient
-
-
-class DBTiler(Tiler):
-
-    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
-                            hop: str, tensorName: str) -> Union[int, IntVar]:
-        varBuffer = ctxt.lookup(tensorName)
-
-        generalCoeff = 2
-
-        if isinstance(varBuffer, TransientBuffer):
-            coefficient = 1
-        elif isinstance(varBuffer, ConstantBuffer):
-            coefficient = generalCoeff
-        else:
-            coefficient = generalCoeff
-
-        return coefficient
-
-
-class SBTiler(Tiler):
-
-    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
-                            hop: str, tensorName: str) -> Union[int, IntVar]:
-        varBuffer = ctxt.lookup(tensorName)
-
-        generalCoeff = 1
-
-        if isinstance(varBuffer, TransientBuffer):
-            coefficient = 1
-        elif isinstance(varBuffer, ConstantBuffer):
-            coefficient = generalCoeff
-        else:
-            coefficient = generalCoeff
-
-        return coefficient
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine
+from Deeploy.TilingExtension.TilerExtension import TilerDeployerWrapper
 
 
 # Mock of the Global Scheduler's inteface
@@ -140,7 +57,8 @@ def _filterSchedule(schedule: List[List[gs.Node]], layerBinding: 'OrderedDict[st
 
 
 def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel,
-                  defaultIoMemoryLevel: MemoryLevel, verbose: CodeGenVerbosity) -> NetworkDeployer:
+                  defaultIoMemoryLevel: MemoryLevel, verbose: CodeGenVerbosity,
+                  args: argparse.Namespace) -> Tuple[NetworkDeployer, bool]:
 
     inputTypes = {}
     inputOffsets = {}
@@ -160,11 +78,12 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     if args.enableStrides:
         platform.engines[0].enableStrides = True
 
+    clusters = [engine for engine in platform.engines if isinstance(engine, PULPClusterEngine)]
+    for cluster in clusters:
+        cluster.n_cores = args.cores
+
     for index, num in enumerate(test_inputs):
-        # WIESP: Do not infer types and offset of empty arrays
-        if np.prod(num.shape) == 0:
-            continue
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
@@ -197,16 +116,24 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
 
     # Make the deployer tiler aware
+    # VJUNG: Create unique ID for the IO files of minimalloc and prevent conflict in case of parallel execution
+    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_DB{args.doublebuffer}"
+    testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16]
+
     if args.doublebuffer:
-        deployer = TilerDeployerWrapper(deployer, DBOnlyL3Tiler)
+        assert args.defaultMemLevel in ["L3", "L2"]
+        if args.defaultMemLevel == "L3":
+            deployer = TilerDeployerWrapper(deployer, DBOnlyL3Tiler, testName = testIdentifier, workDir = args.dumpdir)
+        else:
+            deployer = TilerDeployerWrapper(deployer, DBTiler, testName = testIdentifier, workDir = args.dumpdir)
     else:
-        deployer = TilerDeployerWrapper(deployer, SBTiler)
+        deployer = TilerDeployerWrapper(deployer, SBTiler, testName = testIdentifier, workDir = args.dumpdir)
 
     deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc
     deployer.tiler.memoryAllocStrategy = args.memAllocStrategy
     deployer.tiler.searchStrategy = args.searchStrategy
 
-    return deployer
+    return deployer, signProp
 
 
 if __name__ == '__main__':
@@ -276,12 +203,21 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
                         """)
     parser.add_argument('--profileTiling', action = "store_true")
     parser.add_argument('--plotMemAlloc',
-                        action = 'store_false',
+                        action = 'store_true',
                         help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n')
+    parser.add_argument(
+        "--cores",
+        type = int,
+        default = 1,
+        help =
+        "Number of cores on which the network is run. Currently, required for im2col buffer sizing on Siracusa. Default: 1."
+    )
 
     parser.set_defaults(shouldFail = False)
     args = parser.parse_args()
 
+    log.debug("Arguments: %s", args)
+
     verbosityCfg = CodeGenVerbosity(None)
 
     if args.profileTiling:
@@ -326,20 +262,25 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
     memoryHierarchy = MemoryHierarchy(memoryLevels)
     memoryHierarchy.setDefaultMemoryLevel(args.defaultMemLevel)
 
-    deployer = setupDeployer(graph,
-                             memoryHierarchy,
-                             defaultTargetMemoryLevel = L1,
-                             defaultIoMemoryLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel],
-                             verbose = verbosityCfg)
+    deployer, signProp = setupDeployer(graph,
+                                       memoryHierarchy,
+                                       defaultTargetMemoryLevel = L1,
+                                       defaultIoMemoryLevel = memoryHierarchy.memoryLevels[args.defaultMemLevel],
+                                       verbose = verbosityCfg,
+                                       args = args)
 
     platform = deployer.Platform
-    signProp = False
+
+    log.debug(f"Platform: {platform} (sign: {signProp})")
+
+    log.debug("Platform Engines:")
+    for engine in platform.engines:
+        log.debug(f" - {engine.name}: {engine}")
+
+    log.debug(f"Deployer: {deployer}")
 
     for index, num in enumerate(test_inputs):
-        # WIESP: Do not infer types and offset of empty arrays
-        if np.prod(num.shape) == 0:
-            continue
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
@@ -353,50 +294,17 @@ def setupDeployer(graph: gs.Graph, memoryHierarchy: MemoryHierarchy, defaultTarg
         sys.exit(0)
     else:
 
-        _ = deployer.generateFunction(verbosityCfg)
-
-        # Create input and output vectors
-        os.makedirs(f'{args.dumpdir}', exist_ok = True)
-
-        testInputStr = generateTestInputsHeader(deployer, test_inputs, inputTypes, inputOffsets)
-        f = open(f'{args.dumpdir}/testinputs.h', "w")
-        f.write(testInputStr)
-        f.close()
-
-        testOutputStr = generateTestOutputsHeader(deployer, test_outputs, signProp, args.verbose)
-        f = open(f'{args.dumpdir}/testoutputs.h', "w")
-        f.write(testOutputStr)
-        f.close()
-
-        # Generate code for Network
-        testNetworkHeaderStr = generateTestNetworkHeader(deployer, platform)
-        f = open(f'{args.dumpdir}/Network.h', "w")
-        f.write(testNetworkHeaderStr)
-        f.close()
-
-        testNetworkImplementationStr = generateTestNetworkImplementation(deployer, platform)
-        f = open(f'{args.dumpdir}/Network.c', "w")
-        f.write(testNetworkImplementationStr)
-        f.close()
-
-        generateL3HexDump(deployer, os.path.join(f'{args.dumpdir}', 'hex'), test_inputs, test_outputs)
-
-        clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
-        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.c')
-        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/Network.h')
-        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testoutputs.h')
-        os.system(f'clang-format -i --style="{clang_format}" {args.dumpdir}/testinputs.h')
-
-        if args.verbose:
-            print()
-            print("=" * 80)
-            num_ops = deployer.numberOfOps(args.verbose)
-            print("=" * 80)
-            print()
-            print(f"{'Number of Ops:' :<{_TEXT_ALIGN}} {num_ops}")
-            print('Worst Case Buffer Size:')
-            for level in deployer.worstCaseBufferSize.keys():
-                print(f"{'  ' + str(level) + ':' :<{_TEXT_ALIGN}} {deployer.worstCaseBufferSize[level]}")
-            print(f"{'Model Parameters: ' :<{_TEXT_ALIGN}} {deployer.getParameterSize()}")
-
-        print("\033[92mCode Generation test ended, no memory violations!\033[0m")
+        _ = deployer.prepare(verbosityCfg)
+
+        # Offset the input and output values if signprop
+        if signProp:
+            test_inputs = [value - inputOffsets[f"input_{i}"] for i, value in enumerate(test_inputs)]
+
+            for i, values in enumerate(test_outputs):
+                buffer = deployer.ctxt.lookup(f"output_{i}")
+                if buffer._type.referencedType.typeName == "float32_t":
+                    continue
+                if not buffer._signed:
+                    values -= buffer.nLevels // 2
+
+        generateTestNetwork(deployer, test_inputs, test_outputs, args.dumpdir, verbosityCfg)
diff --git a/DeeployTest/testMemoryLevelExtension.py b/DeeployTest/testMemoryLevelExtension.py
index 5532f5c010..d8ea29bc24 100644
--- a/DeeployTest/testMemoryLevelExtension.py
+++ b/DeeployTest/testMemoryLevelExtension.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: testMemoryLevelExtension.py
-#
-# Last edited: 04.05.2022
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import copy
 import os
@@ -32,7 +10,7 @@
 import onnx_graphsurgeon as gs
 from testUtils.platformMapping import defaultScheduler, mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser, getPaths
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset
 
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
     NCHWtoNHWCPass, TransposeMatmulInputsPass
@@ -87,10 +65,10 @@
     platform, signProp = mapPlatform(args.platform)
 
     for index, num in enumerate(test_inputs):
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
-        if "simpleRegression" in args.dir:
+        if "CNN_Linear2" in args.dir:
             inputOffsets[f"input_{index}"] = 0
 
     deployer = mapDeployer(platform, graph, inputTypes, deeployStateDir = _DEEPLOYSTATEDIR, inputOffsets = inputOffsets)
diff --git a/DeeployTest/testPrintInputOutputTransformation.py b/DeeployTest/testPrintInputOutputTransformation.py
index 3b2d6d144d..c8f0ee70fe 100644
--- a/DeeployTest/testPrintInputOutputTransformation.py
+++ b/DeeployTest/testPrintInputOutputTransformation.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: testPrintInputOutputTransformation.py
-#
-# Last edited: 15.05.2024.
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-#   - Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 
@@ -31,7 +9,7 @@
 import onnx_graphsurgeon as gs
 from testUtils.platformMapping import mapDeployer, mapPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser, getPaths
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset
 
 from Deeploy.CommonExtensions.CodeTransformationPasses.PrintInputs import MemoryAwarePrintInputGeneration, \
     MemoryAwarePrintOutputGeneration, PrintInputGeneration, PrintOutputGeneration
@@ -89,10 +67,7 @@
     test_inputs = [inputs[x].reshape(-1).astype(np.float64) for x in inputs.files]
     test_outputs = [outputs[x].reshape(-1).astype(np.float64) for x in outputs.files]
     for index, num in enumerate(test_inputs):
-        # WIESP: Do not infer types and offset of empty arrays
-        if np.prod(num.shape) == 0:
-            continue
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
diff --git a/DeeployTest/testRegexMatching.py b/DeeployTest/testRegexMatching.py
index 45ecd5134d..352746e3f7 100644
--- a/DeeployTest/testRegexMatching.py
+++ b/DeeployTest/testRegexMatching.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: testRegexMatching.py
-#
-# Last edited: 10.10.2023.
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-#   - Luka Macan, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import re
 
@@ -64,7 +42,7 @@ def __init__(self):
 
 if __name__ == "__main__":
     optimizer = TopologyOptimizer([ConvTestPass()])
-    model = onnx.load_model('Tests/simpleCNN/network.onnx')
+    model = onnx.load_model('Tests/Models/CNN_Linear1/network.onnx')
     graph = gs.import_onnx(model)
 
     match_count = 0
diff --git a/DeeployTest/testReplaceInsertSubgraph.py b/DeeployTest/testReplaceInsertSubgraph.py
index c6a8aa53b2..73202c6993 100644
--- a/DeeployTest/testReplaceInsertSubgraph.py
+++ b/DeeployTest/testReplaceInsertSubgraph.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: testReplaceInsertSubgraph.py
-#
-# Last edited: 10.10.2023.
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-#   - Luka Macan, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 
@@ -33,7 +11,7 @@
 from Deeploy.OptimizationPasses.TopologyOptimizationPasses.PULPPasses import PULPConvRequantMergePass
 
 if __name__ == "__main__":
-    test_dir = "Tests/simpleRegression"
+    test_dir = "Tests/Models/CNN_Linear2"
 
     model = onnx.load(os.path.join(test_dir, "network.onnx"))
     graph = gs.import_onnx(model).toposort()
diff --git a/DeeployTest/testRunner_chimera.py b/DeeployTest/testRunner_chimera.py
deleted file mode 100644
index 82a94ad815..0000000000
--- a/DeeployTest/testRunner_chimera.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_chimera.py
-#
-# Last edited: 16.06.2025
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Victor Jung, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = False,
-        description = "Deeploy Code Generation Utility for the Chimera Platform (Host, no Tiling).")
-
-    parser.add_argument('--simulator',
-                        metavar = "<simulator>",
-                        dest = "simulator",
-                        type = str,
-                        choices = ["gvsoc"],
-                        default = "gvsoc",
-                        help = "Select the simulator to use")
-
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "Chimera", simulator = args.simulator, tiling = False, argument_parser = parser)
-
-    testRunner.run()
diff --git a/DeeployTest/testRunner_cortexm.py b/DeeployTest/testRunner_cortexm.py
deleted file mode 100644
index efa927fd91..0000000000
--- a/DeeployTest/testRunner_cortexm.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_qemu.py
-#
-# Last edited: 17.03.2023
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = False,
-        description = "Deeploy Code Generation Utility for the ARM (QEMU) Platform (no Tiling).")
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "QEMU-ARM", simulator = "qemu", tiling = False, argument_parser = parser)
-
-    testRunner.run()
diff --git a/DeeployTest/testRunner_generic.py b/DeeployTest/testRunner_generic.py
deleted file mode 100644
index 70909bf7e1..0000000000
--- a/DeeployTest/testRunner_generic.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_generic.py
-#
-# Last edited: 17.03.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = False,
-        description = "Deeploy Code Generation Utility for the Generic Platform (Host Machine, no Tiling).")
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "Generic", simulator = "host", tiling = False, argument_parser = parser)
-
-    testRunner.run()
diff --git a/DeeployTest/testRunner_mempool.py b/DeeployTest/testRunner_mempool.py
deleted file mode 100644
index c3bbd4f0c7..0000000000
--- a/DeeployTest/testRunner_mempool.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_mempool.py
-#
-# Last edited: 17.03.2023
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = False, description = "Deeploy Code Generation Utility for the MemPool Platform (no Tiling).")
-
-    parser.add_argument('-n',
-                        metavar = 'num_threads',
-                        dest = 'num_threads',
-                        type = int,
-                        default = 16,
-                        help = 'Number of parallel threads\n')
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "MemPool", simulator = "banshee", tiling = False, argument_parser = parser)
-
-    testRunner.cmake_args += f" -D num_threads={args.num_threads}"
-
-    testRunner.run()
diff --git a/DeeployTest/testRunner_siracusa.py b/DeeployTest/testRunner_siracusa.py
deleted file mode 100644
index 43e10de8d9..0000000000
--- a/DeeployTest/testRunner_siracusa.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_siracusa.py
-#
-# Last edited: 11.04.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = False,
-        description = "Deeploy Code Generation Utility for the Siracusa Platform (no Tiling).")
-
-    parser.add_argument('--cores',
-                        metavar = '<cores>',
-                        dest = 'cores',
-                        type = int,
-                        default = 8,
-                        help = 'Set number of cluster cores')
-
-    parser.add_argument('--profileUntiled',
-                        action = 'store_true',
-                        dest = 'profileUntiled',
-                        default = False,
-                        help = 'Profile Untiled')
-
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "Siracusa", simulator = "gvsoc", tiling = False, argument_parser = parser)
-
-    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
-    testRunner.run()
diff --git a/DeeployTest/testRunner_snitch.py b/DeeployTest/testRunner_snitch.py
deleted file mode 100644
index 49f51f96d0..0000000000
--- a/DeeployTest/testRunner_snitch.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_snitch.py
-#
-# Last edited: 23.04.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = False, description = "Deeploy Code Generation Utility for the Snitch Platform (no Tiling).")
-
-    parser.add_argument('--cores',
-                        metavar = '<cores>',
-                        dest = 'cores',
-                        type = int,
-                        default = 9,
-                        help = 'Set number of cluster cores')
-    parser.add_argument('--simulator',
-                        metavar = "<simulator>",
-                        dest = "simulator",
-                        type = str,
-                        choices = ["gvsoc", "banshee", "vsim", "vsim.gui"],
-                        default = "gvsoc",
-                        help = "Select the simulator to use")
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "Snitch", simulator = args.simulator, tiling = False, argument_parser = parser)
-
-    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
-    testRunner.run()
diff --git a/DeeployTest/testRunner_softhier.py b/DeeployTest/testRunner_softhier.py
deleted file mode 100644
index bc71d7a7cd..0000000000
--- a/DeeployTest/testRunner_softhier.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_softhier.py
-#
-# Copyright (C) 2025, ETH Zurich and University of Bologna.
-#
-# Author: Bowen Wang <bowwang@iis.ee.ethz.ch> , ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = False,
-        description = "Deeploy Code Generation Utility for the Single Cluster SoftHier (no Tiling).")
-
-    parser.add_argument('--num_clusters',
-                        metavar = 'num_clusters',
-                        dest = 'num_clusters',
-                        type = int,
-                        default = 1,
-                        help = 'Number of clusters\n')
-
-    parser.add_argument('--verbose', metavar = 'verbose', dest = 'verbose', type = int, default = 2, help = 'verbose\n')
-
-    for action in parser._actions:
-        if action.dest == 'toolchain_install_dir':
-            action.default = "${SOFTHIER_INSTALL_DIR}/third_party/toolchain/install"
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "SoftHier", simulator = "gvsoc", tiling = False, argument_parser = parser)
-
-    testRunner.cmake_args += f" -D num_clusters={args.num_clusters}"
-
-    testRunner.run()
diff --git a/DeeployTest/testRunner_tiled_siracusa.py b/DeeployTest/testRunner_tiled_siracusa.py
deleted file mode 100644
index 827affbeb5..0000000000
--- a/DeeployTest/testRunner_tiled_siracusa.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_tiled_siracusa.py
-#
-# Last edited: 31.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = True,
-        description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & NEureka).")
-
-    parser.add_argument('--cores',
-                        metavar = '<cores>',
-                        dest = 'cores',
-                        type = int,
-                        default = 8,
-                        help = 'Set number of cluster cores')
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "Siracusa", simulator = "gvsoc", tiling = True, argument_parser = parser)
-
-    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
-
-    testRunner.run()
diff --git a/DeeployTest/testRunner_tiled_siracusa_w_neureka.py b/DeeployTest/testRunner_tiled_siracusa_w_neureka.py
deleted file mode 100644
index fef043b7ac..0000000000
--- a/DeeployTest/testRunner_tiled_siracusa_w_neureka.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_tiled_siracusa_w_neureka.py
-#
-# Last edited: 31.10.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(
-        tiling_arguments = True,
-        description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & NEureka).")
-
-    parser.add_argument('--cores',
-                        metavar = '<cores>',
-                        dest = 'cores',
-                        type = int,
-                        default = 8,
-                        help = 'Set number of cluster cores')
-    parser.add_argument('--neureka-wmem',
-                        dest = "neureka_wmem",
-                        action = "store_true",
-                        default = False,
-                        help = 'Adds weight memory and neureka engine color\n')
-    parser.add_argument('--enable-3x3',
-                        dest = "enable_3x3",
-                        action = "store_true",
-                        default = False,
-                        help = 'Adds EXPERIMENTAL support for 3x3 convolutions on N-EUREKA\n')
-    parser.add_argument('--enableStrides',
-                        dest = "enableStrides",
-                        action = "store_true",
-                        default = False,
-                        help = 'Adds EXPERIMENTAL support for strided convolutions on N-EUREKA\n')
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "Siracusa_w_neureka",
-                            simulator = "gvsoc",
-                            tiling = True,
-                            argument_parser = parser)
-
-    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
-    if args.neureka_wmem:
-        testRunner.gen_args += " --neureka-wmem"
-    if args.enable_3x3:
-        testRunner.gen_args += " --enable-3x3"
-    if args.enableStrides:
-        testRunner.gen_args += " --enableStrides"
-
-    testRunner.run()
diff --git a/DeeployTest/testRunner_tiled_snitch.py b/DeeployTest/testRunner_tiled_snitch.py
deleted file mode 100644
index 3c2f4d9bda..0000000000
--- a/DeeployTest/testRunner_tiled_snitch.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# ----------------------------------------------------------------------
-#
-# File: testRunner_tiled_snitch.py
-#
-# Last edited: 23.04.2024
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
-
-if __name__ == "__main__":
-
-    parser = TestRunnerArgumentParser(tiling_arguments = True,
-                                      description = "Deeploy Code Generation Utility for the Snitch Platform (Tiling).")
-
-    parser.add_argument('--cores',
-                        metavar = '<cores>',
-                        dest = 'cores',
-                        type = int,
-                        default = 9,
-                        help = 'Set number of cluster cores')
-    parser.add_argument('--simulator',
-                        metavar = "<simulator>",
-                        dest = "simulator",
-                        type = str,
-                        choices = ["gvsoc", "banshee", "vsim", "vsim.gui"],
-                        default = "gvsoc",
-                        help = "Select the simulator to use")
-
-    args = parser.parse_args()
-
-    testRunner = TestRunner(platform = "Snitch", simulator = args.simulator, tiling = True, argument_parser = parser)
-
-    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
-
-    testRunner.run()
diff --git a/DeeployTest/testSchedulingExtension.py b/DeeployTest/testSchedulingExtension.py
index d6372def22..be77ecce53 100644
--- a/DeeployTest/testSchedulingExtension.py
+++ b/DeeployTest/testSchedulingExtension.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: tilerExtensionTest.py
-#
-# Last edited: 09.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 from collections import OrderedDict
@@ -33,7 +12,7 @@
 import pytest
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset
 
 from Deeploy.DeeployTypes import NetworkContext, NetworkDeployer, ONNXLayer, Schedule, StructBuffer, TransientBuffer, \
     VariableBuffer
@@ -249,7 +228,7 @@ def setupDeployer(memoryHierarchy: MemoryHierarchy, graph: gs.Graph) -> NetworkD
     platform, signProp = mapPlatform(args.platform)
 
     for index, num in enumerate(test_inputs):
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
diff --git a/DeeployTest/testSlice_PULP.py b/DeeployTest/testSlice_PULP.py
index dda9d13a58..e3ede61bbd 100644
--- a/DeeployTest/testSlice_PULP.py
+++ b/DeeployTest/testSlice_PULP.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: testSlice_PULP.py
-#
-# Last edited: 15.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import argparse
 import os
@@ -30,12 +9,12 @@
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
-from testUtils.codeGenerate import generateTestInputsHeader, generateTestNetworkHeader, \
-    generateTestNetworkImplementation, generateTestOutputsHeader
+from testUtils.codeGenerate import generateTestNetwork
 from testUtils.platformMapping import mapDeployer, setupMemoryPlatform
 from testUtils.testRunner import escapeAnsi
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset
 
+from Deeploy.DeeployTypes import _NoVerbosity
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper
 from Deeploy.Targets.PULPOpen.Platform import PULPPlatform
@@ -62,11 +41,11 @@
 
     signProp = False
 
-    onnx_graph = onnx.load_model('./Tests/testSlice/network.onnx')
+    onnx_graph = onnx.load_model('./Tests/Kernels/Integer/Slice/network.onnx')
     graph = gs.import_onnx(onnx_graph)
 
-    inputs = np.load('./Tests/testSlice/inputs.npz')
-    outputs = np.load(f'./Tests/testSlice/outputs.npz')
+    inputs = np.load('./Tests/Kernels/Integer/Slice/inputs.npz')
+    outputs = np.load(f'./Tests/Kernels/Integer/Slice/outputs.npz')
     tensors = graph.tensors()
 
     # Load as int64 and infer types later
@@ -86,7 +65,7 @@
     platform = PULPPlatform()
 
     for index, num in enumerate(test_inputs):
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
 
@@ -100,9 +79,9 @@
     deployer.frontEnd()
     deployer.parse(deployer.default_channels_first)
 
-    deployer.ctxt.lookup('onnx::Slice_5')._memoryLevel = "L1"
-    deployer.ctxt.lookup('onnx::Slice_5').allocTemplate = pulpL1AllocateTemplate
-    deployer.ctxt.lookup('onnx::Slice_5').deallocTemplate = pulpL1FreeTemplate
+    deployer.ctxt.lookup('onnxSlice_5_tensor')._memoryLevel = "L1"
+    deployer.ctxt.lookup('onnxSlice_5_tensor').allocTemplate = pulpL1AllocateTemplate
+    deployer.ctxt.lookup('onnxSlice_5_tensor').deallocTemplate = pulpL1FreeTemplate
 
     deployer.midEnd()
 
@@ -110,46 +89,29 @@
     deployer.prepared = True
     deployer.generateInferenceCode()
 
-    # Create input and output vectors
-    os.makedirs('TEST_SIRACUSA/Tests/testSlice', exist_ok = True)
-
-    testInputStr = generateTestInputsHeader(deployer, test_inputs, inputTypes, inputOffsets)
-    f = open('TEST_SIRACUSA/Tests/testSlice/testinputs.h', "w")
-    f.write(testInputStr)
-    f.close()
-
-    testOutputStr = generateTestOutputsHeader(deployer, test_outputs, signProp, False)
-    f = open('TEST_SIRACUSA/Tests/testSlice/testoutputs.h', "w")
-    f.write(testOutputStr)
-    f.close()
-
-    # Generate code for Network
-    testNetworkHeaderStr = generateTestNetworkHeader(deployer, platform)
-    f = open('TEST_SIRACUSA/Tests/testSlice/Network.h', "w")
-    f.write(testNetworkHeaderStr)
-    f.close()
+    # Offset the values if signprop
+    if signProp:
+        test_inputs = [value - inputOffsets[f"input_{i}"] for i, value in enumerate(test_inputs)]
 
-    testNetworkImplementationStr = generateTestNetworkImplementation(deployer, platform)
-    f = open('TEST_SIRACUSA/Tests/testSlice/Network.c', "w")
-    f.write(testNetworkImplementationStr)
-    f.close()
+        for i, values in enumerate(test_outputs):
+            buffer = deployer.ctxt.lookup(f"output_{i}")
+            isFloat = buffer._type.referencedType.typeName == "float32_t"
+            if not isFloat and not buffer._signed:
+                values -= buffer.nLevels // 2
 
-    clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
-    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/Network.c')
-    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/Network.h')
-    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/testoutputs.h')
-    os.system(f'clang-format -i --style="{clang_format}" TEST_SIRACUSA/Tests/testSlice/testinputs.h')
+    generateTestNetwork(deployer, test_inputs, test_outputs, 'TEST_SIRACUSA/Tests/Kernels/Integer/Slice', _NoVerbosity)
 
     os.system(
-        f"$CMAKE -DTOOLCHAIN={args.toolchain} -DTOOLCHAIN_INSTALL_DIR={_TOOLCHAIN_DIR}  -DTESTNAME=testSlice -DGENERATED_SOURCE=TEST_SIRACUSA/Tests/testSlice -Dplatform=Siracusa -B TEST_SIRACUSA/build -DNUM_CORES=1 .."
+        f"$CMAKE -DTOOLCHAIN={args.toolchain} -DTOOLCHAIN_INSTALL_DIR={_TOOLCHAIN_DIR}  -DTESTNAME=Slice -DGENERATED_SOURCE=TEST_SIRACUSA/Tests/Kernels/Integer/Slice -Dplatform=Siracusa -B TEST_SIRACUSA/build -DNUM_CORES=1 .."
     )
-    process = subprocess.Popen(["$CMAKE --build TEST_SIRACUSA/build --target gvsoc_testSlice"],
+    process = subprocess.Popen(["$CMAKE --build TEST_SIRACUSA/build --target gvsoc_Slice"],
                                stdout = subprocess.PIPE,
                                stderr = subprocess.STDOUT,
                                shell = True,
                                encoding = 'utf-8')
     fileHandle = open('out.txt', 'a')
-    fileHandle.write(f"################## Testing Tests/testSlice on SIRACUSA Platform ##################\n")
+    fileHandle.write(
+        f"################## Testing Tests/Kernels/Integer/Slice on SIRACUSA Platform ##################\n")
 
     result = ""
     while True:
@@ -166,4 +128,4 @@
     fileHandle.close()
 
     if not "Errors: 0 out of " in result:
-        raise RuntimeError(f"Found an error in Tests/testSlice")
+        raise RuntimeError(f"Found an error in Tests/Kernels/Integer/Slice")
diff --git a/DeeployTest/testTilerExtension.py b/DeeployTest/testTilerExtension.py
index edf1e6d1cc..e6b043eb57 100644
--- a/DeeployTest/testTilerExtension.py
+++ b/DeeployTest/testTilerExtension.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: tilerExtensionTest.py
-#
-# Last edited: 09.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 from collections import OrderedDict
@@ -33,7 +12,7 @@
 import pytest
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.typeMapping import inferInputType
+from testUtils.typeMapping import inferTypeAndOffset
 
 from Deeploy.DeeployTypes import GlobalDefinition, NetworkDeployer, ONNXLayer, Schedule, TransientBuffer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
@@ -165,10 +144,10 @@ def setupDeployer(memoryHierarchy: MemoryHierarchy, graph: gs.Graph) -> NetworkD
     platform, signProp = mapPlatform(args.platform)
 
     for index, num in enumerate(test_inputs):
-        _type, offset = inferInputType(num, signProp)[0]
+        _type, offset = inferTypeAndOffset(num, signProp)
         inputTypes[f"input_{index}"] = _type
         inputOffsets[f"input_{index}"] = offset
-        if "simpleRegression" in args.dir:
+        if "CNN_Linear2" in args.dir:
             inputOffsets[f"input_{index}"] = 0
 
     deployer = mapDeployer(platform,
diff --git a/DeeployTest/testTypes.py b/DeeployTest/testTypes.py
index 957a1f6274..9858015229 100644
--- a/DeeployTest/testTypes.py
+++ b/DeeployTest/testTypes.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: testTypes.py
-#
-# Last edited: 15.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import pickle
 
diff --git a/DeeployTest/testUtils/ProfilingTraceParser.py b/DeeployTest/testUtils/ProfilingTraceParser.py
index 3374c88fec..398c1e57d0 100644
--- a/DeeployTest/testUtils/ProfilingTraceParser.py
+++ b/DeeployTest/testUtils/ProfilingTraceParser.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: ProfilingTraceParser.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import re
 from dataclasses import dataclass
diff --git a/DeeployTest/testUtils/__init__.py b/DeeployTest/testUtils/__init__.py
index 65ec809815..be436b64a3 100644
--- a/DeeployTest/testUtils/__init__.py
+++ b/DeeployTest/testUtils/__init__.py
@@ -1,26 +1,5 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: __init__.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from . import *
diff --git a/DeeployTest/testUtils/codeGenerate.py b/DeeployTest/testUtils/codeGenerate.py
index 7df306efdb..39a44d9442 100644
--- a/DeeployTest/testUtils/codeGenerate.py
+++ b/DeeployTest/testUtils/codeGenerate.py
@@ -1,36 +1,15 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: codeGenerate.py
-#
-# Last edited: 23.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
-from pprint import pprint
-from typing import Dict, List, Optional, Tuple
+from typing import List, Tuple
 
 import numpy as np
 
-from Deeploy.DeeployTypes import ConstantBuffer, DeploymentPlatform, NetworkDeployer, VariableBuffer
+from Deeploy.DeeployTypes import CodeGenVerbosity, ConstantBuffer, NetworkDeployer, VariableBuffer
 from Deeploy.Targets.MemPool.Platform import MemPoolPlatform
+from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPPlatform
 
 _TEXT_ALIGN = 30
 
@@ -51,105 +30,77 @@ def _shapeBroadcast(ctxt, value, name):
     return broadcastNum
 
 
-def generateTestInputsHeader(deployer: NetworkDeployer, test_inputs: List, inputTypes: Dict, inputOffsets: Dict) -> str:
+def generateTestInputsHeader(deployer: NetworkDeployer, test_inputs: List) -> str:
+    vectors = []
     retStr = ""
-    inputNames = [deployer.ctxt.lookup(buf.name) for buf in deployer.graph.inputs]
-    inputTypes = {buf.name: buf._type for buf in inputNames}
-
-    for index, num in enumerate(test_inputs):
-
-        if f"input_{index}" not in inputTypes.keys():
+    for index, values in enumerate(test_inputs):
+        # WIESEP: Correctly handle empty arrays
+        if np.prod(values.shape) == 0:
             continue
 
-        # WIESEP: Correctly handle empty arrays
-        if np.prod(num.shape) == 0:
+        bufferName = f"input_{index}"
+
+        #LMACAN: We have some tests which have extra inputs and this is a hack to circumvent that
+        if not deployer.ctxt.is_buffer(bufferName):
             continue
 
-        test_inputs[index] -= inputOffsets[f"input_{index}"]
+        values = _shapeBroadcast(deployer.ctxt, values, bufferName)
 
-        broadcastNum = _shapeBroadcast(deployer.ctxt, num, f"input_{index}")
+        buffer = deployer.ctxt.lookup(bufferName)
+        typeName = buffer._type.referencedType.typeName
+        typeWidth = buffer._type.referencedType.typeWidth
 
-        data_type = inputTypes[f"input_{index}"]
-        data_width = inputTypes[f"input_{index}"].referencedType.typeWidth
+        vectorName = f"testInputVector{index}"
+        vectors.append(vectorName)
 
-        retStr += f"{data_type.referencedType.typeName} testInputVector{index}[] ="
+        retStr += f"{typeName} {vectorName}[] ="
         retStr += "{"
-        if data_type.referencedType.typeName == 'float32_t':
-            list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in broadcastNum])
+        if typeName == 'float32_t':
+            list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values])
         else:
-            list_str = (", ").join([str(x) for x in broadcastNum])
+            list_str = (", ").join([str(x) for x in values])
 
-        # WIESEP: Arrays have to be 4 byte alinged (at lest in banshee)
-        bytes = len(broadcastNum) * (data_width // 8)
-        if bytes % 4 != 0:
-            bytes = 4 * int((bytes / 4 + 1))
-            padding = (bytes * 8) // data_width - len(broadcastNum)
-            list_str += ", "
-            list_str += (", ").join([str(0) for x in range(padding)])
+        # WIESEP: Arrays have to be 4 byte aligned (at least in banshee)
+        total_bytes = (values.size * typeWidth) // 8
+        pad_bytes = (-total_bytes) % 4
+        if pad_bytes:
+            paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth
+            list_str += ", " + (", ").join("0" for _ in range(paddingElements))
 
         retStr += list_str
         retStr += "};\n"
 
-    retStr += f"void* testInputVector[{len(inputTypes)}] = " + "{"
-    retStr += ", ".join([
-        f"testInputVector{idx}" for idx, _ in enumerate(test_inputs)
-        if np.prod(test_inputs[idx].shape) != 0 and f"input_{idx}" in inputTypes.keys()
-    ])
+    retStr += f"void* testInputVector[{len(vectors)}] = {{"
+    retStr += ", ".join(vectors)
     retStr += "};\n"
 
     return retStr
 
 
-def generateTestOutputsHeader(deployer: NetworkDeployer,
-                              test_outputs: List,
-                              signProp: Optional[bool] = None,
-                              verbose: Optional[bool] = None) -> str:
-
-    output_signed = {}
-    output_n_levels = {}
-    output_data_type = {}
-
-    if signProp is None:
-        signProp = False
-
-    if verbose is None:
-        verbose = False
-
+def generateTestOutputsHeader(deployer: NetworkDeployer, test_outputs: List[np.ndarray]) -> str:
     retStr = ""
+    for index, values in enumerate(test_outputs):
+        typeName = deployer.ctxt.lookup(f'output_{index}')._type.referencedType.typeName
+        typeWidth = deployer.ctxt.lookup(f'output_{index}')._type.referencedType.typeWidth
 
-    for index, num in enumerate(test_outputs):
-        output_data_type[f"output_{index}"] = deployer.ctxt.lookup(f'output_{index}')._type
-
-        data_type = output_data_type[f"output_{index}"]
-        isdatafloat = (data_type.referencedType.typeName == "float32_t")
-
-        if signProp and not isdatafloat:
-            output_n_levels[f"output_{index}"] = deployer.ctxt.lookup(f'output_{index}').nLevels
-            output_signed[f"output_{index}"] = deployer.ctxt.lookup(f'output_{index}')._signed
-            test_outputs[index] -= int(
-                ((1 - output_signed[f"output_{index}"]) * (output_n_levels[f"output_{index}"] / 2)))
-
-        data_width = data_type.referencedType.typeWidth
-        retStr += f"#define OUTPUTTYPE {data_type.referencedType.typeName}\n"
-        if isdatafloat:
-            retStr += f"#define ISOUTPUTFLOAT 1\n"
-        else:
-            retStr += f"#define ISOUTPUTFLOAT 0\n"
-        retStr += f"{data_type.referencedType.typeName} testOutputVector{index}[] ="
+        retStr += f"#define OUTPUTTYPE {typeName}\n"
+        retStr += f"#define ISOUTPUTFLOAT {int(typeName == 'float32_t')}\n"
+        retStr += f"{typeName} testOutputVector{index}[] ="
         retStr += "{"
 
-        # WIESEP: Arrays have to be 4 byte alinged (at lest in banshee)
-        if data_type.referencedType.typeName == 'float32_t':
-            list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in num])
+        values = values.flatten()
+
+        if typeName == "float32_t":
+            list_str = (", ").join([f'{x}f' if not (np.isinf(x) or np.isnan(x)) else str(x) for x in values])
         else:
-            list_str = (", ").join([str(x) for x in num])
+            list_str = (", ").join([str(x) for x in values])
 
-        bytes = len(num) * (data_width // 8)
-        if bytes % 4 != 0:
-            bytes = 4 * int((bytes / 4 + 1))
-            padding = (bytes * 8) // data_width - len(num)
-            list_str += ", "
-            list_str += (", ").join([str(0) for x in range(padding)])
+        # WIESEP: Arrays have to be 4 byte aligned (at least in banshee)
+        total_bytes = (len(values) * typeWidth) // 8
+        pad_bytes = (-total_bytes) % 4
+        if pad_bytes:
+            paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth
+            list_str += ", " + (", ").join("0" for _ in range(paddingElements))
 
         retStr += list_str
         retStr += "};\n"
@@ -158,35 +109,33 @@ def generateTestOutputsHeader(deployer: NetworkDeployer,
     retStr += ", ".join([f"testOutputVector{idx}" for idx, _ in enumerate(test_outputs)])
     retStr += "};\n"
 
-    if verbose:
-        if signProp:
-            print('Output N Levels:')
-            pprint(output_n_levels, indent = 2, width = 120)
-            print('Output Signed:')
-            pprint(output_signed, indent = 2, width = 120)
-        print('Output Data Type:')
-        pprint(output_data_type, indent = 2, width = 120)
-
     return retStr
 
 
-def generateTestNetworkHeader(deployer: NetworkDeployer, platform: DeploymentPlatform) -> str:
+def generateTestNetworkHeader(deployer: NetworkDeployer) -> str:
 
     retStr = ""
 
     retStr += """
-    #ifndef __DEEPLOY_HEADER_
-    #define __DEEPLOY_HEADER_
+    #ifndef __DEEPLOY_HEADER__
+    #define __DEEPLOY_HEADER__
     #include <stdio.h>
     #include <stdint.h>
     #include <stdlib.h>
     """
     retStr += deployer.generateIncludeString()
-    retStr += """
-    void RunNetwork(uint32_t core_id, uint32_t numThreads);
-    void InitNetwork(uint32_t core_id, uint32_t numThread);
+    if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+        void RunNetwork();
+        void InitNetwork();
 
-    """
+        """
+    else:
+        retStr += """
+        void RunNetwork(uint32_t core_id, uint32_t numThreads);
+        void InitNetwork(uint32_t core_id, uint32_t numThread);
+
+        """
 
     retStr += deployer.generateIOBufferInitializationCode()
     retStr += """
@@ -196,13 +145,7 @@ def generateTestNetworkHeader(deployer: NetworkDeployer, platform: DeploymentPla
     return retStr
 
 
-def generateTestNetworkImplementation(deployer: NetworkDeployer,
-                                      platform: DeploymentPlatform,
-                                      verbose: Optional[bool] = None) -> str:
-
-    if verbose is None:
-        verbose = False
-
+def generateTestNetworkImplementation(deployer: NetworkDeployer, verbosityCfg: CodeGenVerbosity) -> str:
     retStr = ""
 
     retStr += """#include <stdio.h>
@@ -220,23 +163,35 @@ def generateTestNetworkImplementation(deployer: NetworkDeployer,
     retStr += deployer.generateGlobalDefinitionCode()
 
     # WIESEP: Mempool assigns section attributes to intermediate buffers to allow .
-    if isinstance(platform, MemPoolPlatform):
+    if isinstance(deployer.Platform, MemPoolPlatform):
         retStr += deployer.generateInferenceInitializationCode()
         retStr += """
         void RunNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
         """
+    elif isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+        void RunNetwork(){
+        """
+        retStr += deployer.generateInferenceInitializationCode()
     else:
         retStr += """
         void RunNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
         """
         retStr += deployer.generateInferenceInitializationCode()
 
-    retStr += deployer.generateFunction(verbose)
-    retStr += """
-    }
+    retStr += deployer.generateFunction(verbosityCfg)
+    if isinstance(deployer.Platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
+        retStr += """
+        }
 
-    void InitNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
-    """
+        void InitNetwork(){
+        """
+    else:
+        retStr += """
+        }
+
+        void InitNetwork(__attribute__((unused)) uint32_t core_id, __attribute__((unused)) uint32_t numThreads){
+        """
     retStr += deployer.generateEngineInitializationCode()
     retStr += deployer.generateBufferAllocationCode()
     retStr += """
@@ -269,13 +224,14 @@ def type2TypeStr(dataType) -> Tuple[str, int]:
 
     def dumpBuffer(buf: VariableBuffer, path: str):
 
-        if "input" in buf.name:
-            idx = int(buf.name.split("_")[1])
+        # Check if buffer name matches exactly "input_N" or "output_N" pattern
+        parts = buf.name.split("_")
+        if len(parts) == 2 and parts[0] == "input" and parts[1].isdigit():
+            idx = int(parts[1])
             array = _shapeBroadcast(deployer.ctxt, test_inputs[idx], f"input_{idx}")
 
-        elif "output" in buf.name:
-            _list = buf.name.split("_")
-            idx = int(_list[1])
+        elif len(parts) == 2 and parts[0] == "output" and parts[1].isdigit():
+            idx = int(parts[1])
             array = _shapeBroadcast(deployer.ctxt, test_outputs[idx], f"output_{idx}")
 
         elif isinstance(buf, ConstantBuffer):
@@ -298,3 +254,36 @@ def dumpBuffer(buf: VariableBuffer, path: str):
         if hasattr(buf, "extName"):
             pathName = os.path.join(path, f"{buf.extName}.hex")
             dumpBuffer(buf, pathName)
+
+
+def generateTestNetwork(deployer: NetworkDeployer, test_inputs: List[np.ndarray], test_outputs: List[np.ndarray],
+                        dumpdir: str, verbosityCfg: CodeGenVerbosity) -> None:
+    assert deployer.prepared, "An unprepared deployer was given"
+
+    # Create input and output vectors
+    os.makedirs(dumpdir, exist_ok = True)
+
+    testInputStr = generateTestInputsHeader(deployer, test_inputs)
+    with open(f'{dumpdir}/testinputs.h', "w") as f:
+        f.write(testInputStr)
+
+    testOutputStr = generateTestOutputsHeader(deployer, test_outputs)
+    with open(f'{dumpdir}/testoutputs.h', "w") as f:
+        f.write(testOutputStr)
+
+    # Generate code for Network
+    testNetworkHeaderStr = generateTestNetworkHeader(deployer)
+    with open(f'{dumpdir}/Network.h', "w") as f:
+        f.write(testNetworkHeaderStr)
+
+    testNetworkImplementationStr = generateTestNetworkImplementation(deployer, verbosityCfg)
+    with open(f'{dumpdir}/Network.c', "w") as f:
+        f.write(testNetworkImplementationStr)
+
+    generateL3HexDump(deployer, os.path.join(f'{dumpdir}', 'hex'), test_inputs, test_outputs)
+
+    clang_format = "{BasedOnStyle: llvm, IndentWidth: 2, ColumnLimit: 160}"
+    os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/Network.c')
+    os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/Network.h')
+    os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/testoutputs.h')
+    os.system(f'clang-format -i --style="{clang_format}" {dumpdir}/testinputs.h')
diff --git a/DeeployTest/testUtils/core/__init__.py b/DeeployTest/testUtils/core/__init__.py
new file mode 100644
index 0000000000..b08d81d35b
--- /dev/null
+++ b/DeeployTest/testUtils/core/__init__.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .config import DeeployTestConfig
+from .execution import build_binary, configure_cmake, generate_network, run_complete_test, run_simulation
+from .output_parser import TestResult
+from .paths import get_test_paths
+
+__all__ = [
+    'DeeployTestConfig',
+    'TestResult',
+    'get_test_paths',
+    'generate_network',
+    'configure_cmake',
+    'build_binary',
+    'run_simulation',
+    'run_complete_test',
+]
diff --git a/DeeployTest/testUtils/core/config.py b/DeeployTest/testUtils/core/config.py
new file mode 100644
index 0000000000..0c545e1b73
--- /dev/null
+++ b/DeeployTest/testUtils/core/config.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from dataclasses import dataclass
+from typing import List, Literal, Optional
+
+
+@dataclass
+class DeeployTestConfig:
+    """Configuration for a single test case."""
+    test_name: str
+    test_dir: str
+    platform: str
+    simulator: Literal['gvsoc', 'banshee', 'qemu', 'vsim', 'vsim.gui', 'host', 'none']
+    tiling: bool
+    gen_dir: str
+    build_dir: str
+    toolchain: str = "LLVM"
+    toolchain_install_dir: Optional[str] = None
+    cmake_args: List[str] = None
+    gen_args: List[str] = None
+    verbose: int = 0
+    debug: bool = False
+
+    def __post_init__(self):
+        if self.cmake_args is None:
+            self.cmake_args = []
+        if self.gen_args is None:
+            self.gen_args = []
+        if self.toolchain_install_dir is None:
+            self.toolchain_install_dir = os.environ.get('LLVM_INSTALL_DIR')
diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
new file mode 100644
index 0000000000..46ed86d303
--- /dev/null
+++ b/DeeployTest/testUtils/core/execution.py
@@ -0,0 +1,220 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+
+from Deeploy.Logging import DEFAULT_LOGGER as log
+
+from .config import DeeployTestConfig
+from .output_parser import TestResult, parse_test_output
+
+
+def generate_network(config: DeeployTestConfig, skip: bool = False) -> None:
+    """
+    Generate network code from ONNX model.
+        
+    Raises:
+        RuntimeError: If network generation fails
+    """
+    if skip:
+        log.info(f"Skipping network generation for {config.test_name}")
+        return
+
+    script_dir = Path(__file__).parent.parent.parent
+
+    if config.tiling:
+        generation_script = script_dir / "testMVP.py"
+    else:
+        generation_script = script_dir / "generateNetwork.py"
+
+    cmd = [
+        "python",
+        str(generation_script),
+        "-d",
+        config.gen_dir,
+        "-t",
+        config.test_dir,
+        "-p",
+        config.platform,
+    ]
+
+    # Add verbosity flags
+    if config.verbose > 0:
+        cmd.append("-" + "v" * config.verbose)
+
+    # Add debug flag
+    if config.debug:
+        cmd.append("--debug")
+
+    # Add additional generation arguments
+    cmd.extend(config.gen_args)
+
+    log.debug(f"[Execution] Generation command: {' '.join(cmd)}")
+
+    result = subprocess.run(cmd, check = False)
+
+    if result.returncode != 0:
+        log.error(f"Network generation failed with return code {result.returncode}")
+        raise RuntimeError(f"Network generation failed for {config.test_name}")
+
+
+def configure_cmake(config: DeeployTestConfig) -> None:
+
+    assert config.toolchain_install_dir is not None, \
+        "LLVM_INSTALL_DIR environment variable not set"
+
+    cmake_cmd = os.environ.get("CMAKE", "cmake")
+    if cmake_cmd == "cmake" and shutil.which("cmake") is None:
+        raise RuntimeError("CMake not found. Please install CMake or set CMAKE environment variable")
+
+    # Build CMake command
+    cmd = [
+        cmake_cmd,
+        f"-DTOOLCHAIN={config.toolchain}",
+        f"-DTOOLCHAIN_INSTALL_DIR={config.toolchain_install_dir}",
+        f"-DGENERATED_SOURCE={config.gen_dir}",
+        f"-Dplatform={config.platform}",
+        f"-DTESTNAME={config.test_name}",
+        f"-B{config.build_dir}",
+    ]
+
+    for arg in config.cmake_args:
+        if not arg.startswith("-D"):
+            arg = "-D" + arg
+        cmd.append(arg)
+
+    if config.simulator == 'banshee':
+        cmd.append("-Dbanshee_simulation=ON")
+    else:
+        cmd.append("-Dbanshee_simulation=OFF")
+
+    if config.simulator == 'gvsoc':
+        cmd.append("-Dgvsoc_simulation=ON")
+    else:
+        cmd.append("-Dgvsoc_simulation=OFF")
+
+    # Last argument is the source directory
+    script_dir = Path(__file__).parent.parent.parent
+    cmd.append(str(script_dir.parent))
+
+    env = os.environ.copy()
+    if config.verbose >= 3:
+        env["VERBOSE"] = "1"
+
+    log.debug(f"[Execution] CMake command: {' '.join(cmd)}")
+
+    result = subprocess.run(cmd, check = False, env = env)
+
+    if result.returncode != 0:
+        log.error(f"CMake configuration failed with return code {result.returncode}")
+        raise RuntimeError(f"CMake configuration failed for {config.test_name}")
+
+
+def build_binary(config: DeeployTestConfig) -> None:
+
+    cmake_cmd = os.environ.get("CMAKE", "cmake")
+
+    cmd = [
+        cmake_cmd,
+        "--build",
+        config.build_dir,
+        "--target",
+        config.test_name,
+    ]
+
+    env = os.environ.copy()
+    if config.verbose >= 3:
+        env["VERBOSE"] = "1"
+
+    log.debug(f"[Execution] Build command: {' '.join(cmd)}")
+
+    result = subprocess.run(cmd, check = False, env = env)
+
+    if result.returncode != 0:
+        log.error(f"Build failed with return code {result.returncode}")
+        raise RuntimeError(f"Build failed for {config.test_name}")
+
+
+def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
+    """
+    Run simulation and parse output.
+        
+    Raises:
+        RuntimeError: If simulation cannot be executed
+    """
+    if skip:
+        log.info(f"Skipping simulation for {config.test_name}")
+        return TestResult(success = True, error_count = 0, total_count = 0, stdout = "Skipped")
+
+    if config.simulator == 'none':
+        raise RuntimeError("No simulator specified!")
+
+    if config.simulator == 'host':
+        # Run binary directly
+        binary_path = Path(config.build_dir) / "bin" / config.test_name
+        cmd = [str(binary_path)]
+    else:
+        # Run via CMake target
+        cmake_cmd = os.environ.get("CMAKE", "cmake")
+        cmd = [
+            cmake_cmd,
+            "--build",
+            config.build_dir,
+            "--target",
+            f"{config.simulator}_{config.test_name}",
+        ]
+
+    env = os.environ.copy()
+    if config.verbose >= 3:
+        env["VERBOSE"] = "1"
+
+    if config.simulator == 'banshee':
+        if config.verbose == 1:
+            env["BANSHEE_LOG"] = "warn"
+        elif config.verbose == 2:
+            env["BANSHEE_LOG"] = "info"
+        elif config.verbose >= 3:
+            env["BANSHEE_LOG"] = "debug"
+
+    log.debug(f"[Execution] Simulation command: {' '.join(cmd)}")
+
+    result = subprocess.run(cmd, capture_output = True, text = True, env = env)
+
+    if result.stdout:
+        print(result.stdout, end = '')
+    if result.stderr:
+        print(result.stderr, end = '', file = sys.stderr)
+
+    # Parse output for error count and cycles
+    test_result = parse_test_output(result.stdout, result.stderr)
+
+    if not test_result.success and test_result.error_count == -1:
+        log.warning(f"Could not parse error count from output")
+
+    return test_result
+
+
+def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim: bool = False) -> TestResult:
+    """
+    Run a complete test: generate, configure, build, and simulate.
+    """
+    log.info(f"################## Testing {config.test_name} on {config.platform} Platform ##################")
+
+    # Step 1: Generate network
+    generate_network(config, skip = skipgen)
+
+    # Step 2: Configure CMake
+    configure_cmake(config)
+
+    # Step 3: Build binary
+    build_binary(config)
+
+    # Step 4: Run simulation
+    result = run_simulation(config, skip = skipsim)
+
+    return result
diff --git a/DeeployTest/testUtils/core/output_parser.py b/DeeployTest/testUtils/core/output_parser.py
new file mode 100644
index 0000000000..ffb91ce0a7
--- /dev/null
+++ b/DeeployTest/testUtils/core/output_parser.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class TestResult:
+    success: bool
+    error_count: int
+    total_count: int
+    stdout: str
+    stderr: str = ""
+    runtime_cycles: Optional[int] = None
+
+
+def parse_test_output(stdout: str, stderr: str = "") -> TestResult:
+
+    output = stdout + stderr
+
+    # Look for "Errors: X out of Y" pattern
+    error_match = re.search(r'Errors:\s*(\d+)\s*out\s*of\s*(\d+)', output)
+
+    if error_match:
+        error_count = int(error_match.group(1))
+        total_count = int(error_match.group(2))
+        success = (error_count == 0)
+    else:
+        # Could not parse output - treat as failure
+        error_count = -1
+        total_count = -1
+        success = False
+
+    runtime_cycles = None
+    cycle_match = re.search(r'Runtime:\s*(\d+)\s*cycles', output)
+    if cycle_match:
+        runtime_cycles = int(cycle_match.group(1))
+
+    return TestResult(
+        success = success,
+        error_count = error_count,
+        total_count = total_count,
+        stdout = stdout,
+        stderr = stderr,
+        runtime_cycles = runtime_cycles,
+    )
diff --git a/DeeployTest/testUtils/core/paths.py b/DeeployTest/testUtils/core/paths.py
new file mode 100644
index 0000000000..016924d1de
--- /dev/null
+++ b/DeeployTest/testUtils/core/paths.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from pathlib import Path
+from typing import Optional, Tuple
+
+from Deeploy.Logging import DEFAULT_LOGGER as log
+
+
+def get_test_paths(test_dir: str, platform: str, base_dir: Optional[str] = None) -> Tuple[str, str, str]:
+    """
+    Resolve test paths for generation and build directories.
+    
+    Args:
+        test_dir: Path to test directory (e.g., "Tests/Adder" or absolute path)
+        platform: Platform name (e.g., "Generic")
+        base_dir: Base directory for tests (defaults to DeeployTest/)
+        
+    Returns:
+        Tuple of (gen_dir, test_dir_abs, test_name)
+    """
+    if base_dir is None:
+        # Get the absolute path of this script's parent directory (core -> testUtils -> DeeployTest)
+        script_path = Path(__file__).resolve()
+        base_dir = script_path.parent.parent.parent
+    else:
+        base_dir = Path(base_dir)
+
+    test_path = Path(test_dir)
+    if not test_path.is_absolute():
+        test_path = base_dir / test_dir
+
+    test_path = test_path.resolve()
+    test_name = test_path.name
+
+    gen_dir_name = f"TEST_{platform.upper()}"
+
+    # Check if path is inside base_dir
+    try:
+        rel_path = test_path.relative_to(base_dir)
+        gen_dir = base_dir / gen_dir_name / rel_path
+    except ValueError:
+        # Path is outside base_dir
+        gen_dir = base_dir / gen_dir_name / test_name
+        log.warning(f"Test path {test_path} is outside base directory. Using {gen_dir}")
+
+    return str(gen_dir), str(test_path), test_name
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
new file mode 100644
index 0000000000..6e6f57e049
--- /dev/null
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -0,0 +1,426 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import codecs
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+import coloredlogs
+
+from Deeploy.Logging import DEFAULT_FMT
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.Logging import DETAILED_FILE_LOG_FORMAT
+
+from .core import DeeployTestConfig, run_complete_test
+from .core.paths import get_test_paths
+
+
+def cmake_str(arg_str):
+    return "-D" + codecs.decode(str(arg_str), 'unicode_escape')
+
+
+class _ArgumentDefaultMetavarTypeFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.MetavarTypeHelpFormatter):
+
+    def __init__(self, prog: str, indent_increment: int = 2, max_help_position: int = 100, width = None) -> None:
+        super().__init__(prog, indent_increment, max_help_position, width)
+
+
+class DeeployRunnerArgumentParser(argparse.ArgumentParser):
+
+    def __init__(self,
+                 tiling_arguments: bool,
+                 description: Optional[str] = None,
+                 platform_required: bool = True,
+                 allow_extra_args: bool = False):
+        formatter = _ArgumentDefaultMetavarTypeFormatter
+
+        if description is None:
+            super().__init__(description = "Deeploy Code Generation and Test Utility.", formatter_class = formatter)
+        else:
+            super().__init__(description = description, formatter_class = formatter)
+
+        self.allow_extra_args = allow_extra_args
+
+        self.tiling_arguments = tiling_arguments
+
+        self.add_argument('-t',
+                          metavar = '<dir>',
+                          dest = 'dir',
+                          type = str,
+                          required = True,
+                          help = 'Test directory (e.g., Tests/Kernels/Integer/Add/Regular)\n')
+        self.add_argument('-p',
+                          metavar = '<platform>',
+                          dest = 'platform',
+                          type = str,
+                          required = platform_required,
+                          default = None,
+                          help = 'Target platform (e.g., Generic, QEMU-ARM, Siracusa, Snitch)\n')
+        self.add_argument('-s',
+                          metavar = '<simulator>',
+                          dest = 'simulator',
+                          type = str,
+                          default = None,
+                          help = 'Simulator to use (gvsoc, banshee, qemu, vsim, host, none)\n')
+        self.add_argument('-v', action = 'count', dest = 'verbose', default = 0, help = 'Increase verbosity level\n')
+        self.add_argument('-D',
+                          dest = 'cmake',
+                          action = 'extend',
+                          nargs = "*",
+                          type = cmake_str,
+                          help = "Create or update a cmake cache entry\n")
+        self.add_argument('--debug',
+                          dest = 'debug',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Enable debugging mode\n')
+        self.add_argument('--skipgen',
+                          dest = 'skipgen',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Skip network generation (reuse existing generated code)\n')
+        self.add_argument('--skipsim',
+                          dest = 'skipsim',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Skip simulation (build only)\n')
+        self.add_argument('--profileUntiled',
+                          '--profile-untiled',
+                          dest = 'profileUntiled',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Enable untiled profiling (Siracusa only)\n')
+        self.add_argument('--toolchain',
+                          metavar = '<LLVM|GCC>',
+                          dest = 'toolchain',
+                          type = str,
+                          default = "LLVM",
+                          help = 'Compiler toolchain\n')
+        self.add_argument('--toolchain-install-dir',
+                          metavar = '<dir>',
+                          dest = 'toolchain_install_dir',
+                          type = str,
+                          default = os.environ.get('LLVM_INSTALL_DIR'),
+                          help = 'Toolchain installation directory\n')
+        self.add_argument('--input-type-map',
+                          nargs = '*',
+                          default = [],
+                          type = str,
+                          help = '(Optional) mapping of input names to data types. '
+                          'Example: --input-type-map input_0=int8_t input_1=float32_t\n')
+        self.add_argument('--input-offset-map',
+                          nargs = '*',
+                          default = [],
+                          type = str,
+                          help = '(Optional) mapping of input names to offsets. '
+                          'Example: --input-offset-map input_0=0 input_1=128\n')
+
+        if self.tiling_arguments:
+            self.add_argument('--defaultMemLevel',
+                              metavar = '<level>',
+                              dest = 'defaultMemLevel',
+                              type = str,
+                              default = "L2",
+                              help = 'Default memory level (L2 or L3)\n')
+            self.add_argument('--doublebuffer', action = 'store_true', help = 'Enable double buffering\n')
+            self.add_argument('--l1',
+                              metavar = '<size>',
+                              dest = 'l1',
+                              type = int,
+                              default = 64000,
+                              help = 'L1 size in bytes\n')
+            self.add_argument('--l2',
+                              metavar = '<size>',
+                              dest = 'l2',
+                              type = int,
+                              default = 1024000,
+                              help = 'L2 size in bytes\n')
+            self.add_argument('--randomizedMemoryScheduler',
+                              action = "store_true",
+                              help = 'Enable randomized memory scheduler\n')
+            self.add_argument('--profileTiling', action = 'store_true', help = 'Enable tiling profiling\n')
+            self.add_argument('--memAllocStrategy',
+                              metavar = '<strategy>',
+                              dest = 'memAllocStrategy',
+                              type = str,
+                              default = "MiniMalloc",
+                              help = 'Memory allocation strategy: TetrisRandom, TetrisCo-Opt, MiniMalloc\n')
+            self.add_argument('--searchStrategy',
+                              metavar = '<strategy>',
+                              dest = 'searchStrategy',
+                              type = str,
+                              default = "random-max",
+                              help = 'CP solver search strategy: random-max, max, min\n')
+            self.add_argument('--plotMemAlloc',
+                              action = 'store_true',
+                              help = 'Plot memory allocation and save in deeployState folder\n')
+
+        self.args = None
+
+    def parse_args(self, args = None, namespace = None) -> argparse.Namespace:
+
+        self.args = super().parse_args(args, namespace)
+
+        if self.args.verbose > 2:
+            coloredlogs.install(level = 'DEBUG', logger = log, fmt = DETAILED_FILE_LOG_FORMAT)
+        elif self.args.verbose > 1:
+            coloredlogs.install(level = 'DEBUG', logger = log, fmt = DEFAULT_FMT)
+        elif self.args.verbose > 0:
+            coloredlogs.install(level = 'INFO', logger = log, fmt = DEFAULT_FMT)
+        else:
+            coloredlogs.install(level = 'WARNING', logger = log, fmt = DEFAULT_FMT)
+
+        return self.args
+
+
+def create_config_from_args(args: argparse.Namespace,
+                            platform: str,
+                            simulator: str,
+                            tiling: bool,
+                            platform_specific_cmake_args: Optional[list] = None) -> DeeployTestConfig:
+
+    script_path = Path(__file__).resolve()
+    base_dir = script_path.parent.parent
+
+    test_dir = args.dir
+    gen_dir, test_dir_abs, test_name = get_test_paths(test_dir, platform, base_dir = str(base_dir))
+
+    # Use worker-specific build directory to avoid collisions with parallel execution with pytest-xdist
+    worker_id = os.environ.get("PYTEST_XDIST_WORKER", "master")
+    if worker_id == "master":
+        build_dir = str(base_dir / f"TEST_{platform.upper()}" / "build_master")
+    else:
+        build_dir = str(base_dir / f"TEST_{platform.upper()}" / f"build_{worker_id}")
+
+    cmake_args_list = list(args.cmake) if args.cmake else []
+
+    # Add platform-specific CMake args
+    if platform_specific_cmake_args:
+        cmake_args_list.extend(platform_specific_cmake_args)
+
+    # Prepare generation args
+    gen_args_list = []
+
+    if args.input_type_map:
+        gen_args_list.append("--input-type-map")
+        gen_args_list.extend(args.input_type_map)
+    if args.input_offset_map:
+        gen_args_list.append("--input-offset-map")
+        gen_args_list.extend(args.input_offset_map)
+
+    if tiling:
+        if hasattr(args, 'defaultMemLevel') and args.defaultMemLevel:
+            gen_args_list.append(f"--defaultMemLevel={args.defaultMemLevel}")
+        if hasattr(args, 'doublebuffer') and args.doublebuffer:
+            gen_args_list.append("--doublebuffer")
+        if hasattr(args, 'l1') and args.l1:
+            gen_args_list.append(f"--l1={args.l1}")
+        if hasattr(args, 'l2') and args.l2 and args.l2 != 1024000:
+            gen_args_list.append(f"--l2={args.l2}")
+        if hasattr(args, 'randomizedMemoryScheduler') and args.randomizedMemoryScheduler:
+            gen_args_list.append("--randomizedMemoryScheduler")
+        if hasattr(args, 'profileTiling') and args.profileTiling:
+            gen_args_list.append("--profileTiling")
+        if hasattr(args, 'memAllocStrategy') and args.memAllocStrategy:
+            gen_args_list.append(f"--memAllocStrategy={args.memAllocStrategy}")
+        if hasattr(args, 'searchStrategy') and args.searchStrategy:
+            gen_args_list.append(f"--searchStrategy={args.searchStrategy}")
+        if hasattr(args, 'plotMemAlloc') and args.plotMemAlloc:
+            gen_args_list.append("--plotMemAlloc")
+
+    if not tiling and getattr(args, 'profileUntiled', False):
+        gen_args_list.append("--profileUntiled")
+
+    config = DeeployTestConfig(
+        test_name = test_name,
+        test_dir = test_dir_abs,
+        platform = platform,
+        simulator = simulator,
+        tiling = tiling,
+        gen_dir = gen_dir,
+        build_dir = build_dir,
+        toolchain = args.toolchain,
+        toolchain_install_dir = args.toolchain_install_dir,
+        cmake_args = cmake_args_list,
+        gen_args = gen_args_list,
+        verbose = args.verbose,
+        debug = args.debug,
+    )
+
+    return config
+
+
+def print_colored_result(result, test_name: str):
+
+    GREEN = '\033[92m'
+    RED = '\033[91m'
+    RESET = '\033[0m'
+
+    if result.success and result.error_count == 0:
+        print(f"\n{GREEN}✓ Test {test_name} PASSED - No errors found{RESET}")
+        if result.runtime_cycles is not None:
+            print(f"{GREEN}  Runtime: {result.runtime_cycles} cycles{RESET}")
+    else:
+        print(f"\n{RED}✗ Test {test_name} FAILED - {result.error_count} errors out of {result.total_count}{RESET}")
+        if result.runtime_cycles is not None:
+            print(f"{RED}  Runtime: {result.runtime_cycles} cycles{RESET}")
+
+
+def print_configuration(config: DeeployTestConfig):
+
+    CYAN = '\033[96m'
+    BOLD = '\033[1m'
+    RESET = '\033[0m'
+
+    print(f"\n{BOLD}{CYAN}═══════════════════════════════════════════════════════════════{RESET}")
+    print(f"{BOLD}{CYAN}                    Deeploy Test Configuration                 {RESET}")
+    print(f"{BOLD}{CYAN}═══════════════════════════════════════════════════════════════{RESET}\n")
+
+    print(f"{BOLD}Test Configuration:{RESET}")
+    print(f"  Test Name           : {config.test_name}")
+    print(f"  Test Directory      : {config.test_dir}")
+    print(f"  Generation Directory: {config.gen_dir}")
+    print(f"  Build Directory     : {config.build_dir}")
+
+    print(f"\n{BOLD}Platform Configuration:{RESET}")
+    print(f"  Platform            : {config.platform}")
+    print(f"  Simulator           : {config.simulator}")
+    print(f"  Tiling Enabled      : {'Yes' if config.tiling else 'No'}")
+
+    print(f"\n{BOLD}Build Configuration:{RESET}")
+    print(f"  Toolchain           : {config.toolchain}")
+    if config.toolchain_install_dir:
+        print(f"  Toolchain Directory : {config.toolchain_install_dir}")
+    if config.cmake_args:
+        print(f"  CMake Arguments     : {' '.join(config.cmake_args)}")
+
+    print(f"\n{BOLD}Runtime Configuration:{RESET}")
+    print(f"  Verbosity Level     : {config.verbose}")
+    print(f"  Debug Mode          : {'Enabled' if config.debug else 'Disabled'}")
+    if config.gen_args:
+        print(f"  Generation Arguments: {' '.join(config.gen_args)}")
+
+    print(f"\n{BOLD}{CYAN}═══════════════════════════════════════════════════════════════{RESET}\n")
+
+
+def main(default_platform: Optional[str] = None,
+         default_simulator: Optional[str] = None,
+         tiling_enabled: bool = False,
+         platform_specific_cmake_args: Optional[list] = None,
+         parsed_args: Optional[argparse.Namespace] = None,
+         parser_setup_callback = None):
+    """
+    Main entry point for Deeploy test runners.
+    
+    Args:
+        default_platform: Default platform if not specified via -p
+        default_simulator: Default simulator if not specified via -s
+        tiling_enabled: Whether tiling is enabled
+        platform_specific_cmake_args: Additional CMake arguments for platform-specific configurations
+        parsed_args: Pre-parsed arguments (if None, will parse from sys.argv)
+        parser_setup_callback: Optional callback to configure parser before parsing (receives parser as arg)
+    """
+
+    if parsed_args is None:
+        # Make -p optional if default_platform is provided
+        parser = DeeployRunnerArgumentParser(tiling_arguments = tiling_enabled,
+                                             platform_required = (default_platform is None))
+
+        # Allow platform-specific runners to add their own arguments
+        if parser_setup_callback:
+            parser_setup_callback(parser)
+
+        args = parser.parse_args()
+    else:
+        args = parsed_args
+
+    platform_map = {
+        "generic": "Generic",
+        "qemu-arm": "QEMU-ARM",
+        "mempool": "MemPool",
+        "siracusa": "Siracusa",
+        "siracusa_w_neureka": "Siracusa_w_neureka",
+        "snitch": "Snitch",
+        "chimera": "Chimera",
+        "softhier": "SoftHier",
+    }
+
+    if args.platform:
+        platform = platform_map.get(args.platform.lower(), args.platform)
+    else:
+        platform = default_platform
+
+    # Validate platform if default is provided
+    if default_platform and args.platform:
+        normalized_specified = platform_map.get(args.platform.lower(), args.platform)
+        if normalized_specified != default_platform:
+            RED = '\033[91m'
+            BOLD = '\033[1m'
+            RESET = '\033[0m'
+            print(f"\n{RED}{BOLD}ERROR: Platform mismatch!{RESET}", file = sys.stderr)
+            print(f"{RED}This runner is designed for the '{default_platform}' platform.{RESET}", file = sys.stderr)
+            print(f"{RED}You specified platform: '{args.platform}' (normalized to '{normalized_specified}'){RESET}\n",
+                  file = sys.stderr)
+            print(f"Please use one of the following options:", file = sys.stderr)
+            print(f"  1. Remove the '-p {args.platform}' argument to use the default platform", file = sys.stderr)
+            print(f"  2. Use the correct platform-specific runner script for '{normalized_specified}'",
+                  file = sys.stderr)
+            sys.exit(1)
+
+    simulator = args.simulator if args.simulator else default_simulator
+
+    if platform is None:
+        print("Error: Platform must be specified with -p or provided as default", file = sys.stderr)
+        sys.exit(1)
+
+    if simulator is None:
+        simulator_map = {
+            "Generic": "host",
+            "QEMU-ARM": "qemu",
+            "MemPool": "banshee",
+            "Siracusa": "gvsoc",
+            "Siracusa_w_neureka": "gvsoc",
+            "Snitch": "gvsoc",
+            "Chimera": "gvsoc",
+            "SoftHier": "gvsoc",
+        }
+        simulator = simulator_map.get(platform, "host")
+        log.info(f"No simulator specified, using default for {platform}: {simulator}")
+
+    # Extract platform-specific CMake args from parsed args if available
+    if platform_specific_cmake_args is None:
+        platform_specific_cmake_args = []
+
+    # Check for platform-specific arguments in args object and build CMake args
+    if hasattr(args, 'cores'):
+        platform_specific_cmake_args.append(f"-DNUM_CORES={args.cores}")
+    elif hasattr(args, 'num_cores'):
+        platform_specific_cmake_args.append(f"-DNUM_CORES={args.num_cores}")
+
+    if hasattr(args, 'num_clusters'):
+        platform_specific_cmake_args.append(f"-DNUM_CLUSTERS={args.num_clusters}")
+
+    config = create_config_from_args(args, platform, simulator, tiling_enabled, platform_specific_cmake_args)
+
+    print_configuration(config)
+
+    try:
+        result = run_complete_test(config, skipgen = args.skipgen, skipsim = args.skipsim)
+
+        print_colored_result(result, config.test_name)
+
+        return 0 if result.success else 1
+
+    except Exception as e:
+        RED = '\033[91m'
+        RESET = '\033[0m'
+        print(f"\n{RED}✗ Test {config.test_name} FAILED with exception: {e}{RESET}")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/DeeployTest/testUtils/dmaUtils.py b/DeeployTest/testUtils/dmaUtils.py
new file mode 100644
index 0000000000..3266ce5129
--- /dev/null
+++ b/DeeployTest/testUtils/dmaUtils.py
@@ -0,0 +1,377 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+from typing import Dict, List, Optional, Tuple, Type
+
+import numpy.typing as npt
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import BaseType, Pointer, PointerClass
+from Deeploy.CommonExtensions.DataTypes import minimalIntegerType
+from Deeploy.DeeployTypes import NetworkContext, NetworkDeployer, NodeParser, NodeTemplate, NodeTypeChecker, \
+    ONNXLayer, OperatorRepresentation, VariableBuffer
+from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
+from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryDeployerWrapper, \
+    MemoryPlatformWrapper
+from Deeploy.MemoryLevelExtension.OptimizationPasses.MemoryLevelAnnotationPasses import AnnotateDefaultMemoryLevel, \
+    AnnotateIOMemoryLevel
+from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
+from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, PULPOptimizer
+from Deeploy.Targets.Snitch.Deployer import SnitchDeployer
+from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
+from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, \
+    PatternMemoryConstraints, TensorMemoryConstraint
+from Deeploy.TilingExtension.MemoryScheduler import MemoryBlock
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerExtension import MemoryMap, TilerDeployerWrapper, TilingSolution
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+
+from .tilingUtils import DBOnlyL3Tiler, DBTiler, SBTiler
+
+memcpyTemplate = NodeTemplate("""
+memcpy((void *)${dest}, (void *)${src}, ${size});
+""")
+
+
+# Same interface as NodeTypeChecker but allow any input type and the
+# output type matches the input type.
+class MemcpyTypeChecker(NodeTypeChecker):
+
+    def __init__(self):
+        super().__init__([], [])
+
+    def typeInferOutput(self, ctxt: NetworkContext, node: gs.Node,
+                        operatorRepresentation: OperatorRepresentation) -> NetworkContext:
+        assert len(node.inputs) == 1 and len(node.outputs) == 1
+        buffer_in = ctxt.lookup(node.inputs[0].name)
+        ctxt.annotateType(node.outputs[0].name, buffer_in._type)
+        return ctxt
+
+    def typeCheckNodeInputs(self, ctxt: NetworkContext, node: gs.Node) -> bool:
+        return True
+
+    def typeInferGlobalCtxt(self, ctxt: NetworkContext, node: gs.Node) -> NetworkContext:
+        # Whatever it has already annotated, it's good
+        return ctxt
+
+
+class MemcpyTileConstraint(TileConstraint):
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        inputLoadSchedule = [{"src": absCube.rectangle} for absCube in absoluteOutputCubes]
+        outputLoadSchedule = [{"dest": absCube.rectangle} for absCube in absoluteOutputCubes]
+        inputOffsets, outputOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, operatorRepresentation,
+                                                          ["src", "dest"])
+
+        def size(abs: AbsoluteHyperRectangle, buffer: VariableBuffer) -> int:
+            return math.prod(abs.rectangle.dims) * (buffer._type.referencedType.typeWidth // 8)
+
+        buffer_src = ctxt.lookup(operatorRepresentation['src'])
+        assert isinstance(buffer_src, VariableBuffer)
+
+        replacements: Dict[str, List[int]] = {"size": [size(abs, buffer_src) for abs in absoluteOutputCubes]}
+        replacement_types = {key: PointerClass(minimalIntegerType(values)) for key, values in replacements.items()}
+
+        return VariableReplacementScheme(replacements,
+                                         replacement_types), TilingSchedule(inputOffsets, outputOffsets,
+                                                                            inputLoadSchedule, outputLoadSchedule)
+
+
+class MemcpyParser(NodeParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return len(node.inputs) == 1 and len(node.outputs) == 1
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        assert len(node.inputs) == 1 and len(node.outputs) == 1
+        src = ctxt.lookup(node.inputs[0].name)
+        self.operatorRepresentation['src'] = src.name
+        self.operatorRepresentation['dest'] = ctxt.lookup(node.outputs[0].name).name
+        self.operatorRepresentation['size'] = math.prod(src.shape) * (src._type.referencedType.typeWidth // 8)
+        return ctxt, True
+
+
+class MemcpyLayer(ONNXLayer):
+    pass
+
+
+def generate_graph(nodeCount: int, shape: Tuple[int, ...], dtype: npt.DTypeLike) -> gs.Graph:
+    assert nodeCount > 0
+
+    tensor_in = gs.Variable(name = "input_0", dtype = dtype, shape = shape)
+
+    nodes = []
+    for i in range(nodeCount):
+        tensor_out = gs.Variable(name = f"out_{i}", dtype = dtype, shape = shape)
+        nodes.append(gs.Node("Memcpy", f"memcpy_{i}", {}, [tensor_in], [tensor_out]))
+        tensor_in = tensor_out
+
+    return gs.Graph(nodes, [nodes[0].inputs[0]], [nodes[-1].outputs[0]], "dma_test_graph")
+
+
+def generate_tiling(ctxt: NetworkContext, memoryStart: str, memoryOrder: List[str], memoryHierarchy: MemoryHierarchy,
+                    inputShape: Tuple[int, ...], tileShape: Tuple[int, ...], graph: gs.Graph, _type: BaseType,
+                    doublebuffer: bool) -> Tuple[TilingSolution, MemoryMap]:
+    assert memoryStart in memoryOrder
+    memoryStartIndex = memoryOrder.index(memoryStart)
+
+    if memoryStartIndex + 1 < len(memoryOrder):
+        memoryMultibuffer = memoryOrder[memoryOrder.index(memoryStart) + 1]
+    else:
+        memoryMultibuffer = None
+
+    if memoryStartIndex + 2 < len(memoryOrder):
+        singleTileMemories = memoryOrder[memoryStartIndex + 2:]
+    else:
+        singleTileMemories = []
+
+    inputSize = math.prod(inputShape)
+    tileSize = math.prod(tileShape)
+
+    def assertFitsInMemory(size: int, memory: str) -> None:
+        memorySize = memoryHierarchy.memoryLevels[memory].size
+        assert size <= memorySize, f"The required tensor space is too big for the {memory} memory. Required space: {size}, memory space: {memorySize}"
+
+    inputSizeInBytes = inputSize * (_type.typeWidth // 8)
+    assertFitsInMemory(2 * inputSizeInBytes, memoryStart)
+
+    tileSizeInBytes = tileSize * (_type.typeWidth // 8)
+    for memory in singleTileMemories:
+        assertFitsInMemory(2 * tileSizeInBytes, memory)
+
+    if doublebuffer:
+        multiBufferCoefficient = 2
+    else:
+        multiBufferCoefficient = 1
+
+    multibufferSizeInBytes = tileSizeInBytes * multiBufferCoefficient
+    if memoryMultibuffer is not None:
+        assertFitsInMemory(multibufferSizeInBytes + tileSizeInBytes, memoryMultibuffer)
+
+    inputMultibufferAddrSpace = (0, multibufferSizeInBytes)
+    outputMultibufferAddrSpace = (multibufferSizeInBytes, 2 * multibufferSizeInBytes)
+
+    inputTileAddrSpace = (0, tileSizeInBytes)
+    outputTileAddrSpace = (tileSizeInBytes, 2 * tileSizeInBytes)
+
+    # Tiling Solution
+
+    tilingSolution = []
+
+    def generateMemoryConstraint(memory: str, shape: Tuple[int, ...], multiBufferCoefficient: int,
+                                 addrSpace: Optional[Tuple[int, int]]) -> MemoryConstraint:
+        size = math.prod(shape)
+        mc = MemoryConstraint(memory, size)
+        mc.shape = shape
+        mc.multiBufferCoefficient = multiBufferCoefficient
+        if addrSpace is not None:
+            mc.addrSpace = addrSpace
+        return mc
+
+    for node in graph.nodes:
+        inputMemoryConstraints = {}
+        outputMemoryConstraints = {}
+        for i, memory in enumerate(memoryOrder[memoryOrder.index(memoryStart):]):
+            if i == 0:
+                inputMc = generateMemoryConstraint(memory = memory,
+                                                   shape = inputShape,
+                                                   multiBufferCoefficient = 1,
+                                                   addrSpace = None)
+                outputMc = generateMemoryConstraint(memory = memory,
+                                                    shape = inputShape,
+                                                    multiBufferCoefficient = 1,
+                                                    addrSpace = None)
+            elif i == 1:
+                inputMc = generateMemoryConstraint(memory = memory,
+                                                   shape = tileShape,
+                                                   multiBufferCoefficient = multiBufferCoefficient,
+                                                   addrSpace = inputMultibufferAddrSpace)
+                outputMc = generateMemoryConstraint(memory = memory,
+                                                    shape = tileShape,
+                                                    multiBufferCoefficient = multiBufferCoefficient,
+                                                    addrSpace = outputMultibufferAddrSpace)
+            else:
+                inputMc = generateMemoryConstraint(memory = memory,
+                                                   shape = tileShape,
+                                                   multiBufferCoefficient = 1,
+                                                   addrSpace = inputTileAddrSpace)
+                outputMc = generateMemoryConstraint(memory = memory,
+                                                    shape = tileShape,
+                                                    multiBufferCoefficient = 1,
+                                                    addrSpace = outputTileAddrSpace)
+            inputMemoryConstraints[memory] = inputMc
+            outputMemoryConstraints[memory] = outputMc
+
+        inputTensorMemoryConstraint = TensorMemoryConstraint(tensorName = node.inputs[0].name,
+                                                             constraints = inputMemoryConstraints,
+                                                             ctxt = ctxt)
+
+        outputTensorMemoryConstraint = TensorMemoryConstraint(tensorName = node.outputs[0].name,
+                                                              constraints = outputMemoryConstraints,
+                                                              ctxt = ctxt)
+
+        nodeMemoryConstraint = NodeMemoryConstraint()
+        nodeMemoryConstraint.addTensorConstraint(inputTensorMemoryConstraint, 'input')
+        nodeMemoryConstraint.addTensorConstraint(outputTensorMemoryConstraint, 'output')
+
+        patternMemoryConstraints = PatternMemoryConstraints()
+        patternMemoryConstraints.addConstraint(nodeMemoryConstraint)
+
+        tilingSolution.append(patternMemoryConstraints)
+
+    # Memory Map
+
+    # Initialize an empty memory map
+    memoryMap = {memory: [[] for _ in range(len(graph.nodes) + 1)] for memory in memoryOrder}
+
+    # Set memoryStart memory
+
+    def appendMemoryMapStart(tensorName: str, lifetime: Tuple[int, int], addrSpace: Tuple[int, int]) -> None:
+        memoryMap[memoryStart][-1].append(MemoryBlock(tensorName, memoryStart, lifetime, addrSpace))
+
+    addrSpacePing = (0, inputSizeInBytes)
+    addrSpacePong = (inputSizeInBytes, 2 * inputSizeInBytes)
+
+    ## First input tensor has a special lifetime (0, 0)
+    appendMemoryMapStart(graph.nodes[0].inputs[0].name, (0, 0), addrSpacePing)
+
+    for i, node in enumerate(graph.nodes):
+        # Start with addrSpacePong because we used "Ping" for the first input tensor
+        appendMemoryMapStart(node.outputs[0].name, (i, i + 1), addrSpacePong if i % 2 == 0 else addrSpacePing)
+
+    ## Set the rest
+
+    def setMemoryMapRest(memory: str, inputAddrSpace: Tuple[int, int], outputAddrSpace: Tuple[int, int]) -> None:
+        for i, node in enumerate(graph.nodes):
+            # Empirically concluded from looking at produced memory maps
+            if i + 1 == len(graph.nodes):
+                endLifetime = i + 1
+            else:
+                endLifetime = i
+
+            memoryMap[memory][i].extend([
+                MemoryBlock(name = node.inputs[0].name, level = memory, lifetime = (i, i), addrSpace = inputAddrSpace),
+                MemoryBlock(name = node.outputs[0].name,
+                            level = memory,
+                            lifetime = (i, endLifetime),
+                            addrSpace = outputAddrSpace),
+            ])
+
+    if memoryMultibuffer is not None:
+        setMemoryMapRest(memoryMultibuffer, inputMultibufferAddrSpace, outputMultibufferAddrSpace)
+
+    for memory in singleTileMemories:
+        setMemoryMapRest(memory, inputTileAddrSpace, outputTileAddrSpace)
+
+    return tilingSolution, memoryMap
+
+
+def defaultScheduler(graph: gs.Graph) -> List[List[gs.Node]]:
+    return [[node] for node in graph.nodes]
+
+
+def setup_pulp_deployer(defaultMemory: str, targetMemory: str, graph: gs.Graph, inputTypes: Dict[str, Type[Pointer]],
+                        doublebuffer: bool, deeployStateDir: str) -> NetworkDeployer:
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 1024000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 64000)
+    memoryLevels = [L3, L2, L1]
+    memoryLevelMap = {mem.name: mem for mem in memoryLevels}
+
+    assert defaultMemory in memoryLevelMap, f"defaultMemory {defaultMemory} is not part of PULP's memory hierarchy {list(memoryLevelMap.keys())}"
+    assert targetMemory in memoryLevelMap, f"targetMemory {targetMemory} is not part of PULP's memory hierarchy {list(memoryLevelMap.keys())}"
+
+    memoryHierarchy = MemoryHierarchy(memoryLevels)
+    memoryHierarchy.setDefaultMemoryLevel(defaultMemory)
+
+    platform = MemoryPULPPlatform(memoryHierarchy, memoryLevelMap[targetMemory])
+
+    deployer = PULPDeployer(graph,
+                            platform,
+                            inputTypes,
+                            PULPOptimizer,
+                            defaultScheduler,
+                            default_channels_first = True,
+                            deeployStateDir = deeployStateDir)
+
+    memoryLevelAnnotationPasses = [AnnotateIOMemoryLevel(defaultMemory), AnnotateDefaultMemoryLevel(memoryHierarchy)]
+    # Make the deployer memory-level aware
+    deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
+
+    if doublebuffer:
+        assert defaultMemory in ["L3", "L2"]
+        if defaultMemory == "L3":
+            deployer = TilerDeployerWrapper(deployer, DBOnlyL3Tiler)
+        else:
+            deployer = TilerDeployerWrapper(deployer, DBTiler)
+    else:
+        deployer = TilerDeployerWrapper(deployer, SBTiler)
+
+    return deployer
+
+
+def setup_snitch_deployer(defaultMemory: str, targetMemory: str, graph: gs.Graph, inputTypes: Dict[str, Type[Pointer]],
+                          doublebuffer: bool, deeployStateDir: str) -> NetworkDeployer:
+    L3 = MemoryLevel(name = "L3", neighbourNames = ["L2"], size = 64000000)
+    L2 = MemoryLevel(name = "L2", neighbourNames = ["L3", "L1"], size = 1024000)
+    L1 = MemoryLevel(name = "L1", neighbourNames = ["L2"], size = 64000)
+    memoryLevels = [L3, L2, L1]
+    memoryLevelMap = {mem.name: mem for mem in memoryLevels}
+
+    assert defaultMemory in memoryLevelMap, f"defaultMemory {defaultMemory} is not part of PULP's memory hierarchy {list(memoryLevelMap.keys())}"
+    assert targetMemory in memoryLevelMap, f"targetMemory {targetMemory} is not part of PULP's memory hierarchy {list(memoryLevelMap.keys())}"
+
+    memoryHierarchy = MemoryHierarchy(memoryLevels)
+    memoryHierarchy.setDefaultMemoryLevel(defaultMemory)
+
+    platform = SnitchPlatform()
+    platform = MemoryPlatformWrapper(platform, memoryHierarchy, memoryLevelMap[targetMemory])
+
+    deployer = SnitchDeployer(graph,
+                              platform,
+                              inputTypes,
+                              SnitchOptimizer,
+                              defaultScheduler,
+                              deeployStateDir = deeployStateDir)
+    memoryLevelAnnotationPasses = [AnnotateIOMemoryLevel(defaultMemory), AnnotateDefaultMemoryLevel(memoryHierarchy)]
+    # Make the deployer memory-level aware
+    deployer = MemoryDeployerWrapper(deployer, memoryLevelAnnotationPasses)
+
+    assert defaultMemory == "L2"
+    if doublebuffer:
+        deployer = TilerDeployerWrapper(deployer, DBTiler)
+    else:
+        deployer = TilerDeployerWrapper(deployer, SBTiler)
+
+    return deployer
+
+
+def prepare_deployer_with_custom_tiling(deployer: NetworkDeployer, defaultMemory: str, targetMemory: str,
+                                        tileShape: Tuple[int, ...], doublebuffer: bool) -> None:
+    # Decomposed deployer.prepare() to enter a custom tiling solution
+    deployer.frontEnd()
+    super(TilerDeployerWrapper, deployer).bind()
+
+    tilingSolution, memoryMap = generate_tiling(
+        ctxt = deployer.ctxt,
+        memoryStart = defaultMemory,
+        memoryOrder = [defaultMemory, targetMemory],
+        memoryHierarchy = deployer.Platform.memoryHierarchy,
+        inputShape = deployer.graph.inputs[0].shape,
+        tileShape = tileShape,
+        graph = deployer.graph,
+        _type = deployer.inputTypes['input_0'].referencedType,
+        doublebuffer = doublebuffer,
+    )
+    deployer.tile(tilingSolution, memoryMap)
+    deployer.backEnd()
+    deployer.prepared = True
diff --git a/DeeployTest/testUtils/graphColoring.py b/DeeployTest/testUtils/graphColoring.py
index 4437c947ea..d703d81862 100644
--- a/DeeployTest/testUtils/graphColoring.py
+++ b/DeeployTest/testUtils/graphColoring.py
@@ -1,28 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: graphColoring.py
-#
-# Last edited: 10.10.2023.
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author:
-#   - Luka Macan, University of Bologna
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import List, Union
 
diff --git a/DeeployTest/testUtils/graphDebug.py b/DeeployTest/testUtils/graphDebug.py
index 9aa87d9efd..676fdca56a 100644
--- a/DeeployTest/testUtils/graphDebug.py
+++ b/DeeployTest/testUtils/graphDebug.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: graphDebug.py
-#
-# Last edited: 23.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from typing import Dict, Tuple
 
diff --git a/DeeployTest/testUtils/graphDiffUtils.py b/DeeployTest/testUtils/graphDiffUtils.py
index a3b4d48516..c89e799582 100644
--- a/DeeployTest/testUtils/graphDiffUtils.py
+++ b/DeeployTest/testUtils/graphDiffUtils.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: graphDiffUtils.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 from __future__ import annotations
 
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 3e9639a688..48c5777905 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -1,32 +1,12 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: platformMapping.py
-#
-# Last edited: 23.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-from typing import Callable, Dict, Optional, Tuple, Union
+from typing import Callable, Dict, Optional, Tuple, Type, Union
 
 import onnx_graphsurgeon as gs
 
+from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.DeeployTypes import DeploymentPlatform, NetworkDeployer, TopologyOptimizer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
@@ -111,7 +91,7 @@ def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHie
 
 def mapDeployer(platform: DeploymentPlatform,
                 graph: gs.Graph,
-                inputTypes: Dict[str, type],
+                inputTypes: Dict[str, Type[Pointer]],
                 loweringOptimizer: Optional[TopologyOptimizer] = None,
                 scheduler: Optional[Callable] = None,
                 name: Optional[str] = None,
diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py
new file mode 100644
index 0000000000..3a02d6fa34
--- /dev/null
+++ b/DeeployTest/testUtils/pytestRunner.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from pathlib import Path
+from typing import List, Literal, Optional
+
+from .core import DeeployTestConfig, build_binary, configure_cmake, get_test_paths, run_complete_test, run_simulation
+
+__all__ = [
+    'get_worker_id',
+    'create_test_config',
+    'run_and_assert_test',
+    'build_binary',
+    'configure_cmake',
+    'run_simulation',
+]
+
+
+def get_worker_id() -> str:
+    """
+    Get the pytest-xdist worker ID for parallel test execution.
+    
+    Returns:
+        Worker ID string (e.g., 'gw0', 'gw1', 'master' for non-parallel)
+    """
+    return os.environ.get("PYTEST_XDIST_WORKER", "master")
+
+
+def create_test_config(
+    test_name: str,
+    platform: str,
+    simulator: Literal['gvsoc', 'banshee', 'qemu', 'vsim', 'vsim.gui', 'host', 'none'],
+    deeploy_test_dir: str,
+    toolchain: str,
+    toolchain_dir: Optional[str],
+    cmake_args: List[str],
+    tiling: bool = False,
+    cores: Optional[int] = None,
+    l1: Optional[int] = None,
+    l2: int = 1024000,
+    default_mem_level: str = "L2",
+    double_buffer: bool = False,
+    mem_alloc_strategy: str = "MiniMalloc",
+    search_strategy: str = "random-max",
+    profile_tiling: bool = False,
+    plot_mem_alloc: bool = False,
+    randomized_mem_scheduler: bool = False,
+    profile_untiled: bool = False,
+    gen_args: Optional[List[str]] = None,
+) -> DeeployTestConfig:
+
+    test_dir = f"Tests/{test_name}"
+
+    gen_dir, test_dir_abs, test_name_clean = get_test_paths(test_dir, platform, base_dir = deeploy_test_dir)
+
+    worker_id = get_worker_id()
+
+    if worker_id == "master":
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / "build_master")
+    else:
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / f"build_{worker_id}")
+
+    cmake_args_list = list(cmake_args) if cmake_args else []
+    if cores is not None:
+        cmake_args_list.append(f"NUM_CORES={cores}")
+
+    gen_args_list = list(gen_args) if gen_args else []
+
+    if cores is not None and platform in ["Siracusa", "Siracusa_w_neureka"]:
+        gen_args_list.append(f"--cores={cores}")
+
+    if tiling:
+        if l1 is not None:
+            gen_args_list.append(f"--l1={l1}")
+        if l2 != 1024000:
+            gen_args_list.append(f"--l2={l2}")
+        if default_mem_level != "L2":
+            gen_args_list.append(f"--defaultMemLevel={default_mem_level}")
+        if double_buffer:
+            gen_args_list.append("--doublebuffer")
+        if mem_alloc_strategy != "MiniMalloc":
+            gen_args_list.append(f"--memAllocStrategy={mem_alloc_strategy}")
+        if search_strategy != "random-max":
+            gen_args_list.append(f"--searchStrategy={search_strategy}")
+        if profile_tiling:
+            gen_args_list.append("--profileTiling")
+        if plot_mem_alloc:
+            gen_args_list.append("--plotMemAlloc")
+        if randomized_mem_scheduler:
+            gen_args_list.append("--randomizedMemoryScheduler")
+
+    if profile_untiled and not tiling and platform == "Siracusa":
+        gen_args_list.append("--profileUntiled")
+
+    config = DeeployTestConfig(
+        test_name = test_name_clean,
+        test_dir = test_dir_abs,
+        platform = platform,
+        simulator = simulator,
+        tiling = tiling,
+        gen_dir = gen_dir,
+        build_dir = build_dir,
+        toolchain = toolchain,
+        toolchain_install_dir = toolchain_dir,
+        cmake_args = cmake_args_list,
+        gen_args = gen_args_list,
+    )
+
+    return config
+
+
+def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool, skipsim: bool) -> None:
+    """
+    Shared helper function to run a test and assert its results.
+        
+    Raises:
+        AssertionError: If test fails or has errors
+    """
+    result = run_complete_test(config, skipgen = skipgen, skipsim = skipsim)
+
+    assert result.success, (f"Test {test_name} failed with {result.error_count} errors out of {result.total_count}\n"
+                            f"Output:\n{result.stdout}")
+
+    if result.error_count >= 0:
+        assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests")
diff --git a/DeeployTest/testUtils/testRunner.py b/DeeployTest/testUtils/testRunner.py
index ce51fda15e..53a5c7b9b9 100644
--- a/DeeployTest/testUtils/testRunner.py
+++ b/DeeployTest/testUtils/testRunner.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
 #
-# File: testRunner.py
-#
-# Last edited: 17.03.2023
-#
-# Copyright (C) 2022, ETH Zurich and University of Bologna.
-#
-# Author: Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import argparse
 import codecs
@@ -31,6 +10,12 @@
 import subprocess
 from typing import Literal, Tuple
 
+import coloredlogs
+
+from Deeploy.Logging import DEFAULT_FMT
+from Deeploy.Logging import DEFAULT_LOGGER as log
+from Deeploy.Logging import DETAILED_FILE_LOG_FORMAT, FAILURE_MARK, SUCCESS_MARK
+
 
 # Source: https://stackoverflow.com/a/38662876
 def escapeAnsi(line):
@@ -38,18 +23,6 @@ def escapeAnsi(line):
     return ansi_escape.sub('', line)
 
 
-def prRed(skk):
-    print("\033[91m{}\033[00m".format(skk))
-
-
-def prGreen(skk):
-    print("\033[92m{}\033[00m".format(skk))
-
-
-def prBlue(skk):
-    print("\033[94m{}\033[00m".format(skk))
-
-
 def getPaths(path_test: str, gendir_name: str) -> Tuple[str, str]:
 
     dir_test = os.path.normpath(path_test)
@@ -121,6 +94,16 @@ def __init__(self, description = None):
 
     def parse_args(self, args = None, namespace = None) -> argparse.Namespace:
         self.args = super().parse_args(args, namespace)
+
+        # Install logger based on verbosity level
+        if self.args.verbose > 2:
+            coloredlogs.install(level = 'DEBUG', logger = log, fmt = DETAILED_FILE_LOG_FORMAT)
+        elif self.args.verbose > 1:
+            coloredlogs.install(level = 'DEBUG', logger = log, fmt = DEFAULT_FMT)
+        elif self.args.verbose > 0:
+            coloredlogs.install(level = 'INFO', logger = log, fmt = DEFAULT_FMT)
+        else:
+            coloredlogs.install(level = 'WARNING', logger = log, fmt = DEFAULT_FMT)
         return self.args
 
 
@@ -165,6 +148,12 @@ def __init__(self, tiling_arguments: bool, description = None):
                           action = 'store_true',
                           default = False,
                           help = 'Skip network simulation\n')
+        self.add_argument('--profileUntiled',
+                          '--profile-untiled',
+                          dest = 'profileUntiled',
+                          action = 'store_true',
+                          default = False,
+                          help = 'Enable untiled profiling (Siracusa only)\n')
         self.add_argument('--toolchain',
                           metavar = '<LLVM|GCC>',
                           dest = 'toolchain',
@@ -177,6 +166,20 @@ def __init__(self, tiling_arguments: bool, description = None):
                           type = str,
                           default = os.environ.get('LLVM_INSTALL_DIR'),
                           help = 'Pick compiler install dir\n')
+        self.add_argument('--input-type-map',
+                          nargs = '*',
+                          default = [],
+                          type = str,
+                          help = '(Optional) mapping of input names to data types. '
+                          'If not specified, types are inferred from the input data. '
+                          'Example: --input-type-map input_0=int8_t input_1=float32_t ...')
+        self.add_argument('--input-offset-map',
+                          nargs = '*',
+                          default = [],
+                          type = str,
+                          help = '(Optional) mapping of input names to offsets. '
+                          'If not specified, offsets are set to 0. '
+                          'Example: --input-offset-map input_0=0 input_1=128 ...')
 
         if self.tiling_arguments:
             self.add_argument('--defaultMemLevel',
@@ -224,7 +227,7 @@ def __init__(self, tiling_arguments: bool, description = None):
                         """)
             self.add_argument(
                 '--plotMemAlloc',
-                action = 'store_false',
+                action = 'store_true',
                 help = 'Turn on plotting of the memory allocation and save it in the deeployState folder\n')
 
         self.args = None
@@ -239,11 +242,15 @@ def generate_cmd_args(self) -> str:
 
         command = ""
         if self.args.verbose:
-            command += " -v"
+            command += " -" + "v" * self.args.verbose
         if self.args.debug:
             command += " --debug"
         if hasattr(self.args, 'profileUntiled') and self.args.profileUntiled:
             command += " --profileUntiled"
+        if self.args.input_type_map:
+            command += " --input-type-map " + " ".join(self.args.input_type_map)
+        if self.args.input_offset_map:
+            command += " --input-offset-map " + " ".join(self.args.input_offset_map)
 
         if self.tiling_arguments:
             if self.args.defaultMemLevel:
@@ -304,13 +311,14 @@ def __init__(self,
         self.gen_args = gen_args
 
         self._dir_gen_root = f'TEST_{platform.upper()}'
+        assert self._args.toolchain_install_dir is not None, f"Environment variable LLVM_INSTALL_DIR is not set"
         self._dir_toolchain = os.path.normpath(self._args.toolchain_install_dir)
         self._dir_build = f"{self._dir_gen_root}/build"
         self._dir_gen, self._dir_test, self._name_test = getPaths(self._args.dir, self._dir_gen_root)
 
         if "CMAKE" not in os.environ:
             if self._args.verbose >= 1:
-                prRed(f"[TestRunner] CMAKE environment variable not set. Falling back to cmake")
+                log.error(f"[TestRunner] CMAKE environment variable not set. Falling back to cmake")
             assert shutil.which(
                 "cmake"
             ) is not None, "CMake not found. Please check that CMake is installed and available in your system’s PATH, or set the CMAKE environment variable to the full path of your preferred CMake executable."
@@ -321,7 +329,7 @@ def __init__(self,
         print("Test Name           : ", self._name_test)
 
     def run(self,):
-        prRed(f"################## Testing {self._dir_test} on {self._platform} Platform ##################")
+        log.info(f"################## Testing {self._dir_test} on {self._platform} Platform ##################")
 
         if self._args.skipgen is False:
             self.generate_test()
@@ -340,10 +348,13 @@ def generate_test(self):
             generation_script = "generateNetwork.py"
 
         command = f"python {generation_script} -d {self._dir_gen} -t {self._dir_test} -p {self._platform} {self.gen_args}"
+
+        if self._platform in ["Siracusa", "Siracusa_w_neureka"]:
+            command += f" --cores={self._args.cores}"
+
         command += self._argument_parser.generate_cmd_args()
 
-        if self._args.verbose >= 2:
-            prBlue(f"[TestRunner] Generation Command: {command}")
+        log.debug(f"[TestRunner] Generation Command: {command}")
 
         err = os.system(command)
         if err != 0:
@@ -366,8 +377,8 @@ def configure_cmake_project(self):
 
         if self._args.verbose >= 3:
             command = "VERBOSE=1 " + command
-        if self._args.verbose >= 2:
-            prBlue(f"[TestRunner] Cmake Command: {command}")
+
+        log.debug(f"[TestRunner] Cmake Command: {command}")
 
         err = os.system(command)
         if err != 0:
@@ -378,8 +389,8 @@ def build_binary(self):
 
         if self._args.verbose >= 3:
             command = "VERBOSE=1 " + command
-        if self._args.verbose >= 2:
-            prBlue(f"[TestRunner] Building Command: {command}")
+
+        log.debug(f"[TestRunner] Building Command: {command}")
 
         err = os.system(command)
         if err != 0:
@@ -405,8 +416,7 @@ def run_simulation(self, out_file = 'out.txt'):
             if self._args.verbose >= 3:
                 command = "BANSHEE_LOG=debug " + command
 
-        if self._args.verbose >= 2:
-            prBlue(f"[TestRunner] Simulation Command: {command}")
+        log.debug(f"[TestRunner] Simulation Command: {command}")
 
         process = subprocess.Popen([command],
                                    stdout = subprocess.PIPE,
@@ -432,7 +442,7 @@ def run_simulation(self, out_file = 'out.txt'):
         fileHandle.close()
 
         if "Errors: 0 out of " not in result:
-            prRed(f"❌ Found errors in {self._dir_test}")
+            log.error(f"{FAILURE_MARK} Found errors in {self._dir_test}")
             raise RuntimeError(f"Found an error in {self._dir_test}")
         else:
-            prGreen(f"✅ No errors found in in {self._dir_test}")
+            log.info(f"{SUCCESS_MARK} No errors found in in {self._dir_test}")
diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py
new file mode 100644
index 0000000000..0c3986cd6e
--- /dev/null
+++ b/DeeployTest/testUtils/tilingUtils.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, SubGraph, TransientBuffer
+from Deeploy.TilingExtension.TilerExtension import Tiler
+from Deeploy.TilingExtension.TilerModel import TilerModel
+
+
+class DBOnlyL3Tiler(Tiler):
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        buffer = ctxt.lookup(tensorName)
+
+        if isinstance(buffer, TransientBuffer):
+            return 1
+
+        if hop == 'L1':
+            return 1
+
+        return 2
+
+
+class DBTiler(Tiler):
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        buffer = ctxt.lookup(tensorName)
+
+        if isinstance(buffer, TransientBuffer):
+            return 1
+
+        return 2
+
+
+class SBTiler(Tiler):
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        return 1
diff --git a/DeeployTest/testUtils/typeMapping.py b/DeeployTest/testUtils/typeMapping.py
index 4cb18cb8f4..232fd1e274 100644
--- a/DeeployTest/testUtils/typeMapping.py
+++ b/DeeployTest/testUtils/typeMapping.py
@@ -1,98 +1,131 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: typeMapping.py
-#
-# Last edited: 22.05.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
-from collections import namedtuple
-from typing import List, Optional
+from typing import Tuple, Type
 
 import numpy as np
+import numpy.typing as npt
 
-from Deeploy.AbstractDataTypes import PointerClass
-from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, int8_t
+from Deeploy.AbstractDataTypes import BaseType, IntegerImmediate, Pointer, PointerClass
+from Deeploy.CommonExtensions.DataTypes import FloatDataTypes, IntegerDataTypes, float32_t, int8_t, int16_t, int32_t, \
+    minimalFloatType, minimalIntegerType, uint8_t, uint16_t, uint32_t
 
-offsetType = namedtuple("offsetType", ("type", "offset"))
+_ALL_DTYPES: dict[str, type] = {t.typeName: t for t in (*IntegerDataTypes, *FloatDataTypes)}
 
 
-def isInteger(_input: np.array) -> bool:
-    if np.abs((_input.astype(int) - _input)).max() > 0.001:
-        return False
-    return True
+def parseDataType(name: str) -> type:
+    """Parses a data type from its name.
 
+    Parameters
+    ----------
+    name : str
+        The name of the data type.
 
-def isUnsigned(_input: np.array) -> bool:
-    if (_input).min() < 0:
-        return False
-    return True
+    Returns
+    -------
+    class
+        The corresponding data type class.
 
+    Raises
+    ------
+    ValueError
+        If the provided data type name is unknown.
+    """
+    if name not in _ALL_DTYPES:
+        allowed = ", ".join(sorted(_ALL_DTYPES))
+        raise ValueError(f"Unknown data type: {name}. Allowed: {allowed}")
+    return _ALL_DTYPES[name]
 
-def dataWidth(n):
-    count = 0
-    n = np.abs(int(n - 1))
-    while (n > 0):
-        count += 1
-        n = n >> 8
-    ret = 2**(count + 2)
-    if ret < 8:
-        ret = 8
-    return ret
 
+def isInteger(x: npt.NDArray) -> bool:
+    return np.abs((x.astype(int) - x)).max() <= 0.001
 
-def inferInputType(_input: np.ndarray,
-                   signProp: Optional[bool] = None,
-                   defaultType = PointerClass(int8_t),
-                   defaultOffset = 0) -> List[offsetType]:
 
+def inferMinimalType(values: np.ndarray, default: Type[BaseType] = int8_t) -> Type[BaseType]:
     # WIESEP: We cannot do type inference for empty arrays.
-    if np.prod(_input.shape) == 0:
-        print(f"Warning: Empty input array for type inference for {_input}!")
-        return [(defaultType, defaultOffset)]
-
-    if signProp is None:
-        signProp = False
-
-    signedPlatformTypes = [_type for _type in IntegerDataTypes if _type.typeMin < 0]
-
-    matchingTypes = []
-
-    # FIXME: this is okay for now (3 distinctions are fine), but there is implicit
-    # knowledge encoded in the order of the checks (i.e. first unsigned, signed
-    # and then float). It might be good to extract that implicit knowledge into an ordered list.
-    if signProp and isUnsigned(_input) and isInteger(_input):
-        for _type in sorted(signedPlatformTypes, key = lambda x: x.typeWidth):
-            signPropOffset = (2**(_type.typeWidth - 1))
-            if _type.checkPromotion(_input - signPropOffset):
-                matchingTypes.append(offsetType(PointerClass(_type), signPropOffset))
-    elif isInteger(_input):
-        for _type in sorted(IntegerDataTypes, key = lambda x: x.typeWidth):
-            if _type.checkPromotion(_input):
-                matchingTypes.append(offsetType(PointerClass(_type), 0))
+    if np.prod(values.shape) == 0:
+        print(f"Warning: Empty input array for type inference for {values}!")
+        return default
+
+    if isInteger(values):
+        return minimalIntegerType(values)
     else:
-        for _type in sorted(FloatDataTypes, key = lambda x: x.typeWidth):
-            if _type.checkPromotion(_input):
-                matchingTypes.append(offsetType(PointerClass(_type), 0))
+        return minimalFloatType(values)
+
+
+def signPropTypeAndOffset(_type: Type[IntegerImmediate]) -> Tuple[Type[IntegerImmediate], int]:
+    if _type.signed:
+        return _type, 0
 
-    if matchingTypes == []:
-        raise Exception("Could not find a matching type!")
+    unsigned2signed = {
+        unsigned.typeName: signed for unsigned, signed in zip([t for t in IntegerDataTypes if t.typeMin == 0
+                                                              ], [t for t in IntegerDataTypes if t.typeMin < 0])
+    }
 
-    return matchingTypes
+    signedType = unsigned2signed[_type.typeName]
+    return signedType, 2**(signedType.typeWidth - 1)
+
+
+def inferTypeAndOffset(values: np.ndarray, signProp: bool = False) -> Tuple[Type[Pointer], int]:
+    """Infers the data type of the provided input array.
+
+    Parameters
+    ----------
+    values : np.ndarray
+        The input array for which to infer the data type.
+
+    signProp : bool
+        Whether to consider signedness when inferring the data type.
+    Returns
+    -------
+    Tuple[Type[BaseType], int]
+        The inferred type and offset
+    """
+
+    _type = inferMinimalType(values)
+
+    if signProp and issubclass(_type, IntegerImmediate):
+        _type, offset = signPropTypeAndOffset(_type)
+    else:
+        offset = 0
+
+    return PointerClass(_type), offset
+
+
+def baseTypeFromName(name: str) -> Type[BaseType]:
+    if name == "int8_t":
+        return int8_t
+    elif name == "uint8_t":
+        return uint8_t
+    elif name == "int16_t":
+        return int16_t
+    elif name == "uint16_t":
+        return uint16_t
+    elif name == "int32_t":
+        return int32_t
+    elif name == "uint32_t":
+        return uint32_t
+    elif name == "float32_t":
+        return float32_t
+    else:
+        raise RuntimeError(f"Unrecognized name {name}")
+
+
+def dtypeFromDeeployType(_ty: Type[BaseType]) -> npt.DTypeLike:
+    if _ty == int8_t:
+        return np.int8
+    elif _ty == uint8_t:
+        return np.uint8
+    elif _ty == int16_t:
+        return np.int16
+    elif _ty == uint16_t:
+        return np.uint16
+    elif _ty == int32_t:
+        return np.int32
+    elif _ty == uint32_t:
+        return np.uint32
+    elif _ty == float32_t:
+        return np.float32
+    else:
+        raise RuntimeError(f"Unimplemented conversion for type {_ty.typeName}")
diff --git a/DeeployTest/test_chimera_config.py b/DeeployTest/test_chimera_config.py
new file mode 100644
index 0000000000..bcc846cb75
--- /dev/null
+++ b/DeeployTest/test_chimera_config.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Chimera platform."""
+
+# Chimera platform uses gvsoc simulator
+# Currently only Adder test is in CI
+
+KERNEL_TESTS = [
+    "Kernels/Integer/Add/Regular",
+]
+
+MODEL_TESTS = []
diff --git a/DeeployTest/test_cortexm_config.py b/DeeployTest/test_cortexm_config.py
new file mode 100644
index 0000000000..dbbd2e4758
--- /dev/null
+++ b/DeeployTest/test_cortexm_config.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Cortex-M (QEMU-ARM) platform."""
+
+KERNEL_TESTS = [
+    "Kernels/Integer/Add/Regular",
+    "Kernels/Integer/Add/MultIO",
+    "Kernels/Integer/Pad/Regular_1D",
+    "Kernels/Integer/Pad/Regular_2D",
+    "Kernels/Integer/MatMul/Regular",
+    "Kernels/Integer/MatMul/Add",
+    "Kernels/Integer/MaxPool",
+    "Kernels/Integer/Conv/Regular_2D_RQ",
+    "Kernels/Integer/ReduceSum",
+    "Kernels/Integer/ReduceMean",
+    "Kernels/Integer/Slice",
+]
+
+MODEL_TESTS = [
+    "Models/CNN_Linear2",
+    "Models/WaveFormer",
+]
diff --git a/DeeployTest/test_deeploy_internal.py b/DeeployTest/test_deeploy_internal.py
new file mode 100644
index 0000000000..14f7c3fc15
--- /dev/null
+++ b/DeeployTest/test_deeploy_internal.py
@@ -0,0 +1,372 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
+from pathlib import Path
+
+import pytest
+
+# Mark all tests in this module as deeploy_internal
+pytestmark = pytest.mark.deeploy_internal
+
+
+@pytest.mark.parametrize("platform", ["QEMU-ARM", "Siracusa", "MemPool", "Generic"])
+def test_deeploy_state_serialization(platform):
+    """Test that Deeploy state can be serialized and deserialized correctly."""
+    script_dir = Path(__file__).parent
+    cmd = [
+        "python",
+        str(script_dir / "deeployStateEqualityTest.py"),
+        "-t",
+        "./Tests/Models/CNN_Linear2",
+        "-p",
+        platform,
+    ]
+    result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+    assert result.returncode == 0, (f"State serialization test failed for platform {platform}\n"
+                                    f"stdout: {result.stdout}\n"
+                                    f"stderr: {result.stderr}")
+
+
+@pytest.mark.parametrize("platform", ["QEMU-ARM", "Siracusa", "MemPool", "Generic"])
+def test_memory_level_extension(platform):
+    """Test memory level extension functionality."""
+    script_dir = Path(__file__).parent
+    cmd = [
+        "python",
+        str(script_dir / "testMemoryLevelExtension.py"),
+        "-t",
+        "./Tests/Models/CNN_Linear2",
+        "-p",
+        platform,
+    ]
+    result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+    assert result.returncode == 0, (f"Memory level extension test failed for platform {platform}\n"
+                                    f"stdout: {result.stdout}\n"
+                                    f"stderr: {result.stderr}")
+
+
+class TestMemoryAllocation:
+    """Test memory allocation strategies and constraints."""
+
+    def test_minimalloc_sufficient_memory(self):
+        """Test MiniMalloc strategy with sufficient L2 memory."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testMVP.py"),
+            "-t",
+            "Tests/Models/CCT/FP32/CCT_1_16_16_8",
+            "-p",
+            "Siracusa",
+            "--defaultMemLevel=L2",
+            "--l1=64000",
+            "--l2=75000",
+            "--memAllocStrategy=MiniMalloc",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Memory allocation test (MiniMalloc, L2=75000) failed\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
+
+    def test_minimalloc_insufficient_memory(self):
+        """Test that MiniMalloc correctly fails with insufficient L2 memory."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testMVP.py"),
+            "-t",
+            "Tests/Models/CCT/FP32/CCT_1_16_16_8",
+            "-p",
+            "Siracusa",
+            "--defaultMemLevel=L2",
+            "--l1=64000",
+            "--l2=60000",
+            "--memAllocStrategy=MiniMalloc",
+            "--shouldFail",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (
+            f"Memory allocation test (MiniMalloc should fail, L2=60000) did not behave as expected\n"
+            f"stdout: {result.stdout}\n"
+            f"stderr: {result.stderr}")
+
+    def test_tetrisrandom_sufficient_memory(self):
+        """Test TetrisRandom strategy with sufficient L2 memory."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testMVP.py"),
+            "-t",
+            "Tests/Models/CCT/FP32/CCT_1_16_16_8",
+            "-p",
+            "Siracusa",
+            "--defaultMemLevel=L2",
+            "--l1=64000",
+            "--l2=90000",
+            "--memAllocStrategy=TetrisRandom",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Memory allocation test (TetrisRandom, L2=90000) failed\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
+
+    def test_tetrisrandom_insufficient_memory(self):
+        """Test that TetrisRandom correctly fails with insufficient L2 memory."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testMVP.py"),
+            "-t",
+            "Tests/Models/CCT/FP32/CCT_1_16_16_8",
+            "-p",
+            "Siracusa",
+            "--defaultMemLevel=L2",
+            "--l1=64000",
+            "--l2=75000",
+            "--memAllocStrategy=TetrisRandom",
+            "--shouldFail",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (
+            f"Memory allocation test (TetrisRandom should fail, L2=75000) did not behave as expected\n"
+            f"stdout: {result.stdout}\n"
+            f"stderr: {result.stderr}")
+
+
+class TestTilerExtension:
+    """Test tiling extension functionality."""
+
+    @pytest.mark.parametrize("test_path", [
+        "./Tests/Models/CNN_Linear2",
+        "./Tests/Models/CNN_Linear1",
+        "./Tests/Kernels/Integer/MatMul/Regular",
+        "./Tests/Kernels/Integer/MaxPool",
+    ])
+    def test_tiler_basic(self, test_path):
+        """Test that tiler can process various networks without L1 constraints."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testTilerExtension.py"),
+            "-p",
+            "Siracusa",
+            "-t",
+            test_path,
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Tiler extension test failed for {test_path}\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
+
+    @pytest.mark.parametrize("test_path", [
+        "./Tests/Models/CNN_Linear2",
+        "./Tests/Models/CNN_Linear1",
+        "./Tests/Kernels/Integer/MatMul/Regular",
+        "./Tests/Kernels/Integer/MaxPool",
+    ])
+    def test_tiler_constrained_should_fail(self, test_path):
+        """Test that tiler correctly fails when L1 memory is too small."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testTilerExtension.py"),
+            "-p",
+            "Siracusa",
+            "-t",
+            test_path,
+            "--l1",
+            "2000",
+            "--shouldFail",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (
+            f"Tiler extension test (should fail) did not behave as expected for {test_path}\n"
+            f"stdout: {result.stdout}\n"
+            f"stderr: {result.stderr}")
+
+    @pytest.mark.parametrize("test_path", [
+        "./Tests/Models/CNN_Linear2",
+        "./Tests/Models/CNN_Linear1",
+        "./Tests/Models/miniMobileNet",
+        "./Tests/Models/miniMobileNetv2",
+        "./Tests/Kernels/Integer/MatMul/Regular",
+        "./Tests/Kernels/Integer/MaxPool",
+    ])
+    def test_tiler_double_buffer(self, test_path):
+        """Test tiler with double buffering enabled."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testTilerExtension.py"),
+            "-p",
+            "Siracusa",
+            "-t",
+            test_path,
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Tiler extension test (double buffer) failed for {test_path}\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
+
+
+def test_types():
+    """Test Deeploy type system (serialization, equivalence, promotion)."""
+    script_dir = Path(__file__).parent
+    cmd = [
+        "python",
+        str(script_dir / "testTypes.py"),
+    ]
+    result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+    assert result.returncode == 0, (f"Types test failed\n"
+                                    f"stdout: {result.stdout}\n"
+                                    f"stderr: {result.stderr}")
+
+
+class TestDebugTransformations:
+    """Test debug and diagnostic transformations."""
+
+    @pytest.mark.parametrize("platform", ["Generic", "Siracusa"])
+    def test_print_input_output_transformation(self, platform):
+        """Test print input/output transformation for debugging."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testPrintInputOutputTransformation.py"),
+            "-p",
+            platform,
+            "-t",
+            "./Tests/Models/CNN_Linear2",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Print I/O transformation test failed for platform {platform}\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
+
+    def test_debug_print_pass(self):
+        """Test debug print pass transformation."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "testDebugPrintPass.py"),
+            "-p",
+            "Generic",
+            "-t",
+            "./Tests/Models/CNN_Linear2",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Debug print pass test failed\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
+
+
+def test_regex_matching():
+    """Test regex matching utilities."""
+    script_dir = Path(__file__).parent
+    cmd = [
+        "python",
+        str(script_dir / "testRegexMatching.py"),
+    ]
+    result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+    assert result.returncode == 0, (f"Regex matching test failed\n"
+                                    f"stdout: {result.stdout}\n"
+                                    f"stderr: {result.stderr}")
+
+
+class TestTypeInference:
+    """Test type inference functionality with different input type configurations."""
+
+    def test_type_inference_fail_all_int8(self):
+        """Test that type inference correctly fails when all inputs are int8."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "generateNetwork.py"),
+            "-p",
+            "Generic",
+            "-t",
+            "./Tests/Others/TypeInference",
+            "-v",
+            "--input-type-map",
+            "A=int8_t",
+            "B=int8_t",
+            "C=int8_t",
+            "--input-offset-map",
+            "A=0",
+            "B=0",
+            "C=0",
+            "--shouldFail",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Type inference test (should fail with all int8) did not behave as expected\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
+
+    def test_type_inference_fail_incompatible_output(self):
+        """Test that type inference correctly fails with incompatible output type."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "generateNetwork.py"),
+            "-p",
+            "Generic",
+            "-t",
+            "./Tests/Others/TypeInference",
+            "-v",
+            "--input-type-map",
+            "A=int16_t",
+            "B=int8_t",
+            "C=int16_t",
+            "--input-offset-map",
+            "A=0",
+            "B=0",
+            "C=0",
+            "--shouldFail",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (
+            f"Type inference test (should fail with incompatible output) did not behave as expected\n"
+            f"stdout: {result.stdout}\n"
+            f"stderr: {result.stderr}")
+
+    def test_type_inference_pass(self):
+        """Test that type inference succeeds with correct type configuration."""
+        script_dir = Path(__file__).parent
+        cmd = [
+            "python",
+            str(script_dir / "generateNetwork.py"),
+            "-p",
+            "Generic",
+            "-t",
+            "./Tests/Others/TypeInference",
+            "-v",
+            "--input-type-map",
+            "A=int16_t",
+            "B=int8_t",
+            "C=int32_t",
+            "--input-offset-map",
+            "A=0",
+            "B=0",
+            "C=0",
+        ]
+        result = subprocess.run(cmd, cwd = script_dir, capture_output = True, text = True)
+
+        assert result.returncode == 0, (f"Type inference test (should pass) failed\n"
+                                        f"stdout: {result.stdout}\n"
+                                        f"stderr: {result.stderr}")
diff --git a/DeeployTest/test_dmas.py b/DeeployTest/test_dmas.py
new file mode 100644
index 0000000000..938459ae62
--- /dev/null
+++ b/DeeployTest/test_dmas.py
@@ -0,0 +1,372 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""DMA test suite for Siracusa and Snitch platforms.
+
+Tests three DMA implementations across various tensor shapes and configurations:
+- MchanDma: Siracusa L2→L1 DMA transfers
+- L3Dma: Siracusa L3→L2 DMA transfers
+- SnitchDma: Snitch L2→L1 DMA transfers
+
+Total test matrix: 3 DMAs × 10 shapes × 2 buffering modes = 60 tests
+"""
+
+import os
+from pathlib import Path
+
+import numpy as np
+import pytest
+from testUtils.codeGenerate import generateTestNetwork
+from testUtils.dmaUtils import MemcpyLayer, MemcpyParser, MemcpyTileConstraint, MemcpyTypeChecker, generate_graph, \
+    memcpyTemplate, prepare_deployer_with_custom_tiling, setup_pulp_deployer, setup_snitch_deployer
+from testUtils.pytestRunner import build_binary, configure_cmake, get_test_paths, get_worker_id
+from testUtils.typeMapping import baseTypeFromName, dtypeFromDeeployType
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import ArgumentStructGeneration, \
+    MemoryManagementGeneration
+from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeMapper, _NoVerbosity
+from Deeploy.Targets.PULPOpen.Bindings import L3MemoryAwareFunctionCallClosure
+from Deeploy.Targets.PULPOpen.Bindings import MemoryAwareFunctionCallClosure as PULPMemoryAwareFunctionCallClosure
+from Deeploy.Targets.PULPOpen.Bindings import TilingCallClosure as PULPTilingCallClosure
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
+from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
+from Deeploy.Targets.PULPOpen.DMA.L3Dma import L3Dma, l3DmaHack
+from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma
+from Deeploy.Targets.Snitch.Bindings import MemoryAwareFunctionCallClosure, TilingCallClosure
+from Deeploy.Targets.Snitch.CodeTransformationPasses import SnitchClusterTiling
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchClusterSynch import SnitchSynchCoresPass
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchCoreFilter import SnitchCoreFilterPass
+from Deeploy.Targets.Snitch.CodeTransformationPasses.SnitchProfileExecutionBlock import SnitchProfileExecutionBlockPass
+from Deeploy.Targets.Snitch.DMA.SnitchDma import SnitchDma
+from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
+    TilingVariableReplacementUpdate
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+
+@pytest.fixture(autouse = True)
+def clear_deeploy_state():
+    """Clear dynamically generated struct classes from AbstractDataTypes before each test.
+    
+    This prevents state pollution between DMA tests where dynamically generated
+    struct classes (like _memcpy_0_tiling_closure_args_t) persist and cause
+    conflicts when tests with different configurations try to create new versions.
+    """
+    import Deeploy.AbstractDataTypes as ADT
+
+    # Get list of all attributes before test
+    attrs_to_remove = []
+    for attr_name in dir(ADT):
+        # Remove dynamically generated struct classes (closure args, etc.)
+        if attr_name.startswith('_') and ('closure_args' in attr_name or 'memcpy' in attr_name.lower()):
+            attr = getattr(ADT, attr_name, None)
+            if isinstance(attr, type):
+                attrs_to_remove.append(attr_name)
+
+    # Remove stale struct classes
+    for attr_name in attrs_to_remove:
+        delattr(ADT, attr_name)
+
+    yield  # Run the test
+
+    # Clean up after test as well
+    for attr_name in dir(ADT):
+        if attr_name.startswith('_') and ('closure_args' in attr_name or 'memcpy' in attr_name.lower()):
+            attr = getattr(ADT, attr_name, None)
+            if isinstance(attr, type):
+                try:
+                    delattr(ADT, attr_name)
+                except AttributeError:
+                    pass
+
+
+# Test shape configurations: (input_shape, tile_shape, node_count, data_type)
+DMA_TEST_SHAPES = [
+    ((10, 10), (10, 10), 1, "uint8_t"),
+    ((10, 10), (10, 4), 1, "uint8_t"),
+    ((10, 10), (10, 4), 1, "uint16_t"),
+    ((10, 10), (10, 4), 1, "uint32_t"),
+    ((10, 10), (3, 4), 1, "uint32_t"),
+    ((10, 10), (3, 4), 2, "uint32_t"),
+    ((10, 10, 10), (2, 3, 4), 1, "uint8_t"),
+    ((10, 10, 10, 10), (2, 3, 5, 4), 1, "uint8_t"),
+    ((10, 10, 10, 10), (2, 3, 5, 4), 1, "uint32_t"),
+    ((10, 10, 10, 10, 10), (2, 3, 5, 7, 4), 1, "uint8_t"),
+]
+
+
+def param_id_dma(val):
+    """Generate readable test IDs for DMA parametrized tests."""
+    if isinstance(val, tuple) and len(val) == 4:
+        input_shape, tile_shape, node_count, data_type = val
+        shape_str = "x".join(map(str, input_shape))
+        tile_str = "x".join(map(str, tile_shape))
+        return f"{shape_str}_tile{tile_str}_n{node_count}_{data_type}"
+    elif isinstance(val, bool):
+        return "doublebuffer" if val else "singlebuffer"
+    return str(val)
+
+
+def setup_dma_deployer(dma_type: str, input_shape: tuple, tile_shape: tuple, node_count: int, data_type: str,
+                       doublebuffer: bool, gen_dir: str):
+    """
+    Set up deployer for DMA testing with custom tiling.
+    
+    Args:
+        dma_type: DMA implementation ("MchanDma", "L3Dma", "SnitchDma")
+        input_shape: Tensor shape to copy
+        tile_shape: Tiling dimensions
+        node_count: Number of memcpy nodes
+        data_type: Data type (uint8_t, uint16_t, uint32_t)
+        doublebuffer: Enable double buffering
+        gen_dir: Generation directory
+        
+    Returns:
+        tuple: (deployer, test_inputs, test_outputs)
+    """
+    _type = baseTypeFromName(data_type)
+    dtype = dtypeFromDeeployType(_type)
+
+    # Validate shapes
+    assert len(input_shape) == len(tile_shape), \
+        f'Input and tile shape must have same dimensionality: {len(input_shape)}D vs {len(tile_shape)}D'
+    assert all(tileDim <= inDim for inDim, tileDim in zip(input_shape, tile_shape)), \
+        f'Tile shape {tile_shape} must be <= input shape {input_shape}'
+
+    # DMA-specific configuration
+    if dma_type == "MchanDma":
+        defaultMemory = "L2"
+        targetMemory = "L1"
+        dma_obj = MchanDma()
+    elif dma_type == "L3Dma":
+        defaultMemory = "L3"
+        targetMemory = "L2"
+        dma_obj = L3Dma()
+    elif dma_type == "SnitchDma":
+        defaultMemory = "L2"
+        targetMemory = "L1"
+        dma_obj = SnitchDma()
+    else:
+        raise ValueError(f"Unknown DMA type: {dma_type}")
+
+    # Generate graph and setup deployer
+    graph = generate_graph(node_count, input_shape, dtype)
+    inputTypes = {"input_0": PointerClass(_type)}
+    _DEEPLOYSTATEDIR = os.path.join(gen_dir, "deeployStates")
+
+    if dma_type == "SnitchDma":
+        deployer = setup_snitch_deployer(defaultMemory, targetMemory, graph, inputTypes, doublebuffer, _DEEPLOYSTATEDIR)
+    else:
+        deployer = setup_pulp_deployer(defaultMemory, targetMemory, graph, inputTypes, doublebuffer, _DEEPLOYSTATEDIR)
+
+    # Create transformer with DMA-specific passes
+    if dma_type == "SnitchDma":
+        transformer = CodeTransformation([
+            SnitchCoreFilterPass("compute"),
+            SnitchProfileExecutionBlockPass(),
+            TilingVariableReplacement(targetMemory),
+            TilingCallClosure(writeback = False),
+            SnitchSynchCoresPass(),
+            TilingVariableReplacementUpdate(targetMemory),
+            SnitchClusterTiling(defaultMemory, targetMemory, dma_obj),
+            ArgumentStructGeneration(),
+            MemoryManagementGeneration(targetMemory),
+            MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+            MemoryManagementGeneration(defaultMemory),
+            MemoryManagementGeneration(),
+        ])
+    elif dma_type == "L3Dma":
+        # L3Dma uses PULPL3Tiling and L3MemoryAwareFunctionCallClosure
+        transformer = CodeTransformation([
+            TilingVariableReplacement(targetMemory),
+            PULPTilingCallClosure(writeback = False, generateStruct = True),
+            TilingVariableReplacementUpdate(targetMemory),
+            PULPL3Tiling("L3", "L2", l3DmaHack),
+            ArgumentStructGeneration(),
+            L3MemoryAwareFunctionCallClosure(writeback = False),
+            MemoryManagementGeneration("L2"),
+            MemoryManagementGeneration("L3.*"),
+            MemoryManagementGeneration(),
+        ])
+    else:  # MchanDma
+        transformer = CodeTransformation([
+            TilingVariableReplacement(targetMemory),
+            PULPTilingCallClosure(writeback = False, generateStruct = True),
+            TilingVariableReplacementUpdate(targetMemory),
+            PULPClusterTiling(defaultMemory, targetMemory, dma_obj),
+            ArgumentStructGeneration(),
+            MemoryManagementGeneration(targetMemory),
+            TilingVariableReplacement(defaultMemory),
+            PULPMemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
+            MemoryManagementGeneration(defaultMemory),
+            MemoryManagementGeneration(),
+        ])
+
+    # Set up bindings
+    binding = NodeBinding(MemcpyTypeChecker(), memcpyTemplate, transformer)
+    tilingReadyBindings = TilingReadyNodeBindings([binding], MemcpyTileConstraint())
+    memcpyMapper = NodeMapper(MemcpyParser(), tilingReadyBindings)
+    memcpyMapping = {"Memcpy": MemcpyLayer([memcpyMapper])}
+    deployer.Platform.engines[0].Mapping.update(memcpyMapping)
+
+    # Prepare custom tiling
+    prepare_deployer_with_custom_tiling(deployer, defaultMemory, targetMemory, tile_shape, doublebuffer)
+
+    # Generate test inputs/outputs
+    if dtype == np.float32:
+        test_inputs = np.random.rand(*input_shape)
+    else:
+        test_inputs = np.arange(stop = np.prod(input_shape), dtype = dtype).reshape(input_shape)
+    test_outputs = test_inputs
+
+    return deployer, test_inputs, test_outputs
+
+
+@pytest.mark.deeploy_internal
+@pytest.mark.parametrize("test_shape", DMA_TEST_SHAPES, ids = param_id_dma)
+@pytest.mark.parametrize("doublebuffer", [True, False], ids = param_id_dma)
+def test_mchan_dma(test_shape, doublebuffer, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen,
+                   skipsim) -> None:
+    """Test MchanDma (Siracusa L2→L1 DMA transfers)."""
+    input_shape, tile_shape, node_count, data_type = test_shape
+
+    # Setup paths
+    test_name = f"testMchanDma_{param_id_dma(test_shape)}_{param_id_dma(doublebuffer)}"
+    platform = "Siracusa"
+    gen_dir, _, test_name_clean = get_test_paths(f"test_dma_gen/{test_name}", platform, base_dir = deeploy_test_dir)
+
+    # Generate network
+    if not skipgen:
+        deployer, test_inputs, test_outputs = setup_dma_deployer("MchanDma", input_shape, tile_shape, node_count,
+                                                                 data_type, doublebuffer, gen_dir)
+        generateTestNetwork(deployer, [test_inputs], [test_outputs], gen_dir, _NoVerbosity)
+
+    # Build and run
+    worker_id = get_worker_id()
+    if worker_id == "master":
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / "build_master")
+    else:
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / f"build_{worker_id}")
+
+    from testUtils.pytestRunner import DeeployTestConfig
+    config = DeeployTestConfig(
+        test_name = test_name_clean,
+        test_dir = gen_dir,
+        platform = platform,
+        simulator = 'gvsoc',
+        tiling = True,
+        gen_dir = gen_dir,
+        build_dir = build_dir,
+        toolchain = toolchain,
+        toolchain_install_dir = toolchain_dir,
+        cmake_args = list(cmake_args) + ["NUM_CORES=8"],
+    )
+
+    configure_cmake(config)
+    build_binary(config)
+
+    if not skipsim:
+        from testUtils.pytestRunner import run_simulation
+        result = run_simulation(config)
+        assert result.success, f"MchanDma test failed with {result.error_count} errors"
+        assert result.error_count == 0, f"Found {result.error_count} errors"
+
+
+@pytest.mark.deeploy_internal
+@pytest.mark.parametrize("test_shape", DMA_TEST_SHAPES, ids = param_id_dma)
+@pytest.mark.parametrize("doublebuffer", [True, False], ids = param_id_dma)
+def test_l3_dma(test_shape, doublebuffer, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen,
+                skipsim) -> None:
+    """Test L3Dma (Siracusa L3→L2 DMA transfers)."""
+    input_shape, tile_shape, node_count, data_type = test_shape
+
+    # Setup paths
+    test_name = f"testL3Dma_{param_id_dma(test_shape)}_{param_id_dma(doublebuffer)}"
+    platform = "Siracusa"
+    gen_dir, _, test_name_clean = get_test_paths(f"test_dma_gen/{test_name}", platform, base_dir = deeploy_test_dir)
+
+    # Generate network
+    if not skipgen:
+        deployer, test_inputs, test_outputs = setup_dma_deployer("L3Dma", input_shape, tile_shape, node_count,
+                                                                 data_type, doublebuffer, gen_dir)
+        generateTestNetwork(deployer, [test_inputs], [test_outputs], gen_dir, _NoVerbosity)
+
+    # Build and run
+    worker_id = get_worker_id()
+    if worker_id == "master":
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / "build_master")
+    else:
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / f"build_{worker_id}")
+
+    from testUtils.pytestRunner import DeeployTestConfig
+    config = DeeployTestConfig(
+        test_name = test_name_clean,
+        test_dir = gen_dir,
+        platform = platform,
+        simulator = 'gvsoc',
+        tiling = True,
+        gen_dir = gen_dir,
+        build_dir = build_dir,
+        toolchain = toolchain,
+        toolchain_install_dir = toolchain_dir,
+        cmake_args = list(cmake_args) + ["NUM_CORES=8"],
+    )
+
+    configure_cmake(config)
+    build_binary(config)
+
+    if not skipsim:
+        from testUtils.pytestRunner import run_simulation
+        result = run_simulation(config)
+        assert result.success, f"L3Dma test failed with {result.error_count} errors"
+        assert result.error_count == 0, f"Found {result.error_count} errors"
+
+
+@pytest.mark.deeploy_internal
+@pytest.mark.parametrize("test_shape", DMA_TEST_SHAPES, ids = param_id_dma)
+@pytest.mark.parametrize("doublebuffer", [True, False], ids = param_id_dma)
+def test_snitch_dma(test_shape, doublebuffer, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen,
+                    skipsim) -> None:
+    """Test SnitchDma (Snitch L2→L1 DMA transfers)."""
+    input_shape, tile_shape, node_count, data_type = test_shape
+
+    # Setup paths
+    test_name = f"testSnitchDma_{param_id_dma(test_shape)}_{param_id_dma(doublebuffer)}"
+    platform = "Snitch"
+    gen_dir, _, test_name_clean = get_test_paths(f"test_dma_gen/{test_name}", platform, base_dir = deeploy_test_dir)
+
+    # Generate network
+    if not skipgen:
+        deployer, test_inputs, test_outputs = setup_dma_deployer("SnitchDma", input_shape, tile_shape, node_count,
+                                                                 data_type, doublebuffer, gen_dir)
+        generateTestNetwork(deployer, [test_inputs], [test_outputs], gen_dir, _NoVerbosity)
+
+    # Build and run
+    worker_id = get_worker_id()
+    if worker_id == "master":
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / "build_master")
+    else:
+        build_dir = str(Path(deeploy_test_dir) / f"TEST_{platform.upper()}" / f"build_{worker_id}")
+
+    from testUtils.pytestRunner import DeeployTestConfig
+    config = DeeployTestConfig(
+        test_name = test_name_clean,
+        test_dir = gen_dir,
+        platform = platform,
+        simulator = 'gvsoc',
+        tiling = True,
+        gen_dir = gen_dir,
+        build_dir = build_dir,
+        toolchain = toolchain,
+        toolchain_install_dir = toolchain_dir,
+        cmake_args = list(cmake_args) + ["NUM_CORES=9"],
+    )
+
+    configure_cmake(config)
+    build_binary(config)
+
+    if not skipsim:
+        from testUtils.pytestRunner import run_simulation
+        result = run_simulation(config)
+        assert result.success, f"SnitchDma test failed with {result.error_count} errors"
+        assert result.error_count == 0, f"Found {result.error_count} errors"
diff --git a/DeeployTest/test_generic_config.py b/DeeployTest/test_generic_config.py
new file mode 100644
index 0000000000..050b8ae0ba
--- /dev/null
+++ b/DeeployTest/test_generic_config.py
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Generic platform."""
+
+KERNEL_TESTS = [
+    # FP32 Kernels
+    "Kernels/FP32/ReLU",
+    "Kernels/FP32/Softmax/Regular",
+    "Kernels/FP32/Add/Regular",
+    "Kernels/FP32/Conv/DW_2D_Bias",
+    "Kernels/FP32/Conv/DW_2D_NoBias",
+    "Kernels/FP32/Conv/DW_2D_ZeroValuedBias",
+    "Kernels/FP32/Conv/Regular_2D_Bias",
+    "Kernels/FP32/Conv/Regular_2D_NoBias",
+    "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias",
+    "Kernels/FP32/Div",
+    "Kernels/FP32/GEMM/Regular",
+    "Kernels/FP32/MatMul",
+    "Kernels/FP32/MaxPool",
+    "Kernels/FP32/Mul",
+    "Kernels/FP32/LayerNorm",
+    "Kernels/FP32/RMSNorm",
+    "Kernels/FP32/Pow/Scalar",
+    "Kernels/FP32/Pow/Vector",
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean",
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add",
+    "Kernels/FP32/ReduceMean/KeepDims/AllAxes",
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3",
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_3",
+    "Kernels/FP32/ReduceMean/KeepDims/Axes2_1",
+    "Kernels/FP32/ReduceMean/KeepDims/Axis0",
+    "Kernels/FP32/ReduceMean/KeepDims/Axis2",
+    "Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add",
+    "Kernels/FP32/ReduceMean/NoKeepDims/AllAxes",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis0",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis2",
+    "Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add",
+    "Kernels/FP32/Reshape/SkipConnection",
+    "Kernels/FP32/Sqrt",
+    "Kernels/FP32/Transpose",
+    # Integer Kernels
+    "Kernels/Integer/Softmax/Regular",
+    "Kernels/Integer/Add/MultIO",
+    "Kernels/Integer/Add/Regular",
+    "Kernels/Integer/Conv/DW_1D",
+    "Kernels/Integer/Conv/Regular_1D",
+    "Kernels/Integer/Conv/DW_2D",
+    "Kernels/Integer/Conv/Regular_2D",
+    "Kernels/Integer/GEMM/Regular",
+    "Kernels/Integer/MatMul/Add",
+    "Kernels/Integer/MatMul/Regular",
+    "Kernels/Integer/MaxPool",
+    "Kernels/Integer/Pad/Regular_1D",
+    "Kernels/Integer/Pad/Regular_2D",
+    "Kernels/Integer/ReduceMean",
+    "Kernels/Integer/ReduceSum",
+    "Kernels/Integer/Slice",
+    # Special test from TinyViT model layers
+    "Models/TinyViT/5M/Layers/FP32/ReduceMean",
+    # Mixed Precision / Quantization
+    "Kernels/Mixed/Dequant",
+    "Kernels/Mixed/Quant",
+    "Models/Transformer_DeepQuant",
+    "Kernels/Integer/Conv/DW_2D_RQ",
+    "Kernels/Integer/Conv/Regular_2D_RQ",
+    "Kernels/Integer/MatMul/Regular_RQ",
+]
+
+# Model tests - paths from generic-models job in workflow
+MODEL_TESTS = [
+    "Models/Autoencoder1D",
+    "Models/CCT/FP32/CCT_1_16_16_8",
+    "Models/CCT/FP32/CCT_2_32_32_128_Opset20",
+    "Models/CCT/Int/ICCT",
+    "Models/CCT/Int/ICCT_8",
+    "Models/CCT/Int/ICCT_ITA",
+    "Models/CCT/Int/ICCT_ITA_8",
+    "Models/miniMobileNet",
+    "Models/miniMobileNetv2",
+    "Models/CNN_Linear1",
+    "Models/TinyViT/Demo",
+    "Models/WaveFormer",
+    "Models/CNN_Linear2",
+]
diff --git a/DeeployTest/test_mempool_config.py b/DeeployTest/test_mempool_config.py
new file mode 100644
index 0000000000..fa0cfd7715
--- /dev/null
+++ b/DeeployTest/test_mempool_config.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Test configuration for MemPool platform.
+
+This module defines the test lists and default parameters for MemPool platform tests.
+"""
+
+# Default number of threads for MemPool
+DEFAULT_NUM_THREADS = 16
+
+# Kernel tests (individual operators)
+KERNEL_TESTS = [
+    "Kernels/Integer/Add/MultIO",
+    "Kernels/Integer/Add/Regular",
+    "Kernels/Integer/Conv/DW_1D",
+    "Kernels/Integer/Conv/Regular_1D",
+    "Kernels/Integer/Conv/DW_2D",
+    "Kernels/Integer/Conv/Regular_2D",
+    "Kernels/Integer/GEMM/Regular",
+    "Kernels/Integer/MatMul/Add",
+    "Kernels/Integer/MatMul/Regular",
+    "Kernels/Integer/MaxPool",
+    "Kernels/Integer/Pad/Regular_1D",
+    "Kernels/Integer/Pad/Regular_2D",
+    "Kernels/Integer/ReduceMean",
+    "Kernels/Integer/ReduceSum",
+    "Kernels/Integer/Slice",
+    "Kernels/Integer/Conv/Regular_2D_RQ",
+    "Kernels/Integer/Conv/DW_2D_RQ",
+    "Kernels/Integer/GEMM/Regular_RQPerRow",
+    "Kernels/Integer/MatMul/Regular_RQ",
+]
+
+# Model tests (full networks)
+MODEL_TESTS = [
+    "Models/CCT/Int/ICCT",
+    "Models/CCT/Int/ICCT_8",
+    "Models/CCT/Int/ICCT_ITA",
+]
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
new file mode 100644
index 0000000000..794ae6fe7e
--- /dev/null
+++ b/DeeployTest/test_platforms.py
@@ -0,0 +1,728 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+# Import platform-specific test configurations
+from test_chimera_config import KERNEL_TESTS as CHIMERA_KERNEL_TESTS
+from test_chimera_config import MODEL_TESTS as CHIMERA_MODEL_TESTS
+from test_cortexm_config import KERNEL_TESTS as CORTEXM_KERNEL_TESTS
+from test_cortexm_config import MODEL_TESTS as CORTEXM_MODEL_TESTS
+from test_generic_config import KERNEL_TESTS as GENERIC_KERNEL_TESTS
+from test_generic_config import MODEL_TESTS as GENERIC_MODEL_TESTS
+from test_mempool_config import DEFAULT_NUM_THREADS as MEMPOOL_DEFAULT_NUM_THREADS
+from test_mempool_config import KERNEL_TESTS as MEMPOOL_KERNEL_TESTS
+from test_mempool_config import MODEL_TESTS as MEMPOOL_MODEL_TESTS
+from test_siracusa_config import DEFAULT_CORES as SIRACUSA_DEFAULT_CORES
+from test_siracusa_config import KERNEL_TESTS as SIRACUSA_KERNEL_TESTS
+from test_siracusa_config import MODEL_TESTS as SIRACUSA_MODEL_TESTS
+from test_siracusa_neureka_tiled_config import DEFAULT_CORES as NEUREKA_DEFAULT_CORES
+from test_siracusa_neureka_tiled_config import L2_DOUBLEBUFFER_KERNELS as NEUREKA_L2_DOUBLEBUFFER_KERNELS
+from test_siracusa_neureka_tiled_config import L2_SINGLEBUFFER_KERNELS as NEUREKA_L2_SINGLEBUFFER_KERNELS
+from test_siracusa_neureka_tiled_config import L2_SINGLEBUFFER_KERNELS_WMEM as NEUREKA_L2_SINGLEBUFFER_KERNELS_WMEM
+from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS
+from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM
+from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS
+from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \
+    L2_SINGLEBUFFER_MODELS, L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS
+from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES
+from test_snitch_config import KERNEL_TESTS as SNITCH_KERNEL_TESTS
+from test_snitch_config import MODEL_TESTS as SNITCH_MODEL_TESTS
+from test_snitch_tiled_config import L2_SINGLEBUFFER_KERNELS as SNITCH_L2_SINGLEBUFFER_KERNELS
+from test_softhier_config import DEFAULT_NUM_CLUSTERS as SOFTHIER_DEFAULT_NUM_CLUSTERS
+from test_softhier_config import KERNEL_TESTS as SOFTHIER_KERNEL_TESTS
+from test_softhier_config import MODEL_TESTS as SOFTHIER_MODEL_TESTS
+from testUtils.pytestRunner import create_test_config, run_and_assert_test
+
+
+def generate_test_params(test_dict, config_name):
+    """
+    Generate test parameters from a dictionary of test names to L1 values.
+    
+    Args:
+        test_dict: Dictionary mapping test_name -> list of L1 values
+        config_name: Configuration name for test ID (e.g., "L2-singlebuffer")
+        
+    Returns:
+        List of (test_name, l1_value, config_name) tuples
+    """
+    params = []
+    for test_name, l1_values in test_dict.items():
+        for l1 in l1_values:
+            params.append((test_name, l1, config_name))
+    return params
+
+
+def param_id(param):
+    """Generate test ID from parameter tuple."""
+    test_name, l1, config = param
+    return f"{test_name}-{l1}-{config}"
+
+
+### Platform Configuration ###
+PLATFORM_CONFIGS = {
+    "generic": {
+        "platform": "Generic",
+        "simulator": "host",
+        "kernel_tests": GENERIC_KERNEL_TESTS,
+        "model_tests": GENERIC_MODEL_TESTS,
+    },
+    "cortexm": {
+        "platform": "QEMU-ARM",
+        "simulator": "qemu",
+        "kernel_tests": CORTEXM_KERNEL_TESTS,
+        "model_tests": CORTEXM_MODEL_TESTS,
+    },
+    "mempool": {
+        "platform": "MemPool",
+        "simulator": "banshee",
+        "kernel_tests": MEMPOOL_KERNEL_TESTS,
+        "model_tests": MEMPOOL_MODEL_TESTS,
+        "default_num_threads": MEMPOOL_DEFAULT_NUM_THREADS,
+    },
+    "chimera": {
+        "platform": "Chimera",
+        "simulator": "gvsoc",
+        "kernel_tests": CHIMERA_KERNEL_TESTS,
+        "model_tests": CHIMERA_MODEL_TESTS,
+    },
+    "softhier": {
+        "platform": "SoftHier",
+        "simulator": "gvsoc",
+        "kernel_tests": SOFTHIER_KERNEL_TESTS,
+        "model_tests": SOFTHIER_MODEL_TESTS,
+        "default_num_clusters": SOFTHIER_DEFAULT_NUM_CLUSTERS,
+    },
+    "snitch": {
+        "platform": "Snitch",
+        "simulator": "gvsoc",
+        "kernel_tests": SNITCH_KERNEL_TESTS,
+        "model_tests": SNITCH_MODEL_TESTS,
+        "default_num_cores": SNITCH_DEFAULT_NUM_CORES,
+    },
+}
+
+### Markers summary ###
+# Platform markers:
+#   generic: tests from the generic platform
+#   cortexm: tests from the cortex-m (QEMU-ARM) platform
+#   mempool: tests from the MemPool platform
+#   chimera: tests from the Chimera platform
+#   softhier: tests from the SoftHier platform
+#   snitch: tests from the Snitch platform (untiled)
+#   snitch_tiled: tests from the Snitch platform (tiled)
+#   siracusa: tests from the Siracusa platform (untiled)
+#   siracusa_tiled: tests from the Siracusa platform (tiled)
+#   siracusa_neureka_tiled: tests from the Siracusa + Neureka platform (tiled)
+# Test type markers:
+#   kernels: single kernel (or single layer) tests
+#   models: full model (multiple layer) tests
+# Configuration markers (tiled platforms):
+#   singlebuffer: single-buffer tests
+#   doublebuffer: double-buffer tests
+#   l2: L2 default memory level
+#   l3: L3 default memory level
+#   wmem: with Neureka weight memory enabled
+
+
+@pytest.mark.generic
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", GENERIC_KERNEL_TESTS, ids = GENERIC_KERNEL_TESTS)
+def test_generic_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["generic"]
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.generic
+@pytest.mark.models
+@pytest.mark.parametrize("test_name", GENERIC_MODEL_TESTS, ids = GENERIC_MODEL_TESTS)
+def test_generic_models(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["generic"]
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.cortexm
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", CORTEXM_KERNEL_TESTS, ids = CORTEXM_KERNEL_TESTS)
+def test_cortexm_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["cortexm"]
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.cortexm
+@pytest.mark.models
+@pytest.mark.parametrize("test_name", CORTEXM_MODEL_TESTS, ids = CORTEXM_MODEL_TESTS)
+def test_cortexm_models(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["cortexm"]
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.mempool
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", MEMPOOL_KERNEL_TESTS, ids = MEMPOOL_KERNEL_TESTS)
+def test_mempool_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["mempool"]
+
+    # Add MemPool-specific CMake args for number of threads
+    mempool_cmake_args = cmake_args + [f"num_threads={platform_config['default_num_threads']}"]
+
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = mempool_cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.mempool
+@pytest.mark.models
+@pytest.mark.parametrize("test_name", MEMPOOL_MODEL_TESTS, ids = MEMPOOL_MODEL_TESTS)
+def test_mempool_models(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["mempool"]
+
+    # Add MemPool-specific CMake args for number of threads
+    mempool_cmake_args = cmake_args + [f"num_threads={platform_config['default_num_threads']}"]
+
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = mempool_cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", SIRACUSA_KERNEL_TESTS, ids = SIRACUSA_KERNEL_TESTS)
+def test_siracusa_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim,
+                          profile_untiled) -> None:
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+        cores = SIRACUSA_DEFAULT_CORES,
+        profile_untiled = profile_untiled,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa
+@pytest.mark.models
+@pytest.mark.parametrize("test_name", SIRACUSA_MODEL_TESTS, ids = SIRACUSA_MODEL_TESTS)
+def test_siracusa_models(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim,
+                         profile_untiled) -> None:
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+        cores = SIRACUSA_DEFAULT_CORES,
+        profile_untiled = profile_untiled,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.kernels
+@pytest.mark.singlebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"),
+    ids = param_id,
+)
+def test_siracusa_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                                skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.kernels
+@pytest.mark.doublebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(L2_DOUBLEBUFFER_KERNELS, "L2-doublebuffer"),
+    ids = param_id,
+)
+def test_siracusa_tiled_kernels_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                                skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = True,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.models
+@pytest.mark.singlebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(L2_SINGLEBUFFER_MODELS, "L2-singlebuffer"),
+    ids = param_id,
+)
+def test_siracusa_tiled_models_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                               skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.models
+@pytest.mark.doublebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(L2_DOUBLEBUFFER_MODELS, "L2-doublebuffer"),
+    ids = param_id,
+)
+def test_siracusa_tiled_models_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                               skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = True,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.models
+@pytest.mark.singlebuffer
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(L3_SINGLEBUFFER_MODELS, "L3-singlebuffer"),
+    ids = param_id,
+)
+def test_siracusa_tiled_models_l3_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                               skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L3",
+        double_buffer = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.models
+@pytest.mark.doublebuffer
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(L3_DOUBLEBUFFER_MODELS, "L3-doublebuffer"),
+    ids = param_id,
+)
+def test_siracusa_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                               skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L3",
+        double_buffer = True,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.chimera
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", CHIMERA_KERNEL_TESTS, ids = CHIMERA_KERNEL_TESTS)
+def test_chimera_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["chimera"]
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.softhier
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", SOFTHIER_KERNEL_TESTS, ids = SOFTHIER_KERNEL_TESTS)
+def test_softhier_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["softhier"]
+
+    # Add SoftHier-specific CMake args for number of clusters
+    softhier_cmake_args = cmake_args + [f"num_clusters={platform_config['default_num_clusters']}"]
+
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = softhier_cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.snitch
+@pytest.mark.kernels
+@pytest.mark.parametrize("test_name", SNITCH_KERNEL_TESTS, ids = SNITCH_KERNEL_TESTS)
+def test_snitch_kernels(test_name, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, skipgen, skipsim) -> None:
+    platform_config = PLATFORM_CONFIGS["snitch"]
+
+    # Add Snitch-specific CMake args for number of cores
+    snitch_cmake_args = cmake_args + [f"NUM_CORES={platform_config['default_num_cores']}"]
+
+    config = create_test_config(
+        test_name = test_name,
+        platform = platform_config["platform"],
+        simulator = platform_config["simulator"],
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = snitch_cmake_args,
+        tiling = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.snitch_tiled
+@pytest.mark.kernels
+@pytest.mark.singlebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(SNITCH_L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"),
+    ids = param_id,
+)
+def test_snitch_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                              skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+
+    # Add Snitch-specific CMake args
+    snitch_cmake_args = cmake_args + [f"NUM_CORES={SNITCH_DEFAULT_NUM_CORES}"]
+
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Snitch",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = snitch_cmake_args,
+        tiling = True,
+        cores = SNITCH_DEFAULT_NUM_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_neureka_tiled
+@pytest.mark.kernels
+@pytest.mark.singlebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(NEUREKA_L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"),
+    ids = param_id,
+)
+def test_siracusa_neureka_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                        cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_neureka",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = NEUREKA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_neureka_tiled
+@pytest.mark.kernels
+@pytest.mark.doublebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(NEUREKA_L2_DOUBLEBUFFER_KERNELS, "L2-doublebuffer"),
+    ids = param_id,
+)
+def test_siracusa_neureka_tiled_kernels_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                        cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_neureka",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = NEUREKA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = True,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_neureka_tiled
+@pytest.mark.models
+@pytest.mark.singlebuffer
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(NEUREKA_L3_SINGLEBUFFER_MODELS, "L3-singlebuffer"),
+    ids = param_id,
+)
+def test_siracusa_neureka_tiled_models_l3_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                       cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_neureka",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = NEUREKA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L3",
+        double_buffer = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_neureka_tiled
+@pytest.mark.models
+@pytest.mark.doublebuffer
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(NEUREKA_L3_DOUBLEBUFFER_MODELS, "L3-doublebuffer"),
+    ids = param_id,
+)
+def test_siracusa_neureka_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                       cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_neureka",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = NEUREKA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L3",
+        double_buffer = True,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_neureka_tiled
+@pytest.mark.kernels
+@pytest.mark.singlebuffer
+@pytest.mark.l2
+@pytest.mark.wmem
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(NEUREKA_L2_SINGLEBUFFER_KERNELS_WMEM, "L2-singlebuffer-wmem"),
+    ids = param_id,
+)
+def test_siracusa_neureka_tiled_kernels_l2_singlebuffer_wmem(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                             cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_neureka",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = NEUREKA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = False,
+        gen_args = ["--neureka-wmem"],
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_neureka_tiled
+@pytest.mark.models
+@pytest.mark.doublebuffer
+@pytest.mark.l3
+@pytest.mark.wmem
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM, "L3-doublebuffer-wmem"),
+    ids = param_id,
+)
+def test_siracusa_neureka_tiled_models_l3_doublebuffer_wmem(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                            cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_neureka",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = NEUREKA_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L3",
+        double_buffer = True,
+        gen_args = ["--neureka-wmem"],
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py
new file mode 100644
index 0000000000..0a77d714e8
--- /dev/null
+++ b/DeeployTest/test_siracusa_config.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+PLATFORM_NAME = "Siracusa"
+SIMULATOR = "gvsoc"
+DEFAULT_CORES = 8
+
+KERNEL_TESTS = [
+    "Kernels/FP32/ReLU",
+    "Kernels/FP32/Softmax/CrossEntropy",
+    "Kernels/FP32/Softmax/CrossEntropyGrad",
+    "Kernels/FP32/Softmax/Grad",
+    "Kernels/FP32/Softmax/Regular",
+    "Kernels/FP32/Add/Regular",
+    "Kernels/FP32/Conv/DW_2D_Bias",
+    "Kernels/FP32/Conv/DW_2D_NoBias",
+    "Kernels/FP32/Conv/DW_2D_ZeroValuedBias",
+    "Kernels/FP32/Conv/Regular_2D_Bias",
+    "Kernels/FP32/Conv/Regular_2D_NoBias",
+    "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias",
+    "Kernels/FP32/GEMM/Regular",
+    "Kernels/FP32/MatMul",
+    "Kernels/FP32/MaxPool",
+    "Kernels/FP32/Mul",
+    "Kernels/FP32/LayerNorm",
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean",
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add",
+    "Kernels/FP32/ReduceMean/KeepDims/AllAxes",
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3",
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_3",
+    "Kernels/FP32/ReduceMean/KeepDims/Axes2_1",
+    "Kernels/FP32/ReduceMean/KeepDims/Axis0",
+    "Kernels/FP32/ReduceMean/KeepDims/Axis2",
+    "Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add",
+    "Kernels/FP32/ReduceMean/NoKeepDims/AllAxes",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis0",
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis2",
+    "Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add",
+    "Kernels/FP32/ReduceSum",
+    "Kernels/FP32/Reshape/SkipConnection",
+    "Kernels/FP32/Transpose",
+    "Kernels/Integer/Hardswish/Regular",
+    "Kernels/Integer/Softmax/Regular",
+    "Kernels/Integer/Add/MultIO",
+    "Kernels/Integer/Add/Regular",
+    "Kernels/Integer/Concat",
+    "Kernels/Integer/MatMul/Add",
+    "Kernels/Integer/MatMul/Regular",
+    "Kernels/Integer/Pad/Regular_1D",
+    "Kernels/Integer/Pad/Regular_2D",
+    "Kernels/Integer/RMSNorm",
+    "Models/TinyViT/5M/Layers/FP32/ReduceMean",
+    "Others/Backtracking",
+    "Kernels/Mixed/Dequant",
+    "Kernels/Mixed/Quant",
+    "Models/Transformer_DeepQuant",
+    "Kernels/Integer/Conv/Regular_2D_RQ",
+    "Kernels/Integer/Conv/DW_2D_RQ",
+    "Kernels/Integer/Hardswish/Regular_RQ",
+    "Kernels/Integer/TrueIntegerDiv",
+]
+
+MODEL_TESTS = [
+    "Kernels/Integer/Attention",
+    "Models/CCT/FP32/CCT_1_16_16_8",
+    "Models/CCT/FP32/CCT_2_32_32_128_Opset20",
+    "Models/miniMobileNet",
+    "Models/miniMobileNetv2",
+    "Models/MLPerf/KeywordSpotting",
+    "Models/MLPerf/ImageClassification",
+    "Models/MLPerf/AnomalyDetection",
+    "Models/TinyViT/Demo",
+    "Models/CNN_Linear2",
+]
diff --git a/DeeployTest/test_siracusa_neureka_tiled_config.py b/DeeployTest/test_siracusa_neureka_tiled_config.py
new file mode 100644
index 0000000000..68bd3dd96e
--- /dev/null
+++ b/DeeployTest/test_siracusa_neureka_tiled_config.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Siracusa platform with Neureka accelerator (tiled)."""
+
+# Siracusa + Neureka platform with tiling support
+# Default configuration: 8 cores, gvsoc simulator
+
+DEFAULT_CORES = 8
+
+# L2 single-buffer kernel tests
+# Format: dict of {test_name: [L1_sizes]}
+L2_SINGLEBUFFER_KERNELS = {
+    "Kernels/Integer/GEMM/Regular_RQPerColumn": [16000],
+    "Kernels/Integer/Conv/PW_2D": [32000],
+    "Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ": [32000],
+    "Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ": [32000],
+}
+
+# L2 double-buffer kernel tests
+L2_DOUBLEBUFFER_KERNELS = {
+    "Kernels/Integer/GEMM/Regular_RQPerColumn": [16000],
+    "Kernels/Integer/Conv/PW_2D": [32000],
+    "Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ": [32000],
+    "Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ": [32000],
+}
+
+# L3 single-buffer model tests
+# Format: dict of {test_name: [L1_sizes]}
+L3_SINGLEBUFFER_MODELS = {
+    "Models/miniMobileNet": [2000],
+    "Kernels/Integer/Attention": [2500],
+    "Models/Transformer": [15000],
+    "Models/microLlama/microLlama1": [10000],
+}
+
+# L3 double-buffer model tests
+L3_DOUBLEBUFFER_MODELS = {
+    "Models/miniMobileNet": [2000],
+    "Kernels/Integer/Attention": [5000],
+    "Models/Transformer": [30000],
+}
+
+# L2 single-buffer kernel tests with weight memory (neureka-wmem)
+L2_SINGLEBUFFER_KERNELS_WMEM = {
+    "Kernels/Integer/GEMM/Regular_RQPerColumn": [16000],
+    "Kernels/Integer/Conv/PW_2D": [32000],
+    "Kernels/Integer/Conv/PW_2D_RQ/Regular_RQ": [32000],
+    "Kernels/Integer/Conv/PW_2D_RQ/Unsigned_RQ": [32000],
+}
+
+# L3 double-buffer model tests with weight memory (neureka-wmem)
+L3_DOUBLEBUFFER_MODELS_WMEM = {
+    "Models/miniMobileNet": [2000],
+    "Kernels/Integer/Attention": [3500],
+    "Models/microLlama/microLlama1": [10000],
+}
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
new file mode 100644
index 0000000000..1c0bb0315c
--- /dev/null
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -0,0 +1,158 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+PLATFORM_NAME = "Siracusa"
+SIMULATOR = "gvsoc"
+DEFAULT_CORES = 8
+DEFAULT_L2 = 1024000
+DEFAULT_MEM_ALLOC_STRATEGY = "MiniMalloc"
+DEFAULT_SEARCH_STRATEGY = "random-max"
+
+L2_SINGLEBUFFER_KERNELS = {
+    "Kernels/FP32/ReLU": [2000],
+    "Kernels/FP32/Softmax/Regular": [4000],
+    "Kernels/FP32/Add/Large": [220000],
+    "Kernels/FP32/Conv/DW_2D_Bias": [7200],
+    "Kernels/FP32/Conv/DW_2D_NoBias": [7200],
+    "Kernels/FP32/Conv/DW_2D_ZeroValuedBias": [7200],
+    "Kernels/FP32/Conv/Regular_2D_Bias": [6600],
+    "Kernels/FP32/Conv/Regular_2D_NoBias": [1600],
+    "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [6600],
+    "Kernels/FP32/GEMM/Regular": [8000],
+    "Kernels/FP32/MatMul": [2000],
+    "Kernels/FP32/MaxPool": [2000],
+    "Kernels/FP32/Mul": [2000],
+    "Kernels/FP32/LayerNorm": [2000],
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean": [8000],
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add": [8000],
+    "Kernels/FP32/ReduceMean/KeepDims/AllAxes": [50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3": [50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_3": [5000, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axes2_1": [6200, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axis0": [8400, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axis2": [8400, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add": [8000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean": [8000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add": [8000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/AllAxes": [50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3": [50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3": [5000, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1": [6200, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis0": [8400, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis2": [8400, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add": [8000],
+    "Kernels/FP32/Reshape/SkipConnection": [1400],
+    "Kernels/FP32/Transpose": [2000],
+    "Kernels/Integer/Hardswish/Regular": [750],
+    "Kernels/Integer/Softmax/Regular": [800, 500, 300],
+    "Kernels/Integer/Concat": [32000, 16000, 8000],
+    "Kernels/Integer/MatMul/Batch": [20000],
+    "Kernels/Integer/MatMul/Regular": [64000, 32000, 16000],
+    "Kernels/Integer/RMSNorm": [2048, 1024, 512],
+    "Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 4000],
+    "Kernels/Integer/Conv/DW_2D_RQ": [2561],
+    "Kernels/Integer/Conv/StriddedPadded_2D_RQ": [600],
+    "Kernels/Integer/GEMM/Batch_RQ": [20000],
+    "Kernels/Integer/Hardswish/Regular_RQ": [750],
+}
+
+L2_DOUBLEBUFFER_KERNELS = {
+    "Kernels/FP32/ReLU": [20],
+    "Kernels/FP32/Softmax/Regular": [8000],
+    "Kernels/FP32/Conv/DW_2D_Bias": [10000],
+    "Kernels/FP32/Conv/DW_2D_NoBias": [9800],
+    "Kernels/FP32/Conv/DW_2D_ZeroValuedBias": [9800],
+    "Kernels/FP32/Conv/Regular_2D_Bias": [8800],
+    "Kernels/FP32/Conv/Regular_2D_NoBias": [2000],
+    "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [8800],
+    "Kernels/FP32/GEMM/Regular": [8000],
+    "Kernels/FP32/MatMul": [5000],
+    "Kernels/FP32/MaxPool": [5000],
+    "Kernels/FP32/Mul": [2000],
+    "Kernels/FP32/LayerNorm": [2000],
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean": [8000],
+    "Kernels/FP32/ReduceMean/KeepDims/Add_ReduceMean_Add": [8000],
+    "Kernels/FP32/ReduceMean/KeepDims/AllAxes": [100000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_2_3": [100000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axes1_3": [10000, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axes2_1": [13000, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axis0": [17000, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/Axis2": [17000, 50000],
+    "Kernels/FP32/ReduceMean/KeepDims/ReduceMean_Add": [8000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean": [8000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Add_ReduceMean_Add": [8000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/AllAxes": [100000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_2_3": [100000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes1_3": [10000, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axes2_1": [13000, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis0": [17000, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/Axis2": [17000, 50000],
+    "Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add": [8000],
+    "Kernels/FP32/Reshape/SkipConnection": [2600],
+    "Kernels/FP32/Transpose": [2000],
+    "Kernels/Integer/Hardswish/Regular": [750],
+    "Kernels/Integer/Softmax/Regular": [1600, 1000, 600],
+    "Kernels/Integer/Concat": [64000, 32000, 16000],
+    "Kernels/Integer/MatMul/Regular": [64000, 32000, 16000],
+    "Kernels/Integer/RMSNorm": [4096, 2048, 1024],
+    "Kernels/Integer/Conv/Regular_2D_RQ": [8000, 6000, 5000],
+    "Kernels/Integer/Conv/DW_2D_RQ": [5121],
+    "Kernels/Integer/Hardswish/Regular_RQ": [800],
+}
+
+L2_SINGLEBUFFER_MODELS = {
+    "Models/CNN_Linear2": [45000, 30000, 15000],
+    "Models/miniMobileNet": [60000, 12000, 6000, 3000],
+    "Models/miniMobileNetv2": [60000, 16000, 12000, 8000],
+    "Kernels/Integer/Attention": [60000, 10000, 5000],
+    "Models/microLlama/microLlama1": [60000, 10000, 5000],
+    "Models/microLlama/microLlama8": [60000, 10000, 5000],
+    "Models/microLlama/microLlama8_parallel": [60000, 10000, 5000],
+    "Models/MLPerf/KeywordSpotting": [64000],
+    "Models/MLPerf/ImageClassification": [64000],
+    "Models/MLPerf/AnomalyDetection": [64000],
+    "Models/CCT/FP32/CCT_1_16_16_8": [64000],
+    "Models/TinyViT/Demo": [4000],
+}
+
+L2_DOUBLEBUFFER_MODELS = {
+    "Models/CNN_Linear2": [60000, 45000, 30000],
+    "Models/miniMobileNet": [60000, 24000, 12000, 6000],
+    "Models/miniMobileNetv2": [60000, 32000, 24000, 16000],
+    "Kernels/Integer/Attention": [60000, 20000, 10000, 5000],
+    "Models/microLlama/microLlama1": [60000, 20000, 10000],
+    "Models/microLlama/microLlama8": [60000, 20000, 10000],
+    "Models/microLlama/microLlama8_parallel": [60000, 20000, 10000],
+    "Models/MLPerf/KeywordSpotting": [128000],
+    "Models/MLPerf/ImageClassification": [128000],
+    "Models/MLPerf/AnomalyDetection": [128000],
+    "Models/CCT/FP32/CCT_1_16_16_8": [128000],
+    "Models/TinyViT/Demo": [8000],
+}
+
+L3_SINGLEBUFFER_MODELS = {
+    "Models/CNN_Linear2": [45000, 30000, 16000],
+    "Models/miniMobileNet": [60000, 12000, 6000],
+    "Models/miniMobileNetv2": [60000, 16000, 12000, 8000],
+    "Kernels/Integer/Attention": [60000, 10000, 5000, 2500],
+    "Models/Transformer": [60000, 30000, 15000],
+    "Models/microLlama/microLlama1": [60000, 10000, 5000],
+    "Models/CCT/FP32/CCT_2_32_32_128": [128000],
+    "Models/CCT_Train/CCT2_FT2": [128000],
+    "Models/TinyViT/Demo": [4000],
+}
+
+L3_DOUBLEBUFFER_MODELS = {
+    "Models/CNN_Linear2": [60000, 45000, 30000],
+    "Models/miniMobileNet": [60000, 24000, 12000, 6000],
+    "Models/miniMobileNetv2": [60000, 32000, 24000, 16000],
+    "Kernels/Integer/Attention": [60000, 20000, 10000, 5000],
+    "Models/Transformer": [60000, 30000, 15000],
+    "Models/microLlama/microLlama1": [60000, 20000, 10000],
+    "Models/microLlama/microLlama8": [60000, 20000, 10000],
+    "Models/microLlama/microLlama8_parallel": [60000, 20000, 10000],
+    "Models/CCT/FP32/CCT_2_32_32_128": [128000],
+    "Models/CCT_Train/CCT2_FT2": [128000],
+    "Models/TinyViT/Demo": [4000],
+}
diff --git a/DeeployTest/test_snitch_config.py b/DeeployTest/test_snitch_config.py
new file mode 100644
index 0000000000..f51b2ede23
--- /dev/null
+++ b/DeeployTest/test_snitch_config.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Snitch platform."""
+
+# Snitch platform supports gvsoc, banshee, vsim simulators
+# Default configuration: 9 cores
+
+DEFAULT_NUM_CORES = 9
+
+KERNEL_TESTS = [
+    "Kernels/FP32/Softmax/Regular",
+    "Kernels/Integer/Add/Large",
+    "Kernels/Integer/Add/Regular",
+    "Kernels/Integer/Softmax/Large",
+    "Kernels/Integer/Softmax/Regular",
+    "Kernels/Integer/MatMul/Regular",
+    "Kernels/Integer/iNoNorm",
+    "Kernels/Integer/GEMM/Regular_RQPerRow",
+    "Kernels/Integer/Add/Regular_RQ",
+    "Kernels/Integer/GEMM/TransB_RQ",
+]
+
+MODEL_TESTS = []
diff --git a/DeeployTest/test_snitch_tiled_config.py b/DeeployTest/test_snitch_tiled_config.py
new file mode 100644
index 0000000000..3f81239fce
--- /dev/null
+++ b/DeeployTest/test_snitch_tiled_config.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Snitch platform (tiled)."""
+
+# Snitch tiled platform supports gvsoc, banshee, vsim simulators
+# Default configuration: 9 cores, L2 default memory level
+
+DEFAULT_NUM_CORES = 9
+
+# L2 single-buffer tests with different L1 sizes
+# Format: {test_name: [L1_sizes]}
+L2_SINGLEBUFFER_KERNELS = {
+    "Kernels/Integer/Add/Large": [5000, 10000],
+    "Kernels/Integer/Softmax/Large": [5000, 10000],
+    "Kernels/FP32/Softmax/Regular": [2000, 5000, 10000],
+    "Kernels/FP32/GEMM/Regular": [2000, 5000, 10000],
+    "Kernels/FP32/GEMM/TransB": [2000, 5000, 10000],
+    "Kernels/Integer/iNoNorm": [5000, 10000],
+    "Kernels/Integer/Add/Regular_RQ": [5000, 10000],
+    "Kernels/Integer/GEMM/Regular_RQPerRow": [2000, 5000],
+}
+
+L2_SINGLEBUFFER_MODELS = {}
+
+# Currently no double-buffer configurations in CI
+L2_DOUBLEBUFFER_KERNELS = {}
+L2_DOUBLEBUFFER_MODELS = {}
diff --git a/DeeployTest/test_softhier_config.py b/DeeployTest/test_softhier_config.py
new file mode 100644
index 0000000000..18d2f61c7d
--- /dev/null
+++ b/DeeployTest/test_softhier_config.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for SoftHier platform."""
+
+# SoftHier platform uses gvsoc simulator
+# Default configuration: 1 cluster
+
+DEFAULT_NUM_CLUSTERS = 1
+
+KERNEL_TESTS = [
+    "Kernels/Integer/Add/Regular",
+]
+
+MODEL_TESTS = []
diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 0b4f59accb..0000000000
--- a/LICENSE
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2022 deeploy
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
diff --git a/LICENSES/Apache-2.0.txt b/LICENSES/Apache-2.0.txt
new file mode 100644
index 0000000000..137069b823
--- /dev/null
+++ b/LICENSES/Apache-2.0.txt
@@ -0,0 +1,73 @@
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+
+     (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and
+
+     (b) You must cause any modified files to carry prominent notices stating that You changed the files; and
+
+     (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+
+     (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+
+     You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!)  The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives.
+
+Copyright [yyyy] [name of copyright owner]
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSES/CC-BY-ND-4.0.txt b/LICENSES/CC-BY-ND-4.0.txt
new file mode 100644
index 0000000000..09a21c7358
--- /dev/null
+++ b/LICENSES/CC-BY-ND-4.0.txt
@@ -0,0 +1,154 @@
+Creative Commons Attribution-NoDerivatives 4.0 International
+
+ Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.
+
+Considerations for licensors: Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. More considerations for licensors.
+
+Considerations for the public: By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. More considerations for the public.
+
+Creative Commons Attribution-NoDerivatives 4.0 International Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NoDerivatives 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
+
+Section 1 – Definitions.
+
+     a.	Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
+
+     b.	Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
+
+     c.	Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
+
+     d.	Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
+
+     e.	Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
+
+     f.	Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
+
+     g.	Licensor means the individual(s) or entity(ies) granting rights under this Public License.
+
+     h.	Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
+
+     i.	Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
+
+     j.	You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
+
+Section 2 – Scope.
+
+     a.	License grant.
+
+          1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
+
+               A. reproduce and Share the Licensed Material, in whole or in part; and
+
+               B. produce and reproduce, but not Share, Adapted Material.
+
+          2. Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
+
+          3. Term. The term of this Public License is specified in Section 6(a).
+
+          4. Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
+
+          5. Downstream recipients.
+
+               A. Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
+
+               B. No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
+
+          6. No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
+
+     b.	Other rights.
+
+          1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
+
+          2. Patent and trademark rights are not licensed under this Public License.
+
+          3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties.
+
+Section 3 – License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the following conditions.
+
+     a.	Attribution.
+
+          1. If You Share the Licensed Material, You must:
+
+               A. retain the following if it is supplied by the Licensor with the Licensed Material:
+
+                    i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
+
+                    ii.	a copyright notice;
+
+                    iii. a notice that refers to this Public License;
+
+                    iv.	a notice that refers to the disclaimer of warranties;
+
+                    v.	a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
+
+               B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
+
+               C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
+
+          2. For the avoidance of doubt, You do not have permission under this Public License to Share Adapted Material.
+
+          3. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
+
+          4. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
+
+Section 4 – Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
+
+     a.	for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database, provided You do not Share Adapted Material;
+
+     b.	if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
+
+     c.	You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
+
+Section 5 – Disclaimer of Warranties and Limitation of Liability.
+
+     a.	Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.
+
+     b.	To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.
+
+     c.	The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
+
+Section 6 – Term and Termination.
+
+     a.	This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
+
+     b.	Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
+
+          1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
+
+          2. upon express reinstatement by the Licensor.
+
+     c.	For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
+
+     d.	For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
+
+     e.	Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+
+Section 7 – Other Terms and Conditions.
+
+     a.	The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
+
+     b.	Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
+
+Section 8 – Interpretation.
+
+     a.	For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
+
+     b.	To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
+
+     c.	No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
+
+     d.	Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
+
+Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at creativecommons.org/policies, Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
diff --git a/LICENSES/MIT.txt b/LICENSES/MIT.txt
new file mode 100644
index 0000000000..d817195dad
--- /dev/null
+++ b/LICENSES/MIT.txt
@@ -0,0 +1,18 @@
+MIT License
+
+Copyright (c) <year> <copyright holders>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
+following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/Makefile b/Makefile
index bcbdc0f783..5299ca817a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,30 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
 #
-# File: Makefile
-#
-# Created: 30.06.2023
-#
-# Copyright (C) 2023, ETH Zurich and University of Bologna.
-#
-# Authors:
-# - Moritz Scherer, ETH Zurich
-# - Victor Jung, ETH Zurich
-# - Philip Wiese, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 SHELL = /usr/bin/env bash
 ROOT_DIR := $(patsubst %/,%, $(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
@@ -66,7 +42,6 @@ CMAKE ?= cmake
 LLVM_COMMIT_HASH ?= 1ccb97ef1789b8c574e3fcab0de674e11b189b96
 PICOLIBC_COMMIT_HASH ?= 31ff1b3601b379e4cab63837f253f59729ce1fef
 PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4
-BANSHEE_COMMIT_HASH ?= 0e105921e77796e83d01c2aa4f4cadfa2005b4d9
 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6
 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2
 SOFTHIER_COMMIT_HASH ?= 0       # bowwang: to be updated
@@ -77,7 +52,22 @@ XTL_VERSION ?= 0.7.5
 XSIMD_VERSION ?= 13.2.0
 XTENSOR_VERSION ?= 0.25.0
 
-RUSTUP_CARGO ?= $$(rustup which cargo)
+OS  := $(shell uname -s)
+ARCH:= $(shell uname -m)
+
+ifeq ($(OS),Linux)
+	ifeq ($(ARCH),x86_64)
+		TARGET := x86_64-unknown-linux-gnu
+	else ifeq ($(ARCH),aarch64)
+		TARGET := aarch64-unknown-linux-gnu
+	else
+		$(error unsupported Linux architecture $(ARCH))
+	endif
+else ifeq ($(OS),Darwin)
+	TARGET := aarch64-apple-darwin
+else
+	$(error unsupported platform $(OS))
+endif
 
 all: toolchain emulators docs echo-bash
 
@@ -460,7 +450,7 @@ ${TOOLCHAIN_DIR}/softhier:
 	rm ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/include/flex_alloc.h && \
 	rm ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/include/flex_runtime.h && \
 	mv ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/flex_memory_deeploy.ld ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/flex_memory.ld && \
-	cp ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/deeploy_include/* ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/include      
+	cp ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/deeploy_include/* ${TOOLCHAIN_DIR}/softhier/soft_hier/flex_cluster_sdk/runtime/include
 
 ${SOFTHIER_INSTALL_DIR}: ${TOOLCHAIN_DIR}/softhier
 	cp -r ${TOOLCHAIN_DIR}/softhier ${SOFTHIER_INSTALL_DIR} && \
@@ -527,11 +517,11 @@ ${TOOLCHAIN_DIR}/banshee:
 	git submodule update --init --recursive && \
 	git apply ${TOOLCHAIN_DIR}/banshee.patch
 
-${BANSHEE_INSTALL_DIR}: ${TOOLCHAIN_DIR}/banshee
+${BANSHEE_INSTALL_DIR}:
 	export LLVM_SYS_150_PREFIX=${LLVM_INSTALL_DIR} && \
-	cd ${TOOLCHAIN_DIR}/banshee/ && \
-	${RUSTUP_CARGO} clean && \
-	${RUSTUP_CARGO} install --path . -f
+	mkdir -p ${BANSHEE_INSTALL_DIR} && cd ${BANSHEE_INSTALL_DIR} && \
+	curl -LO https://github.com/pulp-platform/banshee/releases/download/v0.5.0-prebuilt/banshee-0.5.0-$(TARGET).tar.gz && \
+	tar -xzf banshee-0.5.0-$(TARGET).tar.gz --strip-components=1 -C .
 
 banshee: ${BANSHEE_INSTALL_DIR}
 
@@ -566,10 +556,28 @@ chimera-sdk: ${CHIMERA_SDK_INSTALL_DIR}
 .PHONY: docs clean-docs format
 
 format:
-	python scripts/run_clang_format.py -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format -ir ./ scripts
-	autoflake -i -r --remove-all-unused-imports --ignore-init-module-imports --exclude "*/third_party/**" ./
-	yapf -ipr -e "third_party/" -e "install/" -e "toolchain/" .
-	isort --sg "**/third_party/*"  --sg "install/*" --sg "toolchain/*" ./
+	@echo "Formatting all relevant files..."
+	@echo " - Format Python Files"
+	@yapf -ipr -e "*/TEST_*/" -e "*/third_party/" -e "install/" -e "toolchain/" .
+	@echo " - Format Python Imports"
+	@isort --quiet --sg "**/TEST_*/*" --sg "**/third_party/*" --sg "install/*" --sg "toolchain/*" ./
+	@autoflake -i -r --remove-all-unused-imports --ignore-init-module-imports --exclude "**/third_party/*,**/install/*,**/toolchain/*" .
+	@echo " - Format C/C++ Files"
+	@python scripts/run_clang_format.py -e "*/TEST_*/*" -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format -ir ./ scripts
+
+lint:
+	@echo "Linting all relevant files..."
+	@echo " - Lint License Headers"
+	@scripts/reuse_skip_wrapper.py $$(git ls-files '*.py' '*.c' '*.h' '*.html' '*.rst' '*.yml' '*.yaml')
+	@echo " - Lint Python Files"
+	@yapf -rpd -e "*/TEST_*/" -e "*/third_party/" -e "install/" -e "toolchain/" .
+	@echo " - Lint Python Imports"
+	@isort --quiet --sg "**/TEST_*/*" --sg "**/third_party/*" --sg "install/*" --sg "toolchain/*" ./ -c
+	@autoflake --quiet -c -r --remove-all-unused-imports --ignore-init-module-imports --exclude "**/third_party/*,**/install/*,**/toolchain/*" .
+	@echo " - Lint C/C++ Files"
+	@python scripts/run_clang_format.py -e "*/TEST_*/*" -e "*/third_party/*" -e "*/install/*" -e "*/toolchain/*" -r --clang-format-executable=${LLVM_INSTALL_DIR}/bin/clang-format . scripts
+	@echo " - Lint YAML files"
+	@yamllint .
 
 docs:
 	make -C docs html
diff --git a/README.md b/README.md
index 0e00125731..261d46bfec 100644
--- a/README.md
+++ b/README.md
@@ -5,11 +5,10 @@
 # Deeploy: DNN Compiler for Heterogeneous SoCs
 </div>
 
-[![Documentation Status](https://img.shields.io/github/deployments/pulp-platform/Deeploy/github-pages?logo=readthedocs&logoColor=white&label=Docs
+![PyPI](https://img.shields.io/pypi/v/deeploy-pulp)[![Documentation Status](https://img.shields.io/github/deployments/pulp-platform/Deeploy/github-pages?logo=readthedocs&logoColor=white&label=Docs
 )](https://pulp-platform.github.io/Deeploy/)
-![CI](https://github.com/pulp-platform/Deeploy/actions/workflows/CI.yml/badge.svg?branch=devel)
-![Deeploy Docker](https://github.com/pulp-platform/Deeploy/actions/workflows/BuildDockerDeeploy.yml/badge.svg)
-![Toolchain Docker](https://github.com/pulp-platform/Deeploy/actions/workflows/BuildDockerToolchain.yml/badge.svg)
+[![CI](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-deeploy.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-deeploy.yml)
+[![Deeploy Docker](https://github.com/pulp-platform/Deeploy/actions/workflows/docker-build-deeploy.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/docker-build-deeploy.yml)
 [![GitHub last commit](https://img.shields.io/github/last-commit/pulp-platform/Deeploy)](#)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
 ![](https://img.shields.io/badge/Provided_by_PULP_Platform-24AF4B)
@@ -36,7 +35,7 @@ git submodule update --init --recursive
 
 Installing Deeploy is as simple as running:
 ```
-pip install -e . --extra-index-url=https://pypi.ngc.nvidia.com
+pip install -e .
 ```
 However, to run the code generated by Deeploy on a certain target, you need the toolchains and the simulators associated with this platform.
 
@@ -51,19 +50,21 @@ docker run -it --name deeploy_main -v $(pwd):/app/Deeploy ghcr.io/pulp-platform/
 Install Deeploy inside the container in editable mode:
 ```
 cd Deeploy
-pip install -e . --extra-index-url=https://pypi.ngc.nvidia.com
+pip install -e .
 ```
 Congratulations, you installed Deeploy and its dependencies! Now, to test your installation let's run one simple test on each platform with the following commands:
 ```
 cd DeeployTest
-python testRunner_generic.py -t Tests/Adder
-python testRunner_cortexm.py -t Tests/Adder
-python testRunner_mempool.py -t Tests/Adder
-python testRunner_snitch.py -t Tests/Adder/
-python testRunner_siracusa.py -t Tests/Adder --cores=8
-python testRunner_snitch.py -t Tests/Adder --cores=9
-python testRunner_softhier.py -t Tests/Adder --toolchain=GCC
-python testRunner_chimera.py -t Tests/Adder
+python deeployRunner_generic.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_cortexm.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_mempool.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_snitch.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_tiled_snitch.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_siracusa.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_tiled_siracusa.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_tiled_siracusa_w_neureka.py -t ./Tests/Kernels/Integer/Add/Regular
+python deeployRunner_softhier.py -t ./Tests/Kernels/Integer/Add/Regular --toolchain=GCC
+python deeployRunner_chimera.py -t ./Tests/Kernels/Integer/Add/Regular
 ```
 
 To restart and connect to the container, run:
@@ -72,24 +73,23 @@ docker start -i deeploy_main
 cd Deeploy
 ```
 
-You can find the ONNX file in `DeeployTest/Tests/Adder`, to visualize it, you can use [Netron](https://netron.app/). You can also find the generated code for the platform X in `TEST_X` in `DeeployTest` and you should notice that the generated code for the `Adder` test is very simple. However, this gets more complex when you add tiling. Let's generate the code for a single layer but using tiling this time:
+You can find the ONNX file in `DeeployTest/Tests/Kernels/Integer/Add/Regular`, to visualize it, you can use [Netron](https://netron.app/). You can also find the generated code for the platform X in `TEST_X` in `DeeployTest` and you should notice that the generated code for the `Add` test is very simple. However, this gets more complex when you add tiling. Let's generate the code for a single layer but using tiling this time:
 ```
-python testRunner_tiled_siracusa.py -t Tests/testMatMul --cores=8 --l1=16000
+python deeployRunner_tiled_siracusa.py -t Tests/Kernels/Integer/MatMul/Regular --cores=8 --l1=16000
 ```
 Now you can open the generated code in `DeeployTest/TEST_SIRACUSA/Tests/testMatMul/Network.c` and see how we executed a tiled layer.
 
 ## Supported Platforms
 
-| **Platform**           | **Hardware**                                                                                     | **Simulator**                                                  |
-| ---------------------- | ------------------------------------------------------------------------------------------------ | -------------------------------------------------------------- |
-| **Generic CPU**        | Your laptop CPU :)                                                                               | Host                                         |
-| **CortexM Processors** | [Documentation](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m4)                  | [QEMU](https://www.qemu.org/)                |
-| **MemPool + ITA**      | [Mempool paper](https://arxiv.org/abs/2303.17742), [ITA paper](https://arxiv.org/abs/2307.03493) | [Banshee](https://github.com/pulp-platform/banshee)            |
-| **Siracusa**           | [Siracusa paper](https://arxiv.org/abs/2312.14750)                                               | [GVSoC](https://github.com/gvsoc/gvsoc)                        |
-| **Snitch Cluster**     | [Snitch paper](https://arxiv.org/abs/2002.10143)                                                 | [GVSoC](https://github.com/gvsoc/gvsoc)                        |
-| **SoftHier**           | [Repo](https://github.com/gvsoc/gvsoc/tree/soft_hier_release)                                    | [GVSoC](https://github.com/gvsoc/gvsoc/tree/soft_hier_release) |
-| **Chimera**            | [Repo](https://github.com/pulp-platform/chimera)                                                 | [GVSoC](https://github.com/gvsoc/gvsoc)                        |
-
+| **Platform**           | **Hardware**                                                                                     | **Simulator**                                                  | **CI Status**
+| ---------------------- | ------------------------------------------------------------------------------------------------ | -------------------------------------------------------------- | ---------------
+| **Generic CPU**        | Your laptop CPU :)                                                                               | Host                                                           | [![CI • Generic](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-generic.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-generic.yml)
+| **CortexM Processors** | [Documentation](https://www.arm.com/products/silicon-ip-cpu/cortex-m/cortex-m4)                  | [QEMU](https://www.qemu.org/)                                  | [![CI • Cortex-M](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-cortexm.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-cortexm.yml)
+| **MemPool + ITA**      | [Mempool paper](https://arxiv.org/abs/2303.17742), [ITA paper](https://arxiv.org/abs/2307.03493) | [Banshee](https://github.com/pulp-platform/banshee)            | [![CI • Mempool](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-mempool.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-mempool.yml)
+| **Siracusa**           | [Siracusa paper](https://arxiv.org/abs/2312.14750)                                               | [GVSoC](https://github.com/gvsoc/gvsoc)                        | [![CI • Siracusa](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-siracusa.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-siracusa.yml) [![CI • Siracusa (Tiled)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-siracusa-tiled.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-siracusa-tiled.yml) [![CI • Siracusa + Neureka (Tiled)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-siracusa-neureka-tiled.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-siracusa-neureka-tiled.yml)
+| **Snitch Cluster**     | [Snitch paper](https://arxiv.org/abs/2002.10143)                                                 | [GVSoC](https://github.com/gvsoc/gvsoc)                        | [![CI • Snitch](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-snitch.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-snitch.yml) [![CI • Snitch (Tiled)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-snitch-tiled.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-snitch-tiled.yml)
+| **SoftHier**           | [Repo](https://github.com/gvsoc/gvsoc/tree/soft_hier_release)                                    | [GVSoC](https://github.com/gvsoc/gvsoc/tree/soft_hier_release) | [![CI • SoftHier](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-softhier.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-softhier.yml)
+| **Chimera**            | [Repo](https://github.com/pulp-platform/chimera)                                                 | [GVSoC](https://github.com/gvsoc/gvsoc)                        | [![CI • Chimera](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-chimera.yml/badge.svg)](https://github.com/pulp-platform/Deeploy/actions/workflows/ci-platform-chimera.yml)
 
 ## Publications
 
@@ -128,5 +128,9 @@ The preprint is available on arXiv @ [arXiv:2408.04413](https://arxiv.org/abs/24
 The preprint is available on arXiv @ [arXiv:2408.02473](https://arxiv.org/abs/2408.02473).
 
 ## License
+All licenses used in this repository are listed under the `LICENSES` folder. Unless specified otherwise in the respective file headers, all code checked into this repository is made available under a permissive license.
+- Most software sources and tool scripts are licensed under the [Apache 2.0 license](https://opensource.org/licenses/Apache-2.0).
+- Some files in the `scripts` directory are licensed under the [MIT license](https://opensource.org/license/mit).
+- Markdown, JSON, text files, pictures, and files in the `DeeployTest/Tests` directory are licensed under the [Creative Commons Attribution 4.0 International](https://creativecommons.org/licenses/by/4.0) license (CC BY 4.0).
 
-Unless specified otherwise in the respective file headers, all code checked into this repository is made available under a permissive license. All software sources and tool scripts are licensed under Apache 2.0, except for files contained in the `scripts` directory, which are licensed under the MIT license, and files contained in the `DeeployTest/Tests`directory, which are licensed under the [Creative Commons Attribution-NoDerivates 4.0 International](https://creativecommons.org/licenses/by-nd/4.0) license (CC BY-ND 4.0).
\ No newline at end of file
+To extract license information for all files, you can use the [reuse tool](https://reuse.software/) and by running `reuse spdx` in the root directory of this repository.
\ No newline at end of file
diff --git a/REUSE.toml b/REUSE.toml
new file mode 100644
index 0000000000..b37b8dfee0
--- /dev/null
+++ b/REUSE.toml
@@ -0,0 +1,36 @@
+version = 1
+
+[[annotations]]
+path = "DeeployTest/Tests/**"
+SPDX-FileCopyrightText = "2023 ETH Zurich and University of Bologna"
+SPDX-License-Identifier = "CC-BY-ND-4.0"
+
+[[annotations]]
+path = "docs/_static/**"
+SPDX-FileCopyrightText = "2023 ETH Zurich and University of Bologna"
+SPDX-License-Identifier = "CC-BY-ND-4.0"
+
+[[annotations]]
+path = "**/*.md"
+SPDX-FileCopyrightText = "2023 ETH Zurich and University of Bologna"
+SPDX-License-Identifier = "CC-BY-ND-4.0"
+
+[[annotations]]
+path = "**/*.txt"
+SPDX-FileCopyrightText = "2023 ETH Zurich and University of Bologna"
+SPDX-License-Identifier = "CC-BY-ND-4.0"
+
+[[annotations]]
+path = "**/*.patch"
+SPDX-FileCopyrightText = "2023 ETH Zurich and University of Bologna"
+SPDX-License-Identifier = "CC-BY-ND-4.0"
+
+[[annotations]]
+path = "*.json"
+SPDX-FileCopyrightText = "2023 ETH Zurich and University of Bologna"
+SPDX-License-Identifier = "CC-BY-ND-4.0"
+
+[[annotations]]
+path = ".vscode/*.json"
+SPDX-FileCopyrightText = "2023 ETH Zurich and University of Bologna"
+SPDX-License-Identifier = "CC-BY-ND-4.0"
\ No newline at end of file
diff --git a/TargetLibraries/CMSIS/CMakeLists.txt b/TargetLibraries/CMSIS/CMakeLists.txt
index 2b6b75d569..03f04ffe09 100644
--- a/TargetLibraries/CMSIS/CMakeLists.txt
+++ b/TargetLibraries/CMSIS/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 file(GLOB_RECURSE SOURCES
   "src/**"
 )
diff --git a/TargetLibraries/CMSIS/inc/DeeployMath.h b/TargetLibraries/CMSIS/inc/DeeployMath.h
index 7aa1b1805c..a40ad3345d 100644
--- a/TargetLibraries/CMSIS/inc/DeeployMath.h
+++ b/TargetLibraries/CMSIS/inc/DeeployMath.h
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        DeeployMath.h
- * Description:
- *
- * $Date:        30.12.2021
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/CMSIS/src/Util.c b/TargetLibraries/CMSIS/src/Util.c
index 257ea9590a..a48c969b71 100644
--- a/TargetLibraries/CMSIS/src/Util.c
+++ b/TargetLibraries/CMSIS/src/Util.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Util.c
- * Description:
- *
- * Date:         15.03.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/Chimera/CMakeLists.txt b/TargetLibraries/Chimera/CMakeLists.txt
index f3c437c7fd..f69a741a85 100644
--- a/TargetLibraries/Chimera/CMakeLists.txt
+++ b/TargetLibraries/Chimera/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 file(GLOB_RECURSE SOURCES
   "src/**"
 )
diff --git a/TargetLibraries/Chimera/inc/DeeployChimeraMath.h b/TargetLibraries/Chimera/inc/DeeployChimeraMath.h
index b4bbf0b0b8..756b532850 100644
--- a/TargetLibraries/Chimera/inc/DeeployChimeraMath.h
+++ b/TargetLibraries/Chimera/inc/DeeployChimeraMath.h
@@ -1,9 +1,7 @@
 /*
- * Copyright 2025 ETH Zurich.
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
  *
- * Victor Jung <jungvi@iis.ee.ethz.ch>
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #ifndef __DEEPLOY_CHIMERA_MATH_HEADER_
diff --git a/TargetLibraries/Chimera/src/Add.c b/TargetLibraries/Chimera/src/Add.c
index d4bab85258..b90c517fff 100644
--- a/TargetLibraries/Chimera/src/Add.c
+++ b/TargetLibraries/Chimera/src/Add.c
@@ -1,9 +1,7 @@
 /*
- * Copyright 2025 ETH Zurich.
- * Licensed under the Apache License, Version 2.0, see LICENSE for details.
- * SPDX-License-Identifier: Apache-2.0
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
  *
- * Victor Jung <jungvi@iis.ee.ethz.ch>
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "DeeployChimeraMath.h"
diff --git a/TargetLibraries/Generic/CMakeLists.txt b/TargetLibraries/Generic/CMakeLists.txt
index 55bcde77bc..b129c1c994 100644
--- a/TargetLibraries/Generic/CMakeLists.txt
+++ b/TargetLibraries/Generic/CMakeLists.txt
@@ -1,10 +1,14 @@
-file(GLOB_RECURSE SOURCES 
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+file(GLOB_RECURSE SOURCES
     "src/**"
 )
 
 add_deeploy_library(deeploybasic STATIC ${SOURCES})
 
-target_include_directories(deeploybasic 
+target_include_directories(deeploybasic
     PUBLIC
     ${CMAKE_CURRENT_LIST_DIR}/inc
 )
diff --git a/TargetLibraries/Generic/inc/DeeployBasicMath.h b/TargetLibraries/Generic/inc/DeeployBasicMath.h
index 7f123e9dfc..4fbbd00bf8 100644
--- a/TargetLibraries/Generic/inc/DeeployBasicMath.h
+++ b/TargetLibraries/Generic/inc/DeeployBasicMath.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        DeeployBasicMath.h
- * Description:
- *
- * Date:         14.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
- * - Victor Jung, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_HEADER_
@@ -46,6 +22,7 @@
 
 #include <ctype.h>
 #include <inttypes.h>
+
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -55,6 +32,8 @@
 #include "types.h"
 #include "utils.h"
 
+#include "kernel/BatchNorm.h"
+#include "kernel/ConvTranspose1d_fp32.h"
 #include "kernel/Convolution.h"
 #include "kernel/DWConvolution.h"
 #include "kernel/Div.h"
@@ -64,11 +43,15 @@
 #include "kernel/Layernorm.h"
 #include "kernel/MatMul.h"
 #include "kernel/MaxPool.h"
+#include "kernel/MaxPool1d.h"
+#include "kernel/Pow.h"
 #include "kernel/RMSNorm.h"
 #include "kernel/RQDiv.h"
 #include "kernel/RQGELU.h"
 #include "kernel/RQHardswish.h"
+#include "kernel/Relu.h"
 #include "kernel/RequantShift.h"
 #include "kernel/Softmax.h"
+#include "kernel/Sqrt.h"
 
 #endif //__DEEPLOY_BASIC_MATH_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/BatchNorm.h b/TargetLibraries/Generic/inc/kernel/BatchNorm.h
new file mode 100644
index 0000000000..72703f5fe2
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/BatchNorm.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef BATCHNORM_H
+#define BATCHNORM_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+void BatchNorm_fp32(const float32_t *input, const float32_t *gamma,
+                    const float32_t *beta, const float32_t *mean,
+                    const float32_t *var, float32_t *output, int N, int C,
+                    int L);
+
+#endif // BATCHNORM_H
diff --git a/TargetLibraries/Generic/inc/kernel/ConvTranspose1d_fp32.h b/TargetLibraries/Generic/inc/kernel/ConvTranspose1d_fp32.h
new file mode 100644
index 0000000000..40ef065992
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/ConvTranspose1d_fp32.h
@@ -0,0 +1,16 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef CONV_TRANSPOSE1D_FP32_H
+#define CONV_TRANSPOSE1D_FP32_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+void ConvTranspose1d_fp32(const float32_t *input, uint32_t C_in, uint32_t W_in,
+                          const float32_t *weight, uint32_t C_out, uint32_t K,
+                          uint32_t stride, const float32_t *bias, bool has_bias,
+                          float32_t *output, uint32_t W_out);
+
+#endif // CONV_TRANSPOSE1D_FP32_H
diff --git a/TargetLibraries/Generic/inc/kernel/Convolution.h b/TargetLibraries/Generic/inc/kernel/Convolution.h
index b1ee40039b..8c1d2388ba 100644
--- a/TargetLibraries/Generic/inc/kernel/Convolution.h
+++ b/TargetLibraries/Generic/inc/kernel/Convolution.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        Convolution.h
- * Description:
- *
- * Date:         12.05.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2025 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
- * - Calin Diaconu, University of Bologna
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_CONVOLUTION_KERNEL_HEADER_
@@ -67,4 +43,13 @@ void Conv2d_fp32_fp32_fp32_NCHW(const float *__restrict__ pSrcA, uint32_t C,
                                 uint32_t SQ, const float *__restrict__ pSrcBias,
                                 const bool has_bias, float *__restrict__ pDstC);
 
+void Conv1d_fp32_fp32_fp32(
+    const float32_t *__restrict__ pSrcA, // Input: [C_in, W_in]
+    uint32_t C_in, uint32_t W_in,
+    const float32_t *__restrict__ pSrcB, // Weights: [C_out, C_in, K]
+    uint32_t C_out, uint32_t K, uint32_t stride,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, // Output: [C_out, W_out]
+    uint32_t W_out);
+
 #endif //__DEEPLOY_BASIC_MATH_CONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/DWConvolution.h b/TargetLibraries/Generic/inc/kernel/DWConvolution.h
index 23c8447928..b1b17e6c24 100644
--- a/TargetLibraries/Generic/inc/kernel/DWConvolution.h
+++ b/TargetLibraries/Generic/inc/kernel/DWConvolution.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        DWConvolution.h
- * Description:
- *
- * Date:         05.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2025 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
- * - Calin Diaconu, University of Bologna
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_DWCONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Div.h b/TargetLibraries/Generic/inc/kernel/Div.h
index 4929936667..7e3706db73 100644
--- a/TargetLibraries/Generic/inc/kernel/Div.h
+++ b/TargetLibraries/Generic/inc/kernel/Div.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        Div.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_DIV_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/GELU.h b/TargetLibraries/Generic/inc/kernel/GELU.h
index 702c1c09cc..ffa104b617 100644
--- a/TargetLibraries/Generic/inc/kernel/GELU.h
+++ b/TargetLibraries/Generic/inc/kernel/GELU.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        GELU.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_GELU_KERNEL_HEADER_
@@ -49,4 +25,8 @@ void GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize);
 void GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out,
                             int32_t dataSize);
 
+void GELU_fp32_fp32_sigmoid_grad_chunk(float32_t *grad_in, float32_t *data_in,
+                                       float32_t *grad_out, int32_t start_idx,
+                                       int32_t end_idx);
+
 #endif //__DEEPLOY_BASIC_MATH_GELU_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Gemm.h b/TargetLibraries/Generic/inc/kernel/Gemm.h
index 19e424f6c4..0472c51cfc 100644
--- a/TargetLibraries/Generic/inc/kernel/Gemm.h
+++ b/TargetLibraries/Generic/inc/kernel/Gemm.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Gemm.h
- * Description:
- *
- * Date:         05.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_GEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Hardswish.h b/TargetLibraries/Generic/inc/kernel/Hardswish.h
index 967d2b676a..e0df42efbb 100644
--- a/TargetLibraries/Generic/inc/kernel/Hardswish.h
+++ b/TargetLibraries/Generic/inc/kernel/Hardswish.h
@@ -1,28 +1,11 @@
-/* ----------------------------------------------------------------------
-#
-# File: Hardswish.h
-#
-# Last edited: 22.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_HARDSWISH_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_HARDSWISH_KERNEL_HEADER_
 
 #include "DeeployBasicMath.h"
 
@@ -32,4 +15,6 @@
 
 void iHardswish_s8_s32(int8_t *input, int32_t *output, int32_t size,
                        int32_t one_over_six, int32_t three, int32_t six,
-                       int32_t input_offset);
\ No newline at end of file
+                       int32_t input_offset);
+
+#endif // __DEEPLOY_BASIC_MATH_HARDSWISH_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Generic/inc/kernel/Layernorm.h b/TargetLibraries/Generic/inc/kernel/Layernorm.h
index 5518a47d72..381f184dd6 100644
--- a/TargetLibraries/Generic/inc/kernel/Layernorm.h
+++ b/TargetLibraries/Generic/inc/kernel/Layernorm.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        Layernorm.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_LAYERNORM_KERNEL_HEADER_
@@ -49,4 +25,8 @@ void Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out,
                          float32_t *scale, float32_t *bias, float32_t epsilon,
                          int32_t size, int32_t lastDimLength);
 
+void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in,
+                             float32_t *grad_out, float32_t *scale,
+                             float32_t *bias, float32_t epsilon, int32_t size,
+                             int32_t lastDimLength);
 #endif //__DEEPLOY_BASIC_MATH_LAYERNORM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/MatMul.h b/TargetLibraries/Generic/inc/kernel/MatMul.h
index 2b75267893..adf1d8459e 100644
--- a/TargetLibraries/Generic/inc/kernel/MatMul.h
+++ b/TargetLibraries/Generic/inc/kernel/MatMul.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        MatMul.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_MATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/MaxPool.h b/TargetLibraries/Generic/inc/kernel/MaxPool.h
index bedffbf269..585b4967fc 100644
--- a/TargetLibraries/Generic/inc/kernel/MaxPool.h
+++ b/TargetLibraries/Generic/inc/kernel/MaxPool.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MaxPool.h
- * Description:
- *
- * Date:         04.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_MAXPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/MaxPool1d.h b/TargetLibraries/Generic/inc/kernel/MaxPool1d.h
new file mode 100644
index 0000000000..26d5e8e460
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/MaxPool1d.h
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef __DEEPLOY_BASIC_MATH_MAXPOOL1D_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_MAXPOOL1D_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+void MaxPool1d_fp32_fp32(float32_t const *__restrict__ pSrcA, uint32_t C,
+                         uint32_t W, uint32_t K, uint32_t S,
+                         float32_t *__restrict__ pDstC);
+
+#endif
\ No newline at end of file
diff --git a/TargetLibraries/Generic/inc/kernel/Pow.h b/TargetLibraries/Generic/inc/kernel/Pow.h
new file mode 100644
index 0000000000..f1d64859ed
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Pow.h
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * This file implements the element-wise binary power operation.
+ */
+
+#ifndef __DEEPLOY_MATH_POW_KERNEL_HEADER_
+#define __DEEPLOY_MATH_POW_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+void Pow_fp32_fp32_fp32(const float32_t *__restrict__ data_in,
+                        const float32_t *__restrict__ exponent,
+                        float32_t *__restrict__ data_out, int32_t size);
+
+void Pow_fp32_scalar_fp32(const float32_t *__restrict__ data_in,
+                          float32_t exponent, float32_t *__restrict__ data_out,
+                          int32_t size);
+
+#endif
diff --git a/TargetLibraries/Generic/inc/kernel/RMSNorm.h b/TargetLibraries/Generic/inc/kernel/RMSNorm.h
index a960b4a21c..576f10cf5e 100644
--- a/TargetLibraries/Generic/inc/kernel/RMSNorm.h
+++ b/TargetLibraries/Generic/inc/kernel/RMSNorm.h
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        RMSNorm.h
- * Description:
- *
- * $Date:        20.02.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_RMSNORM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/RQDiv.h b/TargetLibraries/Generic/inc/kernel/RQDiv.h
index 3e79d01a7e..69bcc12dd4 100644
--- a/TargetLibraries/Generic/inc/kernel/RQDiv.h
+++ b/TargetLibraries/Generic/inc/kernel/RQDiv.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        RQDiv.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_RQDIV_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/RQGELU.h b/TargetLibraries/Generic/inc/kernel/RQGELU.h
index 8965a34f8c..3c866950e9 100644
--- a/TargetLibraries/Generic/inc/kernel/RQGELU.h
+++ b/TargetLibraries/Generic/inc/kernel/RQGELU.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        RQGELU.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_RQGELU_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/RQHardswish.h b/TargetLibraries/Generic/inc/kernel/RQHardswish.h
index 6e2ea7eb79..672d312811 100644
--- a/TargetLibraries/Generic/inc/kernel/RQHardswish.h
+++ b/TargetLibraries/Generic/inc/kernel/RQHardswish.h
@@ -1,28 +1,11 @@
-/* ----------------------------------------------------------------------
-#
-# File: RQHardswish.h
-#
-# Last edited: 23.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_RQIHARDSWISH_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_RQIHARDSWISH_KERNEL_HEADER_
 
 #include "DeeployBasicMath.h"
 
@@ -34,3 +17,5 @@ void RQiHardswish_s8_s8(int8_t *input, int8_t *output, int32_t size,
                         int32_t one_over_six, int32_t three, int32_t six,
                         int32_t input_offset, int32_t output_offset,
                         int32_t mul, int32_t add, int32_t shift);
+
+#endif // __DEEPLOY_BASIC_MATH_RQIHARDSWISH_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Relu.h b/TargetLibraries/Generic/inc/kernel/Relu.h
index c0da5823c9..02e62df95c 100644
--- a/TargetLibraries/Generic/inc/kernel/Relu.h
+++ b/TargetLibraries/Generic/inc/kernel/Relu.h
@@ -1,33 +1,14 @@
-
-/* =====================================================================
- * Title:        Relu.h
- * Description:
- *
- * Date:         23.1.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_BASIC_MATH_RELU_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_RELU_KERNEL_HEADER_
+
 #include "DeeployBasicMath.h"
 
-void Relu_fp32_fp32(float32_t *input, float32_t *output, int32_t size);
\ No newline at end of file
+void Relu_fp32_fp32(float32_t *input, float32_t *output, int32_t size);
+
+#endif // __DEEPLOY_BASIC_MATH_RELU_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Generic/inc/kernel/RequantShift.h b/TargetLibraries/Generic/inc/kernel/RequantShift.h
index cc1c1cd36c..fe550fac76 100644
--- a/TargetLibraries/Generic/inc/kernel/RequantShift.h
+++ b/TargetLibraries/Generic/inc/kernel/RequantShift.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        RequantShift.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_REQUANTSHIFT_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Softmax.h b/TargetLibraries/Generic/inc/kernel/Softmax.h
index 56a46f6e0b..e5e817d218 100644
--- a/TargetLibraries/Generic/inc/kernel/Softmax.h
+++ b/TargetLibraries/Generic/inc/kernel/Softmax.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        Softmax.h
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_SOFTMAX_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Sqrt.h b/TargetLibraries/Generic/inc/kernel/Sqrt.h
new file mode 100644
index 0000000000..2c14e43bd3
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Sqrt.h
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_SQRT_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_SQRT_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * Square root operation - computes sqrt for each element
+ */
+
+/******************************************************************************/
+/*                              Sqrt                                          */
+/******************************************************************************/
+
+void Sqrt_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_SQRT_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/macros.h b/TargetLibraries/Generic/inc/macros.h
index bbf7a01f8f..d97cfecb7c 100644
--- a/TargetLibraries/Generic/inc/macros.h
+++ b/TargetLibraries/Generic/inc/macros.h
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        macros.h
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_MACROS_HEADER_
diff --git a/TargetLibraries/Generic/inc/types.h b/TargetLibraries/Generic/inc/types.h
index 60ed361307..be75317984 100644
--- a/TargetLibraries/Generic/inc/types.h
+++ b/TargetLibraries/Generic/inc/types.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        types.h
- * Description:
- *
- * Date:         11.11.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2024 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Francesco Conti, University of Bologna
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_TYPES_HEADER_
diff --git a/TargetLibraries/Generic/inc/utils.h b/TargetLibraries/Generic/inc/utils.h
index 53aad633b7..f64642908e 100644
--- a/TargetLibraries/Generic/inc/utils.h
+++ b/TargetLibraries/Generic/inc/utils.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        utils.h
- * Description:
- *
- * Date:         06.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_UTIL_HEADER_
diff --git a/TargetLibraries/Generic/src/BatchNorm_fp32.c b/TargetLibraries/Generic/src/BatchNorm_fp32.c
new file mode 100644
index 0000000000..9b30a30207
--- /dev/null
+++ b/TargetLibraries/Generic/src/BatchNorm_fp32.c
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployBasicMath.h"
+
+void BatchNorm_fp32(const float32_t *input, const float32_t *gamma,
+                    const float32_t *beta, const float32_t *mean,
+                    const float32_t *var, float32_t *output, int N, int C,
+                    int L) {
+  const float epsilon = 1e-5f;
+#pragma omp parallel for
+  for (int c = 0; c < C; ++c) {
+    float32_t c_mean = mean[c];
+    float32_t c_var = var[c];
+    float32_t c_gamma = gamma[c];
+    float32_t c_beta = beta[c];
+    float32_t denom = sqrtf(c_var + epsilon);
+    for (int n = 0; n < N; ++n) {
+      for (int l = 0; l < L; ++l) {
+        int index = n * C * L + c * L + l;
+        float32_t x = input[index];
+        float32_t norm = (x - c_mean) / denom;
+        output[index] = c_gamma * norm + c_beta;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/ConvTranspose1d_fp32.c b/TargetLibraries/Generic/src/ConvTranspose1d_fp32.c
new file mode 100644
index 0000000000..362058734e
--- /dev/null
+++ b/TargetLibraries/Generic/src/ConvTranspose1d_fp32.c
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployBasicMath.h"
+
+void ConvTranspose1d_fp32(const float32_t *input, uint32_t C_in, uint32_t W_in,
+                          const float32_t *weight, uint32_t C_out, uint32_t K,
+                          uint32_t stride, const float32_t *bias, bool has_bias,
+                          float32_t *output, uint32_t W_out) {
+  /*
+  input:       [C_in, W_in]
+  weight:      [C_in, C_out, K]
+  output:      [C_out, W_out]
+  bias:        [C_out] optionally
+
+  */
+
+  // Output initialization
+  for (uint32_t c = 0; c < C_out; ++c) {
+    for (uint32_t w = 0; w < W_out; ++w) {
+      output[c * W_out + w] = 0.0f;
+    }
+  }
+
+  // For each output channel
+  for (uint32_t cout = 0; cout < C_out; ++cout) {
+    // For each input channel
+    for (uint32_t cin = 0; cin < C_in; ++cin) {
+      // For each input width
+      for (uint32_t w_in = 0; w_in < W_in; ++w_in) {
+        float32_t val = input[cin * W_in + w_in];
+        // Transposed convolution: output width is calculated based on stride
+        for (uint32_t k = 0; k < K; ++k) {
+          uint32_t w_out = w_in * stride + k;
+          if (w_out < W_out) {
+            // weight indexing: weight[cin, cout, k]
+            float32_t wgt = weight[cin * (C_out * K) + cout * K + k];
+            output[cout * W_out + w_out] += val * wgt;
+          }
+        }
+      }
+    }
+    if (has_bias) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        output[cout * W_out + w] += bias[cout];
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/Convolution_fp32.c b/TargetLibraries/Generic/src/Convolution_fp32.c
index 313f2f47e3..e073e18125 100644
--- a/TargetLibraries/Generic/src/Convolution_fp32.c
+++ b/TargetLibraries/Generic/src/Convolution_fp32.c
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        Convolution_float32.c
- * Description:  Float32 version of Conv2D with NCHW format (pre-padded input)
- *
- * Date:         12.05.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
- * - Calin Diaconu, University of Bologna
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
@@ -90,3 +66,32 @@ void Conv2d_fp32_fp32_fp32_NCHW(const float32_t *__restrict__ pSrcA, uint32_t C,
     }
   }
 }
+
+void Conv1d_fp32_fp32_fp32(
+    const float32_t *__restrict__ pSrcA, // Input: [C_in, W_in]
+    uint32_t C_in, uint32_t W_in,
+    const float32_t *__restrict__ pSrcB, // Weights: [C_out, C_in, K]
+    uint32_t C_out, uint32_t K, uint32_t stride,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, // Output: [C_out, W_out]
+    uint32_t W_out) {
+  uint32_t c_out, c_in, w_out, k, w_in;
+  for (c_out = 0; c_out < C_out; ++c_out) {
+    for (w_out = 0; w_out < W_out; ++w_out) {
+      float32_t sum = 0.0f;
+      for (c_in = 0; c_in < C_in; ++c_in) {
+        for (k = 0; k < K; ++k) {
+          w_in = w_out * stride + k;
+          if (w_in < W_in) {
+            sum += pSrcA[c_in * W_in + w_in] *
+                   pSrcB[c_out * C_in * K + c_in * K + k];
+          }
+        }
+      }
+      if (has_bias) {
+        sum += pSrcBias[c_out];
+      }
+      pDstC[c_out * W_out + w_out] = sum;
+    }
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/Convolution_s8.c b/TargetLibraries/Generic/src/Convolution_s8.c
index a250124a5e..c5c2ffecfa 100644
--- a/TargetLibraries/Generic/src/Convolution_s8.c
+++ b/TargetLibraries/Generic/src/Convolution_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Convolution_s8.c
- * Description:
- *
- * Date:         04.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/DWConvolution_fp32.c b/TargetLibraries/Generic/src/DWConvolution_fp32.c
index 355433d427..b9b5f0b638 100644
--- a/TargetLibraries/Generic/src/DWConvolution_fp32.c
+++ b/TargetLibraries/Generic/src/DWConvolution_fp32.c
@@ -1,27 +1,7 @@
-/* =====================================================================
- * Title:        DWConvolution_float32.c
- * Description:  Float32 version of Conv2D with NCHW format (pre-padded input)
- *
- * Date:         12.05.2025
- *
- * Copyright (C) 2025 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Calin Diaconu, University of Bologna
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/DWConvolution_s8.c b/TargetLibraries/Generic/src/DWConvolution_s8.c
index 1a7da6aa1d..bf3736fdbd 100644
--- a/TargetLibraries/Generic/src/DWConvolution_s8.c
+++ b/TargetLibraries/Generic/src/DWConvolution_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        DWConvolution_s8.c
- * Description:
- *
- * Date:         05.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/Div_fp32.c b/TargetLibraries/Generic/src/Div_fp32.c
index c8f93dae0d..77214e5b84 100644
--- a/TargetLibraries/Generic/src/Div_fp32.c
+++ b/TargetLibraries/Generic/src/Div_fp32.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        Div_fp32.c
- * Description:
- *
- * $Date:        23.01.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/Div_s32.c b/TargetLibraries/Generic/src/Div_s32.c
index c2b66ad3f3..f3bb65b6ba 100644
--- a/TargetLibraries/Generic/src/Div_s32.c
+++ b/TargetLibraries/Generic/src/Div_s32.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Div_s32.c
- * Description:
- *
- * $Date:        19.12.2022
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/GELU_fp32.c b/TargetLibraries/Generic/src/GELU_fp32.c
index 880d715fc6..6cafed1986 100644
--- a/TargetLibraries/Generic/src/GELU_fp32.c
+++ b/TargetLibraries/Generic/src/GELU_fp32.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        GELU_fp32.c
- * Description:
- *
- * $Date:        19.12.2022
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
@@ -52,3 +30,24 @@ void GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out,
     data_out[i] = x * sigmoid;
   }
 }
+
+void GELU_fp32_fp32_sigmoid_grad_chunk(float32_t *grad_in, float32_t *data_in,
+                                       float32_t *grad_out, int32_t start_idx,
+                                       int32_t end_idx) {
+  // d(Gelu)/dx ≈ sigmoid(1.702 * x) + x * sigmoid(1.702 * x) * (1 -
+  // sigmoid(1.702 * x)) * 1.702
+  const float COEFF = 1.702f;
+  for (int32_t i = start_idx; i < end_idx; i++) {
+    float x = data_in[i];
+    float upstream_grad = grad_in[i];
+    float z = COEFF * x;
+    float sigmoid_z = 1.0f / (1.0f + expf(-z));
+
+    // d(Gelu)/dx = sigmoid(1.702*x) + x * sigmoid(1.702*x) *
+    // (1-sigmoid(1.702*x)) * 1.702
+    float sigmoid_derivative = sigmoid_z * (1.0f - sigmoid_z) * COEFF;
+    float gelu_derivative = sigmoid_z + x * sigmoid_derivative;
+
+    grad_out[i] = upstream_grad * gelu_derivative;
+  }
+}
diff --git a/TargetLibraries/Generic/src/GELU_s8.c b/TargetLibraries/Generic/src/GELU_s8.c
index 91f42546e6..d9940e9553 100644
--- a/TargetLibraries/Generic/src/GELU_s8.c
+++ b/TargetLibraries/Generic/src/GELU_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        GELU_s8.c
- * Description:
- *
- * $Date:        19.12.2022
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/Gemm_fp32.c b/TargetLibraries/Generic/src/Gemm_fp32.c
index 512969c100..a3c6418508 100644
--- a/TargetLibraries/Generic/src/Gemm_fp32.c
+++ b/TargetLibraries/Generic/src/Gemm_fp32.c
@@ -1,31 +1,9 @@
-/* =====================================================================
- * Title:        GEMM_fp32.c
- * Description:
- *
- * Date:         24.01.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
+
 #include "DeeployBasicMath.h"
 
 void Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
diff --git a/TargetLibraries/Generic/src/Gemm_s8.c b/TargetLibraries/Generic/src/Gemm_s8.c
index bd5e807797..8d53e21f2e 100644
--- a/TargetLibraries/Generic/src/Gemm_s8.c
+++ b/TargetLibraries/Generic/src/Gemm_s8.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        Gemm_s8.c
- * Description:
- *
- * $Date:        05.01.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/Hardswish_s8.c b/TargetLibraries/Generic/src/Hardswish_s8.c
index fb7c1f9213..384e49c98d 100644
--- a/TargetLibraries/Generic/src/Hardswish_s8.c
+++ b/TargetLibraries/Generic/src/Hardswish_s8.c
@@ -1,28 +1,8 @@
-/* ----------------------------------------------------------------------
-#
-# File: Hardswish_s8.c
-#
-# Last edited: 22.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
 #include "DeeployBasicMath.h"
 
diff --git a/TargetLibraries/Generic/src/Layernorm_fp32.c b/TargetLibraries/Generic/src/Layernorm_fp32.c
index 2def4fcb4f..fb68df8dfe 100644
--- a/TargetLibraries/Generic/src/Layernorm_fp32.c
+++ b/TargetLibraries/Generic/src/Layernorm_fp32.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        Layernorm_fp32.c
- * Description:
- *
- * $Date:        22.01.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
@@ -58,3 +36,58 @@ void Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out,
     }
   }
 }
+
+void LayernormGrad_fp32_fp32(float32_t *grad_in, float32_t *data_in,
+                             float32_t *grad_out, float32_t *scale,
+                             float32_t *bias, float32_t epsilon, int32_t size,
+                             int32_t lastDimLength) {
+  float32_t mean, variance, std, inv_std;
+  float32_t sum_dy, sum_dy_scaled, sum_dy_scaled_centered;
+  float32_t centered_input;
+
+  for (int i = 0; i < (size / lastDimLength); i++) {
+    // RW: Step 1: Recompute mean and variance from forward pass
+    mean = 0.0f;
+    variance = 0.0f;
+
+    for (int j = 0; j < lastDimLength; j++) {
+      mean += data_in[j + i * lastDimLength];
+    }
+    mean = mean / lastDimLength;
+
+    for (int j = 0; j < lastDimLength; j++) {
+      centered_input = data_in[j + i * lastDimLength] - mean;
+      variance += centered_input * centered_input;
+    }
+    variance = variance / lastDimLength;
+    variance += epsilon;
+    std = sqrtf(variance);
+    inv_std = 1.0f / std;
+
+    // RW: Step 2: Compute intermediate values needed for gradient calculation
+    sum_dy = 0.0f;
+    sum_dy_scaled_centered = 0.0f;
+
+    // RW: Calculate sum(dy) and sum(dy * scale * (x - mean) / std)
+    for (int j = 0; j < lastDimLength; j++) {
+      sum_dy += grad_in[j + i * lastDimLength];
+      centered_input = data_in[j + i * lastDimLength] - mean;
+      sum_dy_scaled_centered +=
+          grad_in[j + i * lastDimLength] * scale[j] * centered_input * inv_std;
+    }
+
+    // RW: Step 3: Calculate gradients for each element
+    for (int j = 0; j < lastDimLength; j++) {
+      centered_input = data_in[j + i * lastDimLength] - mean;
+
+      // Gradient formula:
+      // dx = (1/std) * scale * (dy - (1/N)*sum(dy) -
+      // (x-mean)/(N*std^2)*sum(dy*scale*(x-mean)/std))
+      grad_out[j + i * lastDimLength] =
+          inv_std * scale[j] *
+          (grad_in[j + i * lastDimLength] - (sum_dy / lastDimLength) -
+           (centered_input * inv_std * inv_std / lastDimLength) *
+               sum_dy_scaled_centered);
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/Layernorm_s8.c b/TargetLibraries/Generic/src/Layernorm_s8.c
index a7079a9f8a..281cef9d0c 100644
--- a/TargetLibraries/Generic/src/Layernorm_s8.c
+++ b/TargetLibraries/Generic/src/Layernorm_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Layernorm_s8.c
- * Description:
- *
- * $Date:        19.12.2022
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/MatMul_fp32.c b/TargetLibraries/Generic/src/MatMul_fp32.c
index f7df3e5321..d8226d8fa8 100644
--- a/TargetLibraries/Generic/src/MatMul_fp32.c
+++ b/TargetLibraries/Generic/src/MatMul_fp32.c
@@ -1,31 +1,9 @@
-/* =====================================================================
- * Title:        GEMM_fp32.c
- * Description:
- *
- * Date:         24.01.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
+
 #include "DeeployBasicMath.h"
 
 void MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
diff --git a/TargetLibraries/Generic/src/MatMul_s8.c b/TargetLibraries/Generic/src/MatMul_s8.c
index 2b53baa391..20e98b6da5 100644
--- a/TargetLibraries/Generic/src/MatMul_s8.c
+++ b/TargetLibraries/Generic/src/MatMul_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MatMul_s8.c
- * Description:
- *
- * $Date:        19.12.2022
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/MaxPool1D_fp32.c b/TargetLibraries/Generic/src/MaxPool1D_fp32.c
new file mode 100644
index 0000000000..a8686503b4
--- /dev/null
+++ b/TargetLibraries/Generic/src/MaxPool1D_fp32.c
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void MaxPool1d_fp32_fp32(float32_t const *__restrict__ pSrcA, uint32_t C,
+                         uint32_t W, uint32_t K, uint32_t S,
+                         float32_t *__restrict__ pDstC) {
+  uint32_t W_out = (W - K) / S + 1;
+  for (uint32_t c = 0; c < C; ++c) {
+    for (uint32_t w_out = 0; w_out < W_out; ++w_out) {
+      float32_t max = -INFINITY;
+      for (uint32_t k = 0; k < K; ++k) {
+        uint32_t w_in = w_out * S + k;
+        if (w_in >= W)
+          continue;
+        float32_t tmp = pSrcA[c * W + w_in];
+        if (tmp > max) {
+          max = tmp;
+        }
+      }
+      pDstC[c * W_out + w_out] = max;
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/MaxPool_fp32.c b/TargetLibraries/Generic/src/MaxPool_fp32.c
index 82211a4bb7..407f32c8f6 100644
--- a/TargetLibraries/Generic/src/MaxPool_fp32.c
+++ b/TargetLibraries/Generic/src/MaxPool_fp32.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MaxPool_fp32.c
- * Description:
- *
- * Date:         27.01.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/MaxPool_s8.c b/TargetLibraries/Generic/src/MaxPool_s8.c
index 3e482f824a..ab46ab8d65 100644
--- a/TargetLibraries/Generic/src/MaxPool_s8.c
+++ b/TargetLibraries/Generic/src/MaxPool_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MaxPool_s8.c
- * Description:
- *
- * Date:         04.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/Pow_fp32.c b/TargetLibraries/Generic/src/Pow_fp32.c
new file mode 100644
index 0000000000..89c07c6bda
--- /dev/null
+++ b/TargetLibraries/Generic/src/Pow_fp32.c
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void Pow_fp32_fp32_fp32(const float32_t *__restrict__ data_in,
+                        const float32_t *__restrict__ exponent,
+                        float32_t *__restrict__ data_out, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = powf(data_in[i], exponent[i]);
+  }
+}
+
+void Pow_fp32_scalar_fp32(const float32_t *__restrict__ data_in,
+                          float32_t exponent, float32_t *__restrict__ data_out,
+                          int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = powf(data_in[i], exponent);
+  }
+}
diff --git a/TargetLibraries/Generic/src/RQDiv_s8.c b/TargetLibraries/Generic/src/RQDiv_s8.c
index 2ac7524969..30f83d5cbe 100644
--- a/TargetLibraries/Generic/src/RQDiv_s8.c
+++ b/TargetLibraries/Generic/src/RQDiv_s8.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        RQDiv_s8.c
- * Description:
- *
- * $Date:        19.12.2022
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/RQGELU_s8.c b/TargetLibraries/Generic/src/RQGELU_s8.c
index 0e033a26a1..6113e5ddc5 100644
--- a/TargetLibraries/Generic/src/RQGELU_s8.c
+++ b/TargetLibraries/Generic/src/RQGELU_s8.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        RQGELU_s8.c
- * Description:
- *
- * $Date:        19.12.2022
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/RQHardswish.c b/TargetLibraries/Generic/src/RQHardswish.c
index 6826c2c173..b780b166c9 100644
--- a/TargetLibraries/Generic/src/RQHardswish.c
+++ b/TargetLibraries/Generic/src/RQHardswish.c
@@ -1,28 +1,8 @@
-/* ----------------------------------------------------------------------
-#
-# File: RQHardswish.c
-#
-# Last edited: 23.02.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
 #include "DeeployBasicMath.h"
 
diff --git a/TargetLibraries/Generic/src/Relu_fp32.c b/TargetLibraries/Generic/src/Relu_fp32.c
index 775fb7c7f0..81b65f8516 100644
--- a/TargetLibraries/Generic/src/Relu_fp32.c
+++ b/TargetLibraries/Generic/src/Relu_fp32.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        Softmax_fp8.c
- * Description:
- *
- * $Date:        22.01.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/RequantShift_s8.c b/TargetLibraries/Generic/src/RequantShift_s8.c
index c9c55df32c..56a84260e0 100644
--- a/TargetLibraries/Generic/src/RequantShift_s8.c
+++ b/TargetLibraries/Generic/src/RequantShift_s8.c
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        RequantShift_s8.c
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/Softmax_fp32.c b/TargetLibraries/Generic/src/Softmax_fp32.c
index 64c4ba6f3e..94673a661d 100644
--- a/TargetLibraries/Generic/src/Softmax_fp32.c
+++ b/TargetLibraries/Generic/src/Softmax_fp32.c
@@ -1,29 +1,9 @@
-/* =====================================================================
- * Title:        Softmax_fp32.c
- * Description:
- *
- * $Date:        23.01.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
+
 #include "DeeployBasicMath.h"
 #include <math.h>
 
diff --git a/TargetLibraries/Generic/src/Softmax_s8.c b/TargetLibraries/Generic/src/Softmax_s8.c
index ae5b40eb55..be073f01d0 100644
--- a/TargetLibraries/Generic/src/Softmax_s8.c
+++ b/TargetLibraries/Generic/src/Softmax_s8.c
@@ -1,29 +1,7 @@
-/* =====================================================================
- * Title:        Softmax_s8.c
- * Description:
- *
- * $Date:        27.03.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/Sqrt_fp32.c b/TargetLibraries/Generic/src/Sqrt_fp32.c
new file mode 100644
index 0000000000..06327fda4e
--- /dev/null
+++ b/TargetLibraries/Generic/src/Sqrt_fp32.c
@@ -0,0 +1,13 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+
+void Sqrt_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = sqrtf(data_in[i]);
+  }
+}
diff --git a/TargetLibraries/Generic/src/Util.c b/TargetLibraries/Generic/src/Util.c
index e576e0114c..e73b6ff1d8 100644
--- a/TargetLibraries/Generic/src/Util.c
+++ b/TargetLibraries/Generic/src/Util.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Util.c
- * Description:
- *
- * Date:         06.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/Generic/src/iRMSNorm_s8.c b/TargetLibraries/Generic/src/iRMSNorm_s8.c
index e84c4075a0..529935a929 100644
--- a/TargetLibraries/Generic/src/iRMSNorm_s8.c
+++ b/TargetLibraries/Generic/src/iRMSNorm_s8.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        iRMSNorm_s8.c
- * Description:
- *
- * $Date:        20.02.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployBasicMath.h"
diff --git a/TargetLibraries/MemPool/CMakeLists.txt b/TargetLibraries/MemPool/CMakeLists.txt
index d22180cd52..be2f481109 100644
--- a/TargetLibraries/MemPool/CMakeLists.txt
+++ b/TargetLibraries/MemPool/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 file(GLOB_RECURSE SOURCES
   "src/**"
 )
diff --git a/TargetLibraries/MemPool/cmake/mempool-runtime.cmake b/TargetLibraries/MemPool/cmake/mempool-runtime.cmake
index 96cebfd241..aa32e20d27 100644
--- a/TargetLibraries/MemPool/cmake/mempool-runtime.cmake
+++ b/TargetLibraries/MemPool/cmake/mempool-runtime.cmake
@@ -1,3 +1,7 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
 set(MEMPOOL_HOME $ENV{MEMPOOL_HOME})
 set(MEMPOOL_RUNTIME_HOME ${MEMPOOL_HOME}/software/runtime)
 
diff --git a/TargetLibraries/MemPool/inc/CycleCounter.h b/TargetLibraries/MemPool/inc/CycleCounter.h
index 52e24d2ee2..a1516f471e 100644
--- a/TargetLibraries/MemPool/inc/CycleCounter.h
+++ b/TargetLibraries/MemPool/inc/CycleCounter.h
@@ -1,51 +1,28 @@
-/* =====================================================================
- * Title:        CycleCounter.h
- * Description:
- *
- * Date:         06.12.2022
- *
- * ===================================================================== */
-
-/*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEEPLOY_MATH_CYCLE_HEADER_
-#define __DEEPLOY_MATH_CYCLE_HEADER_
-
-#include <stdint.h>
-
-// Resets the internal cycle and instruction counter to zero
-void ResetTimer(void);
-
-// Starts the internal cycle and instruction counter
-void StartTimer(void);
-
-// Stops the internal cycle and instruction counter
-void StopTimer(void);
-
-// Returns the current number of cycles according to the internal cycle counter
-uint32_t getCycles(void);
-
-// Returns the current number of instructions according to the internal
-// instructions counter
-uint32_t getInstr(void);
-
-#endif //__DEEPLOY_MATH_CYCLE_HEADER_
+/*
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_CYCLE_HEADER_
+#define __DEEPLOY_MATH_CYCLE_HEADER_
+
+#include <stdint.h>
+
+// Resets the internal cycle and instruction counter to zero
+void ResetTimer(void);
+
+// Starts the internal cycle and instruction counter
+void StartTimer(void);
+
+// Stops the internal cycle and instruction counter
+void StopTimer(void);
+
+// Returns the current number of cycles according to the internal cycle counter
+uint32_t getCycles(void);
+
+// Returns the current number of instructions according to the internal
+// instructions counter
+uint32_t getInstr(void);
+
+#endif //__DEEPLOY_MATH_CYCLE_HEADER_
diff --git a/TargetLibraries/MemPool/inc/DeeployMath.h b/TargetLibraries/MemPool/inc/DeeployMath.h
index 2ff40809bd..7dc9c54a0d 100644
--- a/TargetLibraries/MemPool/inc/DeeployMath.h
+++ b/TargetLibraries/MemPool/inc/DeeployMath.h
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        DeeployMath.h
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/MemPool/inc/ITA.h b/TargetLibraries/MemPool/inc/ITA.h
index 2bdda47bce..042173bb98 100644
--- a/TargetLibraries/MemPool/inc/ITA.h
+++ b/TargetLibraries/MemPool/inc/ITA.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        ITA.h
- * Description:
- *
- * Date:         03.03.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_ITA_HEADER_
diff --git a/TargetLibraries/MemPool/inc/constants.h b/TargetLibraries/MemPool/inc/constants.h
index 6a3d2cae5c..c282ecd863 100644
--- a/TargetLibraries/MemPool/inc/constants.h
+++ b/TargetLibraries/MemPool/inc/constants.h
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        constants.h
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_CONSTANTS_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/Convolution.h b/TargetLibraries/MemPool/inc/kernel/Convolution.h
index 63ac4837c9..83988e566e 100644
--- a/TargetLibraries/MemPool/inc/kernel/Convolution.h
+++ b/TargetLibraries/MemPool/inc/kernel/Convolution.h
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        Convolution.h
- * Description:
- *
- * Date:         02.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_CONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/DWConvolution.h b/TargetLibraries/MemPool/inc/kernel/DWConvolution.h
index e05ae7a770..f88259c591 100644
--- a/TargetLibraries/MemPool/inc/kernel/DWConvolution.h
+++ b/TargetLibraries/MemPool/inc/kernel/DWConvolution.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        DWConvolution.h
- * Description:
- *
- * Date:         09.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_DWCONVOLUTION_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/Gemm.h b/TargetLibraries/MemPool/inc/kernel/Gemm.h
index 157a3d221a..aef7e522e4 100644
--- a/TargetLibraries/MemPool/inc/kernel/Gemm.h
+++ b/TargetLibraries/MemPool/inc/kernel/Gemm.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Gemm.h
- * Description:
- *
- * Date:         16.05.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/MHSA.h b/TargetLibraries/MemPool/inc/kernel/MHSA.h
index a97f4ae601..0b696600e8 100644
--- a/TargetLibraries/MemPool/inc/kernel/MHSA.h
+++ b/TargetLibraries/MemPool/inc/kernel/MHSA.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MHSA.h
- * Description:
- *
- * Date:         08.02.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_MHSA_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/MatMul.h b/TargetLibraries/MemPool/inc/kernel/MatMul.h
index ad61bafd4d..bd355856d3 100644
--- a/TargetLibraries/MemPool/inc/kernel/MatMul.h
+++ b/TargetLibraries/MemPool/inc/kernel/MatMul.h
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul.h
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/MaxPool.h b/TargetLibraries/MemPool/inc/kernel/MaxPool.h
index d8b02b558b..614db2d770 100644
--- a/TargetLibraries/MemPool/inc/kernel/MaxPool.h
+++ b/TargetLibraries/MemPool/inc/kernel/MaxPool.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MaxPool.h
- * Description:
- *
- * Date:         13.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/RQGemm.h b/TargetLibraries/MemPool/inc/kernel/RQGemm.h
index a548122482..65de18fcfb 100644
--- a/TargetLibraries/MemPool/inc/kernel/RQGemm.h
+++ b/TargetLibraries/MemPool/inc/kernel/RQGemm.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQGemm.h
- * Description:
- *
- * Date:         16.05.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/RQMatMul.h b/TargetLibraries/MemPool/inc/kernel/RQMatMul.h
index 3e4fd96e12..a5261fcd74 100644
--- a/TargetLibraries/MemPool/inc/kernel/RQMatMul.h
+++ b/TargetLibraries/MemPool/inc/kernel/RQMatMul.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQMatMul.h
- * Description:
- *
- * Date:         24.04.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_RQMATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/RequantShift.h b/TargetLibraries/MemPool/inc/kernel/RequantShift.h
index b995e31392..b0ff3cef03 100644
--- a/TargetLibraries/MemPool/inc/kernel/RequantShift.h
+++ b/TargetLibraries/MemPool/inc/kernel/RequantShift.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RequantShift.h
- * Description:
- *
- * Date:         24.04.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_REQUANTSHIFT_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/kernel/Softmax.h b/TargetLibraries/MemPool/inc/kernel/Softmax.h
index 4a21a4e2de..920b59a1d3 100644
--- a/TargetLibraries/MemPool/inc/kernel/Softmax.h
+++ b/TargetLibraries/MemPool/inc/kernel/Softmax.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Softmax.h
- * Description:
- *
- * Date:         25.04.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
diff --git a/TargetLibraries/MemPool/inc/macros.h b/TargetLibraries/MemPool/inc/macros.h
index 2d36e2334a..a6e0b231e0 100644
--- a/TargetLibraries/MemPool/inc/macros.h
+++ b/TargetLibraries/MemPool/inc/macros.h
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        macros.h
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_MACROS_HEADER_
diff --git a/TargetLibraries/MemPool/src/Convolution_s8.c b/TargetLibraries/MemPool/src/Convolution_s8.c
index a81673e507..de1954145e 100644
--- a/TargetLibraries/MemPool/src/Convolution_s8.c
+++ b/TargetLibraries/MemPool/src/Convolution_s8.c
@@ -1,31 +1,7 @@
-/* =====================================================================
- * Title:        Convolution_s8.c
- * Description:
- *
- * Date:         02.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/CycleCounter.c b/TargetLibraries/MemPool/src/CycleCounter.c
index 6e0be0eb97..b6ec31e09e 100644
--- a/TargetLibraries/MemPool/src/CycleCounter.c
+++ b/TargetLibraries/MemPool/src/CycleCounter.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        CycleCounter.c
- * Description:
- *
- * Date:         06.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/DWConvolution_s8.c b/TargetLibraries/MemPool/src/DWConvolution_s8.c
index 00b5070fbd..18f8bffc0f 100644
--- a/TargetLibraries/MemPool/src/DWConvolution_s8.c
+++ b/TargetLibraries/MemPool/src/DWConvolution_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        DWConvolution_s8.c
- * Description:
- *
- * Date:         09.01.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/Gemm_s8.c b/TargetLibraries/MemPool/src/Gemm_s8.c
index e5eba7e73b..73f56203a2 100644
--- a/TargetLibraries/MemPool/src/Gemm_s8.c
+++ b/TargetLibraries/MemPool/src/Gemm_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Gemm_s8.c
- * Description:
- *
- * Date:         16.05.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/ITA.c b/TargetLibraries/MemPool/src/ITA.c
index ee1a4cbd57..3db7bdaccd 100644
--- a/TargetLibraries/MemPool/src/ITA.c
+++ b/TargetLibraries/MemPool/src/ITA.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        ITA.c
- * Description:
- *
- * Date:         5.12.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/MHSA_s8.c b/TargetLibraries/MemPool/src/MHSA_s8.c
index 9b3570ec9b..9f071526ca 100644
--- a/TargetLibraries/MemPool/src/MHSA_s8.c
+++ b/TargetLibraries/MemPool/src/MHSA_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        M4HSA_s8.c
- * Description:
- *
- * Date:         08.02.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/MatMul_s16.c b/TargetLibraries/MemPool/src/MatMul_s16.c
index b28bf002c0..df54e856db 100644
--- a/TargetLibraries/MemPool/src/MatMul_s16.c
+++ b/TargetLibraries/MemPool/src/MatMul_s16.c
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul_s16.c
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/MatMul_s32.c b/TargetLibraries/MemPool/src/MatMul_s32.c
index 6b04fc2536..6b78f2d2a9 100644
--- a/TargetLibraries/MemPool/src/MatMul_s32.c
+++ b/TargetLibraries/MemPool/src/MatMul_s32.c
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul_s32.c
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/MatMul_s8.c b/TargetLibraries/MemPool/src/MatMul_s8.c
index 81ddcf585c..ca0ac4fb5b 100644
--- a/TargetLibraries/MemPool/src/MatMul_s8.c
+++ b/TargetLibraries/MemPool/src/MatMul_s8.c
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul_s8.c
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/MaxPool_s8.c b/TargetLibraries/MemPool/src/MaxPool_s8.c
index 94e3f6175e..97587ec534 100644
--- a/TargetLibraries/MemPool/src/MaxPool_s8.c
+++ b/TargetLibraries/MemPool/src/MaxPool_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MaxPool_s8.c
- * Description:
- *
- * Date:         13.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/RQGemm_s8.c b/TargetLibraries/MemPool/src/RQGemm_s8.c
index f983ef587e..4cc395687f 100644
--- a/TargetLibraries/MemPool/src/RQGemm_s8.c
+++ b/TargetLibraries/MemPool/src/RQGemm_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQGemm_s8.c
- * Description:
- *
- * Date:         16.05.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/RQMatMul_s8.c b/TargetLibraries/MemPool/src/RQMatMul_s8.c
index 8a499c273b..28fea567df 100644
--- a/TargetLibraries/MemPool/src/RQMatMul_s8.c
+++ b/TargetLibraries/MemPool/src/RQMatMul_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQMatMul_s8.c
- * Description:
- *
- * Date:         24.04.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/RequantShift_s8.c b/TargetLibraries/MemPool/src/RequantShift_s8.c
index c71942303e..51273aefac 100644
--- a/TargetLibraries/MemPool/src/RequantShift_s8.c
+++ b/TargetLibraries/MemPool/src/RequantShift_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RequantShift_s8.c
- * Description:
- *
- * Date:         24.04.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/Softmax_s8.c b/TargetLibraries/MemPool/src/Softmax_s8.c
index cefe3eca22..cf53de43eb 100644
--- a/TargetLibraries/MemPool/src/Softmax_s8.c
+++ b/TargetLibraries/MemPool/src/Softmax_s8.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        Softmax_s8.c
- * Description:
- *
- * $Date:        25.04.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/MemPool/src/Util.c b/TargetLibraries/MemPool/src/Util.c
index 315469c819..ae02076557 100644
--- a/TargetLibraries/MemPool/src/Util.c
+++ b/TargetLibraries/MemPool/src/Util.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Util.c
- * Description:
- *
- * Date:         15.03.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployMath.h"
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index 30bc9aa094..0b9a3247e7 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -1,9 +1,13 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 file(GLOB_RECURSE SOURCES
   "src/**"
 )
 
 if(NOT DEFINED ENV{PULP_SDK_HOME})
-  message(FATAL_ERROR "Environemnt variable PULP_SDK_HOME not set.")
+  message(FATAL_ERROR "Environment variable PULP_SDK_HOME not set.")
 endif()
 
 if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
@@ -25,6 +29,11 @@ target_compile_options(deeploypulp PRIVATE
   -Wno-implicit-int-conversion
   -Wno-sign-conversion
   -Wno-sign-compare
+  -Wno-typedef-redefinition
+  -Wno-unused-parameter
+  -Wno-unused-function
+  -Wno-unused-variable
+  -Wno-uninitialized
 )
 
 target_include_directories(deeploypulp PUBLIC ${PULP_SDK_INCLUDES})
@@ -48,5 +57,12 @@ if (platform IN_LIST PULP_NNX_PLATFORMS)
   add_subdirectory(third_party/pulp-nnx)
   target_include_directories(pulp-nnx PUBLIC ${PULP_SDK_INCLUDES})
   target_compile_options(pulp-nnx PUBLIC ${PULP_SDK_COMPILE_FLAGS})
+    target_compile_options(pulp-nnx PRIVATE
+    -Wno-implicit-int-conversion
+    -Wno-sign-conversion
+    -Wno-typedef-redefinition
+    -Wno-unused-parameter
+    -Wno-incompatible-pointer-types-discards-qualifiers
+    )
   target_link_libraries(deeploypulp INTERFACE pulp-nnx)
 endif()
diff --git a/TargetLibraries/PULPOpen/cmake/pulp-sdk-base.cmake b/TargetLibraries/PULPOpen/cmake/pulp-sdk-base.cmake
index 8c7109d9c4..64452165ff 100644
--- a/TargetLibraries/PULPOpen/cmake/pulp-sdk-base.cmake
+++ b/TargetLibraries/PULPOpen/cmake/pulp-sdk-base.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(PULP_SDK_HOME $ENV{PULP_SDK_HOME})
 
 set(PULP_SDK_BASE_C_SOURCE
diff --git a/TargetLibraries/PULPOpen/cmake/pulp-sdk-pulp-open.cmake b/TargetLibraries/PULPOpen/cmake/pulp-sdk-pulp-open.cmake
index 3027ddb94e..3ab74d58bc 100644
--- a/TargetLibraries/PULPOpen/cmake/pulp-sdk-pulp-open.cmake
+++ b/TargetLibraries/PULPOpen/cmake/pulp-sdk-pulp-open.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 include(cmake/pulp-sdk-base.cmake)
 
 set(PULP_SDK_HOME $ENV{PULP_SDK_HOME})
diff --git a/TargetLibraries/PULPOpen/cmake/pulp-sdk-siracusa.cmake b/TargetLibraries/PULPOpen/cmake/pulp-sdk-siracusa.cmake
index c11544dd22..45fdd881a7 100644
--- a/TargetLibraries/PULPOpen/cmake/pulp-sdk-siracusa.cmake
+++ b/TargetLibraries/PULPOpen/cmake/pulp-sdk-siracusa.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 include(cmake/pulp-sdk-base.cmake)
 
 set(PULP_SDK_HOME $ENV{PULP_SDK_HOME})
diff --git a/TargetLibraries/PULPOpen/inc/DeeployMath.h b/TargetLibraries/PULPOpen/inc/DeeployMath.h
deleted file mode 100644
index 7aa1b1805c..0000000000
--- a/TargetLibraries/PULPOpen/inc/DeeployMath.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* =====================================================================
- * Title:        DeeployMath.h
- * Description:
- *
- * $Date:        30.12.2021
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __DEEPLOY_MATH_HEADER_
-#define __DEEPLOY_MATH_HEADER_
-
-#include <ctype.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#if defined(AM_PART_APOLLO4B) | defined(DAM_PART_APOLLO3)
-#include "am_bsp.h"
-#include "am_mcu_apollo.h"
-#include "am_util.h"
-#endif
-
-#include "DeeployBasicMath.h"
-
-#endif // __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
index a9d242d1b3..f6e8308c97 100644
--- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
+++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        DeeployMath.h
- * Description:
- *
- * $Date:        30.12.2021
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_HEADER_
@@ -36,6 +15,10 @@
 
 #include "types.h"
 
+#define BEGIN_SINGLE_CORE if (pi_core_id() == 0) {
+#define END_SINGLE_CORE }
+#define SINGLE_CORE if (pi_core_id() == 0)
+
 #include "DeeployBasicMath.h"
 
 #include "pmsis.h"
@@ -52,4 +35,6 @@
 #include "kernel/gemv.h"
 #include "kernel/iRMSnorm.h"
 
+#define LOG2(x) (__builtin_pulp_fl1(x))
+
 #endif // __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/dory_dma.h b/TargetLibraries/PULPOpen/inc/dory_dma.h
deleted file mode 100644
index 9b2c4259ab..0000000000
--- a/TargetLibraries/PULPOpen/inc/dory_dma.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * dory.h
- * Alessio Burrello <alessio.burrello@unibo.it>
- *
- * Copyright (C) 2019-2020 University of Bologna
- *
- * SPDX-License-Identifier: Apache-2.0
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef _DORY_DMA_H
-#define _DORY_DMA_H
-
-typedef struct {
-  void *ext;
-  void *loc;
-  unsigned short hwc_to_chw;
-  unsigned short stride_2d;
-  unsigned short number_of_2d_copies;
-  unsigned short stride_1d;
-  unsigned short number_of_1d_copies;
-  unsigned int length_1d_copy;
-  unsigned int mchan_cmd;
-  int dir; // 0 l1->l2, 1 l2->l1
-  int tid;
-} DMA_copy;
-
-void dory_dma_memcpy_hwc_to_chw(DMA_copy *copy);
-
-void dory_dma_memcpy_1d_async(DMA_copy *copy);
-
-void dory_dma_memcpy_2d_async(DMA_copy *copy);
-
-void dory_dma_memcpy_3d_async(DMA_copy *copy);
-
-void dory_dma_memcpy_async(DMA_copy *copy);
-
-void dory_dma_free(DMA_copy *copy);
-
-void dory_dma_barrier(DMA_copy *copy);
-
-int dory_dma_allocate();
-#endif
diff --git a/TargetLibraries/PULPOpen/inc/dory_mem.h b/TargetLibraries/PULPOpen/inc/dory_mem.h
index 35a06cfd6c..ccd36a37d6 100644
--- a/TargetLibraries/PULPOpen/inc/dory_mem.h
+++ b/TargetLibraries/PULPOpen/inc/dory_mem.h
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        dory_mem.h
- * Description:
- *
- * $Date:        12.12.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __MEM_H__
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
index e63b7e939f..3ebab54a0b 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
@@ -1,45 +1,38 @@
-
-/* =====================================================================
- * Title:        Conv.h
- * Description:
- *
- * $Date:       05.04.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_CONV_KERNEL_HEADER_
+#define __DEEPLOY_MATH_CONV_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
-void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                    uint32_t H, uint32_t W, uint32_t C,
-                                    const float32_t *__restrict__ pSrcB,
-                                    uint32_t F_total, uint32_t P, uint32_t Q,
-                                    uint32_t SP, uint32_t SQ,
-                                    float32_t *__restrict__ pDstC,
-                                    uint32_t pad_top, uint32_t pad_bottom,
-                                    uint32_t pad_left, uint32_t pad_right);
+void PULP_Conv2d_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right);
 
 void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
     const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
-    uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC,
-    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
-    uint32_t pad_right, float32_t *__restrict__ pContextBuffer);
\ No newline at end of file
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer);
+
+void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer);
+
+#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/GELU.h b/TargetLibraries/PULPOpen/inc/kernel/GELU.h
index a7cc00f71b..8b2af4ec55 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/GELU.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/GELU.h
@@ -1,35 +1,18 @@
-
-/* =====================================================================
- * Title:        GELU.h
- * Description:
- *
- * $Date:       05.06.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_GELU_KERNEL_HEADER_
+#define __DEEPLOY_MATH_GELU_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void PULP_GELU_fp32_fp32(float32_t *data_in, float32_t *data_out,
                          int32_t dataSize);
 
 void PULP_GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out,
-                                 int32_t dataSize);
\ No newline at end of file
+                                 int32_t dataSize);
+
+#endif // __DEEPLOY_MATH_GELU_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h b/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h
index 9a63066bd7..43e9c55cf4 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Layernorm.h
@@ -1,33 +1,17 @@
-/* =====================================================================
- * Title:        Layernorm.h
- * Description:
- *
- * $Date:        05.06.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_LAYERNORM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_LAYERNORM_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void PULP_Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out,
                               float32_t *scale, float32_t *bias,
                               float32_t epsilon, uint32_t size,
-                              uint32_t lastDimLength);
\ No newline at end of file
+                              uint32_t lastDimLength);
+
+#endif // __DEEPLOY_MATH_LAYERNORM_KERNEL_HEADER__
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Matmul.h b/TargetLibraries/PULPOpen/inc/kernel/Matmul.h
index 43a0f3b8a3..9176801b58 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Matmul.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Matmul.h
@@ -1,35 +1,17 @@
-
-
-/* =====================================================================
- * Title:        Matmul.h
- * Description:
- *
- * $Date:        05.06.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
+#define __DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void PULP_MatMul_fp32_fp32_fp32_unroll1x7(const float32_t *__restrict__ pSrcA,
                                           const float32_t *__restrict__ pSrcB,
                                           float32_t *__restrict__ pDstY,
-                                          uint32_t M, uint32_t N, uint32_t O);
\ No newline at end of file
+                                          uint32_t M, uint32_t N, uint32_t O);
+
+#endif // __DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
index d7901ba117..b37487439f 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
@@ -1,30 +1,12 @@
-/* =====================================================================
- * Title:        Maxpool.h
- * Description:
- *
- * $Date:       05.04.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
+#define __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
@@ -32,4 +14,6 @@ void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
                                   uint32_t Q, uint32_t P, uint32_t SQ,
                                   uint32_t SP, float32_t *__restrict__ pDstC,
                                   uint32_t pad_top, uint32_t pad_bottom,
-                                  uint32_t pad_left, uint32_t pad_right);
\ No newline at end of file
+                                  uint32_t pad_left, uint32_t pad_right);
+
+#endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/RQiHardswish.h b/TargetLibraries/PULPOpen/inc/kernel/RQiHardswish.h
index bd528491df..407d97f903 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/RQiHardswish.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/RQiHardswish.h
@@ -1,32 +1,16 @@
-/* =====================================================================
- * Title:        RQiHardswish.h
- * Description:
- *
- * $Date:        15.03.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_RQIHARDSWISH_KERNEL_HEADER_
+#define __DEEPLOY_MATH_RQIHARDSWISH_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void RQiHardswish_s8_s8_plp(int8_t *input, int8_t *output, int32_t size,
                             int32_t one_over_six, int32_t three, int32_t six,
                             int32_t mul, int32_t add, int32_t shift);
+
+#endif // __DEEPLOY_MATH_RQIHARDSWISH_KERNEL_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Relu.h b/TargetLibraries/PULPOpen/inc/kernel/Relu.h
index 27dc2d8580..1c49bd1cd8 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Relu.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Relu.h
@@ -1,31 +1,14 @@
-
-/* =====================================================================
- * Title:      Relu.h
- * Description:
- *
- * $Date:       05.06.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_RELU_KERNEL_HEADER_
+#define __DEEPLOY_MATH_RELU_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
-void PULP_Relu_fp32_fp32(float32_t *input, float32_t *output, uint32_t size);
\ No newline at end of file
+void PULP_Relu_fp32_fp32(float32_t *input, float32_t *output, uint32_t size);
+
+#endif // __DEEPLOY_MATH_RELU_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/RequantShift.h b/TargetLibraries/PULPOpen/inc/kernel/RequantShift.h
index 54c38620a2..3e5da558d2 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/RequantShift.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/RequantShift.h
@@ -1,33 +1,12 @@
-/* =====================================================================
- * Title:        RequantShift_s8.c
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_REQUANTSHIFT_KERNEL_HEADER_
+#define __DEEPLOY_MATH_REQUANTSHIFT_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void RequantShift_u8_s8_NHWC(uint8_t *data_in, int32_t size, int32_t *mul,
@@ -137,3 +116,5 @@ void RequantShift_s32_u8_NCHW(int32_t *data_in, int32_t size, int32_t *mul,
                               int32_t HW, int32_t input_offset,
                               int32_t output_offset, uint8_t output_min,
                               uint8_t output_max, bool rounding);
+
+#endif // __DEEPLOY_MATH_REQUANTSHIFT_KERNEL_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Softmax.h b/TargetLibraries/PULPOpen/inc/kernel/Softmax.h
index 9e000664c0..d696a5c27a 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Softmax.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Softmax.h
@@ -1,30 +1,12 @@
-/* =====================================================================
- * Title:        iSoftmax.h
- * Description:
- *
- * $Date:        13.11.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
+#define __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void PULPSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out,
@@ -36,4 +18,6 @@ void PULPSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out,
                        uint32_t lastDimLength, int32_t coeffB, int32_t coeffC,
                        int32_t log2);
 void PULP_Softmax_fp32_fp32(float32_t *input, float32_t *output, uint32_t size,
-                            uint32_t last_dim_length);
\ No newline at end of file
+                            uint32_t last_dim_length);
+
+#endif // __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/UniformRequantShift.h b/TargetLibraries/PULPOpen/inc/kernel/UniformRequantShift.h
index 0cbd5c28fe..7f795ee115 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/UniformRequantShift.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/UniformRequantShift.h
@@ -1,28 +1,11 @@
-/* ----------------------------------------------------------------------
-#
-# File: UniformRequantShift.h
-#
-# Last edited: 12.03.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_UNIFORMREQUANTSHIFT_KERNEL_HEADER_
+#define __DEEPLOY_MATH_UNIFORMREQUANTSHIFT_KERNEL_HEADER_
 
 #include "DeeployPULPMath.h"
 
@@ -48,4 +31,6 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul,
                                 int32_t add, int8_t *data_out, int32_t log2D,
                                 int32_t HW, int32_t input_offset,
                                 int32_t output_offset, int8_t output_min,
-                                int8_t output_max, bool rounding);
\ No newline at end of file
+                                int8_t output_max, bool rounding);
+
+#endif // __DEEPLOY_MATH_UNIFORMREQUANTSHIFT_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/gemm.h b/TargetLibraries/PULPOpen/inc/kernel/gemm.h
index 95cf0e4800..863bb2102c 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/gemm.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/gemm.h
@@ -1,31 +1,12 @@
-
-/* =====================================================================
- * Title:        gemm.h
- * Description:
- *
- * $Date:       05.06.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
@@ -33,4 +14,6 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
                                    const float32_t *__restrict__ pDstC,
                                    float32_t *__restrict__ pDstY, uint32_t M,
                                    uint32_t N, uint32_t O, uint32_t transA,
-                                   uint32_t transB);
\ No newline at end of file
+                                   uint32_t transB);
+
+#endif // __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/gemv.h b/TargetLibraries/PULPOpen/inc/kernel/gemv.h
index 214f8300ad..f40934544b 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/gemv.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/gemv.h
@@ -1,30 +1,12 @@
-/* =====================================================================
- * Title:        vec2mat.h
- * Description:
- *
- * $Date:        15.03.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_GEMV_KERNEL_HEADER_
+#define __DEEPLOY_MATH_GEMV_KERNEL_HEADER_
+
 #include "stdint.h"
 
 void gemv_s8_s8_plp(int8_t *pIn, int8_t *pBias, int8_t *pOut, int8_t *pWeight,
@@ -32,3 +14,5 @@ void gemv_s8_s8_plp(int8_t *pIn, int8_t *pBias, int8_t *pOut, int8_t *pWeight,
                     uint16_t out_shift, uint16_t dim_vec,
                     uint16_t num_o_neurons, uint8_t flag_relu,
                     uint8_t flag_batch_norm);
+
+#endif // __DEEPLOY_MATH_GEMV_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/iRMSnorm.h b/TargetLibraries/PULPOpen/inc/kernel/iRMSnorm.h
index fa1c5e4083..5e2cd3a54c 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/iRMSnorm.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/iRMSnorm.h
@@ -1,31 +1,15 @@
-/* =====================================================================
- * Title:        iRMSnorm.h
- * Description:
- *
- * $Date:        14.03.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_IRMSNORM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_IRMSNORM_KERNEL_HEADER_
+
 #include "DeeployPULPMath.h"
 
 void iRMSnorm_s8_s8_plp(int8_t *data_in, int8_t *data_out, int32_t *weight,
                         int32_t size, int32_t lastDimLength, int32_t log2D);
+
+#endif // __DEEPLOY_MATH_IRMSNORM_KERNEL_HEADER__
diff --git a/TargetLibraries/PULPOpen/inc/mchan.h b/TargetLibraries/PULPOpen/inc/mchan.h
deleted file mode 100644
index cd7c2ee799..0000000000
--- a/TargetLibraries/PULPOpen/inc/mchan.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* =====================================================================
- * Title:        mchan.h
- * Description:
- *
- * $Date:        26.07.2024
- *
- * ===================================================================== */
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
-
- * Adopted from PULP-SDK (https://github.com/pulp-platform/pulp-sdk), released
- under Apache 2.0
-
- */
-
-#ifndef _MCHAN_H
-#define _MCHAN_H
-
-// Requires to have MCHAN_BASE_ADDR, MCHAN_EVENT defined outside of header
-#ifndef MCHAN_BASE_ADDR
-#error "[mchan.h] MCHAN_BASE_ADDR not defined!"
-#endif
-
-#if !defined(MCHAN_EVENT) && !defined(MCHAN_POLLED)
-#error "[mchan.h] Nor MCHAN_EVENT nor MCHAN_POLLED defined!"
-#endif
-
-#if defined(MCHAN_EVENT) && !defined(MCHAN_EVENT_BIT)
-#error                                                                         \
-    "[mchan.h] MCHAN_EVENT_BIT should be defined when using events as signalization!"
-#endif
-
-#include "pmsis.h"
-
-#define MCHAN_CMD_OFFSET 0
-#define MCHAN_STATUS_OFFSET 4
-
-#define MCHAN_CMD_ADDR (MCHAN_BASE_ADDR + MCHAN_CMD_OFFSET)
-#define MCHAN_STATUS_ADDR (MCHAN_BASE_ADDR + MCHAN_STATUS_OFFSET)
-
-#define READ_REG(addr) (*(volatile int *)(addr))
-#define WRITE_REG(addr, value)                                                 \
-  do {                                                                         \
-    *(volatile int *)(addr) = (int)value;                                      \
-  } while (0)
-
-#define MCHAN_READ_CMD() READ_REG(MCHAN_CMD_ADDR)
-#define MCHAN_WRITE_CMD(value) WRITE_REG(MCHAN_CMD_ADDR, value)
-
-#define MCHAN_READ_STATUS() READ_REG(MCHAN_STATUS_ADDR)
-#define MCHAN_WRITE_STATUS(value) WRITE_REG(MCHAN_STATUS_ADDR, value)
-
-// MCHAN version 7 has 1 more bit for the transfer length, so all the flag
-// offsets are shifted by 1. Also, LOC (TCDM) striding is not supported in v6.
-#if MCHAN_VERSION == 7
-#define MCHAN_TRANSFER_LEN_SIZE (17)
-#else
-#define MCHAN_TRANSFER_LEN_SIZE (16)
-#endif
-
-#define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0))
-#define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0))
-#define MCHAN_CMD_FLAG_INCREMENTAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 1))
-#define MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 2))
-#define MCHAN_CMD_FLAG_EVENT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 3))
-#define MCHAN_CMD_FLAG_INTERRUPT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 4))
-#define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5))
-#if MCHAN_VERSION == 7
-#define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL                                       \
-  (1 << (MCHAN_TRANSFER_LEN_SIZE + 6)) // can only be used with MCHAN v7
-#endif
-#define MCHAN_CMD_SHIFT_DIRECTION MCHAN_TRANSFER_LEN_SIZE
-
-#define MCHAN_CMD(len, dir, inc, loc_2d, ext_2d, int_en, event_en, broadcast)  \
-  (len | dir | inc | loc_2d | ext_2d | broadcast | int_en | event_en)
-
-typedef enum {
-  MCHAN_DMA_TRANSFER_DIRECTION_EXT2LOC = MCHAN_CMD_FLAG_DIRECTION_EXT2LOC,
-  MCHAN_DMA_TRANSFER_DIRECTION_LOC2EXT = MCHAN_CMD_FLAG_DIRECTION_LOC2EXT
-} mchan_dma_transfer_direction_e;
-
-typedef struct {
-  int cmd;
-  int size;
-
-  void *loc;
-  int loc_size_1d;
-  int loc_stride_1d;
-
-  void *ext;
-  int ext_size_1d;
-  int ext_stride_1d;
-} mchan_transfer_t;
-
-static int mchan_transfer_get_id() { return MCHAN_READ_CMD(); }
-
-static void mchan_transfer_push_1d(mchan_transfer_t trans) {
-  MCHAN_WRITE_CMD(trans.cmd);
-  MCHAN_WRITE_CMD(trans.loc);
-  MCHAN_WRITE_CMD(trans.ext);
-}
-
-static void mchan_transfer_push_2d(mchan_transfer_t trans) {
-  MCHAN_WRITE_CMD(trans.cmd);
-  MCHAN_WRITE_CMD(trans.loc);
-  MCHAN_WRITE_CMD(trans.ext);
-// MCHAN version 7 takes 2D "count" (length of 1D transfers) and stride in 2
-// steps, v7 takes it in 1 step with the stride shifted to the upper 16 bits.
-#if MCHAN_VERSION == 7
-  MCHAN_WRITE_CMD(trans.ext_size_1d);
-  MCHAN_WRITE_CMD(trans.ext_stride_1d);
-#else
-  MCHAN_WRITE_CMD(trans.ext_size_1d | (trans.ext_stride_1d << 16));
-#endif
-}
-
-static void mchan_transfer_push(mchan_transfer_t trans) {
-  MCHAN_WRITE_CMD(trans.cmd);
-  MCHAN_WRITE_CMD(trans.loc);
-  MCHAN_WRITE_CMD(trans.ext);
-
-  if (trans.ext_size_1d < trans.size) {
-    MCHAN_WRITE_CMD(trans.ext_size_1d);
-    MCHAN_WRITE_CMD(trans.ext_stride_1d);
-  }
-
-  if (trans.loc_size_1d < trans.size) {
-    MCHAN_WRITE_CMD(trans.loc_size_1d);
-    MCHAN_WRITE_CMD(trans.loc_stride_1d);
-  }
-}
-
-static void mchan_transfer_free(int tid) { MCHAN_WRITE_STATUS(1 << tid); }
-
-static int mchan_transfer_busy(int tid) {
-  return MCHAN_READ_STATUS() & (1 << tid);
-}
-
-static void mchan_transfer_wait(int tid) {
-#if defined(MCHAN_EVENT)
-  while (mchan_transfer_busy(tid))
-    eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT);
-#elif defined(MCHAN_POLLED)
-  while (mchan_transfer_busy(tid))
-    ;
-#endif
-}
-
-#endif
diff --git a/TargetLibraries/PULPOpen/inc/mchan_siracusa.h b/TargetLibraries/PULPOpen/inc/mchan_siracusa.h
new file mode 100644
index 0000000000..c0ecc02ebb
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/mchan_siracusa.h
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Default mchan base address
+#ifndef MCHAN_BASE_ADDR
+#define MCHAN_BASE_ADDR (ARCHI_MCHAN_DEMUX_ADDR) // CLUSTER_MCHAN_ADDR
+#endif
+
+// Default mchan await mode
+#if !defined(MCHAN_EVENT) && !defined(MCHAN_POLLED)
+#define MCHAN_EVENT
+#endif
+
+#ifdef MCHAN_EVENT
+#define MCHAN_EVENT_BIT (ARCHI_CL_EVT_DMA0) // 8
+#endif
+
+#include "mchan_v7.h"
diff --git a/TargetLibraries/PULPOpen/inc/mchan_v6.h b/TargetLibraries/PULPOpen/inc/mchan_v6.h
new file mode 100644
index 0000000000..34a42d882e
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/mchan_v6.h
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __MCHAN_V6_H__
+#define __MCHAN_V6_H__
+
+#include "assert.h"
+
+// Requires to have MCHAN_BASE_ADDR, MCHAN_EVENT defined outside of header
+#ifndef MCHAN_BASE_ADDR
+#error "[mchan_v6.h] MCHAN_BASE_ADDR not defined!"
+#endif
+
+#if !defined(MCHAN_EVENT) && !defined(MCHAN_POLLED)
+#error "[mchan_v6.h] Nor MCHAN_EVENT nor MCHAN_POLLED defined!"
+#endif
+
+#if defined(MCHAN_EVENT) && defined(MCHAN_POLLED)
+#error "[mchan_v6.h] Define either MCHAN_EVENT or MCHAN_POLLED, not both!"
+#endif
+
+#if defined(MCHAN_EVENT) && !defined(MCHAN_EVENT_BIT)
+#error                                                                         \
+    "[mchan_v6.h] MCHAN_EVENT_BIT should be defined when using events as signalization!"
+#endif
+
+#if !defined(MCHAN_VERSION)
+#define MCHAN_VERSION 6
+#elif MCHAN_VERSION != 6
+#error "[mchan_v6.h] Illegal MCHAN_VERSION. Supported only 6"
+#endif
+
+#include "pmsis.h"
+
+#define MCHAN_TRANSFER_LEN_SIZE (16)
+#define MCHAN_CHANNEL_ID_MAX (15)
+
+#define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0))
+#define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0))
+#define MCHAN_CMD_FLAG_INCREMENTAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 1))
+#define MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 2))
+#define MCHAN_CMD_FLAG_EVENT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 3))
+#define MCHAN_CMD_FLAG_INTERRUPT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 4))
+#define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5))
+
+static volatile uint32_t *const cmd_ptr =
+    (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x0);
+static volatile uint32_t *const status_ptr =
+    (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x4);
+
+static void mchan_transfer_1d(uint32_t cmd, void *loc, void *ext) {
+  // TODO: assert flags are set correctly
+  *cmd_ptr = (uint32_t)cmd;
+  *cmd_ptr = (uint32_t)loc;
+  *cmd_ptr = (uint32_t)ext;
+}
+
+static void mchan_transfer_2d_ext_strided(uint32_t cmd, void *loc, void *ext,
+                                          uint16_t ext_size_1d,
+                                          uint16_t ext_stride_2d) {
+  // TODO: assert flags are set correctly
+  *cmd_ptr = (uint32_t)cmd;
+  *cmd_ptr = (uint32_t)loc;
+  *cmd_ptr = (uint32_t)ext;
+  *cmd_ptr = (uint32_t)ext_size_1d | ((uint32_t)ext_stride_2d << 16);
+}
+
+static uint32_t mchan_channel_alloc() { return *cmd_ptr; }
+
+static void mchan_channel_free(uint32_t channel_id) {
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
+  *status_ptr = 1 << channel_id;
+}
+
+static uint32_t mchan_channel_is_busy(uint32_t channel_id) {
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
+  return *status_ptr & (1 << channel_id);
+}
+
+static void mchan_channel_wait(uint32_t channel_id) {
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
+#if defined(MCHAN_EVENT)
+  while (mchan_channel_is_busy(channel_id))
+    eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT);
+#elif defined(MCHAN_POLLED)
+  while (mchan_channel_is_busy(channel_id))
+    ;
+#endif
+}
+
+#endif // __MCHAN_V6_H__
diff --git a/TargetLibraries/PULPOpen/inc/mchan_v7.h b/TargetLibraries/PULPOpen/inc/mchan_v7.h
new file mode 100644
index 0000000000..32ef836f34
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/mchan_v7.h
@@ -0,0 +1,120 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __MCHAN_V7_H__
+#define __MCHAN_V7_H__
+
+#include "assert.h"
+
+// Requires to have MCHAN_BASE_ADDR, MCHAN_EVENT defined outside of header
+#ifndef MCHAN_BASE_ADDR
+#error "[mchan_v7.h] MCHAN_BASE_ADDR not defined!"
+#endif
+
+#if !defined(MCHAN_EVENT) && !defined(MCHAN_POLLED)
+#error "[mchan_v7.h] Nor MCHAN_EVENT nor MCHAN_POLLED defined!"
+#endif
+
+#if defined(MCHAN_EVENT) && defined(MCHAN_POLLED)
+#error "[mchan_v7.h] Define either MCHAN_EVENT or MCHAN_POLLED, not both!"
+#endif
+
+#if defined(MCHAN_EVENT) && !defined(MCHAN_EVENT_BIT)
+#error                                                                         \
+    "[mchan_v7.h] MCHAN_EVENT_BIT should be defined when using events as signalization!"
+#endif
+
+#if !defined(MCHAN_VERSION)
+#define MCHAN_VERSION 7
+#elif MCHAN_VERSION != 7
+#error "[mchan_v7.h] Illegal MCHAN_VERSION. Supported only 7"
+#endif
+
+#include "pmsis.h"
+
+#define MCHAN_TRANSFER_LEN_SIZE (17)
+#define MCHAN_CHANNEL_ID_MAX (15)
+
+#define MCHAN_CMD_FLAG_DIRECTION_LOC2EXT (0 << (MCHAN_TRANSFER_LEN_SIZE + 0))
+#define MCHAN_CMD_FLAG_DIRECTION_EXT2LOC (1 << (MCHAN_TRANSFER_LEN_SIZE + 0))
+#define MCHAN_CMD_FLAG_INCREMENTAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 1))
+#define MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 2))
+#define MCHAN_CMD_FLAG_EVENT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 3))
+#define MCHAN_CMD_FLAG_INTERRUPT_ENABLE (1 << (MCHAN_TRANSFER_LEN_SIZE + 4))
+#define MCHAN_CMD_FLAG_BROADCAST_FINISH (1 << (MCHAN_TRANSFER_LEN_SIZE + 5))
+#define MCHAN_CMD_FLAG_2D_TRANSFER_LOCAL (1 << (MCHAN_TRANSFER_LEN_SIZE + 6))
+
+static volatile uint32_t *const cmd_ptr =
+    (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x0);
+static volatile uint32_t *const status_ptr =
+    (volatile uint32_t *const)(MCHAN_BASE_ADDR + 0x4);
+
+static void mchan_transfer_1d(uint32_t cmd, void *loc, void *ext) {
+  // TODO: assert flags are set correctly
+  *cmd_ptr = (uint32_t)cmd;
+  *cmd_ptr = (uint32_t)loc;
+  *cmd_ptr = (uint32_t)ext;
+}
+
+static void mchan_transfer_2d_loc_strided(uint32_t cmd, void *loc, void *ext,
+                                          uint32_t loc_size_1d,
+                                          uint32_t loc_stride_2d) {
+  // TODO: assert flags are set correctly
+  *cmd_ptr = (uint32_t)cmd;
+  *cmd_ptr = (uint32_t)loc;
+  *cmd_ptr = (uint32_t)ext;
+  *cmd_ptr = (uint32_t)loc_size_1d;
+  *cmd_ptr = (uint32_t)loc_stride_2d;
+}
+
+static void mchan_transfer_2d_ext_strided(uint32_t cmd, void *loc, void *ext,
+                                          uint32_t ext_size_1d,
+                                          uint32_t ext_stride_2d) {
+  // TODO: assert flags are set correctly
+  *cmd_ptr = (uint32_t)cmd;
+  *cmd_ptr = (uint32_t)loc;
+  *cmd_ptr = (uint32_t)ext;
+  *cmd_ptr = (uint32_t)ext_size_1d;
+  *cmd_ptr = (uint32_t)ext_stride_2d;
+}
+
+static void mchan_transfer_2d_loc_strided_ext_strided(
+    uint32_t cmd, void *loc, void *ext, uint32_t loc_size_1d,
+    uint32_t loc_stride_2d, uint32_t ext_size_1d, uint32_t ext_stride_2d) {
+  // TODO: assert flags are set correctly
+  *cmd_ptr = (uint32_t)cmd;
+  *cmd_ptr = (uint32_t)loc;
+  *cmd_ptr = (uint32_t)ext;
+  *cmd_ptr = (uint32_t)ext_size_1d;
+  *cmd_ptr = (uint32_t)ext_stride_2d;
+  *cmd_ptr = (uint32_t)loc_size_1d;
+  *cmd_ptr = (uint32_t)loc_stride_2d;
+}
+
+static uint32_t mchan_channel_alloc() { return *cmd_ptr; }
+
+static void mchan_channel_free(uint32_t channel_id) {
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
+  *status_ptr = 1 << channel_id;
+}
+
+static uint32_t mchan_channel_is_busy(uint32_t channel_id) {
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
+  return *status_ptr & (1 << channel_id);
+}
+
+static void mchan_channel_wait(uint32_t channel_id) {
+  assert(channel_id <= MCHAN_CHANNEL_ID_MAX);
+#if defined(MCHAN_EVENT)
+  while (mchan_channel_is_busy(channel_id))
+    eu_evt_maskWaitAndClr(1 << MCHAN_EVENT_BIT);
+#elif defined(MCHAN_POLLED)
+  while (mchan_channel_is_busy(channel_id))
+    ;
+#endif
+}
+
+#endif // __MCHAN_V7_H__
diff --git a/TargetLibraries/PULPOpen/inc/pulp_core.h b/TargetLibraries/PULPOpen/inc/pulp_core.h
index 809a16e299..c6877ed587 100644
--- a/TargetLibraries/PULPOpen/inc/pulp_core.h
+++ b/TargetLibraries/PULPOpen/inc/pulp_core.h
@@ -1,3 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_PULPCORE_HEADER_
+#define __DEEPLOY_MATH_PULPCORE_HEADER_
+
 #define BEGIN_SINGLE_CORE if (pi_core_id() == 0) {
 #define END_SINGLE_CORE }
-#define SINGLE_CORE if (pi_core_id() == 0)
\ No newline at end of file
+#define SINGLE_CORE if (pi_core_id() == 0)
+
+#endif //__DEEPLOY_MATH_PULPCORE_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/types.h b/TargetLibraries/PULPOpen/inc/types.h
index 1031a86859..802ea2ea1a 100644
--- a/TargetLibraries/PULPOpen/inc/types.h
+++ b/TargetLibraries/PULPOpen/inc/types.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        types.h
- * Description:
- *
- * Date:         11.11.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2024 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Francesco Conti, University of Bologna
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_TYPES_HEADER_
diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
index 43ec7f4b5d..af21293233 100644
--- a/TargetLibraries/PULPOpen/src/Convolution_fp32.c
+++ b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
@@ -1,47 +1,25 @@
-/* =====================================================================
- * Title:        Conv.c
- * Description:  Float32 version of Conv2D with NCHW format (pre-padded input)
- *
- * Date:         05.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
 
-void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                    uint32_t H, uint32_t W, uint32_t C,
-                                    const float32_t *__restrict__ pSrcB,
-                                    uint32_t F_total, uint32_t P, uint32_t Q,
-                                    uint32_t SP, uint32_t SQ,
-                                    float32_t *__restrict__ pDstC,
-                                    uint32_t pad_top, uint32_t pad_bottom,
-                                    uint32_t pad_left, uint32_t pad_right) {
+void PULP_Conv2d_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right) {
 
+  // Compute core
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
+  // Compute the chunk size for each core
   uint16_t ch_out_chunk =
       (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
   uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
@@ -52,37 +30,72 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
     return;
   }
 
+  // Pointer to the weights for the current core
   const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q;
 
+  // Compute the output dimensions
   uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
   uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
 
-  for (uint32_t h = 0; h < H_out; ++h) {
-    for (uint32_t w = 0; w < W_out; ++w) {
-      for (uint32_t f = 0; f < ch_out_count; ++f) {
-        float32_t sum = 0.0f;
+  // Compute the output
+  if (has_bias) {
+    for (uint32_t h = 0; h < H_out; ++h) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        for (uint32_t f = 0; f < ch_out_count; ++f) {
+          float32_t sum = 0.0f;
 
-        for (uint32_t p = 0; p < P; ++p) {
-          for (uint32_t q = 0; q < Q; ++q) {
-            for (uint32_t c = 0; c < C; ++c) {
-              int32_t h_in = h * SP + p - pad_top;
-              int32_t w_in = w * SQ + q - pad_left;
+          for (uint32_t p = 0; p < P; ++p) {
+            for (uint32_t q = 0; q < Q; ++q) {
+              for (uint32_t c = 0; c < C; ++c) {
+                int32_t h_in = h * SP + p - pad_top;
+                int32_t w_in = w * SQ + q - pad_left;
 
-              if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
-                  w_in >= (int32_t)W) {
-                continue;
-              }
+                if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
+                    w_in >= (int32_t)W) {
+                  continue;
+                }
 
-              uint32_t input_idx = (h_in * W + w_in) * C + c;
-              uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
+                uint32_t input_idx = (h_in * W + w_in) * C + c;
+                uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
 
-              sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+                sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+              }
             }
           }
+
+          uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
+          pDstC[output_idx] = sum + pSrcBias[f + ch_out_start];
         }
+      }
+    }
+  } else {
+    for (uint32_t h = 0; h < H_out; ++h) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        for (uint32_t f = 0; f < ch_out_count; ++f) {
+          float32_t sum = 0.0f;
+
+          for (uint32_t p = 0; p < P; ++p) {
+            for (uint32_t q = 0; q < Q; ++q) {
+              for (uint32_t c = 0; c < C; ++c) {
+                int32_t h_in = h * SP + p - pad_top;
+                int32_t w_in = w * SQ + q - pad_left;
+
+                if (h_in < 0 || h_in >= (int32_t)H || w_in < 0 ||
+                    w_in >= (int32_t)W) {
+                  continue;
+                }
+
+                uint32_t input_idx = (h_in * W + w_in) * C + c;
+                uint32_t weight_idx = f * (P * Q * C) + p * (Q * C) + q * C + c;
+
+                sum += pSrcA[input_idx] * weight_ptr[weight_idx];
+              }
+            }
+          }
 
-        uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
-        pDstC[output_idx] = sum;
+          uint32_t output_idx = (h * W_out + w) * F_total + (ch_out_start + f);
+          pDstC[output_idx] = sum;
+        }
       }
     }
   }
@@ -91,12 +104,17 @@ void PULP_Conv2d_fp32_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
 void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
     const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
-    uint32_t Q, uint32_t SP, uint32_t SQ, float32_t *__restrict__ pDstC,
-    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
-    uint32_t pad_right, float32_t *__restrict__ pContextBuffer) {
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer) {
+
+  // Compute core
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
+  // Compute the chunk size for each core
   uint16_t ch_out_chunk =
       (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
   uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
@@ -107,50 +125,95 @@ void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     return;
   }
 
+  // Pointer to the weights for the current core
   const float32_t *weight_ptr = pSrcB + ch_out_start * C * P * Q;
 
   uint32_t im2col_size_per_core = C * P * Q;
   float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core;
 
+  // Compute the output dimensions
   uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
   uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
   uint32_t kernel_size = P * Q * C;
 
-  for (uint32_t h_out = 0; h_out < H_out; h_out++) {
-    for (uint32_t w_out = 0; w_out < W_out; w_out++) {
-      int32_t h_in_start = h_out * SP - pad_top;
-      int32_t w_in_start = w_out * SQ - pad_left;
+  // Compute the output
+  if (has_bias) {
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        for (uint32_t p = 0; p < P; p++) {
+          int32_t h_in = h_in_start + p;
+
+          for (uint32_t q = 0; q < Q; q++) {
+            int32_t w_in = w_in_start + q;
+
+            for (uint32_t c = 0; c < C; c++) {
+              if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
+                  w_in < (int32_t)W) {
+                uint32_t in_idx = (h_in * W + w_in) * C + c;
+                im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
+              } else {
+                im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+              }
+            }
+          }
+        }
 
-      for (uint32_t p = 0; p < P; p++) {
-        int32_t h_in = h_in_start + p;
+        for (uint32_t f = ch_out_start; f < ch_out_stop; f++) {
+          float32_t sum = 0.0f;
+          const float32_t *local_weight_ptr =
+              weight_ptr + (f - ch_out_start) * kernel_size;
 
-        for (uint32_t q = 0; q < Q; q++) {
-          int32_t w_in = w_in_start + q;
+          for (uint32_t k = 0; k < kernel_size; k++) {
+            sum += im2col_buffer[k] * local_weight_ptr[k];
+          }
+
+          uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
 
-          for (uint32_t c = 0; c < C; c++) {
-            if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
-                w_in < (int32_t)W) {
-              uint32_t in_idx = (h_in * W + w_in) * C + c;
-              im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
-            } else {
-              im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+          pDstC[out_idx] = sum + pSrcBias[f];
+        }
+      }
+    }
+  } else {
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        for (uint32_t p = 0; p < P; p++) {
+          int32_t h_in = h_in_start + p;
+
+          for (uint32_t q = 0; q < Q; q++) {
+            int32_t w_in = w_in_start + q;
+
+            for (uint32_t c = 0; c < C; c++) {
+              if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
+                  w_in < (int32_t)W) {
+                uint32_t in_idx = (h_in * W + w_in) * C + c;
+                im2col_buffer[p * Q * C + q * C + c] = pSrcA[in_idx];
+              } else {
+                im2col_buffer[p * Q * C + q * C + c] = 0.0f;
+              }
             }
           }
         }
-      }
 
-      for (uint32_t f = 0; f < ch_out_count; f++) {
-        float32_t sum = 0.0f;
-        const float32_t *local_weight_ptr = weight_ptr + f * kernel_size;
+        for (uint32_t f = ch_out_start; f < ch_out_stop; f++) {
+          float32_t sum = 0.0f;
+          const float32_t *local_weight_ptr =
+              weight_ptr + (f - ch_out_start) * kernel_size;
 
-        for (uint32_t k = 0; k < kernel_size; k++) {
-          sum += im2col_buffer[k] * local_weight_ptr[k];
-        }
+          for (uint32_t k = 0; k < kernel_size; k++) {
+            sum += im2col_buffer[k] * local_weight_ptr[k];
+          }
+
+          uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
 
-        uint32_t out_idx =
-            (h_out * W_out + w_out) * F_total + (ch_out_start + f);
-        pDstC[out_idx] = sum;
+          pDstC[out_idx] = sum;
+        }
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c
new file mode 100644
index 0000000000..3565fc664d
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/DWConvolution_fp32.c
@@ -0,0 +1,251 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
+    uint32_t Q, uint32_t SP, uint32_t SQ,
+    const float32_t *__restrict__ pSrcBias, const bool has_bias,
+    float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom,
+    uint32_t pad_left, uint32_t pad_right,
+    float32_t *__restrict__ pContextBuffer) {
+
+  // Compute core information
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = LOG2(NUM_CORES);
+
+  // Compute the chunk size for each core
+  // (Splitting work along the output channels)
+  uint16_t ch_out_chunk =
+      (F_total >> log2Core) + ((F_total & (NUM_CORES - 1)) != 0);
+  uint16_t ch_out_start = MIN(ch_out_chunk * core_id, F_total);
+  uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, F_total);
+  uint16_t ch_out_count = ch_out_stop - ch_out_start;
+
+  // If there is no output channel to process, return
+  // (when F < NUM_CORES and working on a core with id > F)
+  if (ch_out_count == 0) {
+    return;
+  }
+
+  // Move pointer of the weights for the current core
+  const float32_t *weight_ptr = pSrcB + ch_out_start * P * Q;
+
+  // Move pointer of the im2col buffer for the current core
+  uint32_t im2col_size_per_core = P * Q;
+  float32_t *im2col_buffer = pContextBuffer + core_id * im2col_size_per_core;
+
+  // Compute the output dimensions
+  uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
+  uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
+  uint32_t kernel_size = P * Q * F_total;
+
+  // Compute the output
+  if (has_bias) {
+    // Work on individual output elements
+    // (each element depends on a column from the im2col buffer
+    // and one convolutional filter, stored in memory continuously)
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        // Compute height and width starting point
+        // (depending on stride and padding)
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        // Initialize the padded part of the im2col buffer with 0
+        // Work on the TOP padding
+        for (int32_t h_in = (int32_t)h_in_start;
+             h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the BOTTOM padding
+        for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P;
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining LEFT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining RIGHT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q;
+               w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Copy input data to im2col buffer
+        // Input channels depend on the output channels assigned to the core
+        // (each input channel is associated with F_total / C output channels,
+        // number which corresponds to the "group" parameter in the Conv ONNX
+        // operator)
+        for (uint32_t c = ch_out_start / (F_total / C);
+             c < (ch_out_stop + 1) / (F_total / C); c++) {
+          // Copy the valid input data to the im2col buffer
+          for (uint32_t h_in = MAX(0, h_in_start);
+               h_in < MIN(H, h_in_start + P); h_in++) {
+            for (uint32_t w_in = MAX(0, w_in_start);
+                 w_in < MIN(W, w_in_start + Q); w_in++) {
+              uint32_t in_idx = (h_in * W + w_in) * C + c;
+              im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] =
+                  pSrcA[in_idx];
+            }
+          }
+
+          // Compute output channels of interest, based on current input channel
+          // and core
+          uint32_t lower_f, upper_f;
+
+          if (c * (F_total / C) < ch_out_start) {
+            lower_f = ch_out_start;
+          } else {
+            lower_f = c * (F_total / C);
+          }
+
+          if ((c + 1) * (F_total / C) < ch_out_stop) {
+            upper_f = (c + 1) * (F_total / C);
+          } else {
+            upper_f = ch_out_stop;
+          }
+
+          // Perform convolution for the assigned output channels
+          for (uint32_t f = lower_f; f < upper_f; f++) {
+            float32_t sum = 0.0f;
+            uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+            for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) {
+              sum +=
+                  im2col_buffer[im2col_idx] *
+                  weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)];
+            }
+
+            // Copy the result to the output tensor
+            pDstC[out_idx] = sum + pSrcBias[f];
+          }
+        }
+      }
+    }
+  } else {
+    // Work on individual output elements
+    // (each element depends on a column from the im2col buffer
+    // and one convolutional filter, stored in memory continuously)
+    for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+      for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+        // Compute height and width starting point
+        // (depending on stride and padding)
+        int32_t h_in_start = h_out * SP - pad_top;
+        int32_t w_in_start = w_out * SQ - pad_left;
+
+        // Initialize the padded part of the im2col buffer with 0
+        // Work on the TOP padding
+        for (int32_t h_in = (int32_t)h_in_start;
+             h_in < MIN(0, (int32_t)(h_in_start + P)); h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the BOTTOM padding
+        for (uint32_t h_in = MAX(H, h_in_start); h_in < h_in_start + P;
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < (int32_t)(w_in_start + Q); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining LEFT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (int32_t w_in = (int32_t)w_in_start;
+               w_in < MIN(0, (int32_t)(w_in_start + Q)); w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Work on the remaining RIGHT padding
+        for (uint32_t h_in = MAX(0, h_in_start); h_in < MIN(H, h_in_start + P);
+             h_in++) {
+          for (uint32_t w_in = MAX(W, w_in_start); w_in < w_in_start + Q;
+               w_in++) {
+            im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] = 0.0f;
+          }
+        }
+
+        // Copy input data to im2col buffer
+        // Input channels depend on the output channels assigned to the core
+        // (each input channel is associated with F_total / C output channels,
+        // number which corresponds to the "group" parameter in the Conv ONNX
+        // operator)
+        for (uint32_t c = ch_out_start / (F_total / C);
+             c < (ch_out_stop + 1) / (F_total / C); c++) {
+          // Copy the valid input data to the im2col buffer
+          for (uint32_t h_in = MAX(0, h_in_start);
+               h_in < MIN(H, h_in_start + P); h_in++) {
+            for (uint32_t w_in = MAX(0, w_in_start);
+                 w_in < MIN(W, w_in_start + Q); w_in++) {
+              uint32_t in_idx = (h_in * W + w_in) * C + c;
+              im2col_buffer[(h_in - h_in_start) * Q + (w_in - w_in_start)] =
+                  pSrcA[in_idx];
+            }
+          }
+
+          // Compute output channels of interest, based on current input channel
+          // and core
+          uint32_t lower_f, upper_f;
+
+          if (c * (F_total / C) < ch_out_start) {
+            lower_f = ch_out_start;
+          } else {
+            lower_f = c * (F_total / C);
+          }
+
+          if ((c + 1) * (F_total / C) < ch_out_stop) {
+            upper_f = (c + 1) * (F_total / C);
+          } else {
+            upper_f = ch_out_stop;
+          }
+
+          // Perform convolution for the assigned output channels
+          for (uint32_t f = lower_f; f < upper_f; f++) {
+            float32_t sum = 0.0f;
+            uint32_t out_idx = (h_out * W_out + w_out) * F_total + f;
+
+            for (uint32_t im2col_idx = 0; im2col_idx < P * Q; im2col_idx++) {
+              sum +=
+                  im2col_buffer[im2col_idx] *
+                  weight_ptr[(f - ch_out_start) * P * Q + im2col_idx % (P * Q)];
+            }
+
+            // Copy the result to the output tensor
+            pDstC[out_idx] = sum;
+          }
+        }
+      }
+    }
+  }
+
+  return;
+}
diff --git a/TargetLibraries/PULPOpen/src/GELU.c b/TargetLibraries/PULPOpen/src/GELU.c
index a128645638..5a0a4fa3f0 100644
--- a/TargetLibraries/PULPOpen/src/GELU.c
+++ b/TargetLibraries/PULPOpen/src/GELU.c
@@ -1,35 +1,10 @@
-/* =====================================================================
- * Title:        GELU.c
- * Description:
- *
- * $Date:        05.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "pmsis.h"
-#include "pulp_nn_kernels.h"
-#include "pulp_nn_utils.h"
 
 #include "DeeployPULPMath.h"
 
@@ -37,23 +12,21 @@
 
 void PULP_GELU_fp32_fp32(float32_t *data_in, float32_t *data_out,
                          int32_t dataSize) {
+  // Get core information
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
-  int16_t chunk = (dataSize >> log2Core) + ((dataSize & (NUM_CORES - 1)) != 0);
-  int16_t chunk_start = MIN(chunk * core_id, dataSize);
-  int16_t chunk_stop = MIN(chunk_start + chunk, dataSize);
-  const float32_t sqrt_2_over_pi = 0.7978845608f; // sqrt(2/π)
-  const float32_t coeff = 0.044715f;
+  int8_t log2Core = LOG2(NUM_CORES);
 
-  for (uint32_t i = chunk_start; i < chunk_stop; i++) {
-    float32_t x = data_in[i];
-    float32_t x_cubed = x * x * x;
-    float32_t inner = sqrt_2_over_pi * (x + coeff * x_cubed);
+  // Split into chunks for each core
+  int32_t chunk = (dataSize >> log2Core) + ((dataSize & (NUM_CORES - 1)) != 0);
+  int32_t chunk_start = MIN(chunk * core_id, dataSize);
+  int32_t chunk_stop = MIN(chunk_start + chunk, dataSize);
 
-    float32_t exp_2z = expf(2.0f * inner);
-    float32_t tanh_val = (exp_2z - 1.0f) / (exp_2z + 1.0f);
+  // Compute GELU on the assigned chunk
+  for (int32_t i = chunk_start; i < chunk_stop; i++) {
+    float32_t x = data_in[i];
+    float32_t cdf = 0.5f * (1.0f + tanhf((sqrtf(2.0f / (float)M_PI) *
+                                          (x + 0.044715f * powf(x, 3.0f)))));
 
-    float32_t cdf = 0.5f * (1.0f + tanh_val);
     data_out[i] = x * cdf;
   }
 }
@@ -61,7 +34,7 @@ void PULP_GELU_fp32_fp32(float32_t *data_in, float32_t *data_out,
 void PULP_GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out,
                                  int32_t dataSize) {
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
   int16_t chunk = (dataSize >> log2Core) + ((dataSize & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, dataSize);
   int16_t chunk_stop = MIN(chunk_start + chunk, dataSize);
diff --git a/TargetLibraries/PULPOpen/src/Gemm.c b/TargetLibraries/PULPOpen/src/Gemm.c
index 58d1688a46..a46f8ac6ae 100644
--- a/TargetLibraries/PULPOpen/src/Gemm.c
+++ b/TargetLibraries/PULPOpen/src/Gemm.c
@@ -1,31 +1,7 @@
-
-/* =====================================================================
- * Title:        Gemm.c
- * Description:
- *
- * Date:         05.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
@@ -39,7 +15,7 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
                                    uint32_t transB) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
   uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0);
   uint32_t M_start = MIN(core_id * M_chunk, M);
@@ -50,15 +26,329 @@ void PULP_Gemm_fp32_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
     return;
   }
 
-  for (uint32_t i = M_start; i < M_end; ++i) {
-    for (uint32_t j = 0; j < O; ++j) {
-      float32_t sum = 0.0f;
-      for (uint32_t k = 0; k < N; ++k) {
-        uint32_t a_idx = transA ? (k * M + i) : (i * N + k);
-        uint32_t b_idx = transB ? (j * N + k) : (k * O + j);
-        sum += pSrcA[a_idx] * pSrcB[b_idx];
+  const uint32_t has_bias = (pDstC != NULL);
+  const uint32_t N_unroll = N - (N % 6);
+  const uint32_t O_unroll = O - (O % 6);
+
+  if (!transA && !transB) {
+
+    for (uint32_t i = M_start; i < M_end; ++i) {
+      const float32_t *__restrict__ a_row = &pSrcA[i * N];
+      float32_t *__restrict__ y_row = &pDstY[i * O];
+      const float32_t *__restrict__ c_row = has_bias ? &pDstC[i * O] : NULL;
+
+      uint32_t j = 0;
+
+      for (; j < O_unroll; j += 6) {
+        float32_t sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f,
+                  sum4 = 0.0f, sum5 = 0.0f;
+
+        uint32_t k = 0;
+
+        for (; k < N; ++k) {
+          const float32_t a_val = a_row[k];
+          sum0 += a_val * pSrcB[k * O + j];
+          sum1 += a_val * pSrcB[k * O + j + 1];
+          sum2 += a_val * pSrcB[k * O + j + 2];
+          sum3 += a_val * pSrcB[k * O + j + 3];
+          sum4 += a_val * pSrcB[k * O + j + 4];
+          sum5 += a_val * pSrcB[k * O + j + 5];
+        }
+
+        if (has_bias) {
+          y_row[j] = sum0 + c_row[j];
+          y_row[j + 1] = sum1 + c_row[j + 1];
+          y_row[j + 2] = sum2 + c_row[j + 2];
+          y_row[j + 3] = sum3 + c_row[j + 3];
+          y_row[j + 4] = sum4 + c_row[j + 4];
+          y_row[j + 5] = sum5 + c_row[j + 5];
+        } else {
+          y_row[j] = sum0;
+          y_row[j + 1] = sum1;
+          y_row[j + 2] = sum2;
+          y_row[j + 3] = sum3;
+          y_row[j + 4] = sum4;
+          y_row[j + 5] = sum5;
+        }
+      }
+
+      for (; j < O; ++j) {
+        float32_t sum = 0.0f;
+        for (uint32_t k = 0; k < N; ++k) {
+          sum += a_row[k] * pSrcB[k * O + j];
+        }
+
+        y_row[j] = has_bias ? sum + c_row[j] : sum;
+      }
+    }
+  } else if (transA && !transB) {
+
+    for (uint32_t i = M_start; i < M_end; ++i) {
+      float32_t *__restrict__ y_row = &pDstY[i * O];
+      const float32_t *__restrict__ c_row = has_bias ? &pDstC[i * O] : NULL;
+
+      uint32_t j = 0;
+      for (; j < O_unroll; j += 6) {
+        float32_t sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f,
+                  sum4 = 0.0f, sum5 = 0.0f;
+
+        uint32_t k = 0;
+        for (; k < N_unroll; k += 6) {
+          const float32_t a0 = pSrcA[k * M + i];
+          const float32_t a1 = pSrcA[(k + 1) * M + i];
+          const float32_t a2 = pSrcA[(k + 2) * M + i];
+          const float32_t a3 = pSrcA[(k + 3) * M + i];
+          const float32_t a4 = pSrcA[(k + 4) * M + i];
+          const float32_t a5 = pSrcA[(k + 5) * M + i];
+
+          sum0 += a0 * pSrcB[k * O + j] + a1 * pSrcB[(k + 1) * O + j] +
+                  a2 * pSrcB[(k + 2) * O + j] + a3 * pSrcB[(k + 3) * O + j] +
+                  a4 * pSrcB[(k + 4) * O + j] + a5 * pSrcB[(k + 5) * O + j];
+          sum1 += a0 * pSrcB[k * O + j + 1] + a1 * pSrcB[(k + 1) * O + j + 1] +
+                  a2 * pSrcB[(k + 2) * O + j + 1] +
+                  a3 * pSrcB[(k + 3) * O + j + 1] +
+                  a4 * pSrcB[(k + 4) * O + j + 1] +
+                  a5 * pSrcB[(k + 5) * O + j + 1];
+          sum2 += a0 * pSrcB[k * O + j + 2] + a1 * pSrcB[(k + 1) * O + j + 2] +
+                  a2 * pSrcB[(k + 2) * O + j + 2] +
+                  a3 * pSrcB[(k + 3) * O + j + 2] +
+                  a4 * pSrcB[(k + 4) * O + j + 2] +
+                  a5 * pSrcB[(k + 5) * O + j + 2];
+          sum3 += a0 * pSrcB[k * O + j + 3] + a1 * pSrcB[(k + 1) * O + j + 3] +
+                  a2 * pSrcB[(k + 2) * O + j + 3] +
+                  a3 * pSrcB[(k + 3) * O + j + 3] +
+                  a4 * pSrcB[(k + 4) * O + j + 3] +
+                  a5 * pSrcB[(k + 5) * O + j + 3];
+          sum4 += a0 * pSrcB[k * O + j + 4] + a1 * pSrcB[(k + 1) * O + j + 4] +
+                  a2 * pSrcB[(k + 2) * O + j + 4] +
+                  a3 * pSrcB[(k + 3) * O + j + 4] +
+                  a4 * pSrcB[(k + 4) * O + j + 4] +
+                  a5 * pSrcB[(k + 5) * O + j + 4];
+          sum5 += a0 * pSrcB[k * O + j + 5] + a1 * pSrcB[(k + 1) * O + j + 5] +
+                  a2 * pSrcB[(k + 2) * O + j + 5] +
+                  a3 * pSrcB[(k + 3) * O + j + 5] +
+                  a4 * pSrcB[(k + 4) * O + j + 5] +
+                  a5 * pSrcB[(k + 5) * O + j + 5];
+        }
+
+        for (; k < N; ++k) {
+          const float32_t a_val = pSrcA[k * M + i];
+          sum0 += a_val * pSrcB[k * O + j];
+          sum1 += a_val * pSrcB[k * O + j + 1];
+          sum2 += a_val * pSrcB[k * O + j + 2];
+          sum3 += a_val * pSrcB[k * O + j + 3];
+          sum4 += a_val * pSrcB[k * O + j + 4];
+          sum5 += a_val * pSrcB[k * O + j + 5];
+        }
+
+        if (has_bias) {
+          y_row[j] = sum0 + c_row[j];
+          y_row[j + 1] = sum1 + c_row[j + 1];
+          y_row[j + 2] = sum2 + c_row[j + 2];
+          y_row[j + 3] = sum3 + c_row[j + 3];
+          y_row[j + 4] = sum4 + c_row[j + 4];
+          y_row[j + 5] = sum5 + c_row[j + 5];
+        } else {
+          y_row[j] = sum0;
+          y_row[j + 1] = sum1;
+          y_row[j + 2] = sum2;
+          y_row[j + 3] = sum3;
+          y_row[j + 4] = sum4;
+          y_row[j + 5] = sum5;
+        }
+      }
+
+      for (; j < O; ++j) {
+        float32_t sum = 0.0f;
+        uint32_t k = 0;
+        for (; k < N_unroll; k += 6) {
+          sum += pSrcA[k * M + i] * pSrcB[k * O + j] +
+                 pSrcA[(k + 1) * M + i] * pSrcB[(k + 1) * O + j] +
+                 pSrcA[(k + 2) * M + i] * pSrcB[(k + 2) * O + j] +
+                 pSrcA[(k + 3) * M + i] * pSrcB[(k + 3) * O + j] +
+                 pSrcA[(k + 4) * M + i] * pSrcB[(k + 4) * O + j] +
+                 pSrcA[(k + 5) * M + i] * pSrcB[(k + 5) * O + j];
+        }
+        for (; k < N; ++k) {
+          sum += pSrcA[k * M + i] * pSrcB[k * O + j];
+        }
+
+        y_row[j] = has_bias ? sum + c_row[j] : sum;
+      }
+    }
+  } else if (!transA && transB) {
+
+    for (uint32_t i = M_start; i < M_end; ++i) {
+      const float32_t *__restrict__ a_row = &pSrcA[i * N];
+      float32_t *__restrict__ y_row = &pDstY[i * O];
+      const float32_t *__restrict__ c_row = has_bias ? &pDstC[i * O] : NULL;
+
+      uint32_t j = 0;
+      for (; j < O_unroll; j += 6) {
+        const float32_t *__restrict__ b_row0 = &pSrcB[j * N];
+        const float32_t *__restrict__ b_row1 = &pSrcB[(j + 1) * N];
+        const float32_t *__restrict__ b_row2 = &pSrcB[(j + 2) * N];
+        const float32_t *__restrict__ b_row3 = &pSrcB[(j + 3) * N];
+        const float32_t *__restrict__ b_row4 = &pSrcB[(j + 4) * N];
+        const float32_t *__restrict__ b_row5 = &pSrcB[(j + 5) * N];
+
+        float32_t sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f,
+                  sum4 = 0.0f, sum5 = 0.0f;
+
+        uint32_t k = 0;
+        for (; k < N_unroll; k += 6) {
+          const float32_t a0 = a_row[k];
+          const float32_t a1 = a_row[k + 1];
+          const float32_t a2 = a_row[k + 2];
+          const float32_t a3 = a_row[k + 3];
+          const float32_t a4 = a_row[k + 4];
+          const float32_t a5 = a_row[k + 5];
+
+          sum0 += a0 * b_row0[k] + a1 * b_row0[k + 1] + a2 * b_row0[k + 2] +
+                  a3 * b_row0[k + 3] + a4 * b_row0[k + 4] + a5 * b_row0[k + 5];
+          sum1 += a0 * b_row1[k] + a1 * b_row1[k + 1] + a2 * b_row1[k + 2] +
+                  a3 * b_row1[k + 3] + a4 * b_row1[k + 4] + a5 * b_row1[k + 5];
+          sum2 += a0 * b_row2[k] + a1 * b_row2[k + 1] + a2 * b_row2[k + 2] +
+                  a3 * b_row2[k + 3] + a4 * b_row2[k + 4] + a5 * b_row2[k + 5];
+          sum3 += a0 * b_row3[k] + a1 * b_row3[k + 1] + a2 * b_row3[k + 2] +
+                  a3 * b_row3[k + 3] + a4 * b_row3[k + 4] + a5 * b_row3[k + 5];
+          sum4 += a0 * b_row4[k] + a1 * b_row4[k + 1] + a2 * b_row4[k + 2] +
+                  a3 * b_row4[k + 3] + a4 * b_row4[k + 4] + a5 * b_row4[k + 5];
+          sum5 += a0 * b_row5[k] + a1 * b_row5[k + 1] + a2 * b_row5[k + 2] +
+                  a3 * b_row5[k + 3] + a4 * b_row5[k + 4] + a5 * b_row5[k + 5];
+        }
+
+        for (; k < N; ++k) {
+          const float32_t a_val = a_row[k];
+          sum0 += a_val * b_row0[k];
+          sum1 += a_val * b_row1[k];
+          sum2 += a_val * b_row2[k];
+          sum3 += a_val * b_row3[k];
+          sum4 += a_val * b_row4[k];
+          sum5 += a_val * b_row5[k];
+        }
+
+        if (has_bias) {
+          y_row[j] = sum0 + c_row[j];
+          y_row[j + 1] = sum1 + c_row[j + 1];
+          y_row[j + 2] = sum2 + c_row[j + 2];
+          y_row[j + 3] = sum3 + c_row[j + 3];
+          y_row[j + 4] = sum4 + c_row[j + 4];
+          y_row[j + 5] = sum5 + c_row[j + 5];
+        } else {
+          y_row[j] = sum0;
+          y_row[j + 1] = sum1;
+          y_row[j + 2] = sum2;
+          y_row[j + 3] = sum3;
+          y_row[j + 4] = sum4;
+          y_row[j + 5] = sum5;
+        }
+      }
+
+      for (; j < O; ++j) {
+        const float32_t *__restrict__ b_row = &pSrcB[j * N];
+        float32_t sum = 0.0f;
+
+        uint32_t k = 0;
+        for (; k < N_unroll; k += 6) {
+          sum += a_row[k] * b_row[k] + a_row[k + 1] * b_row[k + 1] +
+                 a_row[k + 2] * b_row[k + 2] + a_row[k + 3] * b_row[k + 3] +
+                 a_row[k + 4] * b_row[k + 4] + a_row[k + 5] * b_row[k + 5];
+        }
+        for (; k < N; ++k) {
+          sum += a_row[k] * b_row[k];
+        }
+
+        y_row[j] = has_bias ? sum + c_row[j] : sum;
+      }
+    }
+  } else {
+
+    for (uint32_t i = M_start; i < M_end; ++i) {
+      float32_t *__restrict__ y_row = &pDstY[i * O];
+      const float32_t *__restrict__ c_row = has_bias ? &pDstC[i * O] : NULL;
+
+      uint32_t j = 0;
+      for (; j < O_unroll; j += 6) {
+        const float32_t *__restrict__ b_row0 = &pSrcB[j * N];
+        const float32_t *__restrict__ b_row1 = &pSrcB[(j + 1) * N];
+        const float32_t *__restrict__ b_row2 = &pSrcB[(j + 2) * N];
+        const float32_t *__restrict__ b_row3 = &pSrcB[(j + 3) * N];
+        const float32_t *__restrict__ b_row4 = &pSrcB[(j + 4) * N];
+        const float32_t *__restrict__ b_row5 = &pSrcB[(j + 5) * N];
+
+        float32_t sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f,
+                  sum4 = 0.0f, sum5 = 0.0f;
+
+        uint32_t k = 0;
+        for (; k < N_unroll; k += 6) {
+          const float32_t a0 = pSrcA[k * M + i];
+          const float32_t a1 = pSrcA[(k + 1) * M + i];
+          const float32_t a2 = pSrcA[(k + 2) * M + i];
+          const float32_t a3 = pSrcA[(k + 3) * M + i];
+          const float32_t a4 = pSrcA[(k + 4) * M + i];
+          const float32_t a5 = pSrcA[(k + 5) * M + i];
+
+          sum0 += a0 * b_row0[k] + a1 * b_row0[k + 1] + a2 * b_row0[k + 2] +
+                  a3 * b_row0[k + 3] + a4 * b_row0[k + 4] + a5 * b_row0[k + 5];
+          sum1 += a0 * b_row1[k] + a1 * b_row1[k + 1] + a2 * b_row1[k + 2] +
+                  a3 * b_row1[k + 3] + a4 * b_row1[k + 4] + a5 * b_row1[k + 5];
+          sum2 += a0 * b_row2[k] + a1 * b_row2[k + 1] + a2 * b_row2[k + 2] +
+                  a3 * b_row2[k + 3] + a4 * b_row2[k + 4] + a5 * b_row2[k + 5];
+          sum3 += a0 * b_row3[k] + a1 * b_row3[k + 1] + a2 * b_row3[k + 2] +
+                  a3 * b_row3[k + 3] + a4 * b_row3[k + 4] + a5 * b_row3[k + 5];
+          sum4 += a0 * b_row4[k] + a1 * b_row4[k + 1] + a2 * b_row4[k + 2] +
+                  a3 * b_row4[k + 3] + a4 * b_row4[k + 4] + a5 * b_row4[k + 5];
+          sum5 += a0 * b_row5[k] + a1 * b_row5[k + 1] + a2 * b_row5[k + 2] +
+                  a3 * b_row5[k + 3] + a4 * b_row5[k + 4] + a5 * b_row5[k + 5];
+        }
+
+        for (; k < N; ++k) {
+          const float32_t a_val = pSrcA[k * M + i];
+          sum0 += a_val * b_row0[k];
+          sum1 += a_val * b_row1[k];
+          sum2 += a_val * b_row2[k];
+          sum3 += a_val * b_row3[k];
+          sum4 += a_val * b_row4[k];
+          sum5 += a_val * b_row5[k];
+        }
+
+        if (has_bias) {
+          y_row[j] = sum0 + c_row[j];
+          y_row[j + 1] = sum1 + c_row[j + 1];
+          y_row[j + 2] = sum2 + c_row[j + 2];
+          y_row[j + 3] = sum3 + c_row[j + 3];
+          y_row[j + 4] = sum4 + c_row[j + 4];
+          y_row[j + 5] = sum5 + c_row[j + 5];
+        } else {
+          y_row[j] = sum0;
+          y_row[j + 1] = sum1;
+          y_row[j + 2] = sum2;
+          y_row[j + 3] = sum3;
+          y_row[j + 4] = sum4;
+          y_row[j + 5] = sum5;
+        }
+      }
+
+      for (; j < O; ++j) {
+        const float32_t *__restrict__ b_row = &pSrcB[j * N];
+        float32_t sum = 0.0f;
+
+        uint32_t k = 0;
+        for (; k < N_unroll; k += 6) {
+          sum += pSrcA[k * M + i] * b_row[k] +
+                 pSrcA[(k + 1) * M + i] * b_row[k + 1] +
+                 pSrcA[(k + 2) * M + i] * b_row[k + 2] +
+                 pSrcA[(k + 3) * M + i] * b_row[k + 3] +
+                 pSrcA[(k + 4) * M + i] * b_row[k + 4] +
+                 pSrcA[(k + 5) * M + i] * b_row[k + 5];
+        }
+        for (; k < N; ++k) {
+          sum += pSrcA[k * M + i] * b_row[k];
+        }
+
+        y_row[j] = has_bias ? sum + c_row[j] : sum;
       }
-      pDstY[i * O + j] = sum + pDstC[i * O + j];
     }
   }
 }
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/Layernorm.c b/TargetLibraries/PULPOpen/src/Layernorm.c
index ca2d977399..f8387ab5e2 100644
--- a/TargetLibraries/PULPOpen/src/Layernorm.c
+++ b/TargetLibraries/PULPOpen/src/Layernorm.c
@@ -1,36 +1,10 @@
-
-/* =====================================================================
- * Title:        Layernorm.c
- * Description:
- *
- * $Date:        05.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "pmsis.h"
-#include "pulp_nn_kernels.h"
-#include "pulp_nn_utils.h"
 
 #include "DeeployPULPMath.h"
 
@@ -40,7 +14,7 @@ void PULP_Layernorm_fp32_fp32(float32_t *data_in, float32_t *data_out,
                               uint32_t lastDimLength) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
   int32_t seq_length = size / lastDimLength;
   int32_t chunk =
diff --git a/TargetLibraries/PULPOpen/src/Matmul.c b/TargetLibraries/PULPOpen/src/Matmul.c
index ba2f235851..639da75f45 100644
--- a/TargetLibraries/PULPOpen/src/Matmul.c
+++ b/TargetLibraries/PULPOpen/src/Matmul.c
@@ -1,36 +1,10 @@
-
-/* =====================================================================
- * Title:        Matmul.c
- * Description:
- *
- * $Date:        05.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "pmsis.h"
-#include "pulp_nn_kernels.h"
-#include "pulp_nn_utils.h"
 
 #include "DeeployPULPMath.h"
 
@@ -40,7 +14,7 @@ void PULP_MatMul_fp32_fp32_fp32_unroll1x7(const float32_t *__restrict__ pSrcA,
                                           uint32_t M, uint32_t N, uint32_t O) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
   uint32_t M_chunk = (M >> log2Core) + ((M & (NUM_CORES - 1)) != 0);
   uint32_t M_start = MIN(core_id * M_chunk, M);
diff --git a/TargetLibraries/PULPOpen/src/MaxPool.c b/TargetLibraries/PULPOpen/src/MaxPool.c
index 3ccea4cfbd..3b630b97cc 100644
--- a/TargetLibraries/PULPOpen/src/MaxPool.c
+++ b/TargetLibraries/PULPOpen/src/MaxPool.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        MaxPool.c
- * Description:
- *
- * Date:         05.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
@@ -38,7 +15,7 @@ void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
                                   uint32_t pad_left, uint32_t pad_right) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
   uint16_t ch_chunk = (C >> log2Core) + ((C & (NUM_CORES - 1)) != 0);
   uint16_t ch_start = MIN(ch_chunk * core_id, C);
diff --git a/TargetLibraries/PULPOpen/src/RQiHardswish.c b/TargetLibraries/PULPOpen/src/RQiHardswish.c
index 8689383c0d..a60247290c 100644
--- a/TargetLibraries/PULPOpen/src/RQiHardswish.c
+++ b/TargetLibraries/PULPOpen/src/RQiHardswish.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        RQiHardswish.c
- * Description:
- *
- * $Date:        15.03.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
@@ -37,7 +16,7 @@ void RQiHardswish_s8_s8_plp(int8_t *input, int8_t *output, int32_t size,
   rnd = (1 << (shift - 1));
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
   int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, size);
   int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
diff --git a/TargetLibraries/PULPOpen/src/Relu.c b/TargetLibraries/PULPOpen/src/Relu.c
index a446ac60c5..4e309bc092 100644
--- a/TargetLibraries/PULPOpen/src/Relu.c
+++ b/TargetLibraries/PULPOpen/src/Relu.c
@@ -1,31 +1,7 @@
-
-/* =====================================================================
- * Title:        Relu.c
- * Description:
- *
- * Date:         05.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Run Wang, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
@@ -34,7 +10,7 @@
 void PULP_Relu_fp32_fp32(float32_t *input, float32_t *output, uint32_t size) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
   int32_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
   int32_t start = MIN(chunk * core_id, size);
diff --git a/TargetLibraries/PULPOpen/src/RequantShift.c b/TargetLibraries/PULPOpen/src/RequantShift.c
index 9343be20f3..3f1f15f32b 100644
--- a/TargetLibraries/PULPOpen/src/RequantShift.c
+++ b/TargetLibraries/PULPOpen/src/RequantShift.c
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        RequantShift_s8.c
- * Description:
- *
- * Date:         19.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese, ETH Zurich
- * - Victor Jung, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
diff --git a/TargetLibraries/PULPOpen/src/Softmax.c b/TargetLibraries/PULPOpen/src/Softmax.c
index f6370f35e8..3fd60111fe 100644
--- a/TargetLibraries/PULPOpen/src/Softmax.c
+++ b/TargetLibraries/PULPOpen/src/Softmax.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        iSoftmax.c
- * Description:
- *
- * $Date:        13.11.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
@@ -127,7 +106,7 @@ void PULP_Softmax_fp32_fp32(float32_t *input, float32_t *output, uint32_t size,
                             uint32_t last_dim_length) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
 
   int32_t num_vectors = size / last_dim_length;
   int32_t chunk =
diff --git a/TargetLibraries/PULPOpen/src/UniformRequantShift.c b/TargetLibraries/PULPOpen/src/UniformRequantShift.c
index 5507d0ebbc..18a179154b 100644
--- a/TargetLibraries/PULPOpen/src/UniformRequantShift.c
+++ b/TargetLibraries/PULPOpen/src/UniformRequantShift.c
@@ -1,28 +1,8 @@
-/* ----------------------------------------------------------------------
-#
-# File: UniformRequantShift.c
-#
-# Last edited: 12.03.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
@@ -34,7 +14,7 @@ void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul,
                                int8_t output_max, bool rounding) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
   int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, size);
   int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
@@ -99,7 +79,7 @@ void UniformRequantShift_u8_s8(uint8_t *data_in, int32_t size, int32_t mul,
                                int8_t output_max, bool rounding) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
   int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, size);
   int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
@@ -164,7 +144,7 @@ void UniformRequantShift_s16_s8(int16_t *data_in, int32_t size, int32_t mul,
                                 int8_t output_max, bool rounding) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
   int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, size);
   int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
@@ -229,7 +209,7 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul,
                                 int8_t output_max, bool rounding) {
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
   int16_t chunk = (size >> log2Core) + ((size & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, size);
   int16_t chunk_stop = MIN(chunk_start + chunk, size + 1);
diff --git a/TargetLibraries/PULPOpen/src/Util.c b/TargetLibraries/PULPOpen/src/Util.c
index 257ea9590a..034beedf0f 100644
--- a/TargetLibraries/PULPOpen/src/Util.c
+++ b/TargetLibraries/PULPOpen/src/Util.c
@@ -1,33 +1,10 @@
-/* =====================================================================
- * Title:        Util.c
- * Description:
- *
- * Date:         15.03.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
-#include "DeeployMath.h"
+#include "DeeployPULPMath.h"
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
diff --git a/TargetLibraries/PULPOpen/src/dory_dma.c b/TargetLibraries/PULPOpen/src/dory_dma.c
deleted file mode 100644
index 0aa31dcd17..0000000000
--- a/TargetLibraries/PULPOpen/src/dory_dma.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * dory_dma.c
- * Alessio Burrello <alessio.burrello@unibo.it>
- *
- * Copyright (C) 2019-2020 University of Bologna
- *
- * SPDX-License-Identifier: Apache-2.0
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "dory_dma.h"
-
-#include "pmsis.h"
-
-#ifndef MCHAN_BASE_ADDR
-// FIXME: For GAP9, this must point to ARCHI_MCHAN_EXT_ADDR!!!
-// In PULP-SDK for Kraken, this is fixed.
-// GAP8 hardware to be tested...
-#define MCHAN_BASE_ADDR (ARCHI_MCHAN_DEMUX_ADDR) // CLUSTER_MCHAN_ADDR
-#endif
-#define MCHAN_EVENT
-// #define MCHAN_POLLED
-#ifdef MCHAN_EVENT
-#define MCHAN_EVENT_BIT (ARCHI_CL_EVT_DMA0) // 8
-#endif
-#include "mchan.h"
-
-#if defined(MCHAN_POLLED)
-#define MCHAN_FLAGS (MCHAN_CMD_FLAG_INCREMENTAL)
-#elif defined(MCHAN_EVENT)
-#define MCHAN_FLAGS (MCHAN_CMD_FLAG_EVENT_ENABLE | MCHAN_CMD_FLAG_INCREMENTAL)
-#elif defined(MCHAN_INTERRUPT)
-#define MCHAN_FLAGS                                                            \
-  (MCHAN_CMD_FLAG_INTERRUPT_ENABLE | MCHAN_CMD_FLAG_INCREMENTAL)
-#endif
-
-#define MCHAN_FLAGS_1D (MCHAN_FLAGS)
-#define MCHAN_FLAGS_2D (MCHAN_FLAGS | MCHAN_CMD_FLAG_2D_TRANSFER_EXTERNAL)
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-void dory_dma_memcpy_hwc_to_chw(DMA_copy *copy) {
-  int core_id = pi_core_id();
-  int Log2Core = log2(NUM_CORES);
-  int number_of_copies_per_core =
-      (copy->length_1d_copy >> Log2Core) +
-      ((copy->length_1d_copy & (NUM_CORES - 1)) != 0);
-  int start_pixel, stop_pixel; // "pixel" is a misnomer; the CHANNELS are
-                               // divided between the cores
-  // this function assumes that a DW tile is always as wide as the complete
-  // feature map (this is enforced by DORY's tiler)
-  start_pixel = MIN(number_of_copies_per_core * core_id, copy->length_1d_copy);
-  stop_pixel =
-      MIN(start_pixel + number_of_copies_per_core, copy->length_1d_copy);
-  void *ext = copy->ext + start_pixel;
-  void *loc = copy->loc + copy->number_of_1d_copies *
-                              copy->number_of_2d_copies * start_pixel;
-  const int size_2d = copy->number_of_1d_copies * copy->number_of_2d_copies;
-
-  for (int i = start_pixel; i < stop_pixel; i++) {
-    mchan_transfer_t trans = {.cmd = size_2d |
-                                     copy->dir << MCHAN_CMD_SHIFT_DIRECTION |
-                                     MCHAN_FLAGS_2D,
-                              .size = size_2d,
-                              .ext = ext,
-                              .loc = loc,
-                              .ext_size_1d = 1, // one byte at a time...
-                              .ext_stride_1d = copy->stride_1d};
-    mchan_transfer_push_2d(trans);
-#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board
-    dory_dma_barrier(copy);
-#endif
-    ext += 1; // next channel
-    loc += copy->number_of_1d_copies * copy->number_of_2d_copies;
-  }
-}
-
-void dory_dma_memcpy_1d_async(DMA_copy *copy) {
-  if (pi_core_id() == 0) {
-    mchan_transfer_t trans = {
-        .cmd = copy->length_1d_copy * copy->number_of_1d_copies *
-                   copy->number_of_2d_copies |
-               (copy->dir << MCHAN_CMD_SHIFT_DIRECTION) | MCHAN_FLAGS_1D,
-        .size = copy->length_1d_copy * copy->number_of_1d_copies *
-                copy->number_of_2d_copies,
-        .ext = copy->ext,
-        .loc = copy->loc};
-    mchan_transfer_push_1d(trans);
-  }
-}
-
-void dory_dma_memcpy_2d_async(DMA_copy *copy) {
-  if (pi_core_id() == 0) {
-    const int size_2d = copy->number_of_1d_copies * copy->length_1d_copy *
-                        copy->number_of_2d_copies;
-    const int stride =
-        (copy->number_of_2d_copies == 1) ? copy->stride_1d : copy->stride_2d;
-    const int size_1d = (copy->number_of_2d_copies == 1)
-                            ? copy->length_1d_copy
-                            : copy->length_1d_copy * copy->number_of_1d_copies;
-
-    mchan_transfer_t trans = {.cmd = size_2d |
-                                     copy->dir << MCHAN_CMD_SHIFT_DIRECTION |
-                                     MCHAN_FLAGS_2D,
-                              .size = size_2d,
-                              .ext = copy->ext,
-                              .loc = copy->loc,
-                              .ext_size_1d = size_1d,
-                              .ext_stride_1d = stride};
-    mchan_transfer_push_2d(trans);
-  }
-}
-
-void dory_dma_memcpy_3d_async(DMA_copy *copy) {
-  int core_id = pi_core_id();
-  if (core_id == 0) {
-    int Log2Core = log2(1);
-    int number_of_2d_copies_per_core = (copy->number_of_2d_copies >> Log2Core) +
-                                       ((copy->number_of_2d_copies & (0)) != 0);
-    int start_pixel, stop_pixel;
-    start_pixel =
-        MIN(number_of_2d_copies_per_core * core_id, copy->number_of_2d_copies);
-    stop_pixel = MIN(start_pixel + number_of_2d_copies_per_core,
-                     copy->number_of_2d_copies);
-    void *ext = copy->ext + copy->stride_2d * start_pixel;
-    void *loc = copy->loc +
-                copy->length_1d_copy * copy->number_of_1d_copies * start_pixel;
-    const int size_2d = copy->number_of_1d_copies * copy->length_1d_copy;
-
-    for (int i = start_pixel; i < stop_pixel; i++) {
-      mchan_transfer_t trans = {.cmd = size_2d |
-                                       copy->dir << MCHAN_CMD_SHIFT_DIRECTION |
-                                       MCHAN_FLAGS_2D,
-                                .size = size_2d,
-                                .ext = ext,
-                                .loc = loc,
-                                .ext_size_1d = copy->length_1d_copy,
-                                .ext_stride_1d = copy->stride_1d};
-      mchan_transfer_push_2d(trans);
-#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board
-                                  // dory_dma_barrier(copy);
-#endif
-      loc += size_2d;
-      ext += copy->stride_2d;
-    }
-  }
-}
-
-void dory_dma_memcpy_async(DMA_copy *copy) {
-  if (copy->hwc_to_chw == 1) {
-    dory_dma_memcpy_hwc_to_chw(copy);
-  } else if ((copy->number_of_2d_copies == 1 &&
-              copy->number_of_1d_copies == 1) ||
-             (copy->stride_1d == copy->length_1d_copy &&
-              copy->number_of_1d_copies * copy->length_1d_copy ==
-                  copy->stride_2d) ||
-             (copy->number_of_2d_copies == 1 &&
-              copy->length_1d_copy == copy->stride_1d)) {
-    dory_dma_memcpy_1d_async(copy);
-  } else if ((copy->number_of_2d_copies == 1) ||
-             (copy->length_1d_copy == copy->stride_1d)) { // wrong!
-    dory_dma_memcpy_2d_async(copy);
-  } else {
-    dory_dma_memcpy_3d_async(copy);
-  }
-}
-
-void dory_dma_memcpy_1d_mindims_async(DMA_copy *copy) {
-  mchan_transfer_t trans = {
-      .cmd = copy->mchan_cmd, .ext = copy->ext, .loc = copy->loc};
-  mchan_transfer_push_1d(trans);
-}
-
-void dory_dma_memcpy_2d_mindims_async(DMA_copy *copy) {
-  mchan_transfer_t trans = {.cmd = copy->mchan_cmd,
-                            .ext = copy->ext,
-                            .loc = copy->loc,
-                            .ext_size_1d = copy->length_1d_copy,
-                            .ext_stride_1d = copy->stride_1d};
-  mchan_transfer_push_2d(trans);
-}
-
-void dory_dma_memcpy_3d_mindims_async(DMA_copy *copy) {
-  void *ext = copy->ext;
-  void *loc = copy->loc;
-  const int length_2d_copy =
-      copy->mchan_cmd & ((1 << MCHAN_TRANSFER_LEN_SIZE) - 1);
-
-  for (int i = 0; i < copy->number_of_2d_copies; i++) {
-    mchan_transfer_t trans = {.cmd = copy->mchan_cmd,
-                              .ext = ext,
-                              .loc = loc,
-                              .ext_size_1d = copy->length_1d_copy,
-                              .ext_stride_1d = copy->stride_1d};
-    mchan_transfer_push_2d(trans);
-    loc += length_2d_copy;
-    ext += copy->stride_2d;
-#ifdef ALWAYS_BLOCK_DMA_TRANSFERS // needed on GAP8 board
-                                  // dory_dma_barrier(copy);
-#endif
-  }
-}
-
-void dory_dma_memcpy_mindims_async(DMA_copy *copy) {
-  if (copy->number_of_2d_copies == 1 && copy->number_of_1d_copies == 1) {
-    dory_dma_memcpy_1d_mindims_async(copy);
-  } else if (copy->number_of_2d_copies == 1) {
-    dory_dma_memcpy_2d_mindims_async(copy);
-  } else {
-    dory_dma_memcpy_3d_mindims_async(copy);
-  }
-}
-
-void dory_dma_free(DMA_copy *copy) { mchan_transfer_free(copy->tid); }
-
-void dory_dma_barrier(DMA_copy *copy) { mchan_transfer_wait(copy->tid); }
-
-int dory_dma_allocate() { return mchan_transfer_get_id(); }
diff --git a/TargetLibraries/PULPOpen/src/dory_mem.c b/TargetLibraries/PULPOpen/src/dory_mem.c
index 8c04f8f7f8..95699a7492 100644
--- a/TargetLibraries/PULPOpen/src/dory_mem.c
+++ b/TargetLibraries/PULPOpen/src/dory_mem.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        dory_mem.c
- * Description:
- *
- * $Date:        12.12.2023
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "dory_mem.h"
@@ -101,44 +80,46 @@ void mem_init() {
 struct pi_device *get_ram_ptr() { return &ram; }
 
 void *ram_malloc(size_t size) {
-  void *ptr = NULL;
+  uint32_t ptr;
   pi_ram_alloc(&ram, &ptr, size);
-  return ptr;
+  return (void *)ptr;
 }
 
-void ram_free(void *ptr, size_t size) { pi_ram_free(&ram, ptr, size); }
+void ram_free(void *ptr, size_t size) {
+  pi_ram_free(&ram, (uint32_t)ptr, size);
+}
 
 void ram_read(void *dest, void *src, const size_t size) {
-  pi_ram_read(&ram, src, dest, size);
+  pi_ram_read(&ram, (uint32_t)src, dest, size);
 }
 
 void ram_write(void *dest, void *src, const size_t size) {
-  pi_ram_write(&ram, dest, src, size);
+  pi_ram_write(&ram, (uint32_t)dest, src, size);
 }
 
 void *cl_ram_malloc(size_t size) {
-  int addr;
-  pi_cl_ram_req_t req;
+  uint32_t addr;
+  pi_cl_ram_alloc_req_t req;
   pi_cl_ram_alloc(&ram, size, &req);
   pi_cl_ram_alloc_wait(&req, &addr);
   return (void *)addr;
 }
 
 void cl_ram_free(void *ptr, size_t size) {
-  pi_cl_ram_req_t req;
-  pi_cl_ram_free(&ram, ptr, size, &req);
+  pi_cl_ram_free_req_t req;
+  pi_cl_ram_free(&ram, (uint32_t)ptr, size, &req);
   pi_cl_ram_free_wait(&req);
 }
 
 void cl_ram_read(void *dest, void *src, const size_t size) {
   pi_cl_ram_req_t req;
-  pi_cl_ram_read(&ram, src, dest, size, &req);
+  pi_cl_ram_read(&ram, (uint32_t)src, dest, size, &req);
   pi_cl_ram_read_wait(&req);
 }
 
 void cl_ram_write(void *dest, void *src, const size_t size) {
   pi_cl_ram_req_t req;
-  pi_cl_ram_write(&ram, dest, src, size, &req);
+  pi_cl_ram_write(&ram, (uint32_t)dest, src, size, &req);
   pi_cl_ram_write_wait(&req);
 }
 
@@ -162,7 +143,7 @@ size_t load_file_to_ram(const void *dest, const char *filename) {
     pi_cl_fs_req_t req;
     pi_cl_fs_read(fd, buffer, load_size, &req);
     pi_cl_fs_wait(&req);
-    cl_ram_write(dest + offset, buffer, load_size);
+    cl_ram_write((void *)dest + offset, buffer, load_size);
     offset += load_size;
   } while (offset < size);
 
diff --git a/TargetLibraries/PULPOpen/src/gemv.c b/TargetLibraries/PULPOpen/src/gemv.c
index c774224d6a..a3c26cf3c1 100644
--- a/TargetLibraries/PULPOpen/src/gemv.c
+++ b/TargetLibraries/PULPOpen/src/gemv.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        vec2mat.c
- * Description:
- *
- * $Date:        15.03.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "pmsis.h"
@@ -151,5 +130,5 @@ void gemv_s8_s8_plp(int8_t *pIn, int8_t *pBias, int8_t *pOut, int8_t *pWeight,
       }
     }
   }
-  pi_cl_team_barrier(0);
+  pi_cl_team_barrier();
 }
diff --git a/TargetLibraries/PULPOpen/src/iRMSnorm.c b/TargetLibraries/PULPOpen/src/iRMSnorm.c
index 78f882e16d..de03d241d2 100644
--- a/TargetLibraries/PULPOpen/src/iRMSnorm.c
+++ b/TargetLibraries/PULPOpen/src/iRMSnorm.c
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        iRMSnorm.c
- * Description:
- *
- * $Date:        14.03.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeployPULPMath.h"
@@ -67,7 +46,7 @@ void iRMSnorm_s8_s8_plp(int8_t *data_in, int8_t *data_out, int32_t *weight,
   int32_t intermediate;
 
   int8_t core_id = pi_core_id();
-  int8_t log2Core = log2(NUM_CORES);
+  int8_t log2Core = LOG2(NUM_CORES);
   int16_t chunk =
       (lastDimLength >> log2Core) + ((lastDimLength & (NUM_CORES - 1)) != 0);
   int16_t chunk_start = MIN(chunk * core_id, lastDimLength);
diff --git a/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed b/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed
index b69ec23ec8..a9b4aaf597 160000
--- a/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed
+++ b/TargetLibraries/PULPOpen/third_party/pulp-nn-mixed
@@ -1 +1 @@
-Subproject commit b69ec23ec81595ebbec694f4a28d84022858af83
+Subproject commit a9b4aaf597c030ce24bf65a00b5f3ec84a1528c4
diff --git a/TargetLibraries/Snitch/CMakeLists.txt b/TargetLibraries/Snitch/CMakeLists.txt
index 78a214fea9..a2dd6703c7 100644
--- a/TargetLibraries/Snitch/CMakeLists.txt
+++ b/TargetLibraries/Snitch/CMakeLists.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 file(GLOB_RECURSE SOURCES
   "src/**"
 )
diff --git a/TargetLibraries/Snitch/cmake/snitch-runtime-precompiled.cmake b/TargetLibraries/Snitch/cmake/snitch-runtime-precompiled.cmake
index a9947dc0ab..85a509a83c 100644
--- a/TargetLibraries/Snitch/cmake/snitch-runtime-precompiled.cmake
+++ b/TargetLibraries/Snitch/cmake/snitch-runtime-precompiled.cmake
@@ -1,3 +1,7 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
 set(SNITCH_RUNTIME_BASE_INCLUDE
   ${SNITCH_RUNTIME_HOME}/src
   ${SNITCH_RUNTIME_HOME}/api
diff --git a/TargetLibraries/Snitch/inc/CycleCounter.h b/TargetLibraries/Snitch/inc/CycleCounter.h
index f197055746..a1516f471e 100644
--- a/TargetLibraries/Snitch/inc/CycleCounter.h
+++ b/TargetLibraries/Snitch/inc/CycleCounter.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        CycleCounter.h
- * Description:
- *
- * Date:         06.12.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_CYCLE_HEADER_
diff --git a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h
index 193df1132e..e44d3c20c6 100644
--- a/TargetLibraries/Snitch/inc/DeeploySnitchMath.h
+++ b/TargetLibraries/Snitch/inc/DeeploySnitchMath.h
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        DeeploySnitchMath.h
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/Snitch/inc/dmaStruct.h b/TargetLibraries/Snitch/inc/dmaStruct.h
index bf360747e3..abbfc42b5a 100644
--- a/TargetLibraries/Snitch/inc/dmaStruct.h
+++ b/TargetLibraries/Snitch/inc/dmaStruct.h
@@ -1,28 +1,11 @@
-/* ----------------------------------------------------------------------
-#
-# File: dmaStruct.h
-#
-# Last edited: 03.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_DMASTRUCT_HEADER_
+#define __DEEPLOY_MATH_DMASTRUCT_HEADER_
 
 #include "snrt.h"
 
@@ -34,4 +17,6 @@ typedef struct {
   size_t src_stride;
   size_t repeat;
   snrt_dma_txid_t tid;
-} DMA_copy;
\ No newline at end of file
+} DMA_copy;
+
+#endif // __DEEPLOY_MATH_DMASTRUCT_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Snitch/inc/kernel/Gemm.h b/TargetLibraries/Snitch/inc/kernel/Gemm.h
index b1197b7863..d4a25a9d78 100644
--- a/TargetLibraries/Snitch/inc/kernel/Gemm.h
+++ b/TargetLibraries/Snitch/inc/kernel/Gemm.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Gemm.h
- * Description:
- *
- * Date:         30.05.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/Gemm_fp32.h b/TargetLibraries/Snitch/inc/kernel/Gemm_fp32.h
index 8c3e3afa12..2c3507113e 100644
--- a/TargetLibraries/Snitch/inc/kernel/Gemm_fp32.h
+++ b/TargetLibraries/Snitch/inc/kernel/Gemm_fp32.h
@@ -1,5 +1,11 @@
-#ifndef __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
-#define __DEEPLOY_MATH_GEMM_KERNEL_HEADER_
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_GEMMFP32_KERNEL_HEADER_
+#define __DEEPLOY_MATH_GEMMFP32_KERNEL_HEADER_
 
 #include "DeeploySnitchMath.h"
 
@@ -48,4 +54,4 @@ void gemm_fp32_opt(uint32_t M, uint32_t N, uint32_t K, float32_t *A,
                    uint32_t ldC, float32_t *Y, uint32_t BETA,
                    uint32_t setup_SSR);
 
-#endif //__DEEPLOY_MATH_GEMM_KERNEL_HEADER_
\ No newline at end of file
+#endif //__DEEPLOY_MATH_GEMMFP32_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Snitch/inc/kernel/MatMul.h b/TargetLibraries/Snitch/inc/kernel/MatMul.h
index be0cdeac7c..d4b9ba71ca 100644
--- a/TargetLibraries/Snitch/inc/kernel/MatMul.h
+++ b/TargetLibraries/Snitch/inc/kernel/MatMul.h
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul.h
- * Description:
- *
- * Date:         30.05.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_MATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/RQGemm.h b/TargetLibraries/Snitch/inc/kernel/RQGemm.h
index b1d77c1e1c..dc72e1b3e8 100644
--- a/TargetLibraries/Snitch/inc/kernel/RQGemm.h
+++ b/TargetLibraries/Snitch/inc/kernel/RQGemm.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQGemm.h
- * Description:
- *
- * Date:         30.05.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_RQGEMM_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/RQMatMul.h b/TargetLibraries/Snitch/inc/kernel/RQMatMul.h
index fdac512497..c58e25d516 100644
--- a/TargetLibraries/Snitch/inc/kernel/RQMatMul.h
+++ b/TargetLibraries/Snitch/inc/kernel/RQMatMul.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQMatMul.h
- * Description:
- *
- * Date:         30.05.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_RQMATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/kernel/Softmax.h b/TargetLibraries/Snitch/inc/kernel/Softmax.h
index fc0bf272f3..c2d7596e7a 100644
--- a/TargetLibraries/Snitch/inc/kernel/Softmax.h
+++ b/TargetLibraries/Snitch/inc/kernel/Softmax.h
@@ -1,5 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
+#define __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
+
 #include "DeeploySnitchMath.h"
 
 void softmax_fp32(float *input, float *output, int32_t ldI,
                   int32_t batch_offset, int32_t batch_size, int32_t seq_len,
-                  int32_t input_samples);
\ No newline at end of file
+                  int32_t input_samples);
+
+#endif // #define __DEEPLOY_MATH_SOFTMAX_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h b/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h
index 79887d2322..32358e5fb8 100644
--- a/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h
+++ b/TargetLibraries/Snitch/inc/kernel/UniformRequantShift.h
@@ -1,32 +1,12 @@
 /*
- * ----------------------------------------------------------------------
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
- * File: UniformRequantShift.h
- *
- * Last edited: 30.05.2024
- *
- * Copyright (C) 2024, ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Victor Jung, (jungvi@iis.ee.ethz.ch), ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
- *
- * ----------------------------------------------------------------------
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_UNIFORMREQUANTSHIFT_KERNEL_HEADER_
+#define __DEEPLOY_MATH_UNIFORMREQUANTSHIFT_KERNEL_HEADER_
+
 #include "DeeploySnitchMath.h"
 
 void UniformRequantShift_s8_s8(int8_t *data_in, int32_t size, int32_t mul,
@@ -51,4 +31,6 @@ void UniformRequantShift_s32_s8(int32_t *data_in, int32_t size, int32_t mul,
                                 int32_t add, int8_t *data_out, int32_t log2D,
                                 int32_t HW, int32_t input_offset,
                                 int32_t output_offset, int8_t output_min,
-                                int8_t output_max, bool rounding);
\ No newline at end of file
+                                int8_t output_max, bool rounding);
+
+#endif // __DEEPLOY_MATH_UNIFORMREQUANTSHIFT_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Snitch/inc/kernel/iNoNorm.h b/TargetLibraries/Snitch/inc/kernel/iNoNorm.h
index 56b58e0959..694df5be29 100644
--- a/TargetLibraries/Snitch/inc/kernel/iNoNorm.h
+++ b/TargetLibraries/Snitch/inc/kernel/iNoNorm.h
@@ -1,31 +1,16 @@
-/* ----------------------------------------------------------------------
-#
-# File: iNoNorm.h
-#
-# Last edited: 06.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_MATH_INONORM_KERNEL_HEADER_
+#define __DEEPLOY_MATH_INONORM_KERNEL_HEADER_
 
 #include "DeeploySnitchMath.h"
 
 void SnitchiNoNorm_s8_s8(int8_t *data_in, int8_t *data_out, int8_t *weights,
                          int32_t *bias, uint32_t size, int32_t mul,
                          int32_t log2D);
+
+#endif // __DEEPLOY_MATH_INONORM_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Snitch/inc/kernel/iSoftmax.h b/TargetLibraries/Snitch/inc/kernel/iSoftmax.h
index a59c54e9be..fb68d04adf 100644
--- a/TargetLibraries/Snitch/inc/kernel/iSoftmax.h
+++ b/TargetLibraries/Snitch/inc/kernel/iSoftmax.h
@@ -1,32 +1,12 @@
-/* =====================================================================
- * Title:        iSoftmax.h
- * Description:
- *
- * $Date:        30.05.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
+#ifndef __DEEPLOY_MATH_ISOFTMAX_KERNEL_HEADER_
+#define __DEEPLOY_MATH_ISOFTMAX_KERNEL_HEADER_
+
 #include "DeeploySnitchMath.h"
 
 void SnitchSoftmax_u8_u8(uint8_t *data_in, uint8_t *data_out,
@@ -37,3 +17,5 @@ void StnichSoftmax_i8_u8(int8_t *data_in, uint8_t *data_out,
                          uint32_t *lastDimBuffer, uint32_t size,
                          uint32_t lastDimLength, int32_t coeffB, int32_t coeffC,
                          int32_t log2);
+
+#endif // __DEEPLOY_MATH_ISOFTMAX_KERNEL_HEADER_
diff --git a/TargetLibraries/Snitch/inc/macros.h b/TargetLibraries/Snitch/inc/macros.h
index a54bc24d89..bc1191d25a 100644
--- a/TargetLibraries/Snitch/inc/macros.h
+++ b/TargetLibraries/Snitch/inc/macros.h
@@ -1,29 +1,7 @@
 /*
- * ----------------------------------------------------------------------
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
- * File: macros.h
- *
- * Last edited: 30.05.2024
- *
- * Copyright (C) 2024, ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
- *
- * ----------------------------------------------------------------------
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_MACROS_HEADER_
diff --git a/TargetLibraries/Snitch/src/Add.c b/TargetLibraries/Snitch/src/Add.c
index 094739cf1b..40b216f138 100644
--- a/TargetLibraries/Snitch/src/Add.c
+++ b/TargetLibraries/Snitch/src/Add.c
@@ -1,28 +1,8 @@
-/* ----------------------------------------------------------------------
-#
-# File: Add.c
-#
-# Last edited: 11.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
 #include "DeeploySnitchMath.h"
 
diff --git a/TargetLibraries/Snitch/src/CycleCounter.c b/TargetLibraries/Snitch/src/CycleCounter.c
index 1a3ccd133b..3861c421c1 100644
--- a/TargetLibraries/Snitch/src/CycleCounter.c
+++ b/TargetLibraries/Snitch/src/CycleCounter.c
@@ -1,29 +1,7 @@
 /*
- * ----------------------------------------------------------------------
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
- * File: CycleCounter.c
- *
- * Last edited: 23.04.2024
- *
- * Copyright (C) 2024, ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
- *
- * ----------------------------------------------------------------------
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
@@ -36,7 +14,7 @@ static uint32_t instr_end[NUM_CORES] __attribute__((section(".l1")));
 static uint32_t running[NUM_CORES] __attribute__((section(".l1")));
 
 void ResetTimer() {
-  // snrt_reset_perf_counter(SNRT_PERF_CNT0);
+  snrt_reset_perf_counter(SNRT_PERF_CNT0);
   uint32_t const core_id = snrt_global_core_idx();
   uint32_t _timer_init = read_csr(mcycle);
   uint32_t _instr_init = read_csr(minstret);
@@ -48,7 +26,9 @@ void ResetTimer() {
 }
 
 void StartTimer() {
-  // snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_CYCLES, 0);
+  if (snrt_is_dm_core()) {
+    snrt_start_perf_counter(SNRT_PERF_CNT0, SNRT_PERF_CNT_CYCLES, 0);
+  }
   uint32_t const core_id = snrt_global_core_idx();
   timer_init[core_id] = read_csr(mcycle);
   instr_init[core_id] = read_csr(minstret);
@@ -56,17 +36,16 @@ void StartTimer() {
 }
 
 void StopTimer() {
-  // if (!snrt_is_dm_core()) {
-  //   snrt_stop_perf_counter(SNRT_PERF_CNT0);
-  // }
+  if (snrt_is_dm_core()) {
+    snrt_stop_perf_counter(SNRT_PERF_CNT0);
+  }
   uint32_t const core_id = snrt_global_core_idx();
   timer_end[core_id] = read_csr(mcycle);
-  timer_end[core_id] = read_csr(minstret);
+  instr_end[core_id] = read_csr(minstret);
   running[core_id] = 0;
 }
 
 uint32_t getCycles() {
-  // return snrt_get_perf_counter(SNRT_PERF_CNT0);
   uint32_t const core_id = snrt_global_core_idx();
   if (running[core_id]) {
     return read_csr(mcycle) - timer_init[core_id];
diff --git a/TargetLibraries/Snitch/src/Gemm_fp32.c b/TargetLibraries/Snitch/src/Gemm_fp32.c
index b9ff58850e..9a79538e12 100644
--- a/TargetLibraries/Snitch/src/Gemm_fp32.c
+++ b/TargetLibraries/Snitch/src/Gemm_fp32.c
@@ -1,3 +1,9 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
 #include "DeeploySnitchMath.h"
 #include "Gemm.h"
 
diff --git a/TargetLibraries/Snitch/src/Gemm_s8.c b/TargetLibraries/Snitch/src/Gemm_s8.c
index eefd407394..c216815505 100644
--- a/TargetLibraries/Snitch/src/Gemm_s8.c
+++ b/TargetLibraries/Snitch/src/Gemm_s8.c
@@ -1,3 +1,9 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
 #include "DeeploySnitchMath.h"
 #include "Gemm.h"
 
diff --git a/TargetLibraries/Snitch/src/MatMul_s16.c b/TargetLibraries/Snitch/src/MatMul_s16.c
index f4c56ba14e..6ec2ac8502 100644
--- a/TargetLibraries/Snitch/src/MatMul_s16.c
+++ b/TargetLibraries/Snitch/src/MatMul_s16.c
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul_s16.c
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/MatMul_s32.c b/TargetLibraries/Snitch/src/MatMul_s32.c
index 4ba81a7442..3c4a8d3837 100644
--- a/TargetLibraries/Snitch/src/MatMul_s32.c
+++ b/TargetLibraries/Snitch/src/MatMul_s32.c
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul_s32.c
- * Description:
- *
- * Date:         29.11.2022
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/MatMul_s8.c b/TargetLibraries/Snitch/src/MatMul_s8.c
index e9ad77f349..d20cf2a756 100644
--- a/TargetLibraries/Snitch/src/MatMul_s8.c
+++ b/TargetLibraries/Snitch/src/MatMul_s8.c
@@ -1,32 +1,7 @@
-/* =====================================================================
- * Title:        MatMul_s8.c
- * Description:
- *
- * Date:         30.05.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Samuel Riedel, ETH Zurich
- * - Sergio Mazzola, ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/RQGemm_s8.c b/TargetLibraries/Snitch/src/RQGemm_s8.c
index cfbc867bd7..d4e0d4c660 100644
--- a/TargetLibraries/Snitch/src/RQGemm_s8.c
+++ b/TargetLibraries/Snitch/src/RQGemm_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQGemm_s8.c
- * Description:
- *
- * Date:         30.05.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2023 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/RQMatMul_s8.c b/TargetLibraries/Snitch/src/RQMatMul_s8.c
index 083141505e..1f20ecf8c0 100644
--- a/TargetLibraries/Snitch/src/RQMatMul_s8.c
+++ b/TargetLibraries/Snitch/src/RQMatMul_s8.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        RQMatMul_s8.c
- * Description:
- *
- * Date:         30.05.2024
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/Softmax_fp32.c b/TargetLibraries/Snitch/src/Softmax_fp32.c
index 45fb960ffb..b8abb27845 100644
--- a/TargetLibraries/Snitch/src/Softmax_fp32.c
+++ b/TargetLibraries/Snitch/src/Softmax_fp32.c
@@ -1,3 +1,9 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
 #include "DeeploySnitchMath.h"
 
 void Softmax_fp32(float32_t *input, float32_t *output, int32_t ldI,
diff --git a/TargetLibraries/Snitch/src/UniformRequantShift.c b/TargetLibraries/Snitch/src/UniformRequantShift.c
index a45a0b7e33..c326f0f769 100644
--- a/TargetLibraries/Snitch/src/UniformRequantShift.c
+++ b/TargetLibraries/Snitch/src/UniformRequantShift.c
@@ -1,30 +1,7 @@
 /*
- * ----------------------------------------------------------------------
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
- * File: UniformRequantShift.c
- *
- * Last edited: 30.05.2024
- *
- * Copyright (C) 2024, ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Victor Jung, (jungvi@iis.ee.ethz.ch), ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
- *
- * ----------------------------------------------------------------------
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/Util.c b/TargetLibraries/Snitch/src/Util.c
index 7325e66047..35ef97ea59 100644
--- a/TargetLibraries/Snitch/src/Util.c
+++ b/TargetLibraries/Snitch/src/Util.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Util.c
- * Description:
- *
- * Date:         15.03.2023
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except pSrcA compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to pSrcA writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/iNoNorm.c b/TargetLibraries/Snitch/src/iNoNorm.c
index 30b3c68492..ee39c08f1d 100644
--- a/TargetLibraries/Snitch/src/iNoNorm.c
+++ b/TargetLibraries/Snitch/src/iNoNorm.c
@@ -1,28 +1,8 @@
-/* ----------------------------------------------------------------------
-#
-# File: iNoNorm.c
-#
-# Last edited: 06.06.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author:
-# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
-#
-# ----------------------------------------------------------------------
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-*/
+/*
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
 
 #include "DeeploySnitchMath.h"
 
diff --git a/TargetLibraries/Snitch/src/iSoftmax.c b/TargetLibraries/Snitch/src/iSoftmax.c
index 41194a969f..9cd6c0f641 100644
--- a/TargetLibraries/Snitch/src/iSoftmax.c
+++ b/TargetLibraries/Snitch/src/iSoftmax.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        iSoftmax.c
- * Description:
- *
- * $Date:        30.05.2024
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author:
- * - Moritz Scherer, ETH Zurich
- * - Philip Wiese (wiesep@iis.ee.ethz.ch), ETH Zurich
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/Snitch/src/snitch_nn_add_i8_i8_i8.c b/TargetLibraries/Snitch/src/snitch_nn_add_i8_i8_i8.c
index f83e3bac88..b735e60704 100644
--- a/TargetLibraries/Snitch/src/snitch_nn_add_i8_i8_i8.c
+++ b/TargetLibraries/Snitch/src/snitch_nn_add_i8_i8_i8.c
@@ -1,21 +1,7 @@
 /*
- * pulp_nn_add_i8_i8_i8.c
- * Georg Rutishauser <georgr@iis.ee.ethz.ch>
- * Victor Jung <jungvi@iis.ee.ethz.ch>
+ * SPDX-FileCopyrightText: 2018 ETH Zurich and University of Bologna
  *
- * Copyright (C) 2018-2020 University of Bologna
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "DeeploySnitchMath.h"
diff --git a/TargetLibraries/SoftHier/CMakeLists.txt b/TargetLibraries/SoftHier/CMakeLists.txt
index d7cab9c559..1ae3a51f59 100644
--- a/TargetLibraries/SoftHier/CMakeLists.txt
+++ b/TargetLibraries/SoftHier/CMakeLists.txt
@@ -1,17 +1,21 @@
-file(GLOB_RECURSE SOURCES 
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+file(GLOB_RECURSE SOURCES
     "src/**"
 )
 
 include(cmake/softhier-runtime.cmake)
 add_deeploy_library(deeploysofthier STATIC ${SOURCES})
 
-target_include_directories(deeploysofthier 
+target_include_directories(deeploysofthier
     PUBLIC
     ${CMAKE_CURRENT_LIST_DIR}/inc
 )
 
 target_compile_options(deeploysofthier PRIVATE
-  -DDEEPLOY_SOFTHIER_PLATFORM 
+  -DDEEPLOY_SOFTHIER_PLATFORM
   -Wno-implicit-function-declaration
   -Wno-implicit-int-conversion
   -Wno-sign-conversion
diff --git a/TargetLibraries/SoftHier/cmake/softhier-runtime.cmake b/TargetLibraries/SoftHier/cmake/softhier-runtime.cmake
index a0ce19e6c4..dd900b3829 100644
--- a/TargetLibraries/SoftHier/cmake/softhier-runtime.cmake
+++ b/TargetLibraries/SoftHier/cmake/softhier-runtime.cmake
@@ -1,3 +1,7 @@
+# Copyright 2025 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
 # SoftHier runtime should have structure:
 # runtime/
 # ├── include/
@@ -23,7 +27,7 @@ set_source_files_properties(
 
 
 
-add_library(softhier-sdk OBJECT 
+add_library(softhier-sdk OBJECT
             ${SOFTHIER_RUNTIME_ASM_SOURCE}
 )
 
@@ -32,7 +36,7 @@ target_compile_options(softhier-sdk PRIVATE
 )
 target_include_directories(softhier-sdk SYSTEM PUBLIC ${SOFTHIER_INCLUDES})
 
-target_compile_options(softhier-sdk PRIVATE 
+target_compile_options(softhier-sdk PRIVATE
     -Wno-sign-conversion
     -Wno-unused-function
     -Wno-unused-parameter
diff --git a/TargetLibraries/SoftHier/inc/DeeploySoftHierMath.h b/TargetLibraries/SoftHier/inc/DeeploySoftHierMath.h
index 5a77b83a49..e54b3be47d 100644
--- a/TargetLibraries/SoftHier/inc/DeeploySoftHierMath.h
+++ b/TargetLibraries/SoftHier/inc/DeeploySoftHierMath.h
@@ -1,28 +1,7 @@
-/* =====================================================================
- * Title:        DeeployMath.h
- * Description:
- *
- * $Date:        07.06.2025
- *
- * ===================================================================== */
 /*
- * Copyright (C) 2025 ETH Zurich and University of Bologna.
- *
- * Author: Bowen Wang <bowwang@iis.ee.ethz.ch>, ETH Zurich
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_MATH_HEADER_
diff --git a/TargetLibraries/SoftHier/inc/types.h b/TargetLibraries/SoftHier/inc/types.h
index 4d57747402..04720e14b4 100644
--- a/TargetLibraries/SoftHier/inc/types.h
+++ b/TargetLibraries/SoftHier/inc/types.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        types.h
- * Description:
- *
- * Date:         07.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2025 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Bowen Wang <bowwang@iis.ee.ethz.ch>, ETH Zurich
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_SOFTHIER_MATH_TYPES_HEADER_
diff --git a/TargetLibraries/SoftHier/inc/util.h b/TargetLibraries/SoftHier/inc/util.h
index 1997e1ffc7..8474e9fd0b 100644
--- a/TargetLibraries/SoftHier/inc/util.h
+++ b/TargetLibraries/SoftHier/inc/util.h
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        util.h
- * Description:
- *
- * Date:         07.06.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2022 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Bowen Wang <bowwang@iis.ee.ethz.ch>, ETH Zurich
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #ifndef __DEEPLOY_BASIC_MATH_UTIL_HEADER_
diff --git a/TargetLibraries/SoftHier/src/Util.c b/TargetLibraries/SoftHier/src/Util.c
index 3e1120459a..156953801b 100644
--- a/TargetLibraries/SoftHier/src/Util.c
+++ b/TargetLibraries/SoftHier/src/Util.c
@@ -1,30 +1,7 @@
-/* =====================================================================
- * Title:        Util.c
- * Description:
- *
- * Date:         06.05.2025
- *
- * ===================================================================== */
-
 /*
- * Copyright (C) 2025 ETH Zurich and University of Bologna.
- *
- * Authors:
- * - Bowen Wang <bowwang@iis.ee.ethz.ch>, ETH Zurich
+ * SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
  *
  * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
  */
 
 #include "DeeploySoftHierMath.h"
diff --git a/cmake/Util.cmake b/cmake/Util.cmake
index 88bc48007b..1e54dc680b 100644
--- a/cmake/Util.cmake
+++ b/cmake/Util.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 macro(add_deeploy_library name)
     add_library(${ARGV})
     add_custom_command(
diff --git a/cmake/chimera/chimera-sdk.cmake b/cmake/chimera/chimera-sdk.cmake
index c54b4573dd..b01b03b28d 100644
--- a/cmake/chimera/chimera-sdk.cmake
+++ b/cmake/chimera/chimera-sdk.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 add_compile_definitions(
   DEEPLOY_CHIMERA_PLATFORM
 )
diff --git a/cmake/chimera/toolchain_llvm.cmake b/cmake/chimera/toolchain_llvm.cmake
index cbb6d84bfd..6e613c0cb9 100644
--- a/cmake/chimera/toolchain_llvm.cmake
+++ b/cmake/chimera/toolchain_llvm.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
 
 set(CMAKE_SYSTEM_NAME Generic)
@@ -13,7 +17,7 @@ set(CMAKE_LINKER ${TOOLCHAIN_PREFIX}/ld.lld)
 set(CMAKE_EXECUTABLE_SUFFIX ".elf")
 
 # Enable WHOLE_ARCHIVE feature
-set(CMAKE_LINK_LIBRARY_USING_WHOLE_ARCHIVE 
+set(CMAKE_LINK_LIBRARY_USING_WHOLE_ARCHIVE
 "-Wl,--whole-archive <LIBRARY> -Wl,--no-whole-archive"
 )
 set(CMAKE_LINK_LIBRARY_USING_WHOLE_ARCHIVE_SUPPORTED True)
diff --git a/cmake/cmsis/cmsis.cmake b/cmake/cmsis/cmsis.cmake
index f10892c87b..adcb7c7a36 100644
--- a/cmake/cmsis/cmsis.cmake
+++ b/cmake/cmsis/cmsis.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 add_compile_definitions(
     DEEPLOY_CMSIS_PLATFORM
 )
diff --git a/cmake/cmsis/qemu.cmake b/cmake/cmsis/qemu.cmake
index d8b8e04512..1c2bf650e8 100644
--- a/cmake/cmsis/qemu.cmake
+++ b/cmake/cmsis/qemu.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TARGET_CPU "cortex-m4" CACHE STRING "Target CPU")
 set(CPU cortex-m4)
 set(FPU fpv4-sp-d16)
diff --git a/cmake/cmsis/toolchain_gcc.cmake b/cmake/cmsis/toolchain_gcc.cmake
index c99245b93e..aabf5c591b 100644
--- a/cmake/cmsis/toolchain_gcc.cmake
+++ b/cmake/cmsis/toolchain_gcc.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX arm-none-eabi)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/cmsis/toolchain_llvm.cmake b/cmake/cmsis/toolchain_llvm.cmake
index 8a329b1aaa..6c747b1d9d 100644
--- a/cmake/cmsis/toolchain_llvm.cmake
+++ b/cmake/cmsis/toolchain_llvm.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/common.cmake b/cmake/common.cmake
index c090edf525..18437219d5 100644
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
 set(CMAKE_EXPORT_COMPILE_COMMANDS TRUE)
 
diff --git a/cmake/generic/generic.cmake b/cmake/generic/generic.cmake
index 63c17b1024..a1d2ba807b 100644
--- a/cmake/generic/generic.cmake
+++ b/cmake/generic/generic.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 add_compile_definitions(
     DEEPLOY_GENERIC_PLATFORM
 )
diff --git a/cmake/generic/toolchain_llvm.cmake b/cmake/generic/toolchain_llvm.cmake
index fbe119d717..2dafa53857 100644
--- a/cmake/generic/toolchain_llvm.cmake
+++ b/cmake/generic/toolchain_llvm.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/mempool/mempool.cmake b/cmake/mempool/mempool.cmake
index 9ba4070c58..77053c1e06 100644
--- a/cmake/mempool/mempool.cmake
+++ b/cmake/mempool/mempool.cmake
@@ -1,7 +1,10 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 #############################
 ##  Address configuration  ##
 #############################
-
 # Boot address (in dec)
 set(boot_addr  2684354560   CACHE STRING "Boot address (in dec)") # A0000000
 
diff --git a/cmake/mempool/mempool.yaml b/cmake/mempool/mempool.yaml
index b9e776dc35..f940a961b5 100644
--- a/cmake/mempool/mempool.yaml
+++ b/cmake/mempool/mempool.yaml
@@ -1,5 +1,5 @@
-# Copyright 2021 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
 # SPDX-License-Identifier: Apache-2.0
 
 ---
diff --git a/cmake/mempool/mempool_ita.cmake b/cmake/mempool/mempool_ita.cmake
index bc3631c6d2..e19be8366c 100644
--- a/cmake/mempool/mempool_ita.cmake
+++ b/cmake/mempool/mempool_ita.cmake
@@ -1,7 +1,10 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 #############################
 ##  Address configuration  ##
 #############################
-
 # Boot address (in dec)
 set(boot_addr  2684354560   CACHE STRING "Boot address (in dec)") # A0000000
 
diff --git a/cmake/mempool/mempool_ita.yaml b/cmake/mempool/mempool_ita.yaml
index 3c26e30095..aa1504c637 100644
--- a/cmake/mempool/mempool_ita.yaml
+++ b/cmake/mempool/mempool_ita.yaml
@@ -1,5 +1,5 @@
-# Copyright 2021 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
 # SPDX-License-Identifier: Apache-2.0
 
 ---
diff --git a/cmake/mempool/minpool.cmake b/cmake/mempool/minpool.cmake
index 3c688504ae..664c4a8f27 100644
--- a/cmake/mempool/minpool.cmake
+++ b/cmake/mempool/minpool.cmake
@@ -1,7 +1,10 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 #############################
 ##  Address configuration  ##
 #############################
-
 # Boot address (in dec)
 set(boot_addr  2684354560   CACHE STRING "Boot address (in dec)") # A0000000
 
diff --git a/cmake/mempool/minpool.yaml b/cmake/mempool/minpool.yaml
index c80e9964df..436e0c5eec 100644
--- a/cmake/mempool/minpool.yaml
+++ b/cmake/mempool/minpool.yaml
@@ -1,5 +1,5 @@
-# Copyright 2021 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
 # SPDX-License-Identifier: Apache-2.0
 
 ---
diff --git a/cmake/mempool/toolchain_gcc.cmake b/cmake/mempool/toolchain_gcc.cmake
index fa0636f72d..58c38049bf 100644
--- a/cmake/mempool/toolchain_gcc.cmake
+++ b/cmake/mempool/toolchain_gcc.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin/riscv32-unknown-elf)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/mempool/toolchain_llvm.cmake b/cmake/mempool/toolchain_llvm.cmake
index fe66233360..3340daf460 100644
--- a/cmake/mempool/toolchain_llvm.cmake
+++ b/cmake/mempool/toolchain_llvm.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/pulp/pulp-open/pulp-open.cmake b/cmake/pulp/pulp-open/pulp-open.cmake
index c8d7454f11..a58a1bd8e8 100644
--- a/cmake/pulp/pulp-open/pulp-open.cmake
+++ b/cmake/pulp/pulp-open/pulp-open.cmake
@@ -1,2 +1,6 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(PULPNNVERSION XPULPV2)
 set(PULPNNBITWIDTH 32)
diff --git a/cmake/pulp/pulp.cmake b/cmake/pulp/pulp.cmake
index 242e798e29..8518804315 100644
--- a/cmake/pulp/pulp.cmake
+++ b/cmake/pulp/pulp.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 add_compile_definitions(
   DEEPLOY_PULP_PLATFORM
 )
diff --git a/cmake/pulp/siracusa/siracusa.cmake b/cmake/pulp/siracusa/siracusa.cmake
index c8d7454f11..a58a1bd8e8 100644
--- a/cmake/pulp/siracusa/siracusa.cmake
+++ b/cmake/pulp/siracusa/siracusa.cmake
@@ -1,2 +1,6 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(PULPNNVERSION XPULPV2)
 set(PULPNNBITWIDTH 32)
diff --git a/cmake/pulp/toolchain_gcc.cmake b/cmake/pulp/toolchain_gcc.cmake
index a5681a319a..18014a98b9 100644
--- a/cmake/pulp/toolchain_gcc.cmake
+++ b/cmake/pulp/toolchain_gcc.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin/riscv32-unknown-elf)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/pulp/toolchain_llvm.cmake b/cmake/pulp/toolchain_llvm.cmake
index c0c6952e52..cf3fb80d6c 100644
--- a/cmake/pulp/toolchain_llvm.cmake
+++ b/cmake/pulp/toolchain_llvm.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/simulation.cmake b/cmake/simulation.cmake
index 96bb45af9e..8d68a0ad05 100644
--- a/cmake/simulation.cmake
+++ b/cmake/simulation.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 #########################
 ##  Simulation Config  ##
 #########################
@@ -38,9 +42,13 @@ macro(print_simulation_config)
 endmacro()
 
 macro(add_banshee_simulation name)
+	if(NOT DEFINED ENV{BANSHEE_INSTALL_DIR})
+		message(FATAL_ERROR "Environment variable BANSHEE_INSTALL_DIR not set")
+	endif()
+	set(BANSHEE_EXECUTABLE "$ENV{BANSHEE_INSTALL_DIR}/banshee")
     add_custom_target(banshee_${name}
 	DEPENDS ${name}
-	COMMAND RUST_MIN_STACK=${banshee_stack_size} banshee
+	COMMAND RUST_MIN_STACK=${banshee_stack_size} ${BANSHEE_EXECUTABLE}
 	--num-cores=${num_threads}
 	--num-clusters=1
 	--latency
diff --git a/cmake/snitch/snitch.cmake b/cmake/snitch/snitch.cmake
index 4170a99aab..a67f3ddc54 100644
--- a/cmake/snitch/snitch.cmake
+++ b/cmake/snitch/snitch.cmake
@@ -1,3 +1,11 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+if(NOT DEFINED ENV{SNITCH_HOME})
+    message(FATAL_ERROR "Environment variable SNITCH_HOME not set.")
+endif()
+
 set(SNITCH_HOME $ENV{SNITCH_HOME})
 set(SNITCH_RUNTIME_HOME ${SNITCH_HOME}/sw/snRuntime)
 
diff --git a/cmake/snitch/snitch_cluster/snitch_cluster.cmake b/cmake/snitch/snitch_cluster/snitch_cluster.cmake
index 38b0b07b73..91d9392bb0 100644
--- a/cmake/snitch/snitch_cluster/snitch_cluster.cmake
+++ b/cmake/snitch/snitch_cluster/snitch_cluster.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(SNITCH_CLUSTER_HOME ${SNITCH_HOME}/target/snitch_cluster)
 
 set(BANSHEE_CONFIG ${SNITCH_CLUSTER_HOME}/src/banshee.yaml CACHE INTERNAL "source_list")
\ No newline at end of file
diff --git a/cmake/snitch/toolchain_llvm.cmake b/cmake/snitch/toolchain_llvm.cmake
index b6c6f23d22..09f64b9817 100644
--- a/cmake/snitch/toolchain_llvm.cmake
+++ b/cmake/snitch/toolchain_llvm.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 set(TOOLCHAIN_PREFIX ${TOOLCHAIN_INSTALL_DIR}/bin)
 
 set(CMAKE_SYSTEM_NAME Generic)
diff --git a/cmake/softhier/softhier_gvsoc.cmake b/cmake/softhier/softhier_gvsoc.cmake
index 49dc6ec121..198649f1ee 100644
--- a/cmake/softhier/softhier_gvsoc.cmake
+++ b/cmake/softhier/softhier_gvsoc.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 macro(add_gvsoc_emulation name)
   set(BINARY_PATH ${CMAKE_BINARY_DIR}/bin/${name})
 
diff --git a/cmake/softhier/toolchain_gcc.cmake b/cmake/softhier/toolchain_gcc.cmake
index ef1cee12b0..16ff0e5234 100644
--- a/cmake/softhier/toolchain_gcc.cmake
+++ b/cmake/softhier/toolchain_gcc.cmake
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 add_compile_definitions(
     DEEPLOY_SOFTHIER_PLATFORM
 )
@@ -20,15 +24,15 @@ set(ISA rv32imafdv_zfh)
 set(CMAKE_EXECUTABLE_SUFFIX ".elf")
 
 add_compile_options(
-  -mabi=ilp32d 
+  -mabi=ilp32d
   -mcmodel=medlow
   -march=${ISA}
-  -g 
-  -O3 
+  -g
+  -O3
   -ffast-math
-  -fno-builtin 
-  -fno-tree-vectorize 
-  -fno-common 
+  -fno-builtin
+  -fno-tree-vectorize
+  -fno-common
   -ffunction-sections
   -fno-strict-aliasing
 )
@@ -36,7 +40,7 @@ add_compile_options(
 add_link_options(
   -march=${ISA}
   -nostartfiles
-  -Wl,--gc-sections 
+  -Wl,--gc-sections
 )
 
 link_libraries(
diff --git a/docs/Makefile b/docs/Makefile
index 22e9a64c15..0009e947eb 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,8 +1,6 @@
-# Copyright 2024 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# Moritz Scherer <scheremo@iis.ee.ethz.ch>
+# SPDX-License-Identifier: Apache-2.0
 
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
diff --git a/docs/_templates/custom-class-template.rst b/docs/_templates/custom-class-template.rst
index aa311868fb..8f52d154cb 100644
--- a/docs/_templates/custom-class-template.rst
+++ b/docs/_templates/custom-class-template.rst
@@ -1,3 +1,7 @@
+.. SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
 {{ fullname | escape | underline}}
 
 .. currentmodule:: {{ module }}
diff --git a/docs/_templates/custom-module-template.rst b/docs/_templates/custom-module-template.rst
index 6adfe405db..edd7d130e0 100644
--- a/docs/_templates/custom-module-template.rst
+++ b/docs/_templates/custom-module-template.rst
@@ -1,3 +1,7 @@
+.. SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
 {{ fullname | escape | underline}}
 
 .. automodule:: {{ fullname }}
diff --git a/docs/_templates/versions.html b/docs/_templates/versions.html
index 137438b750..f4f59a671d 100644
--- a/docs/_templates/versions.html
+++ b/docs/_templates/versions.html
@@ -1,3 +1,9 @@
+<!--
+SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+
+SPDX-License-Identifier: Apache-2.0
+-->
+
 <div class="rst-versions" data-toggle="rst-versions" role="note" aria-label="versions">
   <span class="rst-current-version" data-toggle="rst-current-version">
     Version: {{ current_version }}
diff --git a/docs/apidocs.rst b/docs/apidocs.rst
index a7da36f9e5..29b5495a13 100644
--- a/docs/apidocs.rst
+++ b/docs/apidocs.rst
@@ -1,3 +1,7 @@
+.. SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
 API Reference
 *************
 
diff --git a/docs/conf.py b/docs/conf.py
index e8f7ce201c..6ca3d33c6f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,27 +1,6 @@
-# ----------------------------------------------------------------------
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# File: conf.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
-#
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import os
 import subprocess
diff --git a/docs/index.rst b/docs/index.rst
index 23bd5db472..dc32dbbce3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,3 +1,7 @@
+.. SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
 Deeploy Documentation
 =====================
 
@@ -20,3 +24,4 @@ Deeploy is developed as part of the PULP project, a joint effort between ETH Zur
    tutorials/overview
    structure
    apidocs
+   releasing
diff --git a/docs/install.md b/docs/install.md
index 4499ba1ef0..ca85e1acea 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -47,14 +47,6 @@ pip install -e .
 
 ## Testing Framework Installation
 
-Please make sure to use a Rust version that is compatible with LLVM 15, like 1.63.0:
-
-```
-sudo snap install rustup --classic
-rustup install 1.63.0
-rustup default 1.63.0
-```
-
 The Makefile expects the environemt variable `CMAKE` to be defined. In case you have no strong preferences, you may run
 
 ```
@@ -84,7 +76,5 @@ For example, you can run
 
 ```
 cd DeeployTest
-python testRunner_generic.py -t Tests/simpleRegression
+python testRunner_generic.py -t ./Tests/Kernels/Integer/Add/Regular
 ```
-
-to run the `simpleRegression` test on your workstation. Various other tests are available and compatibility between tests and platforms is tested in the `.gitlab-ci.yml` file.
diff --git a/docs/make.bat b/docs/make.bat
index 32bb24529f..bb6fa61b30 100644
--- a/docs/make.bat
+++ b/docs/make.bat
@@ -1,3 +1,7 @@
+@REM SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+@REM
+@REM SPDX-License-Identifier: Apache-2.0
+
 @ECHO OFF
 
 pushd %~dp0
diff --git a/docs/releasing.md b/docs/releasing.md
new file mode 100644
index 0000000000..62dca8dc63
--- /dev/null
+++ b/docs/releasing.md
@@ -0,0 +1,18 @@
+# Deeploy Release Guide
+
+This guide explains how to prepare and publish a Deeploy release to PyPI/TestPyPI using `uv`.
+
+## Prepare the release
+1. **Update the changelog** – Add a section for the new version under [CHANGELOG.md](../CHANGELOG.md) with the release date and noteworthy entries.
+2. **Bump the version** – Use `uv` so the version in [pyproject.toml](../pyproject.toml) stays authoritative:
+   ```bash
+   uv version --bump major/minor/patch
+   ```
+3. **Verify the build locally** – This is optional but highly recommended:
+   ```bash
+   uv build
+   uv run --isolated --no-project --with dist/*.whl python -c "import Deeploy"
+   uv run --isolated --no-project --with dist/*.tar.gz python -c "import Deeploy"
+   ```
+4. **Commit and merge the changes** – Include the updated version and changelog in the commit. Once your commit reaches the `main` branch, it will be tagged.
+5. **Deployment** – The publish workflow triggers on tags that start with `v` and match the version. Your package is now published congrats.
diff --git a/docs/tutorials/debugging.rst b/docs/tutorials/debugging.rst
index 0e0dd5a885..5145ae79c8 100644
--- a/docs/tutorials/debugging.rst
+++ b/docs/tutorials/debugging.rst
@@ -1,3 +1,7 @@
+.. SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
 Debugging
 =========
 
diff --git a/docs/tutorials/introduction.md b/docs/tutorials/introduction.md
index e2a54f9f6f..1e638bd8a8 100644
--- a/docs/tutorials/introduction.md
+++ b/docs/tutorials/introduction.md
@@ -38,11 +38,11 @@ From the `DeeployTest` folder, you can use the `testRunner` to compile ONNXs and
 
 To validate your installation, you can run a simple Add node on each platform:
 ```
-python testRunner_generic.py -t Tests/Adder
-python testRunner_cortexm.py -t Tests/Adder
-python testRunner_mempool.py -t Tests/Adder
-python testRunner_snitch.py -t Tests/Adder/
-python testRunner_siracusa.py -t Tests/Adder --cores=8
+python testRunner_generic.py -t Tests/IntKernels/Add/Regular
+python testRunner_cortexm.py -t Tests/IntKernels/Add/Regular
+python testRunner_mempool.py -t Tests/IntKernels/Add/Regular
+python testRunner_snitch.py -t Tests/IntKernels/Add/Regular/
+python testRunner_siracusa.py -t Tests/IntKernels/Add/Regular --cores=8
 ```
 Once all these basic tests are passed, we can jump into the basics of Deeploy.
 
@@ -67,9 +67,9 @@ The figure below gives an overview of the deployment stack. As you can see, ther
 
 You can visualize the ONNX graphs using [Netron](https://netron.app/). Either use the web interface or install the python package with `pip install netron`.
 
-> ✅ **Task:** Visualize the ONNX graph of the `Adder`, `MobileNetv2`, and `Transformer`
+> ✅ **Task:** Visualize the ONNX graph of the `IntKernels/Add/Regular`, `Models/MobileNetv2`, and `Others/Transformer`
 
-The ONNX graphs are in `DeeployTest/Tests/<TestName>/network.onnx`. The networks are increasing in complexity, `Adder` is a single node network for unit testing, while `MobileNetv2` is a simple sequential network mostly made of convolutions. Finally, the `Transformer` network showcases a typical transformer block used in Encoder and Decoder networks. If you want to peek at a complex network, you can visualize `microLlama/microLlama128`.
+The ONNX graphs are in `DeeployTest/Tests/<TestName>/network.onnx`. The networks are increasing in complexity, `IntKernels/Add/Regular` is a single node network for unit testing, while `Models/MobileNetv2` is a simple sequential network mostly made of convolutions. Finally, the `Others/Transformer` network showcases a typical transformer block used in Encoder and Decoder networks. If you want to peek at a complex network, you can visualize `Models/microLlama/microLlama128`.
 
 Now that we understand Deeploy's input, let's check the output-generated code!
 
@@ -77,15 +77,15 @@ Now that we understand Deeploy's input, let's check the output-generated code!
 
 The generated code is located in the following directory: `DeeployTest/TEST_<PlatformName>/Tests`, and the `Network.c` file is the interesting one.
 
-The generated code is trivial for the `Adder` graph; we simply use the template for the `Add` node of the Generic platform. You can find the template declaration in `Deeploy/Targets/Generic/Templates/AddTemplate.py`.
+The generated code is trivial for the `IntKernels/Add/Regular` graph; we simply use the template for the `Add` node of the Generic platform. You can find the template declaration in `Deeploy/Targets/Generic/Templates/AddTemplate.py`.
 
-Now, if you want to look at something a bit more complex, run `python testRunner_generic.py  -t ./Tests/miniMobileNetv2` (from `DeeployTest`) and look at the generated code. There are two interesting points you can notice:
+Now, if you want to look at something a bit more complex, run `python testRunner_generic.py  -t ./Tests/Models/miniMobileNetv2` (from `DeeployTest`) and look at the generated code. There are two interesting points you can notice:
 - We hoist the constants at the top of the file.
-- In the `RunNetwork` function, we sequentially have node templates to execute the operands and malloc/free to manage the memory. You can open the ONNX graph of `miniMobileNetv2` on the side to try to match the nodes of the graph with their generated code.
+- In the `RunNetwork` function, we sequentially have node templates to execute the operands and malloc/free to manage the memory. You can open the ONNX graph of `Models/miniMobileNetv2` on the side to try to match the nodes of the graph with their generated code.
 
 > ✅ **Task:** Visualize the effect of passes on the ONNX graph for the Siracusa platform.
 
-Deeploy applies passes on the ONNX graph to transform its topology and optimize its execution. Let's visualize the effect of the passes used in the Siracusa Platform. First, let's execute our `miniMobileNetv2` on Siracusa with `python testRunner_siracusa.py  -t ./Tests/miniMobileNetv2`. You can find the original ONNX graph at `DeeployTest/Tests/miniMobileNetv2/network.onnx`, and the transformed ONNX graph at `DeeployTest/TEST_SIRACUSA/Tests/miniMobileNetv2/deeployStates/backend_post_binding.onnx`. Open both ONNX graphs side by side to compare them.
+Deeploy applies passes on the ONNX graph to transform its topology and optimize its execution. Let's visualize the effect of the passes used in the Siracusa Platform. First, let's execute our `miniMobileNetv2` on Siracusa with `python testRunner_siracusa.py  -t ./Tests/Models/miniMobileNetv2`. You can find the original ONNX graph at `DeeployTest/Tests/Models/miniMobileNetv2/network.onnx`, and the transformed ONNX graph at `DeeployTest/TEST_SIRACUSA/Tests/Models/miniMobileNetv2/deeployStates/backend_post_binding.onnx`. Open both ONNX graphs side by side to compare them.
 
 You can notice the effect of two passes on the graph:
 - One pass fuses the `Conv` and `RequantShift` nodes. This is a common technique named [Operator Fusion](https://medium.com/data-science/how-pytorch-2-0-accelerates-deep-learning-with-operator-fusion-and-cpu-gpu-code-generation-35132a85bd26) and used in many DNN compilers.
@@ -140,7 +140,7 @@ Now that you understand the hardware and the kind of workload we want to execute
 <details>
  <summary><span style="font-weight: bold; font-size: 1.3em;">Solution</span></summary>
 
- > If you run `python testRunner_siracusa.py -t Tests/microLlama/microLlama128 --cores=1` and then `python testRunner_siracusa.py -t Tests/microLlama/microLlama128 --cores=8`, you should measure a runtime of ~16,1M cycles for 1 core and 3.1M cycles for 8 cores.
+ > If you run `python testRunner_siracusa.py -t Tests/Models/microLlama/microLlama128 --cores=1` and then `python testRunner_siracusa.py -t Tests/Models/microLlama/microLlama128 --cores=8`, you should measure a runtime of ~16,1M cycles for 1 core and 3.1M cycles for 8 cores.
  >
  > The speedup ratio is obtained via $\frac{\text{Runtime 1 cores}}{\text{Runtime 8 cores}} = 5.2$. Hence, using 8 cores instead of 1 leads to a 5.2 times speedup.
  >
@@ -162,9 +162,9 @@ The good news is that Deeploy can already do that! So, let's generate and run so
 <details>
  <summary><span style="font-weight: bold; font-size: 1.3em;">Solution</span></summary>
 
- > Bad configuration: `python testRunner_tiled_siracusa.py -t Tests/microLlama/microLlama64_parallel --cores=8 --l1 8000 --defaultMemLevel=L2` -> Runtime: 47.5 MCycles
+ > Bad configuration: `python testRunner_tiled_siracusa.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 8000 --defaultMemLevel=L2` -> Runtime: 47.5 MCycles
  >
- > Good configuration `python testRunner_tiled_siracusa.py -t Tests/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2`: -> Runtime: 35.3 MCycles
+ > Good configuration `python testRunner_tiled_siracusa.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2`: -> Runtime: 35.3 MCycles
  >
  > Justification: As the size of the L1 memory gets smaller, tiles also get smaller and smaller. Smaller tiles usually mean that it's harder to keep the core properly utilized.
 
@@ -199,7 +199,7 @@ To use the NPU, you can use the `testRunner_tiled_siracusa_w_neureka.py`. The Li
  > The runtime in parallel mode with NPU is obtained with:
  >
  >`
- python testRunner_tiled_siracusa_w_neureka.py -t Tests/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2
+ python testRunner_tiled_siracusa_w_neureka.py -t Tests/Models/microLlama/microLlama64_parallel --cores=8 --l1 64000 --defaultMemLevel=L2
  `
  >
  > And returns 28.6 MCycles of runtime. The runtime without NPU was measured above and is 35.3 MCycles. Hence, the speedup is ~1.23 times.
diff --git a/docs/tutorials/overview.rst b/docs/tutorials/overview.rst
index a76bb1e1e3..0b3d97c761 100644
--- a/docs/tutorials/overview.rst
+++ b/docs/tutorials/overview.rst
@@ -1,3 +1,7 @@
+.. SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+..
+.. SPDX-License-Identifier: Apache-2.0
+
 Tutorials
 *********
 
diff --git a/pyproject.toml b/pyproject.toml
index 752603f21b..3a924a22b9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,10 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 [project]
-name = "PULP-Deeploy"
-version = '0.2.0'
+name = "deeploy-pulp"
+version = "0.2.1"
 description = "Deeploy - DNN Compiler for Heterogeneous SoCs"
 authors = [
     { name="Victor Jung", email="jungvi@iis.ee.ethz.ch" },
@@ -18,13 +22,15 @@ dependencies = [
 'numpy<2.0.0',
 'onnx',
 'onnxruntime',
-'onnx-graphsurgeon==0.3.20',
+'onnx-graphsurgeon>=0.5.8',
 'mako',
 'argparse',
 'toml',
-'pytest',
 'ortools',
 'plotly',
+'coloredlogs',
+'pytest',
+'pytest-xdist',
 ]
 
 [project.urls]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000..f3567107f9
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[pytest]
+# Pytest configuration for Deeploy tests
+
+# Test discovery patterns
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Minimum version
+minversion = 6.0
+
+# Add current directory to Python path
+pythonpath = .
+
+# Default options
+addopts = 
+    -ra
+    --strict-markers
+    --strict-config
+    --showlocals
+
+# Test output
+console_output_style = progress
+
+# Logging
+log_cli = false
+log_cli_level = INFO
+log_cli_format = %(levelname)s %(message)s
+
+# Warnings
+filterwarnings =
+    error
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
+
+# VJUNG: Restrict test discovery to the DeeployTest dir
+testpaths = DeeployTest
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ed846e9b09..6d047b4957 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 # Quality of life
 netron
 debugpy
@@ -15,7 +19,9 @@ pyserial
 yapf==0.33.0
 isort==5.12.0
 autoflake==2.3.0
+yamllint==1.37.1
 clang-format
+reuse
 
 # Documentation
 sphinx
diff --git a/scripts/gen_changelog.py b/scripts/gen_changelog.py
index 8e981071ef..474eff405e 100644
--- a/scripts/gen_changelog.py
+++ b/scripts/gen_changelog.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import re
 import subprocess
 from collections import defaultdict
diff --git a/scripts/generate_test_matrix.py b/scripts/generate_test_matrix.py
new file mode 100755
index 0000000000..c25fe534f0
--- /dev/null
+++ b/scripts/generate_test_matrix.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+Generate GitHub Actions test matrix from Python test configuration.
+
+This script reads test configurations from DeeployTest config files and outputs
+JSON arrays suitable for GitHub Actions matrix strategies.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+# Add DeeployTest to path to import config
+sys.path.insert(0, str(Path(__file__).parent.parent / "DeeployTest"))
+
+from test_siracusa_tiled_config import L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_MODELS, L3_DOUBLEBUFFER_MODELS, \
+    L3_SINGLEBUFFER_MODELS
+
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: generate_test_matrix.py <config-key>", file = sys.stderr)
+        print("config-key must be one of:", file = sys.stderr)
+        print("  l2-singlebuffer-models", file = sys.stderr)
+        print("  l2-doublebuffer-models", file = sys.stderr)
+        print("  l3-singlebuffer-models", file = sys.stderr)
+        print("  l3-doublebuffer-models", file = sys.stderr)
+        sys.exit(1)
+
+    config_key = sys.argv[1]
+
+    # Map config keys to Python dictionaries
+    config_map = {
+        "l2-singlebuffer-models": L2_SINGLEBUFFER_MODELS,
+        "l2-doublebuffer-models": L2_DOUBLEBUFFER_MODELS,
+        "l3-singlebuffer-models": L3_SINGLEBUFFER_MODELS,
+        "l3-doublebuffer-models": L3_DOUBLEBUFFER_MODELS,
+    }
+
+    if config_key not in config_map:
+        print(f"Error: Unknown config-key '{config_key}'", file = sys.stderr)
+        sys.exit(1)
+
+    # Extract test names from the dictionary keys
+    test_dict = config_map[config_key]
+    test_names = list(test_dict.keys())
+
+    # Output as JSON array
+    print(json.dumps(test_names))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/reuse_skip_wrapper.py b/scripts/reuse_skip_wrapper.py
new file mode 100755
index 0000000000..74d3c79696
--- /dev/null
+++ b/scripts/reuse_skip_wrapper.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import os
+import shutil
+import subprocess
+
+
+def skip(file: str) -> bool:
+    # Skip license directory
+    if "LICENSES" in os.path.split(file):
+        return True
+
+    return False
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(prog = "license-fix.py",
+                                     usage = "%(prog)s [OPTIONS] FILE ...",
+                                     description = "Helper script to fix the licenses with pre-commit")
+    parser.add_argument("files", nargs = "+")
+
+    args = parser.parse_args()
+
+    files = [f for f in args.files if not skip(f)]
+
+    reuse_path = shutil.which("reuse")
+    if not reuse_path:
+        reuse_path = "python -m reuse"
+    try:
+        subprocess.run(f"{reuse_path} lint-file {' '.join(files)}", shell = True, check = True)
+    except subprocess.CalledProcessError:
+        exit(1)
diff --git a/setup.py b/setup.py
index bd6d90c570..b89383d40a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,27 +1,8 @@
-# ----------------------------------------------------------------------
-#
-# File: setup.py
-#
-# Last edited: 26.07.2024
-#
-# Copyright (C) 2024, ETH Zurich and University of Bologna.
-#
-# Author: Moritz Scherer, ETH Zurich
+#!/usr/bin/env python
+
+# SPDX-FileCopyrightText: 2024 ETH Zurich and University of Bologna
 #
-# ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 
 import setuptools
 
diff --git a/toolchain/banshee.patch b/toolchain/banshee.patch
deleted file mode 100644
index 6e982afcb8..0000000000
--- a/toolchain/banshee.patch
+++ /dev/null
@@ -1,98 +0,0 @@
-diff --git a/Cargo.toml b/Cargo.toml
-index d406357..eb5ce8e 100644
---- a/Cargo.toml
-+++ b/Cargo.toml
-@@ -11,30 +11,32 @@ edition = "2018"
- build = "build/build.rs"
-
- [dependencies]
--anyhow = "1"
--binread = "2.2.0"
--bytebuffer = "0.2.1"
--byteorder = "1.4.3"
--clap = "2"
-+anyhow = "=1"
-+binread = "=2.2.0"
-+bytebuffer = "=0.2.1"
-+byteorder = "=1.4.3"
-+unicode-width = "=0.1.13"
-+clap = "=2"
- crossbeam-utils = "0.8"
--csv = "1.0.0-beta.2"
--elf = "0.0.10"
-+csv = "=1.0.0-beta.2"
-+elf = "=0.0.10"
- flexfloat = { path = "flexfloat" }
--itertools = "0.9"
--llvm-sys = "120"
-+itertools = "=0.9"
-+llvm-sys = "150"
- log = { version = "0.4", features = ["release_max_level_info"] }
--pest = "2.1.3"
--pest_derive = "2.1.0"
--ndarray = "0.13"
--pretty_env_logger = "0.4"
-+pest = "=2.1.3"
-+pest_derive = "=2.1.0"
-+ndarray = "=0.13"
-+pretty_env_logger = "=0.4"
- regex = "~1.9.6"
--rev_slice = "0.1.5"
-+rev_slice = "=0.1.5"
- serde = { version = "1.0.123", features = ["derive"] }
--serde_json = "1.0.63"
--serde_yaml = "0.8"
--termion = "2.0.3"
--thiserror = "1.0.21"
--to-binary = "0.4.0"
-+serde_json = "=1.0.63"
-+serde_yaml = "=0.8"
-+termion = "=2.0.3"
-+thiserror = "=1.0.21"
-+to-binary = "=0.4.0"
-+libc = "0.2"
-
- [build-dependencies]
- cc = "1.0"
-diff --git a/build/runtime.rs b/build/runtime.rs
-index 04f80b8..c03f248 100644
---- a/build/runtime.rs
-+++ b/build/runtime.rs
-@@ -22,8 +22,7 @@ pub fn build() {
-             "--crate-type=staticlib",
-             "-Copt-level=3",
-             "-Cdebuginfo=0",
--            "-Cpanic=abort",
--            "-Cllvm-args=-opaque-pointers=0",
-+            "-Cpanic=abort"
-         ])
-         .status()
-         .unwrap();
-diff --git a/src/engine.rs b/src/engine.rs
-index 216996b..e5abe38 100644
---- a/src/engine.rs
-+++ b/src/engine.rs
-@@ -281,7 +281,6 @@ impl Engine {
-
-             LLVMPassManagerBuilderPopulateFunctionPassManager(builder, func_passes);
-             LLVMAddAnalysisPasses(tm, module_passes);
--            LLVMPassManagerBuilderPopulateLTOPassManager(builder, module_passes, 0, 1);
-             LLVMPassManagerBuilderPopulateModulePassManager(builder, module_passes);
-
-             // Create and run the function pass manager.
-diff --git a/src/tran.rs b/src/tran.rs
-index 1054744..ae5ae78 100644
---- a/src/tran.rs
-+++ b/src/tran.rs
-@@ -18,9 +18,10 @@ use std::{
-     collections::{BTreeSet, HashMap},
-     ffi::CString,
- };
-+use libc;
- extern crate flexfloat;
-
--static NONAME: &'static i8 = unsafe { std::mem::transmute("\0".as_ptr()) };
-+static NONAME: &'static libc::c_char = unsafe { std::mem::transmute("\0".as_ptr()) };
-
- /// Base address of the stream semantic regsiters
- static SSR_BASE: u64 = 0x204800;