Skip to content

ACL: INT8 3x3 slower than F32 on small-channel conv (Neoverse-V2, SVE2) #1225

@allnes

Description

@allnes

Hi, @milpuz01

I catch perf degradation

Environment

  • HW: ARM Neoverse-V2 (flags: sve, sve2, i8mm, bf16, dotprod)
  • ACL build (scons):
    scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=1
    benchmark_tests=0 validation_tests=0 asserts=0
    arch=arm64-v8.2-a-sve2
    (compiles with -march=armv8.2-a+sve2+fp16+dotprod; SVE2 enabled)

Minimal reproducer (standalone)

Single NEConvolutionLayer benchmark, parameters via CLI.

Build (save code below as repro_conv.cpp):

g++ -std=c++14 -O3 -march=armv8.2-a+sve2+fp16+dotprod
-I<ACL_ROOT> -I<ACL_ROOT>/include
repro_conv.cpp -L<ACL_ROOT>/build -larm_compute -pthread
-Wl,-rpath,<ACL_ROOT>/build -o conv_repro

Runs (NCHW, N=1, threads=4):

  1. Problem case: 3x3, H=W=75, C=32->32, stride=1, pad=1

./conv_repro --prec=i8 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4

avg_ms ≈ 0.471

./conv_repro --prec=f32 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4

avg_ms ≈ 0.276

INT8 ~1.7x slower than F32

  1. Reference where INT8 is faster (first 7x7 layer)

./conv_repro --prec=i8 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.192 (H=W=300, C:3->32, k=7, s=2, p=3)
./conv_repro --prec=f32 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.882

Source (repro_conv.cpp)

#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/Scheduler.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
#include <chrono>
#include <iostream>
#include <random>
#include <string>

using namespace arm_compute;

struct Options {
    std::string prec = "i8"; // i8 | f32
    unsigned iters = 50, warmup = 5, threads = 4;
    unsigned h = 300, w = 300, in_c = 3, out_c = 32;
    unsigned k = 7, stride = 2, pad = 3;
};

Options parse(int argc, char** argv) {
    Options o;
    for (int i = 1; i < argc; ++i) {
        std::string a = argv[i];
        auto val = [&](const std::string&) { auto p = a.find('='); return p == std::string::npos ? std::string() : a.substr(p + 1); };
        if      (a.rfind("--prec=",0)==0) o.prec    = val(a);
        else if (a.rfind("--iters=",0)==0)o.iters   = std::stoul(val(a));
        else if (a.rfind("--warmup=",0)==0)o.warmup = std::stoul(val(a));
        else if (a.rfind("--threads=",0)==0)o.threads=std::stoul(val(a));
        else if (a.rfind("--h=",0)==0)    o.h      = std::stoul(val(a));
        else if (a.rfind("--w=",0)==0)    o.w      = std::stoul(val(a));
        else if (a.rfind("--in_c=",0)==0) o.in_c   = std::stoul(val(a));
        else if (a.rfind("--out_c=",0)==0)o.out_c  = std::stoul(val(a));
        else if (a.rfind("--k=",0)==0)    o.k      = std::stoul(val(a));
        else if (a.rfind("--stride=",0)==0)o.stride= std::stoul(val(a));
        else if (a.rfind("--pad=",0)==0)  o.pad    = std::stoul(val(a));
    }
    return o;
}

void fill_random(Tensor& t) {
    std::mt19937 gen(0);
    std::uniform_int_distribution<int> dist_i(-128, 127);
    std::uniform_real_distribution<float> dist_f(-1.f, 1.f);
    auto dt = t.info()->data_type();
    Window win; win.use_tensor_dimensions(t.info()->tensor_shape());
    Iterator it(&t, win);
    execute_window_loop(win, [&](const Coordinates&) {
        if (dt == DataType::QASYMM8 || dt == DataType::QASYMM8_SIGNED)
            *reinterpret_cast<int8_t*>(it.ptr()) = static_cast<int8_t>(dist_i(gen));
        else
            *reinterpret_cast<float*>(it.ptr()) = dist_f(gen);
    }, it);
}

int main(int argc, char** argv) {
    Options opt = parse(argc, argv);
    Scheduler::get().set_num_threads(opt.threads);

    TensorShape src_shape(opt.w, opt.h, opt.in_c, 1U);
    TensorShape wei_shape(opt.k, opt.k, opt.in_c, opt.out_c);
    unsigned out_w = (opt.w + 2 * opt.pad - opt.k) / opt.stride + 1;
    unsigned out_h = (opt.h + 2 * opt.pad - opt.k) / opt.stride + 1;
    TensorShape dst_shape(out_w, out_h, opt.out_c, 1U);
    PadStrideInfo conv_info(opt.stride, opt.stride, opt.pad, opt.pad);

    DataType dt = (opt.prec == "f32") ? DataType::F32 : DataType::QASYMM8_SIGNED;
    QuantizationInfo qi(1.f, 0);
    Tensor src, wei, dst;
    src.allocator()->init(TensorInfo(src_shape, 1, dt, qi));
    wei.allocator()->init(TensorInfo(wei_shape, 1, dt, qi));
    dst.allocator()->init(TensorInfo(dst_shape, 1, dt, qi));

    NEConvolutionLayer conv;
    conv.configure(&src, &wei, nullptr, &dst,
                   conv_info, WeightsInfo(), Size2D(1U, 1U),
                   ActivationLayerInfo(), false, 1);

    src.allocator()->allocate();
    wei.allocator()->allocate();
    dst.allocator()->allocate();
    fill_random(src);
    fill_random(wei);

    for (unsigned i = 0; i < opt.warmup; ++i) conv.run();
    auto t0 = std::chrono::high_resolution_clock::now();
    for (unsigned i = 0; i < opt.iters; ++i) conv.run();
    auto t1 = std::chrono::high_resolution_clock::now();

    double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
    std::cout << "prec=" << opt.prec
              << " h=" << opt.h << " w=" << opt.w
              << " in_c=" << opt.in_c << " out_c=" << opt.out_c
              << " k=" << opt.k << " s=" << opt.stride << " p=" << opt.pad
              << " threads=" << opt.threads
              << " iters=" << opt.iters
              << " avg_ms=" << ms / opt.iters
              << std::endl;
    return 0;
}

Observation

  • On Neoverse-V2 with SVE2 build, INT8 3×3 (75×75, C=32) is still ~1.7× slower than F32. Large 7×7 remains faster in INT8. Please investigate INT8 kernel selection/perf for this pattern.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions