-
Notifications
You must be signed in to change notification settings - Fork 808
Description
Hi, @milpuz01
I catch perf degradation
Environment
- HW: ARM Neoverse-V2 (flags: sve, sve2, i8mm, bf16, dotprod)
- ACL build (scons):
scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=1
benchmark_tests=0 validation_tests=0 asserts=0
arch=arm64-v8.2-a-sve2
(compiles with -march=armv8.2-a+sve2+fp16+dotprod; SVE2 enabled)
Minimal reproducer (standalone)
Single NEConvolutionLayer benchmark, parameters via CLI.
Build (save code below as repro_conv.cpp):
g++ -std=c++14 -O3 -march=armv8.2-a+sve2+fp16+dotprod
-I<ACL_ROOT> -I<ACL_ROOT>/include
repro_conv.cpp -L<ACL_ROOT>/build -larm_compute -pthread
-Wl,-rpath,<ACL_ROOT>/build -o conv_repro
Runs (NCHW, N=1, threads=4):
- Problem case: 3x3, H=W=75, C=32->32, stride=1, pad=1
./conv_repro --prec=i8 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4
avg_ms ≈ 0.471
./conv_repro --prec=f32 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4
avg_ms ≈ 0.276
INT8 ~1.7x slower than F32
- Reference where INT8 is faster (first 7x7 layer)
./conv_repro --prec=i8 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.192 (H=W=300, C:3->32, k=7, s=2, p=3)
./conv_repro --prec=f32 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.882
Source (repro_conv.cpp)
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/Scheduler.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
#include <chrono>
#include <iostream>
#include <random>
#include <string>
using namespace arm_compute;
struct Options {
std::string prec = "i8"; // i8 | f32
unsigned iters = 50, warmup = 5, threads = 4;
unsigned h = 300, w = 300, in_c = 3, out_c = 32;
unsigned k = 7, stride = 2, pad = 3;
};
Options parse(int argc, char** argv) {
Options o;
for (int i = 1; i < argc; ++i) {
std::string a = argv[i];
auto val = [&](const std::string&) { auto p = a.find('='); return p == std::string::npos ? std::string() : a.substr(p + 1); };
if (a.rfind("--prec=",0)==0) o.prec = val(a);
else if (a.rfind("--iters=",0)==0)o.iters = std::stoul(val(a));
else if (a.rfind("--warmup=",0)==0)o.warmup = std::stoul(val(a));
else if (a.rfind("--threads=",0)==0)o.threads=std::stoul(val(a));
else if (a.rfind("--h=",0)==0) o.h = std::stoul(val(a));
else if (a.rfind("--w=",0)==0) o.w = std::stoul(val(a));
else if (a.rfind("--in_c=",0)==0) o.in_c = std::stoul(val(a));
else if (a.rfind("--out_c=",0)==0)o.out_c = std::stoul(val(a));
else if (a.rfind("--k=",0)==0) o.k = std::stoul(val(a));
else if (a.rfind("--stride=",0)==0)o.stride= std::stoul(val(a));
else if (a.rfind("--pad=",0)==0) o.pad = std::stoul(val(a));
}
return o;
}
void fill_random(Tensor& t) {
std::mt19937 gen(0);
std::uniform_int_distribution<int> dist_i(-128, 127);
std::uniform_real_distribution<float> dist_f(-1.f, 1.f);
auto dt = t.info()->data_type();
Window win; win.use_tensor_dimensions(t.info()->tensor_shape());
Iterator it(&t, win);
execute_window_loop(win, [&](const Coordinates&) {
if (dt == DataType::QASYMM8 || dt == DataType::QASYMM8_SIGNED)
*reinterpret_cast<int8_t*>(it.ptr()) = static_cast<int8_t>(dist_i(gen));
else
*reinterpret_cast<float*>(it.ptr()) = dist_f(gen);
}, it);
}
int main(int argc, char** argv) {
Options opt = parse(argc, argv);
Scheduler::get().set_num_threads(opt.threads);
TensorShape src_shape(opt.w, opt.h, opt.in_c, 1U);
TensorShape wei_shape(opt.k, opt.k, opt.in_c, opt.out_c);
unsigned out_w = (opt.w + 2 * opt.pad - opt.k) / opt.stride + 1;
unsigned out_h = (opt.h + 2 * opt.pad - opt.k) / opt.stride + 1;
TensorShape dst_shape(out_w, out_h, opt.out_c, 1U);
PadStrideInfo conv_info(opt.stride, opt.stride, opt.pad, opt.pad);
DataType dt = (opt.prec == "f32") ? DataType::F32 : DataType::QASYMM8_SIGNED;
QuantizationInfo qi(1.f, 0);
Tensor src, wei, dst;
src.allocator()->init(TensorInfo(src_shape, 1, dt, qi));
wei.allocator()->init(TensorInfo(wei_shape, 1, dt, qi));
dst.allocator()->init(TensorInfo(dst_shape, 1, dt, qi));
NEConvolutionLayer conv;
conv.configure(&src, &wei, nullptr, &dst,
conv_info, WeightsInfo(), Size2D(1U, 1U),
ActivationLayerInfo(), false, 1);
src.allocator()->allocate();
wei.allocator()->allocate();
dst.allocator()->allocate();
fill_random(src);
fill_random(wei);
for (unsigned i = 0; i < opt.warmup; ++i) conv.run();
auto t0 = std::chrono::high_resolution_clock::now();
for (unsigned i = 0; i < opt.iters; ++i) conv.run();
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "prec=" << opt.prec
<< " h=" << opt.h << " w=" << opt.w
<< " in_c=" << opt.in_c << " out_c=" << opt.out_c
<< " k=" << opt.k << " s=" << opt.stride << " p=" << opt.pad
<< " threads=" << opt.threads
<< " iters=" << opt.iters
<< " avg_ms=" << ms / opt.iters
<< std::endl;
return 0;
}Observation
- On Neoverse-V2 with SVE2 build, INT8 3×3 (75×75, C=32) is still ~1.7× slower than F32. Large 7×7 remains faster in INT8. Please investigate INT8 kernel selection/perf for this pattern.