Skip to content

Commit

Permalink
w
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Feb 5, 2025
1 parent ac6cc3e commit 88214f9
Show file tree
Hide file tree
Showing 2 changed files with 354 additions and 32 deletions.
193 changes: 177 additions & 16 deletions src/layer/loongarch/quantize_loongarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,18 +83,89 @@ static void quantize(const float* ptr, signed char* s8ptr, const Mat& scale_data
}
}

#if __loongarch_sx
static void quantize_pack4to8(const float* ptr0, const float* ptr1, signed char* s8ptr, const Mat& scale_data, int elemcount)
{
const int scale_data_size = scale_data.w;

// NCNN_LOGE("quantize_pack4to8 %d %d", scale_data_size, elemcount);

float scale = scale_data[0];
__m128 _scale0 = (__m128)__lsx_vreplfr2vr_s(scale);
__m128 _scale1 = _scale0;
if (scale_data_size > 1)
{
_scale0 = (__m128)__lsx_vld((const float*)scale_data, 0);
_scale1 = (__m128)__lsx_vld((const float*)scale_data + 4, 0);
}

int i = 0;
for (; i < elemcount; i++)
{
__m128 _v0 = (__m128)__lsx_vld(ptr0, 0);
__m128 _v1 = (__m128)__lsx_vld(ptr1, 0);
_v0 = __lsx_vfmul_s(_v0, _scale0);
_v1 = __lsx_vfmul_s(_v1, _scale1);
*((int64_t*)s8ptr) = float2int8(_v0, _v1);
ptr0 += 4;
ptr1 += 4;
s8ptr += 8;
}
}

static void quantize_pack4to1(const float* ptr, signed char* s8ptr0, signed char* s8ptr1, signed char* s8ptr2, signed char* s8ptr3, const Mat& scale_data, int elemcount)
{
const int scale_data_size = scale_data.w;

// NCNN_LOGE("quantize_pack4to1 %d %d", scale_data_size, elemcount);

float scale = scale_data[0];
__m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale);
if (scale_data_size > 1)
{
_scale = (__m128)__lsx_vld((const float*)scale_data);
}

int i = 0;
for (; i < elemcount; i++)
{
__m128 _v = (__m128)__lsx_vld(ptr, 0);
_v = __lsx_vfmul_s(_v, _scale);
v16i8 v = float2int8(_v, _v);
s8ptr0[0] = v[0];
s8ptr1[0] = v[1];
s8ptr2[0] = v[2];
s8ptr3[0] = v[3];
ptr += 4;
s8ptr0 += 1;
s8ptr1 += 1;
s8ptr2 += 1;
s8ptr3 += 1;
}
}
#endif // __loongarch_sx

int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
const int dims = bottom_blob.dims;
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int channels = bottom_blob.c;
const int elempack = bottom_blob.elempack;
const size_t out_elemsize = elempack * 1u;

if (dims == 1)
{
top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
int out_elempack = 1;
#if __loongarch_sx
if (opt.use_packing_layout)
{
out_elempack = w * elempack % 8 == 0 ? 8 : 1;
}
#endif
const int outw = w * elempack / out_elempack;
const size_t out_elemsize = out_elempack * 1u;

top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

Expand All @@ -119,37 +190,127 @@ int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Opt

if (dims == 2)
{
top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
int out_elempack = 1;
#if __loongarch_sx
if (opt.use_packing_layout)
{
out_elempack = h * elempack % 8 == 0 ? 8 : 1;
}
#endif
const int outh = h * elempack / out_elempack;
const size_t out_elemsize = out_elempack * 1u;

top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < h; i++)
#if __loongarch_sx
if (elempack == 4 && out_elempack == 8)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < outh; i++)
{
const float* ptr0 = bottom_blob.row(i * 2);
const float* ptr1 = bottom_blob.row(i * 2 + 1);
signed char* s8ptr = top_blob.row<signed char>(i);

const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data;

quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_i, w);
}
}
if (elempack == 4 && out_elempack == 1)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < h; i++)
{
const float* ptr = bottom_blob.row(i);
signed char* s8ptr0 = top_blob.row<signed char>(i * 4);
signed char* s8ptr1 = top_blob.row<signed char>(i * 4 + 1);
signed char* s8ptr2 = top_blob.row<signed char>(i * 4 + 2);
signed char* s8ptr3 = top_blob.row<signed char>(i * 4 + 3);

const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_i, w);
}
}
#endif // __loongarch_sx
if (elempack == out_elempack)
{
const float* ptr = bottom_blob.row(i);
signed char* s8ptr = top_blob.row<signed char>(i);
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < h; i++)
{
const float* ptr = bottom_blob.row(i);
signed char* s8ptr = top_blob.row<signed char>(i);

const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;

quantize(ptr, s8ptr, scale_data_i, w, elempack);
quantize(ptr, s8ptr, scale_data_i, w, elempack);
}
}
}

if (dims == 3)
{
top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
int out_elempack = 1;
#if __loongarch_sx
if (opt.use_packing_layout)
{
out_elempack = channels * elempack % 8 == 0 ? 8 : 1;
}
#endif
const int outc = channels * elempack / out_elempack;
const size_t out_elemsize = out_elempack * 1u;

top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
#if __loongarch_sx
if (elempack == 4 && out_elempack == 8)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < outc; q++)
{
const float* ptr0 = bottom_blob.channel(q * 2);
const float* ptr1 = bottom_blob.channel(q * 2 + 1);
signed char* s8ptr = top_blob.channel(q);

const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data;

quantize_pack4to8(ptr0, ptr1, s8ptr, scale_data_q, w * h);
}
}
if (elempack == 4 && out_elempack == 1)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
signed char* s8ptr0 = top_blob.channel(q * 4);
signed char* s8ptr1 = top_blob.channel(q * 4 + 1);
signed char* s8ptr2 = top_blob.channel(q * 4 + 2);
signed char* s8ptr3 = top_blob.channel(q * 4 + 3);

const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

quantize_pack4to1(ptr, s8ptr0, s8ptr1, s8ptr2, s8ptr3, scale_data_q, w * h);
}
}
#endif // __loongarch_sx
if (elempack == out_elempack)
{
const float* ptr = bottom_blob.channel(q);
signed char* s8ptr = top_blob.channel(q);
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
signed char* s8ptr = top_blob.channel(q);

const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;

quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
quantize(ptr, s8ptr, scale_data_q, w * h, elempack);
}
}
}

Expand Down
Loading

0 comments on commit 88214f9

Please sign in to comment.