• NEON优化:log10函数的优化案例


    NEON优化:log10函数的优化案例

    背景


    在进行音频样点计算中,经常需要将振幅值转换到对数域,涉及到大量的log运算,这里总结下提升log运算速度的一些优化方法。

    log10功能描述

    • 输入:x>0,浮点数
    • 输出:log10(x),浮点数
    • 峰值误差(Peak Error):~0.000465339053%
    • 均方根误差(RMS Error):~0.000008%

    优化方法


    对log10运算进行NEON优化的基本依据是:log10运算可用泰勒近似展开成多项式计算,然后根据IEEE浮点存储格式,对相关数据进行查表运算,转化为乘加运算,加快运算速度。

    多项式系数展开为如下表:

    const float LOG10_TABLE[8] = {  // 8长度
        -0.99697286229624F, // a0
        -1.07301643912502F, // a4
        -2.46980061535534F, // a2
        -0.07176870463131F, // a6
        2.247870219989470F, // a1
        0.366547581117400F, // a5
        1.991005185100089F, // a3
        0.006135635201050F, // a7
    };
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    优化版本1

    优化方法为从对数运算转化为查表和多项式拟合,计算过程展开分析如下:

    • step1
      • A = a4 * x + a0
      • B = a5 * x + a1
      • C = a6 * x + a2
      • D = a7 * x + a3
    • step2
      • A := A + B * x^2
      • C := C + D * x^2
    • step3
      • F = A + C * x^4 = a0 + a4 * x + a2 * x^2 + a6 * x^3 + a1 * x^4 + a5 * x^5 + a3 * x^6 + a7 * x^7
    • 公共算子:x, x^2, x^4

    从原理到代码实现,代码如下:

    float Log10FloatNeonOpt(float fVarin)
    {
        float fTmpA, fTmpB, fTmpC, fTmpD, fTmpxx;
        int32_t iTmpM;
    
        union {
            float fTmp;
            int32_t iTmp;
        } aVarTmp1;
    
        // extract exponent
        aVarTmp1.fTmp = fVarin;
        iTmpM = (int32_t)((uint32_t)aVarTmp1.iTmp >> 23);                 // 乘2^-23缩小, 取 32-9=23的高9位
        iTmpM = iTmpM - 127;                                              // 减2^8-1=127,取反码
        aVarTmp1.iTmp = aVarTmp1.iTmp - (int32_t)((uint32_t)iTmpM << 23); // 减去乘2^23复原的值, 取出指数
    
        // Taylor Polynomial (Estrins)
        fTmpxx = aVarTmp1.fTmp * aVarTmp1.fTmp;
        fTmpA = (LOG10_TABLE[4] * aVarTmp1.fTmp) + (LOG10_TABLE[0]); // 4: a1 0: a0
        fTmpB = (LOG10_TABLE[6] * aVarTmp1.fTmp) + (LOG10_TABLE[2]); // 6: a3 2: a2
        fTmpC = (LOG10_TABLE[5] * aVarTmp1.fTmp) + (LOG10_TABLE[1]); // 5: a5 1: a4
        fTmpD = (LOG10_TABLE[7] * aVarTmp1.fTmp) + (LOG10_TABLE[3]); // 7: a7 3: a6
        fTmpA = fTmpA + fTmpB * fTmpxx;
        fTmpC = fTmpC + fTmpD * fTmpxx;
        fTmpxx = fTmpxx * fTmpxx;
        aVarTmp1.fTmp = fTmpA + fTmpC * fTmpxx;
    
        // add exponent
        aVarTmp1.fTmp = aVarTmp1.fTmp + ((float)iTmpM) * (0.3010299957f);
    
        return aVarTmp1.fTmp;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32

    优化版本2

    优化方法为将版本1中的内部乘加运算并行起来做。

    float Log10FloatNeonOpt(float fVarin)
    {
        float fTmpxx;
        int32_t iTmpM;
    
        union {
            float fTmp;
            int32_t iTmp;
        } aVarTmp1;
    
        // extract exponent
        aVarTmp1.fTmp = fVarin;
        iTmpM = (int32_t)((uint32_t)aVarTmp1.iTmp >> 23);                 // 乘2^-23缩小, 取 32-9=23的高9位
        iTmpM = iTmpM - 127;                                              // 减2^8-1=127,取反码
        aVarTmp1.iTmp = aVarTmp1.iTmp - (int32_t)((uint32_t)iTmpM << 23); // 减去乘2^23复原的值, 取出指数
       
        // Taylor Polynomial (Estrins)
        fTmpxx = aVarTmp1.fTmp * aVarTmp1.fTmp;
        /* origin code
        fTmpA = (LOG10_TABLE[4] * aVarTmp1.fTmp) + (LOG10_TABLE[0]); // 4: a1 0: a0
        fTmpB = (LOG10_TABLE[6] * aVarTmp1.fTmp) + (LOG10_TABLE[2]); // 6: a3 2: a2
        fTmpC = (LOG10_TABLE[5] * aVarTmp1.fTmp) + (LOG10_TABLE[1]); // 5: a5 1: a4
        fTmpD = (LOG10_TABLE[7] * aVarTmp1.fTmp) + (LOG10_TABLE[3]); // 7: a7 3: a6
        fTmpA = fTmpA + fTmpB * fTmpxx;
        fTmpC = fTmpC + fTmpD * fTmpxx;
        */
        // optcode: A C B D
        float32x4_t vf32x4fTmpACBD, vf32x4fTmp1, vf32x4fTmp2;
        vf32x4fTmp1 = vld1q_f32(&LOG10_TABLE[0]);
        vf32x4fTmp2 = vld1q_f32(&LOG10_TABLE[4]); // 4 is for sth
        vf32x4fTmpACBD = vmlaq_n_f32(vf32x4fTmp1, vf32x4fTmp2, aVarTmp1.fTmp); // fTmp1 + fTmp2 * fTmp
        float32x2_t vf32x2fTmpAC, vf32x2fTmpBD;
        vf32x2fTmpAC = vget_low_f32(vf32x4fTmpACBD);
        vf32x2fTmpBD = vget_high_f32(vf32x4fTmpACBD);
        vf32x2fTmpAC = vmla_n_f32(vf32x2fTmpAC, vf32x2fTmpBD, fTmpxx); // fTmpAC + fTmpBD * fTmpxx
        float tmpAC[2]; // 2 is for sth
        vst1_f32(tmpAC, vf32x2fTmpAC);
    
        fTmpxx = fTmpxx * fTmpxx;
        aVarTmp1.fTmp = tmpAC[0] + tmpAC[1] * fTmpxx;
        // add exponent
        aVarTmp1.fTmp = aVarTmp1.fTmp + ((float)iTmpM) * (0.3010299957f);
    
        return aVarTmp1.fTmp;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45

    优化版本3

    对原函数Log10FloatNeonOpt()做接口扩展,使外部调用时可以一次输入4个x值,计算4个log10的运算结果。

    代码中,假设输入/输出都在NEON寄存器中读写,均为4个浮点寄存器值。

    #define PARALLEL_NUM        (4)
    #define EXP_SHIFT_NUM      (23)
    float32x4_t Log10FloatX4NeonOpt(float32x4_t vf32x4Varin)
    {
        int32x4_t vs32x4VarTmpI = vreinterpretq_s32_f32(vf32x4Varin);     // aVarTmp1.iTmp
        uint32x4_t vu32x4VarTmpU = vreinterpretq_u32_s32(vs32x4VarTmpI);  // (uint32_t)aVarTmp1.iTmp
        vu32x4VarTmpU = vshrq_n_u32(vu32x4VarTmpU, EXP_SHIFT_NUM);        // (uint32_t)aVarTmp1.iTmp >> 23
        int32x4_t vs32x4TmpM = vreinterpretq_s32_u32(vu32x4VarTmpU);      // (int32_t)((uint32_t)aVarTmp1.iTmp >> 23)
        int32x4_t vs32x4RevsCode = vdupq_n_s32(-127);                     // -127 is shift
        vs32x4TmpM = vaddq_s32(vs32x4TmpM, vs32x4RevsCode);               // iTmpM = iTmpM - 127, used later
    
        vu32x4VarTmpU = vreinterpretq_u32_s32(vs32x4TmpM);
        vu32x4VarTmpU = vshlq_n_u32(vu32x4VarTmpU, EXP_SHIFT_NUM);        // (uint32_t)iTmpM << 23
        int32x4_t vs32x4SubVal = vreinterpretq_s32_u32(vu32x4VarTmpU);
        vs32x4VarTmpI = vsubq_s32(vs32x4VarTmpI, vs32x4SubVal);           // aVarTmp1.iTmp-(int32_t)((uint32_t)iTmpM<<23)
    
        float32x4_t vf32x4TVarTmpF = vreinterpretq_f32_s32(vs32x4VarTmpI);
        float32x4_t vf32x4TmpXX = vmulq_f32(vf32x4TVarTmpF, vf32x4TVarTmpF);
        float32x4_t vf32x4TmpXXXX = vmulq_f32(vf32x4TmpXX, vf32x4TmpXX);
    
        // input: vf32x4TVarTmpF vs32x4TmpM vf32x4TmpXX vf32x4TmpXXXX
        float32x4x4_t vf32x4x4fTmpACBD;
        float32x4_t vf32x4fTmp1, vf32x4fTmp2;
        float fTmp[PARALLEL_NUM];
        vst1q_f32(fTmp, vf32x4TVarTmpF);
        vf32x4fTmp1 = vld1q_f32(&LOG10_TABLE[0]);
        vf32x4fTmp2 = vld1q_f32(&LOG10_TABLE[4]); // 4 is for sth
        vf32x4x4fTmpACBD.val[0] = vmlaq_n_f32(vf32x4fTmp1, vf32x4fTmp2, fTmp[0]); // fTmp1 + fTmp2 * fTmp
        vf32x4x4fTmpACBD.val[1] = vmlaq_n_f32(vf32x4fTmp1, vf32x4fTmp2, fTmp[1]);
        vf32x4x4fTmpACBD.val[2] = vmlaq_n_f32(vf32x4fTmp1, vf32x4fTmp2, fTmp[2]); // 2 is ACBD[2]
        vf32x4x4fTmpACBD.val[3] = vmlaq_n_f32(vf32x4fTmp1, vf32x4fTmp2, fTmp[3]); // 3 is ACBD[3]
    
        // transpose
        // a1 b1 c1 d1 => a1 a2 a3 a4
        // a2 b2 c2 d2 => b1 b2 b3 b4
        // ...
        // a4 b4 c4 d4 => d1 d2 d3 d4
        float32x4x2_t vf32x4x2fTmpACBD01 = vtrnq_f32(vf32x4x4fTmpACBD.val[0], vf32x4x4fTmpACBD.val[1]);
        float32x4x2_t vf32x4x2fTmpACBD23 = vtrnq_f32(vf32x4x4fTmpACBD.val[2], vf32x4x4fTmpACBD.val[3]); // 2, 3 is row b/d
        float32x4x2_t vf32x4x2fTmpACBD02 = vuzpq_f32(vf32x4x2fTmpACBD01.val[0], vf32x4x2fTmpACBD23.val[0]); // row02
        float32x4x2_t vf32x4x2fTmpACBD13 = vuzpq_f32(vf32x4x2fTmpACBD01.val[1], vf32x4x2fTmpACBD23.val[1]); // row13
        vf32x4x2fTmpACBD02 = vtrnq_f32(vf32x4x2fTmpACBD02.val[0], vf32x4x2fTmpACBD02.val[1]);
        vf32x4x2fTmpACBD13 = vtrnq_f32(vf32x4x2fTmpACBD13.val[0], vf32x4x2fTmpACBD13.val[1]);
        vf32x4x4fTmpACBD.val[0] = vf32x4x2fTmpACBD02.val[0]; // a0 a1 a2 a3
        vf32x4x4fTmpACBD.val[2] = vf32x4x2fTmpACBD02.val[1]; // 2 is row c
        vf32x4x4fTmpACBD.val[1] = vf32x4x2fTmpACBD13.val[0];
        vf32x4x4fTmpACBD.val[3] = vf32x4x2fTmpACBD13.val[1]; // d0 d1 d2 d3, row d
    
        // A = A + B * xx, C = C + D * xx
        float32x4_t vf32x4fTmpA = vmlaq_f32(vf32x4x4fTmpACBD.val[0], vf32x4x4fTmpACBD.val[2], vf32x4TmpXX); // 2 row b
        float32x4_t vf32x4fTmpC = vmlaq_f32(vf32x4x4fTmpACBD.val[1], vf32x4x4fTmpACBD.val[3], vf32x4TmpXX); // 3 row d
        vf32x4TVarTmpF = vmlaq_f32(vf32x4fTmpA, vf32x4fTmpC, vf32x4TmpXXXX);
    
        // add exponent
        float32x4_t vf32x4fTmpM = vcvtq_f32_s32(vs32x4TmpM);
        vf32x4TVarTmpF = vmlaq_n_f32(vf32x4TVarTmpF, vf32x4fTmpM, (0.3010299957f));
    
        return vf32x4TVarTmpF;
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59

    小结


    本文以log10函数的NEON优化为例,总结了三种不同的优化方式,可根据具体场景结合使用。其中,寄存器内4x4矩阵的转置运算,用到了博客《NEON优化3:矩阵转置的指令优化案例》提到的方法。

    值得注意的是,log10/log2的运算能力并无差别,原理类似,只是查表中不同的系数,如果需要log2的结果,可以用换底公式转换。

    如果想举一反三,类似地,也可用这种方法去优化powf函数的计算。

  • 相关阅读:
    未来的户外LED视频墙将怎么发展
    基于 51 的点阵屏显示 8*8 点阵仿真实验
    MySql学习笔记05——DML
    Jmeter之聚合报告“造假”
    详解TCP为什么不能是两次握手
    京东推荐系统的大促性能优化实战
    永磁同步电机(PMSM)磁场定向控制(FOC)及Matlab/Simulink仿真分析
    drawio快捷键
    Zookeeper:节点
    重修SpringMVC(一)
  • 原文地址:https://blog.csdn.net/qq_17256689/article/details/125600826