x265 传递残差计算

一传递残差的作用

传递残差最终会直接增加到当前帧的Cost上，影响了最终的码控，本文分析传递残差计算过程

二代码详细分析

传递残差迭代过程


void Lookahead::estimateCUPropagate(Lowres **frames, double averageDuration, int p0, int p1, int b, int referenced)
{
/*
1 帧序列
2 平均duration
3 前向帧
4 后向帧
5 当前帧
6 是否被参考了
*/
    uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost };
    int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0);
    int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32;
    int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight }; //计算双向权重
    int listDist[2] = { b - p0, p1 - b };
 
    memset(m_scratch, 0, m_8x8Width * sizeof(int));
 
    uint16_t *propagateCost = frames[b]->propagateCost; //传递残差指针
 
    s265_emms();
    double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration);
    //duration factor，看下是否是不均匀的帧率，正常情况下是1
 
    /* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */
    if (!referenced) //如果非参考
        memset(frames[b]->propagateCost, 0, m_8x8Width * sizeof(uint16_t)); //
 
    int32_t strideInCU = m_8x8Width; //以cu为单位的行宽
    for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++) //
    {
        int cuIndex = blocky * strideInCU; //遍历每一行
        if (m_param->rc.qgSize == 8) //如果qgSize == 8
            primitives.propagateCost(m_scratch, propagateCost,
                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
                       frames[b]->invQscaleFactor8x8 + cuIndex, &fpsFactor, m_8x8Width);
                       //计算传递残差
        else
            primitives.propagateCost(m_scratch, propagateCost,
                       frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex,
                       frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width);
                       //
 
        if (referenced) //如果是参考帧，
            propagateCost += m_8x8Width; //偏移一行，要不就是复用的
 
        for (uint16_t blockx = 0; blockx < m_8x8Width; blockx++, cuIndex++) //遍历每一行的每个块
        {
            int32_t propagate_amount = m_scratch[blockx];
            /* Don't propagate for an intra block. */
            if (propagate_amount > 0) //propagate_amount 传递次数
            {
                /* Access width-2 bitfield. */
                int32_t lists_used = frames[b]->lowresCosts[b - p0][p1 - b][cuIndex] >> LOWRES_COST_SHIFT; //
                //最高两位存放 前向和后向使用情况， 一共16位的lowresCosts, 后14真的存放的是cost，前2位存放的是方向信息
                 lowresCosts ,画面在1/4分辨率时候的Cost , 右移位 14，
                /* Follow the MVs to the previous frame(s). */
                for (uint16_t list = 0; list < 2; list++) //0， 1 前向/后向参考传递
                {
                    if ((lists_used >> list) & 1) //为1 表示有这个方向上的。 当list为0, 
                    {
#define CLIP_ADD(s, x) (s) = (uint16_t)S265_MIN((s) + (x), (1 << 16) - 1)
                        int32_t listamount = propagate_amount;
                        /* Apply bipred weighting. */
                        if (lists_used == 3) //双向的
                            listamount = (listamount * bipredWeights[list] + 32) >> 6; //双向的需要调整，根据双向权重值
                        MV *mvs = frames[b]->lowresMvs[list][listDist[list]]; //拿到一个方向上的mv值
                        /* Early termination for simple case of mv 0. */
                        if (!mvs[cuIndex].word) //如果mv是0， 说明是
                        {
                            CLIP_ADD(refCosts[list][cuIndex], listamount);//直接把当前块的cost增大一些，当然不能超过1 << 16
                            //因为这里没有运动向量的cost, 所以只需要加上传递残差
                            continue;
                        }
                        int32_t x = mvs[cuIndex].x;
                        int32_t y = mvs[cuIndex].y;
                        int32_t cux = (x >> 5) + blockx;
                        int32_t cuy = (y >> 5) + blocky;
                        int32_t idx0 = cux + cuy * strideInCU; //当前块位置
                        int32_t idx1 = idx0 + 1;//当前块前一个块
                        int32_t idx2 = idx0 + strideInCU;// 当前块下面一个块
                        int32_t idx3 = idx0 + strideInCU + 1; //当前块，前面下方一个块
                        /*
                           D(当前块)  idx1
                           idx2      idx3
                        */
                        x &= 31;
                        y &= 31;
                        int32_t idx0weight = (32 - y) * (32 - x);
                        int32_t idx1weight = (32 - y) * x;
                        int32_t idx2weight = y * (32 - x);
                        int32_t idx3weight = y * x;
                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
                         * be counted. */
                        if (cux < m_8x8Width - 1 && cuy < m_8x8Height - 1 && cux >= 0 && cuy >= 0)
                        { //给这些块，都加上传递残差的影响 ,在边界范围内的
                            CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
                            CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
                        }
                        else /* Check offsets individually  出界的，需要逐个判断，要不然idx 下标访问越界*/
                        {
                            if (cux < m_8x8Width && cuy < m_8x8Height && cux >= 0 && cuy >= 0)
                                CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10);
                            if (cux + 1 < m_8x8Width && cuy < m_8x8Height && cux + 1 >= 0 && cuy >= 0)
                                CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10);
                            if (cux < m_8x8Width && cuy + 1 < m_8x8Height && cux >= 0 && cuy + 1 >= 0)
                                CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10);
                            if (cux + 1 < m_8x8Width && cuy + 1 < m_8x8Height && cux + 1 >= 0 && cuy + 1 >= 0)
                                CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10);
                        }
                    }
                }
            }
        }
    }
 
    if (m_param->rc.vbvBufferSize && m_param->lookaheadDepth && referenced)
        cuTreeFinish(frames[b], averageDuration, b == p1 ? b - p0 : 0);
}

三传递残差具体计算


/* Estimate the total amount of influence on future quality that could be had if we
 * were to improve the reference samples used to inter predict any given CU. */
static void estimateCUPropagateCost(
int* dst, //最终存储传递残差Amount的地方
const uint16_t* propagateIn,//当前帧存储传递残差的地方，这个会不断迭代，因为一帧一帧参考关系的计算
const int32_t* intraCosts, 
const uint16_t* interCosts,
const int32_t* invQscales, 
const double* fpsFactor, 
int len)
{
    double fps = *fpsFactor / 256;  // range[0.01, 1.00]
    for (int i = 0; i < len; i++) //一行的每个cu块
    {
        int intraCost = intraCosts[i]; //当前块的帧内Costs
        int interCost = S265_MIN(intraCosts[i], interCosts[i] & LOWRES_COST_MASK);//00ffffff 低14bit存放的才是cost
        //上面选择最小的作为Cost
        double propagateIntra = intraCost * invQscales[i]; // Q16 x Q8.8 = Q24.8 帧内cost * 一定的系数
        double propagateAmount = (double)propagateIn[i]/*原有的传递残差*/ + propagateIntra * fps; // Q16.0 + Q24.8 x Q0.x = Q25.0 帧内Cost * fps，计算1/duration
        double propagateNum = (double)(intraCost - interCost); // Q32 - Q32 = Q33.0 帧间Cost  - 帧内Cost
 
 
        double propagateDenom = (double)intraCost;             // Q32
        dst[i] = (int)(propagateAmount * propagateNum / propagateDenom + 0.5);//最终的传递残差值
    }
    //}
}

以上就是cuTree影响帧Cost的全过程

相关阅读:
聚类算法的先验基础知识
 jupyter notebook闪退解决，安美解决
 洛科威多功能岩棉板助力节能减碳战略，推动碳达峰目标实现
 C++设计模式（Design Patterns）
算法——动态规划
 windows内网渗透正向代理
 leetcode 移除链表元素
 wagtail的使用
 【Spring（二）】java对象属性的配置（Bean的配置）
Excel比较两列数据，并找出不同
原文地址：https://blog.csdn.net/fantasy_ARM9/article/details/126506607