|
Analysis::compressIntraCU
|
输入64*64的CU,四叉树递归划分更小CU,然后对每个CU进行帧内预测RDO率失真优化模式选择。函数返回当前划分的 rd cost
|
|
Search::checkIntra()
|
调用estIntraPredQT和estIntraPredChromaQT分别选出当前CU的亮度最优预测模式和色度最优预测模式,然后计算编码当前CU所使用的的RD Cost。
|
|
Search::estIntraPredQT()
|
选出当前CU的亮度最优预测模式, H.265的亮度预测模式包括DC、Planar和33种角度模式共35种预测模式
|
|
Search::estIntraPredChromaQT()
|
选出当前CU的色度最优预测模式
|
|
Search::codeIntraLumaQT()
|
对CU递归划分TU(编码单元),使用上层传进来的预测模式对TU进行预测,然后进行变换、量化、反变换、反量化、重建,计算RD Cost
|
- /*
- * 输入64*64的CU,然后递归划分更小CU,并对每一个CU进行帧内预测来判断最递归划分深度尺寸以及帧内预测模式选择,函数返回当前划分尺寸和深度的 rd cost
- * */
- uint64_t Analysis::compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
- {
- std::cout << "encoder/analysis.cpp Analysis::compressIntraCU" << std::endl;
- uint32_t depth = cuGeom.depth;// CU结构深度,0~3
- ModeDepth& md = m_modeDepth[depth];
- md.bestMode = NULL;
- bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);// true表示叶子结点,还需要继续分裂
- bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
- bool bAlreadyDecided = m_param->intraRefine != 4 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] != (uint8_t)ALL_IDX && !(m_param->bAnalysisType == HEVC_INFO);
- bool bDecidedDepth = m_param->intraRefine != 4 && parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
- int split = 0;
- if (m_param->intraRefine && m_param->intraRefine != 4) //帧内精细化
- {
- split = m_param->scaleFactor && bDecidedDepth && (!mightNotSplit ||
- ((cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1))));
- if (cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize]) && !bDecidedDepth)
- bAlreadyDecided = false;
- }
- if (bAlreadyDecided) //已经决策出确定的帧内预测模式
- {
- if (bDecidedDepth && mightNotSplit) //已经决策出确定的划分深度
- {
- Mode& mode = md.pred[0];
- md.bestMode = &mode;
- mode.cu.initSubCU(parentCTU, cuGeom, qp);
- bool reuseModes = !((m_param->intraRefine == 3) ||
- (m_param->intraRefine == 2 && parentCTU.m_lumaIntraDir[cuGeom.absPartIdx] > DC_IDX));
- if (reuseModes)
- {
- memcpy(mode.cu.m_lumaIntraDir, parentCTU.m_lumaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
- memcpy(mode.cu.m_chromaIntraDir, parentCTU.m_chromaIntraDir + cuGeom.absPartIdx, cuGeom.numPartitions);
- }
- checkIntra(mode, cuGeom, (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx]);
- if (m_bTryLossless)
- tryLossless(cuGeom);
- if (mightSplit)
- addSplitFlagCost(*md.bestMode, cuGeom.depth);
- }
- }
- // 如果当前尺寸不等于最大CU尺寸(64x64)且可能不会继续划分,则开始选择预测模式 RDO
- else if (cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && mightNotSplit)
- {
- md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);
- checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N);
- checkBestMode(md.pred[PRED_INTRA], depth);//判断是否是最佳模式
-
-
- // 如果当前CU尺寸为8x8,则计算将CU划分为4个4x4 PU进行预测所需的RD Cost
- if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3)
- {
- md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom, qp);
- // 帧内预测模式选择 RDO 会计算 rd-cost等等
- // 在递归到最后划分到CU最小级别进来后会执行计算
- checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN);
- checkBestMode(md.pred[PRED_INTRA_NxN], depth);//判断是否是最佳模式
- }
- if (m_bTryLossless)
- tryLossless(cuGeom);
- if (mightSplit)
- addSplitFlagCost(*md.bestMode, cuGeom.depth);
- }
- // 达到设定的划分深度则停止划分
- mightSplit &= !(bAlreadyDecided && bDecidedDepth) || split;
- if (mightSplit)
- {
- // 如果还能继续划分则继续递归
- Mode* splitPred = &md.pred[PRED_SPLIT];
- splitPred->initCosts();
- CUData* splitCU = &splitPred->cu;
- splitCU->initSubCU(parentCTU, cuGeom, qp);
- uint32_t nextDepth = depth + 1;
- ModeDepth& nd = m_modeDepth[nextDepth];
- invalidateContexts(nextDepth);
- Entropy* nextContext = &m_rqt[depth].cur;
- int32_t nextQP = qp;
- uint64_t curCost = 0;
- int skipSplitCheck = 0;
- // 对已经递归的每个子CU进行相同的操作(计算rd-cost)
- for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++)
- {
- const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
- if (childGeom.flags & CUGeom::PRESENT)
- {
- m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);
- m_rqt[nextDepth].cur.load(*nextContext);
- if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)
- nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));
- if (m_param->bEnableSplitRdSkip)
- {
- curCost += compressIntraCU(parentCTU, childGeom, nextQP);
- // 如果划分的深度计算 rd cost 大于总的 rd cost 说明再继续划分失真度还是差不多则不在进行递归划分了
- if (m_modeDepth[depth].bestMode && curCost > m_modeDepth[depth].bestMode->rdCost)
- {
- skipSplitCheck = 1;
- break;
- }
- }
- else
- compressIntraCU(parentCTU, childGeom, nextQP);
- // 前面已经计算了当前深度的CU划分的总rd-cost、量化、编码数据,保存
- splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);
- splitPred->addSubCosts(*nd.bestMode);
- nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);
- nextContext = &nd.bestMode->contexts;
- }
- else
- {
- /* record the depth of this non-present sub-CU */
- splitCU->setEmptyPart(childGeom, subPartIdx);
- /* Set depth of non-present CU to 0 to ensure that correct CU is fetched as reference to code deltaQP */
- if (bAlreadyDecided)
- memset(parentCTU.m_cuDepth + childGeom.absPartIdx, 0, childGeom.numPartitions);
- }
- }
- if (!skipSplitCheck)
- {
- nextContext->store(splitPred->contexts);
- if (mightNotSplit)
- addSplitFlagCost(*splitPred, cuGeom.depth);
- else
- updateModeCost(*splitPred);
- checkDQPForSplitPred(*splitPred, cuGeom);
- checkBestMode(*splitPred, depth);
- }
- }
- // rd5 rd6 开启率失真精细化
- if (m_param->bEnableRdRefine && depth <= m_slice->m_pps->maxCuDQPDepth)
- {
- int cuIdx = (cuGeom.childOffset - 1) / 3;
- cacheCost[cuIdx] = md.bestMode->rdCost;
- }
- if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)
- {
- CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);
- int8_t maxTUDepth = -1;
- for (uint32_t i = 0; i < cuGeom.numPartitions; i++)
- maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);
- ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;
- }
- /* Copy best data to encData CTU and recon */
- md.bestMode->cu.copyToPic(depth);
- if (md.bestMode != &md.pred[PRED_SPLIT])
- md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
- // 返回当前CU的深度划分情况下,帧内预测最优的模式,返回最优模式的 rd-cost
- return md.bestMode->rdCost;
- }
- /*
- * 对传入的CU在当前划分深度和尺寸下计算每个模式的 rd-cost
- * */
- void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize)
- {
- std::cout << "encoder/serach.cpp Search::checkIntra()" << std::endl;
- CUData& cu = intraMode.cu;
- cu.setPartSizeSubParts(partSize);//设置partSize,也就是划分的深度
- cu.setPredModeSubParts(MODE_INTRA);//设置predMode为intra
- uint32_t tuDepthRange[2];//得到TU的深度范围
- cu.getIntraTUQtDepthRange(tuDepthRange, 0);
- intraMode.initCosts();//初始化cost
- //计算当前CU最优帧内预测模式(函数返回亮度失真)
- intraMode.lumaDistortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange);
- //I400就是没有色度分量,如果有色度分量也需要最优预测模式计算
- if (m_csp != X265_CSP_I400)
- {
- //计算当前CU最优帧内预测模式(函数返回色度失真)
- intraMode.chromaDistortion += estIntraPredChromaQT(intraMode, cuGeom);
- intraMode.distortion += intraMode.lumaDistortion + intraMode.chromaDistortion;
- }
- else
- intraMode.distortion += intraMode.lumaDistortion;
- cu.m_distortion[0] = intraMode.distortion;
- m_entropyCoder.resetBits();
- if (m_slice->m_pps->bTransquantBypassEnabled)
- m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
- int skipFlagBits = 0;
- if (!m_slice->isIntra())
- {
- m_entropyCoder.codeSkipFlag(cu, 0);
- skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();
- m_entropyCoder.codePredMode(cu.m_predMode[0]);
- }
- //
- m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
- m_entropyCoder.codePredInfo(cu, 0);
- intraMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;
- bool bCodeDQP = m_slice->m_pps->bUseDQP;
- // 编码残差系数
- m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);
- // 保存熵编码上下文
- m_entropyCoder.store(intraMode.contexts);
- // 得到编码当前CU的总bits开销
- intraMode.totalBits = m_entropyCoder.getNumberOfWrittenBits();
- // 得到编码当前CU系数的总bits开销
- intraMode.coeffBits = intraMode.totalBits - intraMode.mvBits - skipFlagBits;
- const Yuv* fencYuv = intraMode.fencYuv;
- // 技术 energy
- if (m_rdCost.m_psyRd)
- intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size);
- else if(m_rdCost.m_ssimRd)
- intraMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size, cuGeom.log2CUSize, TEXT_LUMA, 0);
- intraMode.resEnergy = primitives.cu[cuGeom.log2CUSize - 2].sse_pp(intraMode.fencYuv->m_buf[0], intraMode.fencYuv->m_size, intraMode.predYuv.m_buf[0], intraMode.predYuv.m_size);
- // 计算残差 intraMode 的 rdcost = distortion(fenc,recon)+lambda*all_bits
- updateModeCost(intraMode);
- checkDQP(intraMode, cuGeom);
- }
- //为当前CU中各个PU分析最优的帧内预测方向,并返回整个CU的distortion
- sse_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2])
- {
- CUData& cu = intraMode.cu;
- //原始帧、预测帧、重建帧
- const Yuv* fencYuv = intraMode.fencYuv;
- Yuv* predYuv = &intraMode.predYuv;
- Yuv* reconYuv = &intraMode.reconYuv;
- uint32_t depth = cuGeom.depth; //CU深度
- uint32_t initTuDepth = cu.m_partSize[0] != SIZE_2Nx2N; //初始TU深度,2Nx2N=>深度0,NxN=>深度1
- uint32_t numPU = 1 << (2 * initTuDepth); //PU个数,2Nx2N=>1个,NxN=>4个
- uint32_t log2TrSize = cuGeom.log2CUSize - initTuDepth;//TUsize,单位log(pixel)
- uint32_t tuSize = 1 << log2TrSize; //TUsize,单位pixel
- uint32_t qNumParts = cuGeom.numPartitions >> 2;
- uint32_t sizeIdx = log2TrSize - 2;
- uint32_t absPartIdx = 0;
- sse_t totalDistortion = 0;
- //是否跳过transform
- int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && !cu.m_tqBypass[0] && cu.m_partSize[0] != SIZE_2Nx2N;
- // loop over partitions 遍历所有PU
- for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
- {
- uint32_t bmode = 0;
- //若指定了帧内预测方向,即非ALL_IDX,则不用进行帧内预测方向分析了
- if (intraMode.cu.m_lumaIntraDir[puIdx] != (uint8_t)ALL_IDX)
- bmode = intraMode.cu.m_lumaIntraDir[puIdx];
- //否则,进行最优帧内预测方向计算
- else
- {
- uint64_t candCostList[MAX_RD_INTRA_MODES];
- uint32_t rdModeList[MAX_RD_INTRA_MODES];
- uint64_t bcost;
- int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1);
- {
- ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis);
- // Reference sample smoothing
- IntraNeighbors intraNeighbors;
- //获取neighbor参考像素可用信息
- initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors);
- //对neighbor像素进行填充,并平滑滤波
- initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX);
- // determine set of modes to be tested (using prediction signal only)
- //取原始YUV及其stride
- const pixel* fenc = fencYuv->getLumaAddr(absPartIdx);
- uint32_t stride = predYuv->m_size;
- int scaleTuSize = tuSize;
- int scaleStride = stride;
- int costShift = 0;
- //加载啥???
- m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur);
- /* there are three cost tiers for intra modes:
- * pred[0] - mode probable, least cost
- * pred[1], pred[2] - less probable, slightly more cost
- * non-mpm modes - all cost the same (rbits) */
- uint64_t mpms; //mpms映射,低0~34bit有效
- uint32_t mpmModes[3]; //存储三个mpm
- //加载mpms,并得到若没有命中mpm时的bits开销
- uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms);
- //加载相应size的sa8d计算函数指针
- pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d;
- //存储35个帧内预测方向的cost
- uint64_t modeCosts[35];
- /* 进行DC帧内预测,并得到其bits、distorton(sa8d)、cost开销,并赋值给bcost*/
- primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16));
- //根据有没有命中mpm返回不同的bits。这里的bits仅为记录最优帧内预测方向的bits开销
- uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits;
- //计算sa8d失真
- uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
- //计算rdcost
- modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits);
- /* 进行PLANAR帧内预测,并得到其bits、distorton(sa8d)、cost开销,更新bcost*/
- //若tuSize再8~32之间,使用平滑滤波后的参考像素,若不在区间内,则使用未平滑滤波的参考像素
- pixel* planar = intraNeighbourBuf[0];
- if (tuSize >= 8 && tuSize <= 32)
- planar = intraNeighbourBuf[1];
- //PLANAR帧内预测
- primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0);
- //bits开销
- bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits;
- //distortion
- sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift;
- //计算cost
- modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits);
- //基于cost更新最优帧内预测模式
- COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]);
- /* 进行angle2~34帧内预测,得到其bits、distorton(sa8d)、cost开销,并更新bcost
- intra_pred_allangs只是将33种帧内预测方向集中起来计算而已 */
- //若intra_pred_allangs
- if (primitives.cu[sizeIdx].intra_pred_allangs)
- {
- /* 将原始YUC转置,输出到m_fencTransposed
- angle2~17的预测方向和angle19~34的预测方向是转置关系 */
- primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride);
- //进行angle2~34帧内预测,将33个预测的结果全部输出到m_intraPredAngs
- primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16));
- //遍历angle2~34
- for (int mode = 2; mode < 35; mode++)
- {
- //计算最优帧内预测方向的bits开销
- bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
- //若是angle2~18,则与转置后的YUV矩阵计算sa8d
- if (mode < 18)
- sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
- //若是angle19~24,则与原始YUV矩阵计算sa8d
- else
- sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift;
- //得到rdcost
- modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
- //更新最优帧内预测方向
- COPY1_IF_LT(bcost, modeCosts[mode]);
- }
- }
- //若非intra_pred_allangs
- else
- {
- //遍历angle2~34
- for (int mode = 2; mode < 35; mode++)
- {
- //计算bits开销
- bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits;
- //是否用平滑滤波后的参考像素
- int filter = !!(g_intraFilterFlags[mode] & scaleTuSize);
- //以mode方向进行帧内预测
- primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16);
- //计算sa8d
- sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift;
- //计算rdcost
- modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits);
- //更新最优帧内预测方向
- COPY1_IF_LT(bcost, modeCosts[mode]);
- }
- }
- /* 到这里只是简单的基于
- cost = sa8d + lambda * IPM_bits
- 确定了最优帧内预测开销bcost,
- 以及35种帧内预测方向各自的rdcost,存储在modeCosts[35]
- 有意义但并不准确,下面依据bcost缩小帧内预测方向搜索范围,
- 得到准确的最优帧内预测方向*/
- //初始化candCostList所有为MAX
- for (int i = 0; i < maxCandCount; i++)
- candCostList[i] = MAX_INT64;
- //1.25倍的bcost为阈值
- uint64_t paddedBcost = bcost + (bcost >> 2); // 1.25%
- //遍历35种帧内预测方向,在满足条件的帧内预测方向中寻找最优的maxCandCount个,存储到candCostList中
- for (int mode = 0; mode < 35; mode++)
- //若该帧内预测方向之前简单计算的cost在1.25倍最优帧内预测方向的cost以内,或命中了mpm,则进行更新CandList
- if ((modeCosts[mode] < paddedBcost) || ((uint32_t)mode == mpmModes[0]))
- /* choose for R-D analysis only if this mode passes cost threshold or matches MPM[0] */
- updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList);
- }
- /* measure best candidates using simple RDO (no TU splits) */
- bcost = MAX_INT64;
- //遍历所有Cand,将cand中的每一个帧内预测方向都严格计算一边开销
- for (int i = 0; i < maxCandCount; i++)
- {
- //若其cost为MAX,则break,不需要继续了,candCostList无可用帧内预测方向
- if (candCostList[i] == MAX_INT64)
- break;
- ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
- //加载熵编码上下文
- m_entropyCoder.load(m_rqt[depth].cur);
- //设置好帧内预测方向
- cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth);
- Cost icosts;
- /* 针对指定的帧内预测方向,
- 严格基于rdcost = sse(fenc, recon) + lambda * all_bits
- 确定最优的TU划分,并得到rdcost、bits、distortion、energy开销 */
- if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
- else
- codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, false, icosts, depthRange);
- //依据rdcost更新bcost和bmode
- COPY2_IF_LT(bcost, icosts.rdcost, bmode, rdModeList[i]);
- }
- /*
- 到这里已经得到了严格意义上的最优帧内预测方向bmode及其bcost
- */
- }
- ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]);
- /* remeasure best mode, allowing TU splits */
- //重新设置刚刚在cand中确定的最优帧内预测方向
- cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth);
- //加载熵编码上下文
- m_entropyCoder.load(m_rqt[depth].cur);
- //再次计算一遍
- Cost icosts;
- //计算当前intraMod下的最优TU划分,并得到严格的distortion、bits、rdcost和energy
- if (checkTransformSkip)
- codeIntraLumaTSkip(intraMode, cuGeom, initTuDepth, absPartIdx, icosts);
- else
- codeIntraLumaQT(intraMode, cuGeom, initTuDepth, absPartIdx, true, icosts, depthRange);
-
- //累加上当前PU的distortion
- totalDistortion += icosts.distortion;
- //将DCT系数和recon的YUV数据提取存储下来
- extractIntraResultQT(cu, *reconYuv, initTuDepth, absPartIdx);
- // set reconstruction for next intra prediction blocks
- //若不是最后一个PU,则将recon的YUV拷贝下来,为下一个PU作像素参考
- if (puIdx != numPU - 1)
- {
- PicYuv* reconPic = m_frame->m_reconPic;
- pixel* dst = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
- uint32_t dststride = reconPic->m_stride;
- const pixel* src = reconYuv->getLumaAddr(absPartIdx);
- uint32_t srcstride = reconYuv->m_size;
- primitives.cu[log2TrSize - 2].copy_pp(dst, dststride, src, srcstride);
- }
- }// end of for (uint32_t puIdx = 0; puIdx < numPU; puIdx++, absPartIdx += qNumParts)
-
- //若CU划分了多个PU,即4个
- if (numPU > 1)
- {
- uint32_t combCbfY = 0;
- //merge四个PU的cbf
- for (uint32_t qIdx = 0, qPartIdx = 0; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
- combCbfY |= cu.getCbf(qPartIdx, TEXT_LUMA, 1);
- //m_cbf[plane][absPartIdx],记录下来
- cu.m_cbf[0][0] |= combCbfY;
- }
- // TODO: remove this,恢复熵编码上下文
- m_entropyCoder.load(m_rqt[depth].cur);
- return totalDistortion;
- }
- void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
- {
- //得到tu的像素size
- int tuSize = 1 << intraNeighbors.log2TrSize;
- //得到tu的像素的两倍
- int tuSize2 = tuSize << 1;
- //取reconPic
- PicYuv* reconPic = cu.m_encData->m_reconPic;
- pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
- intptr_t picStride = reconPic->m_stride;
- //进行neighbor不可用参考像素进行填充,输出到intraNeighbourBuf[0]中
- fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
- pixel* refBuf = intraNeighbourBuf[0]; //unfiltered参考像素
- pixel* fltBuf = intraNeighbourBuf[1]; //filtered参考像素
- //取左上角像素、上边的最右边像素、左边的最下边像素
- pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
- /* 若所有帧内预测方向都允许 && tuSize为8/16/32之间一种,
- 或当前tuSize的当前预测方向允许平滑滤波,
- 则进行平滑滤波 */
- if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
- {
- // generate filtered intra prediction samples
- //使用强帧内预测平滑 && tusize为32x32pixel,则进行强平滑滤波
- if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32)
- {
- //计算阈值
- const int threshold = 1 << (X265_DEPTH - 5);
- //取上边的中间像素,左边的中间像素
- pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
- /* 若上边的 (最左边+最右边)-中间*2 小于 阈值,
- 且左边的 (最上边+最下边)-中间*2 小于 阈值,
- 则使用强双线性差值进行平滑滤波 */
- if (abs(topLeft + topLast - (topMiddle << 1)) < threshold &&
- abs(topLeft + leftLast - (leftMiddle << 1)) < threshold)
- {
- // "strong" bilinear interpolation 使用强双线性插值
- const int shift = 5 + 1;
- int init = (topLeft << shift) + tuSize;
- int deltaL, deltaR;
- //Δleft = 左边最下边-左边最上边,Δtop = 上边最右边-上边最左边
- deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
- //最左上角,即左边最上,上边最左,像素不进行平滑滤波,直接输出
- fltBuf[0] = topLeft;
- //遍历2倍size长度边界参考像素
- for (int i = 1; i < tuSize2; i++)
- {
- //左边平滑滤波,输出到fltBuf中
- fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
- //上边平滑滤波,输出到fltBuf中
- fltBuf[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering
- }
- //上边最下不进行平滑滤波
- fltBuf[tuSize2] = topLast;
- //左边最下不进行平滑滤波
- fltBuf[tuSize2 + tuSize2] = leftLast;
- return;
- }
- }
- //对参考像素refBuf进行常规的平滑过滤,输出到fltBuf中
- primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
- }
- }
- const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1));//无参考像素时的固定预测值
- int numIntraNeighbor = intraNeighbors.numIntraNeighbor;
- int totalUnits = intraNeighbors.totalUnits;
- uint32_t tuSize = 1 << intraNeighbors.log2TrSize;
- uint32_t refSize = tuSize * 2 + 1;
- // Nothing is available, perform DC prediction.
- if (numIntraNeighbor == 0)//所有像素都不可用时,使用固定值填充,对于8比特像素,预测值为128,10bit像素,预测值为512
- {
- // Fill top border with DC value
- for (uint32_t i = 0; i < refSize; i++)
- dst[i] = dcValue
-
- // Fill left border with DC value
- for (uint32_t i = 0; i < refSize - 1; i++)
- dst[i + refSize] = dcValue;
- }
- else if (numIntraNeighbor == totalUnits)//所有参考像素均可用,以像素块的形式直接复制
- {
- // Fill top border with rec. samples
- const pixel* adiTemp = adiOrigin - picStride - 1;
- memcpy(dst, adiTemp, refSize * sizeof(pixel));
- // Fill left border with rec. samples
- adiTemp = adiOrigin - 1;
- for (uint32_t i = 0; i < refSize - 1; i++)
- {
- dst[i + refSize] = adiTemp[0];
- adiTemp += picStride;
- }
- }
- else // reference samples are partially available 部分可用时,要对每个区域依次判断,参考像素不存在时,使用存在的最临近参考像素填充
- {
- ......
- if (!bNeighborFlags[0])//第一个像素不可用时,找到第一个可用的像素作为真实的起点,并将起点前所有参考像素填充该值
- {
- // very bottom unit of bottom-left; at least one unit will be valid.
- while (next < totalUnits && !bNeighborFlags[next])
- next++;
- pixel* pAdiLineNext = adiLineBuffer + ((next < leftUnits) ? (next * unitHeight) : (pAdiLineTopRowOffset + (next * unitWidth)));
- const pixel refSample = *pAdiLineNext;
- ......
- }
- // pad all other reference samples.起点后的值,如果不可用,则复制前一个值,以实现最临近参考像素填充
- while (curr < totalUnits)
- {
- if (!bNeighborFlags[curr]) // samples not available
- {
- int numSamplesInCurrUnit = (curr >= leftUnits) ? unitWidth : unitHeight;
- const pixel refSample = *(adi - 1);
- for (int i = 0; i < numSamplesInCurrUnit; i++)
- adi[i] = refSample;
- adi += numSamplesInCurrUnit;
- curr++;
- }
- else
- {
- adi += (curr >= leftUnits) ? unitWidth : unitHeight;
- curr++;
- }
- }
- ......
- }
- void Search::codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& outCost, const uint32_t depthRange[2])
- {
- //取CUData
- CUData& cu = mode.cu;
- //取fullDepth = CUDepth + TUDepth
- uint32_t fullDepth = cuGeom.depth + tuDepth;
- //log TUsize
- uint32_t log2TrSize = cuGeom.log2CUSize - tuDepth;
- uint32_t qtLayer = log2TrSize - 2;
- uint32_t sizeIdx = log2TrSize - 2;
- //只要TUsize在上限以下,就可以不再split
- bool mightNotSplit = log2TrSize <= depthRange[1];
- //只要TUsize在下限以上,就可以split
- bool mightSplit = (log2TrSize > depthRange[0]) && (bAllowSplit || !mightNotSplit);
- bool bEnableRDOQ = !!m_param->rdoqLevel;
- /* If maximum RD penalty, force spits at TU size 32x32 if SPS allows TUs of 16x16
- 若rdPenaly为2,即full,且非Islice,且TU的尺寸在32x32,且TU尺寸允许小于等于16x16,则强制split*/
- if (m_param->rdPenalty == 2 && m_slice->m_sliceType != I_SLICE && log2TrSize == 5 && depthRange[0] <= 4)
- {
- mightNotSplit = false;
- mightSplit = true;
- }
- /* fullCost表示当前CU不进行TU的划分的cost,整个CU就是一个TU;
- 与之对应的有splitCost,表示进行了TU划分的cost*/
- Cost fullCost;
- //CBF
- uint32_t bCBF = 0;
- //存储recon
- pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getLumaAddr(absPartIdx);
- uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_size;
- /*
- 若可以不再split,则计算不再split的cost,即fullCost
- */
- if (mightNotSplit)
- {
- //若可以split,则将当前上下文存储到rqtRoot中,保证后面计算split时上下文的一致性
- if (mightSplit)
- m_entropyCoder.store(m_rqt[fullDepth].rqtRoot);
- //取原始YUV
- const pixel* fenc = mode.fencYuv->getLumaAddr(absPartIdx);
- //取预测的YUV
- pixel* pred = mode.predYuv.getLumaAddr(absPartIdx);
- //得到残差YUV
- int16_t* residual = m_rqt[cuGeom.depth].tmpResiYuv.getLumaAddr(absPartIdx);
- uint32_t stride = mode.fencYuv->m_size;
- // init availability pattern
- uint32_t lumaPredMode = cu.m_lumaIntraDir[absPartIdx];
- IntraNeighbors intraNeighbors;
- //得到相邻PU的可参考信息
- initIntraNeighbors(cu, absPartIdx, tuDepth, true, &intraNeighbors);
- //进行相邻PU像素补全及平滑滤波
- initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, lumaPredMode);
- // get prediction signal 按照帧内预测方向进行预测计算,输出到pred中
- predIntraLumaAng(lumaPredMode, pred, stride, log2TrSize);
- //设置TransformSkip为false
- cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- //设置tuDepth为fullDepth,即cuDepth+initTuDepth
- cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
- uint32_t coeffOffsetY = absPartIdx << (LOG2_UNIT_SIZE * 2);
- coeff_t* coeffY = m_rqt[qtLayer].coeffRQT[0] + coeffOffsetY;
- // store original entropy coding status 这是是啥
- if (bEnableRDOQ)
- m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true);
- //计算残差resi = fenc - pred
- primitives.cu[sizeIdx].calcresidual[stride % 64 == 0](fenc, pred, residual, stride);
- //若残差进行tranform,输出到coeffY中,并得到非零系数的个数numSig
- uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false);
- /* 得到重构帧recon */
- if (numSig) //若有残差系数
- {
- //进行反transform
- m_quant.invtransformNxN(cu, residual, stride, coeffY, log2TrSize, TEXT_LUMA, true, false, numSig);
- bool reconQtYuvAlign = m_rqt[qtLayer].reconQtYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
- bool predAlign = mode.predYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
- bool residualAlign = m_rqt[cuGeom.depth].tmpResiYuv.getAddrOffset(absPartIdx, mode.predYuv.m_size) % 64 == 0;
- bool bufferAlignCheck = (reconQtStride % 64 == 0) && (stride % 64 == 0) && reconQtYuvAlign && predAlign && residualAlign;
- //重构recon = pred + resi
- primitives.cu[sizeIdx].add_ps[bufferAlignCheck](reconQt, reconQtStride, pred, residual, stride, stride);
- }
- else
- // no coded residual, recon = pred,将pred输出到recon中
- primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, pred, stride);
- //记录CBF
- bCBF = !!numSig << tuDepth;
- //设置CBF
- cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
- //根据fecn和recon来计算sse失真
- fullCost.distortion = primitives.cu[sizeIdx].sse_pp(reconQt, reconQtStride, fenc, stride);
- /*
- 到这里已经计算了严格意义上的distortion(fenc , recon)
- */
- //重置bits
- m_entropyCoder.resetBits();
- if (!absPartIdx)
- {
- //若非Islice
- if (!cu.m_slice->isIntra())
- {
- //若允许旁路trans和quan,则编码bypass flag
- if (cu.m_slice->m_pps->bTransquantBypassEnabled)
- m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);
- //编码skip flag
- m_entropyCoder.codeSkipFlag(cu, 0);
- //编码帧内预测方向
- m_entropyCoder.codePredMode(cu.m_predMode[0]);
- }
- //编码partSize
- m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);
- }
- /* 编码帧内预测方向 */
- //若当前CU为SIZE_2Nx2N,则只需要编码一个方向
- if (cu.m_partSize[0] == SIZE_2Nx2N)
- {
- if (!absPartIdx)
- m_entropyCoder.codeIntraDirLumaAng(cu, 0, false);
- }
- //若非SIZE_2Nx2N,则需要编码四个PU的方向。。。还没理清楚
- else
- {
- uint32_t qNumParts = cuGeom.numPartitions >> 2;
- //若initTuDepth = 0
- if (!tuDepth)
- {
- for (uint32_t qIdx = 0; qIdx < 4; ++qIdx)
- m_entropyCoder.codeIntraDirLumaAng(cu, qIdx * qNumParts, false);
- }
- else if (!(absPartIdx & (qNumParts - 1)))
- m_entropyCoder.codeIntraDirLumaAng(cu, absPartIdx, false);
- }
- //若当前TUsize不是允许的最小size,则编码subDivFlag = false
- if (log2TrSize != depthRange[0])
- m_entropyCoder.codeTransformSubdivFlag(0, 5 - log2TrSize);
- //编码cbf
- m_entropyCoder.codeQtCbfLuma(!!numSig, tuDepth);
- //若有cbf,即有残差,则编码残差
- if (cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth))
- m_entropyCoder.codeCoeffNxN(cu, coeffY, absPartIdx, log2TrSize, TEXT_LUMA);
- //得到前面编码的bits开销总和
- fullCost.bits = m_entropyCoder.getNumberOfWrittenBits();
- //若开启了rdPenalty,且TUsize为32x32,且非Islice,则bits翻四倍
- if (m_param->rdPenalty && log2TrSize == 5 && m_slice->m_sliceType != I_SLICE)
- fullCost.bits *= 4;
- //计算根据distortion(fenc,recon)和全部的编码bits来计算rdcost和enerpy
- if (m_rdCost.m_psyRd)
- {
- fullCost.energy = m_rdCost.psyCost(sizeIdx, fenc, mode.fencYuv->m_size, reconQt, reconQtStride);
- fullCost.rdcost = m_rdCost.calcPsyRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
- }
- else if(m_rdCost.m_ssimRd)
- {
- fullCost.energy = m_quant.ssimDistortion(cu, fenc, stride, reconQt, reconQtStride, log2TrSize, TEXT_LUMA, absPartIdx);
- fullCost.rdcost = m_rdCost.calcSsimRdCost(fullCost.distortion, fullCost.bits, fullCost.energy);
- }
- else
- fullCost.rdcost = m_rdCost.calcRdCost(fullCost.distortion, fullCost.bits);
- }
- //if !(mightNotSplit),即一定要split,则fullCcost的rdcost为max
- else
- fullCost.rdcost = MAX_INT64;
- /*
- 若可以split,则计算split的cost,即splitCost
- */
- if (mightSplit)
- {
- //若可以不split,则将之前分析不split的上下文先保存下来,再恢复没计算split之前的上下文
- if (mightNotSplit)
- {
- //保存熵编码上下文到rqtTest中
- m_entropyCoder.store(m_rqt[fullDepth].rqtTest); // save state after full TU encode
- //重新加载rqtRoot的熵编码上下文
- m_entropyCoder.load(m_rqt[fullDepth].rqtRoot); // prep state of split encode
- }
- /* code split block */
- uint32_t qNumParts = 1 << (log2TrSize - 1 - LOG2_UNIT_SIZE) * 2;
- //是否跳过transForm
- int checkTransformSkip = m_slice->m_pps->bTransformSkipEnabled && (log2TrSize - 1) <= MAX_LOG2_TS_SIZE && !cu.m_tqBypass[0];
- if (m_param->bEnableTSkipFast)
- checkTransformSkip &= cu.m_partSize[0] != SIZE_2Nx2N;
- Cost splitCost;
- uint32_t cbf = 0;
- //遍历四个TU
- for (uint32_t qIdx = 0, qPartIdx = absPartIdx; qIdx < 4; ++qIdx, qPartIdx += qNumParts)
- {
- //递归调用对四个split出来的TU进行残差编码
- if (checkTransformSkip)
- codeIntraLumaTSkip(mode, cuGeom, tuDepth + 1, qPartIdx, splitCost);
- else
- codeIntraLumaQT(mode, cuGeom, tuDepth + 1, qPartIdx, bAllowSplit, splitCost, depthRange);
- //merge四个TU的cbf
- cbf |= cu.getCbf(qPartIdx, TEXT_LUMA, tuDepth + 1);
- }
- //cbf[plane][absPartIdx] 存储cbf
- cu.m_cbf[0][absPartIdx] |= (cbf << tuDepth);
- //若可以不split,且TUsize不是所允许的最小size
- if (mightNotSplit && log2TrSize != depthRange[0])
- {
- /* If we could have coded this TU depth, include cost of subdiv flag */
- //重置bits
- m_entropyCoder.resetBits();
- //编码subDivFlag = true
- m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize);
- //累加subDivFlag的bits
- splitCost.bits += m_entropyCoder.getNumberOfWrittenBits();
- //计算rdcost
- if (m_rdCost.m_psyRd)
- splitCost.rdcost = m_rdCost.calcPsyRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
- else if(m_rdCost.m_ssimRd)
- splitCost.rdcost = m_rdCost.calcSsimRdCost(splitCost.distortion, splitCost.bits, splitCost.energy);
- else
- splitCost.rdcost = m_rdCost.calcRdCost(splitCost.distortion, splitCost.bits);
- } //end of if (mightNotSplit && log2TrSize != depthRange[0])
- /*
- 对比notSplit和split的cost,最优存储两者较优的
- rdcost、distortion、bits、enery、transform、cbf等信息
- */
- //若split的rdcost < full的rdcost,则更新,return结束
- if (splitCost.rdcost < fullCost.rdcost)
- {
- outCost.rdcost += splitCost.rdcost;
- outCost.distortion += splitCost.distortion;
- outCost.bits += splitCost.bits;
- outCost.energy += splitCost.energy;
- return;
- }
- //若full的rdcost < split的rdcost
- else
- {
- // recover entropy state of full-size TU encode 恢复notSplit的上下文
- m_entropyCoder.load(m_rqt[fullDepth].rqtTest);
- // recover transform index and Cbf values 恢复transform indx 和 cbf
- cu.setTUDepthSubParts(tuDepth, absPartIdx, fullDepth);
- cu.setCbfSubParts(bCBF, TEXT_LUMA, absPartIdx, fullDepth);
- cu.setTransformSkipSubParts(0, TEXT_LUMA, absPartIdx, fullDepth);
- }
- } //end of if (mightSplit)
- /* set reconstruction for next intra prediction blocks if full TU prediction won
- 若最后notSplit较优(split优的话执行不到这里),恢复recon的YUV数据,并存储下来 */
- PicYuv* reconPic = m_frame->m_reconPic;
- pixel* picReconY = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx);
- intptr_t picStride = reconPic->m_stride;
- primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride);
- //结算cost
- outCost.rdcost += fullCost.rdcost;
- outCost.distortion += fullCost.distortion;
- outCost.bits += fullCost.bits;
- outCost.energy += fullCost.energy;
- }
-