TensorFlow 中的2D 卷积主要依赖外部库,如 cuDNN、cuBLAS、ROCm 和 hfp/libxsmm,仅 DeepConv2D
InitConv2DParameters 从 OpKernelConstruction 中读取设置到 Conv2DParameters 并进行检查。
CudnnUseAutotune 标识是否开启自动调优。
template <typename Device, typename T>
class Conv2DOp : public BinaryOp<T> {
public:
explicit Conv2DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
OP_REQUIRES_OK(context, InitConv2DParameters(context, ¶ms_));
OP_REQUIRES_OK(context, context->GetAttr("use_cudnn_on_gpu", &use_cudnn_));
cudnn_use_autotune_ = CudnnUseAutotune();
}
ComputeConv2DDimension 检查并设置2D 卷积的维度信息。
ShapeFromFormat 根据格式获取形状。
OpKernelContext::allocate_output 根据形状分配输出张量。
void Compute(OpKernelContext* context) override {
// Input tensor is of the following dimensions:
// [ batch, in_rows, in_cols, in_depth ]
const Tensor& input = context->input(0);
// Input filter is of the following dimensions:
// [ filter_rows, filter_cols, in_depth, out_depth]
const Tensor& filter = context->input(1);
Conv2DDimensions dimensions;
OP_REQUIRES_OK(context,
ComputeConv2DDimension(params_, input, filter, &dimensions));
TensorShape out_shape = ShapeFromFormat(
params_.data_format, dimensions.batch, dimensions.out_rows,
dimensions.out_cols, dimensions.out_depth);
// Output tensor is of the following dimensions:
// [ in_batch, out_rows, out_cols, out_depth ]
Tensor* output = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
VLOG(2) << "Conv2D: in_depth = " << dimensions.in_depth
<< ", patch_depth = " << dimensions.patch_depth
<< ", input_cols = " << dimensions.input_cols
<< ", filter_cols = " << dimensions.filter_cols
<< ", input_rows = " << dimensions.input_rows
<< ", filter_rows = " << dimensions.filter_rows
<< ", stride_rows = " << dimensions.stride_rows
<< ", stride_cols = " << dimensions.stride_cols
<< ", dilation_rows = " << dimensions.dilation_rows
<< ", dilation_cols = " << dimensions.dilation_cols
<< ", out_depth = " << dimensions.out_depth;
// If there is nothing to compute, return.
if (out_shape.num_elements() == 0) {
return;
}
如果使用 hfp/libxsmm 库,在非明确填充模式下调用 LaunchXsmmConvOp
仅 CPU 模式下有效,GPU 模式下返回 false。
#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
if (params_.padding != EXPLICIT &&
LaunchXsmmConvOp<Device, T>::Run(
context, input, filter, dimensions.batch, dimensions.input_rows,
dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
dimensions.filter_cols, dimensions.pad_rows_before,
dimensions.pad_cols_before, dimensions.out_rows,
dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
dimensions.dilation_cols, dimensions.stride_rows,
dimensions.stride_cols, output, params_.data_format)) {
return;
}
#endif
非明确填充先尝试调用 LaunchDeepConvOp::Run。LaunchDeepConvOp 默认会返回 false,只有 CPU 的 float 类型进行了实现。
if (params_.padding != EXPLICIT &&
LaunchDeepConvOp<Device, T>::Run(
context, input, filter, dimensions.batch, dimensions.input_rows,
dimensions.input_cols, dimensions.in_depth, dimensions.filter_rows,
dimensions.filter_cols, dimensions.pad_rows_before,
dimensions.pad_cols_before, dimensions.out_rows,
dimensions.out_cols, dimensions.out_depth, dimensions.dilation_rows,
dimensions.dilation_cols, dimensions.stride_rows,
dimensions.stride_cols, output, params_.data_format)) {
return;
}
LaunchConv2DOp
launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
dimensions.dilation_rows, dimensions.dilation_cols,
dimensions.stride_rows, dimensions.stride_cols, params_.padding,
params_.explicit_paddings, output, params_.data_format);
}
LaunchConv2DOp 对象为实际的执行者。
private:
Conv2DParameters params_;
bool use_cudnn_;
bool cudnn_use_autotune_;
LaunchConv2DOp<Device, T> launcher_;
TF_DISALLOW_COPY_AND_ASSIGN(Conv2DOp);
};
从 OpKernelConstruction 中读取设置到 Conv2DParameters 并进行检查。
ShapeFromFormat 根据格式获取形状。
GetTensorDim 通过字符属性获取维度。
CheckValidPadding 检查填充值。
Status InitConv2DParameters(const OpKernelConstruction* context,
Conv2DParameters* params) {
TF_RETURN_IF_ERROR(context->GetAttr("dilations", ¶ms->dilations));
TF_RETURN_IF_ERROR(context->GetAttr("strides", ¶ms->strides));
TF_RETURN_IF_ERROR(context->GetAttr("padding", ¶ms->padding));
if (context->HasAttr("explicit_paddings")) {
TF_RETURN_IF_ERROR(
context->GetAttr("explicit_paddings", ¶ms->explicit_paddings));
}
string data_format_string;
TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
TF_REQUIRES(FormatFromString(data_format_string, ¶ms->data_format),
errors::InvalidArgument("Invalid data format"));
const auto& strides = params->strides;
const auto& dilations = params->dilations;
const auto& data_format = params->data_format;
TF_REQUIRES(dilations.size() == 4,
errors::InvalidArgument("Sliding window dilations field must "
"specify 4 dimensions"));
TF_REQUIRES(strides.size() == 4,
errors::InvalidArgument("Sliding window strides field must "
"specify 4 dimensions"));
const int64_t stride_n = GetTensorDim(strides, data_format, 'N');
const int64_t stride_c = GetTensorDim(strides, data_format, 'C');
const int64_t stride_h = GetTensorDim(strides, data_format, 'H');
const int64_t stride_w = GetTensorDim(strides, data_format, 'W');
TF_REQUIRES(
stride_n == 1 && stride_c == 1,
errors::Unimplemented("Current implementation does not yet support "
"strides in the batch and depth dimensions."));
TF_REQUIRES(stride_h > 0 && stride_w > 0,
errors::InvalidArgument(
"Row and column strides should be larger than 0."));
const int64_t dilation_n = GetTensorDim(dilations, data_format, 'N');
const int64_t dilation_c = GetTensorDim(dilations, data_format, 'C');
const int64_t dilation_h = GetTensorDim(dilations, data_format, 'H');
const int64_t dilation_w = GetTensorDim(dilations, data_format, 'W');
TF_REQUIRES(
dilation_n == 1 && dilation_c == 1,
errors::Unimplemented("Current implementation does not yet support "
"dilations in the batch and depth dimensions."));
TF_REQUIRES(
dilation_h > 0 && dilation_w > 0,
errors::InvalidArgument("Dilated rates should be larger than 0."));
TF_RETURN_IF_ERROR(CheckValidPadding(params->padding,
params->explicit_paddings,
/*num_dims=*/4, data_format));
return Status::OK();
}
CUDA 和 ROCm 均使用LaunchConv2DOp
。
template <typename Device, typename T>
struct LaunchConv2DOp {
void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
const Tensor& input, const Tensor& filter, int row_dilation,
int col_dilation, int row_stride, int col_stride,
const Padding& padding,
const std::vector<int64_t>& explicit_paddings, Tensor* output,
TensorFormat data_format);
};
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
template <typename T>
struct LaunchConv2DOp<Eigen::GpuDevice, T> {
void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
const Tensor& input, const Tensor& filter, int row_dilation,
int col_dilation, int row_stride, int col_stride,
const Padding& padding,
const std::vector<int64_t>& explicit_paddings, Tensor* output,
TensorFormat data_format);
};
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
GPU 通用版本为 cuDNN 实现。
template <typename T>
void LaunchConv2DOp<GPUDevice, T>::operator()(
OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
const Tensor& input_param, const Tensor& filter, int row_dilation,
int col_dilation, int row_stride, int col_stride, const Padding& padding,
const std::vector<int64_t>& explicit_paddings, Tensor* output,
TensorFormat data_format) {
using se::dnn::AlgorithmConfig;
using se::dnn::AlgorithmDesc;
using se::dnn::ProfileResult;
auto* stream = ctx->op_device_context()->stream();
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
if (!use_cudnn) {
ctx->SetStatus(
errors::Unimplemented("Conv2D for GPU is not currently supported "
"without cudnn"));
return;
}
输入维度信息为 int64。
Tensor input = input_param;
const int64_t in_batch = GetTensorDim(input, data_format, 'N');
int64_t in_rows = GetTensorDim(input, data_format, 'H');
int64_t in_cols = GetTensorDim(input, data_format, 'W');
const int64_t in_depths = GetTensorDim(input, data_format, 'C');
const int64_t patch_rows = filter.dim_size(0);
const int64_t patch_cols = filter.dim_size(1);
const int64_t patch_depths = filter.dim_size(2);
OP_REQUIRES(
ctx, filter.NumElements() > 0,
errors::InvalidArgument("filter must not have zero elements "
"(i.e. all dimensions must be non-zero)"));
如果滤波器深度patch_depths
为1且小于输入深度,则为深度卷积; 更一般地,如果滤波器深度不为1但小于输入深度,则为分组卷积。
如果是1x1卷积且数据格式为 NHWC,调用 Stream::ThenBlasGemm 函数。
AsDeviceMemory 将张量映射为封装给定类型缓冲区的 DeviceMemory 对象。
// If the filter in-depth (patch_depths) is 1 and smaller than the input
// depth, it's a depthwise convolution. More generally, if the filter in-depth
// divides but is smaller than the input depth, it is a grouped convolution.
bool is_grouped_convolution = patch_depths != in_depths;
if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
col_stride == 1 && data_format == FORMAT_NHWC &&
(padding == VALID || padding == SAME)) {
// 1x1 filter, so call cublas directly.
const uint64 m = in_batch * in_rows * in_cols;
const uint64 k = patch_depths;
const uint64 n = filter.dim_size(3);
auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
filter.template flat<T>().size());
auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
output->template flat<T>().size());
auto no_transpose = se::blas::Transpose::kNoTranspose;
OP_REQUIRES_OK(
ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n,
a_ptr, k, &c_ptr, n,
se::blas::kDefaultComputePrecision));
return;
如果卷积核尺寸与输入完全相同,且数据格式为 NHWC 则同样 Stream::ThenBlasGemm 函数。
} else if (patch_rows == in_rows && patch_cols == in_cols &&
!is_grouped_convolution && row_dilation == 1 &&
col_dilation == 1 && padding == VALID &&
data_format == FORMAT_NHWC) {
// The input data and filter have the same height/width, so call cublas
// directly.
const uint64 m = in_batch;
const uint64 k = patch_rows * patch_cols * patch_depths;
const uint64 n = filter.dim_size(3);
auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto b_ptr = AsDeviceMemory(filter.template flat<T>().data(),
filter.template flat<T>().size());
auto c_ptr = AsDeviceMemory(output->template flat<T>().data(),
output->template flat<T>().size());
auto no_transpose = se::blas::Transpose::kNoTranspose;
OP_REQUIRES_OK(
ctx, stream->ThenBlasGemm(no_transpose, no_transpose, n, m, k, b_ptr, n,
a_ptr, k, &c_ptr, n,
se::blas::kDefaultComputePrecision));
return;
}
ComputeInNhwcEnabled 根据数据类型、GPU 计算能力和 cuDNN 的版本综合判断。
Tensor Core 在 NHWC 数据布局中支持 NVIDIA Volta+ GPU 的 fp16 和 Ampere+ GPU 的 tf32 高效卷积。 在所有其他配置中,以 NCHW 数据格式运行计算效率更高。
#if GOOGLE_CUDA
const bool compute_in_nhwc = ComputeInNhwcEnabled(DataTypeToEnum<T>::value,
stream, /*is_conv2d=*/true);
#else
// fast NHWC implementation is a CUDA only feature
const bool compute_in_nhwc = false;
#endif
// We only do one directional conversion: NHWC->NCHW. We never convert in the
// other direction. Grappler layout optimizer selects preferred layout and
// adds necessary annotations to the graph.
// TODO(ezhulenev): Convert in other direction for fp16?
const TensorFormat compute_data_format =
(compute_in_nhwc && data_format == FORMAT_NHWC) ? FORMAT_NHWC
: FORMAT_NCHW;
VLOG(3) << "Compute Conv2D with cuDNN:"
<< " data_format=" << ToString(data_format)
<< " compute_data_format=" << ToString(compute_data_format);
获取输出的维度信息。
GetExplicitPaddingForDim 从explicit_paddings
获取填充值。
GetWindowedOutputSizeVerboseV2 计算出输出尺寸和填充值。
const int64_t out_batch = GetTensorDim(*output, data_format, 'N');
const int64_t out_rows = GetTensorDim(*output, data_format, 'H');
const int64_t out_cols = GetTensorDim(*output, data_format, 'W');
const int64_t out_depths = GetTensorDim(*output, data_format, 'C');
int64_t padding_top = -1, padding_bottom = -1;
int64_t padding_left = -1, padding_right = -1;
if (padding == EXPLICIT) {
GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &padding_top,
&padding_bottom);
GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &padding_left,
&padding_right);
}
int64_t out_rows_check, out_cols_check;
Status status = GetWindowedOutputSizeVerboseV2(
in_rows, patch_rows, row_dilation, row_stride, padding, &out_rows_check,
&padding_top, &padding_bottom);
// The status is guaranteed to be OK because we checked the output and padding
// was valid earlier.
TF_CHECK_OK(status);
DCHECK_EQ(out_rows, out_rows_check);
status = GetWindowedOutputSizeVerboseV2(in_cols, patch_cols, col_dilation,
col_stride, padding, &out_cols_check,
&padding_left, &padding_right);
TF_CHECK_OK(status);
DCHECK_EQ(out_cols, out_cols_check);
cuDNN 只支持填充对称,所以 OpKernelContext::allocate_temp 分配一块临时内存给transformed_input
。
计算为实现对称,4个方向上需要填充的值input_pad_top
、input_pad_bottom
、input_pad_left
和input_pad_right
。
PadInputinput_param
进行填充得到transformed_input
。
const int64_t common_padding_rows = std::min(padding_top, padding_bottom);
const int64_t common_padding_cols = std::min(padding_left, padding_right);
if (padding_top != padding_bottom || padding_left != padding_right) {
// cuDNN only supports padding the same amount on the left and right sides,
// and on the top and bottom sides. So we manually create a new padded
// input tensor such that we can pass it to cuDNN.
VLOG(4) << "Pad input tensor:"
<< " padding_top=" << padding_top
<< " padding_bottom=" << padding_bottom
<< " padding_left=" << padding_left
<< " padding_right=" << padding_right;
// TODO(reedwm): In some cases, we can avoid an allocation even if the two
// padding sides are different. For example, if the input is 2x2, the filter
// is 1x1, the stride is 2, and the padding is (1, 0, 1, 0), the result is
// equivalent to as if the padding is (1, 1, 1, 1). Changing the padding in
// such a way would allow us to avoid the allocation.
Tensor transformed_input;
const int64_t padding_rows_diff = std::abs(padding_bottom - padding_top);
const int64_t padding_cols_diff = std::abs(padding_right - padding_left);
const int64_t new_in_rows = in_rows + padding_rows_diff;
const int64_t new_in_cols = in_cols + padding_cols_diff;
OP_REQUIRES_OK(ctx, ctx->allocate_temp(
DataTypeToEnum<T>::value,
ShapeFromFormat(data_format, in_batch, new_in_rows,
new_in_cols, in_depths),
&transformed_input));
const int64_t input_pad_top = padding_top - common_padding_rows;
const int64_t input_pad_bottom = padding_bottom - common_padding_rows;
const int64_t input_pad_left = padding_left - common_padding_cols;
const int64_t input_pad_right = padding_right - common_padding_cols;
bool in_bounds =
FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
if (!in_bounds) {
ctx->SetStatus(errors::InvalidArgument("Padding is too large."));
return;
}
functor::PadInput<GPUDevice, T, int, 4>()(
ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 4>()),
{{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
{{static_cast<int>(input_pad_bottom),
static_cast<int>(input_pad_right)}},
To32Bit(transformed_input.tensor<T, 4>()), data_format, T{});
input = transformed_input;
in_rows = new_in_rows;
in_cols = new_in_cols;
}
如果输入格式为 NHWC 但计算格式为 NCHW,则调用 NHWCToNCHWinput
进行转换。
if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
VLOG(4) << "Convert the input tensor from NHWC to NCHW.";
TensorShape nchw_shape =
ShapeFromFormat(FORMAT_NCHW, in_batch, in_rows, in_cols, in_depths);
if (in_depths > 1) {
Tensor transformed_input;
OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
nchw_shape, &transformed_input));
functor::NHWCToNCHW<GPUDevice, T, 4>()(
ctx->eigen_device<GPUDevice>(),
const_cast<const Tensor&>(input).tensor<T, 4>(),
transformed_input.tensor<T, 4>());
input = transformed_input;
} else {
// If depth <= 1, then just reshape.
CHECK(input.CopyFrom(input, nchw_shape));
}
} else {
CHECK(data_format == compute_data_format) // Crash OK
<< "Illegal data and compute format pair:"
<< " data_format=" << ToString(data_format)
<< " compute_data_format=" << ToString(compute_data_format);
}
CHECK(common_padding_rows >= 0 && common_padding_cols >= 0) // Crash OK
<< "Negative row or col paddings: (" << common_padding_rows << ", "
<< common_padding_cols << ")";
constexpr auto kComputeInNHWC =
std::make_tuple(se::dnn::DataLayout::kBatchYXDepth,
se::dnn::FilterLayout::kOutputYXInput);
constexpr auto kComputeInNCHW =
std::make_tuple(se::dnn::DataLayout::kBatchDepthYX,
se::dnn::FilterLayout::kOutputInputYX);
se::dnn::DataLayout compute_data_layout;
se::dnn::FilterLayout filter_layout;
std::tie(compute_data_layout, filter_layout) =
compute_data_format == FORMAT_NHWC ? kComputeInNHWC : kComputeInNCHW;
se::dnn::BatchDescriptor input_desc;
input_desc.set_count(in_batch)
.set_feature_map_count(in_depths)
.set_height(in_rows)
.set_width(in_cols)
.set_layout(compute_data_layout);
se::dnn::BatchDescriptor output_desc;
output_desc.set_count(out_batch)
.set_height(out_rows)
.set_width(out_cols)
.set_feature_map_count(out_depths)
.set_layout(compute_data_layout);
se::dnn::FilterDescriptor filter_desc;
filter_desc.set_input_filter_height(patch_rows)
.set_input_filter_width(patch_cols)
.set_input_feature_map_count(patch_depths)
.set_output_feature_map_count(filter.dim_size(3))
.set_layout(filter_layout);
se::dnn::ConvolutionDescriptor conv_desc;
conv_desc.set_vertical_dilation_rate(row_dilation)
.set_horizontal_dilation_rate(col_dilation)
.set_vertical_filter_stride(row_stride)
.set_horizontal_filter_stride(col_stride)
.set_zero_padding_height(common_padding_rows)
.set_zero_padding_width(common_padding_cols)
.set_group_count(in_depths / patch_depths);
对于卷积核进行转换。
Tensor transformed_filter;
const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
<< " to " << ToString(dst_format);
TensorShape dst_shape =
dst_format == FORMAT_OIHW
? TensorShape({filter.dim_size(3), filter.dim_size(2),
filter.dim_size(0), filter.dim_size(1)})
: TensorShape({filter.dim_size(3), filter.dim_size(0),
filter.dim_size(1), filter.dim_size(2)});
TF_RETURN_IF_ERROR(ctx->allocate_temp(DataTypeToEnum<T>::value, dst_shape,
&transformed_filter));
functor::TransformFilter<GPUDevice, T, int, 4>()(
ctx->eigen_device<GPUDevice>(), dst_format,
To32Bit(filter.tensor<T, 4>()),
To32Bit(transformed_filter.tensor<T, 4>()));
return Status::OK();
};
if (compute_data_format == FORMAT_NCHW) {
OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OIHW));
} else if (compute_data_format == FORMAT_NHWC) {
OP_REQUIRES_OK(ctx, transform_filter(FORMAT_OHWI));
} else {
ctx->SetStatus(errors::InvalidArgument("Invalid compute data format: ",
ToString(compute_data_format)));
return;
}
如果输出格式与计算格式不同,则为transformed_output
分配一块内存。
Tensor transformed_output;
if (data_format != compute_data_format) {
VLOG(4) << "Allocate temporary memory for output in compute data format";
OP_REQUIRES_OK(
ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
ShapeFromFormat(compute_data_format, out_batch,
out_rows, out_cols, out_depths),
&transformed_output));
} else {
transformed_output = *output;
}
GetDnnWorkspaceLimit 从环境变量获取工作区大小限制。
ConvAutotuneMap 即 AutotuneSingleton,AutotuneSingleton::GetInstance 返回 AutotuneMap 对象。
AutotuneUnfusedConv 使用 cuDNN 的动态调优功能得到 AutotuneEntry 。
LaunchAutotunedConv 执行 ConvRunner
auto input_ptr = AsDeviceMemory(input.template flat<T>().data(),
input.template flat<T>().size());
auto filter_ptr =
AsDeviceMemory(transformed_filter.template flat<T>().data(),
transformed_filter.template flat<T>().size());
auto output_ptr =
AsDeviceMemory(transformed_output.template flat<T>().data(),
transformed_output.template flat<T>().size());
static int64_t ConvolveScratchSize = GetDnnWorkspaceLimit(
// default value is in bytes despite the name of the environment variable
"TF_CUDNN_WORKSPACE_LIMIT_IN_MB", 1LL << 32 // 4GB
);
int device_id = stream->parent()->device_ordinal();
DataType dtype = input.dtype();
ConvParameters conv_parameters = {in_batch, // batch
in_depths, // in_depths
{{in_rows, // in_rows
in_cols}}, // in_cols
compute_data_format, // compute_data_format
out_depths, // out_depths
{{patch_rows, // filter_rows
patch_cols, // filter_cols
patch_depths}}, // filter_depths
{{row_dilation, // dilation_rows
col_dilation}}, // dilation_cols
{{row_stride, // stride_rows
col_stride}}, // stride_cols
{{common_padding_rows, // padding_rows
common_padding_cols}}, // padding_cols
dtype, // tensor datatype
device_id, // device_id
conv_desc.group_count()};
auto entry_or = AutotuneUnfusedConv(
cudnn_use_autotune, ConvAutotuneMap::GetInstance(), conv_parameters, ctx,
se::dnn::ConvolutionKind::FORWARD, input_desc, input_ptr, filter_desc,
filter_ptr, conv_desc, output_desc, output_ptr, ConvolveScratchSize);
OP_REQUIRES_OK(ctx, entry_or.status());
auto autotune_entry = entry_or.ConsumeValueOrDie();
DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
Status cudnn_launch_status = LaunchAutotunedConv(
autotune_entry, &scratch_allocator, se::dnn::ConvolutionKind::FORWARD,
stream, input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
output_desc, output_ptr);
if (!cudnn_launch_status.ok()) {
ctx->SetStatus(cudnn_launch_status);
return;
}
if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
VLOG(4) << "Convert the output tensor back from NCHW to NHWC.";
functor::NCHWToNHWC<GPUDevice, T, 4>()(
ctx->eigen_device<GPUDevice>(),
const_cast<const Tensor&>(transformed_output).tensor<T, 4>(),
output->tensor<T, 4>());
}
}
se::dnn::ConvOp 为 ConvRunner 实现 LazyOpRunner 所需的概念。
AutotuneMap 从参数中查找最佳自动调整配置的帮助程序类。
BatchDescriptor 描述层消耗/产生的维度。
FilterDescriptor
AutotuneEntry 支持 cuDNN 前端 API 的自动调整映射条目。ROCm 仍停留旧版 API,需要一个 AlgorithmConfig。
ConvParameters 唯一标识在特定设备型号上运行的卷积操作。
AutotuneMap::Find 以参数为键查找配置。
ScopedAnnotation 通过当前注册的 TraceCollector 为实例生命周期内的所有活动添加注释。
template <typename T>
StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
bool cudnn_use_autotune,
AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
const ConvParameters& conv_parameters, OpKernelContext* ctx,
se::dnn::ConvolutionKind kind, const se::dnn::BatchDescriptor& input_desc,
se::DeviceMemory<T> input_ptr, const se::dnn::FilterDescriptor& filter_desc,
se::DeviceMemory<T> filter_ptr,
const se::dnn::ConvolutionDescriptor& conv_desc,
const se::dnn::BatchDescriptor& output_desc, se::DeviceMemory<T> output_ptr,
int64_t scratch_size_limit) {
AutotuneEntry<se::dnn::ConvOp> autotune_entry;
auto* stream = ctx->op_device_context()->stream();
if (!autotune_map->Find(conv_parameters, &autotune_entry)) {
profiler::ScopedAnnotation annotation("cudnn_autotuning");
TfAllocatorAdapter 是封装了 Tensorflow 分配器的适配器类。
RedzoneAllocator 分配器在每次分配的开始/结束时分配一点额外的内存,并且可以检查该内存是否未被修改。
WrapRedzoneBestEffort 调用 RedzoneAllocator::AllocateBytes 分配 DeviceMemory
#if GOOGLE_CUDA
se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
stream);
se::RedzoneAllocator rz_allocator(stream, &tf_allocator_adapter,
se::GpuAsmOpts());
// TODO(awpr): second-guess whether it's okay that this profiles
// convolutions on uninitialized memory.
switch (kind) {
case se::dnn::ConvolutionKind::FORWARD:
case se::dnn::ConvolutionKind::FORWARD_BIAS_ACTIVATION:
output_ptr = se::DeviceMemory<T>(
WrapRedzoneBestEffort(&rz_allocator, output_ptr));
break;
case se::dnn::ConvolutionKind::BACKWARD_DATA:
input_ptr = se::DeviceMemory<T>(
WrapRedzoneBestEffort(&rz_allocator, input_ptr));
break;
case se::dnn::ConvolutionKind::BACKWARD_FILTER:
filter_ptr = se::DeviceMemory<T>(
WrapRedzoneBestEffort(&rz_allocator, filter_ptr));
break;
default:
return errors::InvalidArgument(
absl::StrFormat("Unknown ConvolutionKind %d", kind));
}
launch_func
函数调用 se::dnn::ConvRunner 执行后端。
AutotuneConvImpl 执行传入的launch_func
函数,得到一组 AutotuneResult。
LogConvAutotuneResults 记录到 AutotuningLog 中。
const auto element_type = se::dnn::ToDataType<T>::value;
std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners;
TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
CudnnUseFrontend(), kind, element_type, element_type, stream,
input_desc, input_ptr, filter_desc, filter_ptr, output_desc, output_ptr,
conv_desc, /*use_fallback=*/false, &rz_allocator, &runners));
auto launch_func =
[&](se::ScratchAllocator* allocator_used,
const std::unique_ptr<const se::dnn::ConvRunner>& runner,
se::dnn::ProfileResult* profile_result) -> Status {
TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes(
runner->GetWorkspaceSize()));
return (*runner)(stream, profile_result, scratch, input_ptr, filter_ptr,
output_ptr);
};
SE_ASSIGN_OR_RETURN(
auto results,
AutotuneConvImpl(ctx, runners, cudnn_use_autotune, launch_func,
scratch_size_limit, rz_allocator));
LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
filter_ptr, output_ptr, input_desc, filter_desc,
output_desc, conv_desc, stream->parent(), results);
两级自动调整:Cudnn 前端支持两个引擎列表:
启发式和回退。 启发式引擎通常更快。
为了减少自动调整时间,我们仅在没有启发式引擎工作时评估回退引擎。
BestCudnnConvAlgorithm 由最优结果创建 AutotuneEntry 对象。
// Two-level autotuning: Cudnn frontend supports two engine lists:
// heuristics and fallback. Heuristics engines are normally faster.
// To reduce autotuning time, we evaluate the fallback engines only when
// none of the heuristics engines work.
bool found_working_engine = false;
for (auto& result : results) {
if (!result.has_failure()) {
found_working_engine = true;
break;
}
}
if (!CudnnUseFrontend() || found_working_engine) {
SE_ASSIGN_OR_RETURN(
autotune_entry,
BestCudnnConvAlgorithm<se::dnn::ConvOp>(results, std::move(runners)));
} else {
LOG(WARNING)
<< "None of the algorithms provided by cuDNN frontend heuristics "
"worked; trying fallback algorithms. Conv: "
<< conv_parameters.ToString();
std::vector<std::unique_ptr<const se::dnn::ConvRunner>> fallback_runners;
TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
CudnnUseFrontend(), kind, element_type, element_type, stream,
input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
output_ptr, conv_desc, /*use_fallback=*/true, &rz_allocator,
&fallback_runners));
SE_ASSIGN_OR_RETURN(
auto fallback_results,
AutotuneConvImpl(ctx, fallback_runners, cudnn_use_autotune,
launch_func, scratch_size_limit, rz_allocator));
LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
filter_ptr, output_ptr, input_desc, filter_desc,
output_desc, conv_desc, stream->parent(),
fallback_results);
SE_ASSIGN_OR_RETURN(autotune_entry,
BestCudnnConvAlgorithm<se::dnn::ConvOp>(
fallback_results, std::move(fallback_runners)));
}
ROCm 的实现。
#elif TENSORFLOW_USE_ROCM
DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx);
std::vector<se::dnn::ProfileResult> algorithms;
if (!stream->parent()->GetMIOpenConvolveAlgorithms(
kind, se::dnn::ToDataType<T>::value, stream, input_desc, input_ptr,
filter_desc, filter_ptr, output_desc, output_ptr, conv_desc,
&scratch_allocator, &algorithms)) {
return errors::Unknown(
"Failed to get convolution algorithm. This is probably "
"because MIOpen failed to initialize, so try looking to "
"see if a warning log message was printed above.");
}
std::vector<tensorflow::AutotuneResult> results;
if (algorithms.size() == 1) {
auto profile_result = algorithms[0];
results.emplace_back();
auto& result = results.back();
*result.mutable_algorithm() = profile_result.algorithm().ToProto();
result.set_scratch_bytes(profile_result.scratch_size());
*result.mutable_run_time() = proto_utils::ToDurationProto(
absl::Milliseconds(profile_result.elapsed_time_in_ms()));
} else {
for (auto miopen_algorithm : algorithms) {
auto profile_algorithm = miopen_algorithm.algorithm();
se::dnn::ProfileResult profile_result;
auto miopen_launch_status = stream->ConvolveWithAlgorithm(
kind, input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
output_ptr, conv_desc, &scratch_allocator,
se::dnn::AlgorithmConfig(profile_algorithm,
miopen_algorithm.scratch_size()),
&profile_result);
if (miopen_launch_status.ok() && profile_result.is_valid()) {
results.emplace_back();
auto& result = results.back();
*result.mutable_algorithm() = profile_algorithm.ToProto();
result.set_scratch_bytes(scratch_allocator.TotalByteSize());
*result.mutable_run_time() = proto_utils::ToDurationProto(
absl::Milliseconds(profile_result.elapsed_time_in_ms()));
}
}
}
LogConvAutotuneResults(kind, se::dnn::ToDataType<T>::value, input_ptr,
filter_ptr, output_ptr, input_desc, filter_desc,
output_desc, conv_desc, stream->parent(), results);
SE_ASSIGN_OR_RETURN(auto algo_desc, BestCudnnConvAlgorithm(results));
autotune_entry = AutotuneEntry<se::dnn::ConvOp>(algo_desc);
#endif
AutotuneMap::Insert 将卷积参数和对应的 AutotuneEntry 插入到 AutotuneMap 中。
autotune_map->Insert(conv_parameters, autotune_entry);
}
return autotune_entry;
}
// A helper class that looks up the best autotuned config from parameters.
// Due to the noisy nature of autotune, especially with multiple devices, it
// only accepts a config if its margin exceeds a threshold.
// For the same shape configs, if a new best config matches the previous best,
// they get promoted; otherwise, the winner gets demoted. This process stops
// when the winner's score exceeds the threshold.
// In a bad case when two configs are very close to each other and flips
// back and forth randomly, the expected number of experiments before autotune
// settles is O(threshold ^ 2). So we recommend that number of warmup runs
// for any benchmarks.
template <typename Parameters, typename Config>
class AutotuneMap {
private:
// Retrieves the hash code of Parameters class.
struct Hasher {
std::size_t operator()(const Parameters& parameter) const {
return parameter.hash();
}
};
如果分数小于最小阈值并且未达到最大调优次数,则返回失败。这种机制下会导致调优的次数不固定。
public:
bool Find(const Parameters& params, Config* config) const {
mutex_lock lock(mu_);
auto iter = params_config_map_.find(params);
if (iter == params_config_map_.end() ||
(iter->second.score < min_score_threshold_ &&
iter->second.count <= max_autotune_count_)) {
return false;
}
*config = iter->second.config;
return true;
}
分数是内部定义的一套机制,新参数的分数为1,min_score_threshold_
为1,意味着只会保留旧的?
首先检查params_config_map_
字典中是否已有该参数。
如果没有则创建一个条目,默认分数为1;
否则如果原有分数小于最低阈值并且调优次数未达上限,则如果两个设置不同减分,相同加分。
如果min_score_threshold_
为2,则只保留稳定值。
void Insert(const Parameters& params, const Config& config) {
mutex_lock lock(mu_);
auto iter = params_config_map_.find(params);
int new_score = 0;
if (iter == params_config_map_.end()) {
// Create a new entry if params is new.
VLOG(1) << GetActionSummary("creates", params, config);
params_config_map_.insert(
std::make_pair(params, ValueType{config, 1, 1}));
new_score = 1;
} else if (iter->second.score < min_score_threshold_ &&
iter->second.count <= max_autotune_count_) {
DCHECK_GT(iter->second.score, 0);
if (iter->second.config != config) {
// If it is different from the current winner, demotes the winner.
VLOG(1) << GetActionSummary("demotes", params, config);
new_score = --iter->second.score;
++iter->second.count;
if (new_score <= 0) {
VLOG(1) << GetActionSummary("erases", params, config);
params_config_map_.erase(iter);
}
} else {
// If it is the same as the current winner, promotes the winner.
VLOG(1) << GetActionSummary("promotes", params, config);
new_score = ++iter->second.score;
++iter->second.count;
}
}
如果new_score
小于最低阈值但是全局调优次数已经超过阈值,则接受当前或者字典中已有的配置,将其分数设置为min_score_threshold_
。
if (new_score >= min_score_threshold_) {
VLOG(1) << GetActionSummary("accepts", params, config);
} else if (autotune_global_count_ >= max_autotune_global_count_) {
// The autotuning exceeds the max iteration threshold and we accept the
// the winner if it exists in the map, otherwise we accept the current
// winner.
auto winner = params_config_map_.find(params);
if (winner == params_config_map_.end()) {
VLOG(1) << GetActionSummary("creates", params, config);
for (int i = 0; i < min_score_threshold_; ++i) {
VLOG(1) << GetActionSummary("promotes", params, config);
}
params_config_map_.insert(
std::make_pair(params, ValueType{config, min_score_threshold_, 1}));
} else {
int promotes_times = min_score_threshold_ - winner->second.score;
for (int i = 0; i < promotes_times; ++i) {
VLOG(1) << GetActionSummary("promotes", params, config);
}
winner->second.score = min_score_threshold_;
}
VLOG(1) << GetActionSummary("accepts", params, config);
}
autotune_global_count_++;
}
std::unordered_map<Parameters, Config, Hasher> GetMap() const {
mutex_lock lock(mu_);
std::unordered_map<Parameters, Config, Hasher> map;
for (const auto& entry : params_config_map_) {
map.insert(std::make_pair(entry.first, entry.second.config));
}
return map;
}
// Only for testing
void ClearMap() {
mutex_lock lock(mu_);
params_config_map_.clear();
}
private:
// Underlying data structure of values in the map.
struct ValueType {
Config config;
int32 score;
int32 count;
};
如果不修改min_score_threshold_
,max_autotune_count_
大于等于min_warmup_iterations
。
max_autotune_global_count_
是max_autotune_count_
的两倍。
AutotuneMap(const std::string& name) : name_(name) {
min_score_threshold_ = 1;
int min_warmup_iterations = 10;
const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
if (threshold_str != nullptr) {
VLOG(1) << "TF_AUTOTUNE_THRESHOLD = " << threshold_str;
strings::safe_strto32(threshold_str, &min_score_threshold_);
}
const char* min_warmup_iteration_str =
getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
if (min_warmup_iteration_str != nullptr) {
strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
}
min_score_threshold_ = std::max(min_score_threshold_, 1);
max_autotune_count_ = std::max(
5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
max_autotune_global_count_ = 2 * max_autotune_count_;
autotune_global_count_ = 0;
}
template <class Group, class Params, class Cfg>
friend class AutotuneSingleton;
std::string GetActionSummary(StringPiece action, const Parameters& params,
const Config& config) {
return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
string(action).c_str(), params.ToString().c_str(),
config.ToString().c_str());
}
mutable mutex mu_;
std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
TF_GUARDED_BY(mu_);
std::string name_;
int32 min_score_threshold_;
int32 max_autotune_count_;
int32 max_autotune_global_count_;
int32 autotune_global_count_;
TF_DISALLOW_COPY_AND_ASSIGN(AutotuneMap);
};
设备可以子类化 DeviceContext 以将特定于设备的上下文传递给 OpKernels 类。
se::TfAllocatorAdapter 是包装 Tensorflow 分配器的适配器类。
template <typename LaunchFunc, typename Sig>
StatusOr<std::vector<tensorflow::AutotuneResult>> AutotuneConvImpl(
OpKernelContext* ctx,
std::vector<std::unique_ptr<const se::dnn::OpRunner<Sig>>>& runners,
bool actually_do_autotune, const LaunchFunc& launch_func,
size_t scratch_size_limit, const se::RedzoneAllocator& rz_allocator) {
auto* stream = ctx->op_device_context()->stream();
se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
stream);
se::dnn::OpRunner 是拥有特定操作(配置)的缓存状态的抽象类。其主要动机是 cuDNN 后端执行计划(ExecutionPlan)的重新创建成本很高。所有 OpRunner 的寿命都必须超过其父 Stream。
RedzoneAllocator 分配器。
CudnnLegacyConvRunner::ToAlgorithmDesc 调用 CudnnLegacyConvRunner::MakeAlgorithmDesc 创建 dnn::AlgorithmDesc 对象。
se::dnn::ProfileResult 描述 perf 实验的结果。
如果需要实际调优,则调用launch_func
函数,否则手动设置profile_result
中的值。
std::vector<tensorflow::AutotuneResult> results;
// TODO(reedwm): Warn if determinism is enabled after autotune is run
for (auto& runner : runners) {
// TODO(zhengxq): profile each algorithm multiple times to better
// accuracy.
se::RedzoneAllocator rz_scratch_allocator(
stream, &tf_allocator_adapter, se::GpuAsmOpts(),
/*memory_limit=*/scratch_size_limit);
DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx);
se::ScratchAllocator* allocator_used =
!RedzoneCheckDisabled()
? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
: static_cast<se::ScratchAllocator*>(&scratch_allocator);
SE_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
se::dnn::ProfileResult profile_result;
Status cudnn_launch_status =
actually_do_autotune
? launch_func(allocator_used, runner, &profile_result)
: OkStatus();
if (!actually_do_autotune) {
// Make the result valid according to `is_valid`.
profile_result.set_algorithm(desc);
profile_result.set_elapsed_time_in_ms(0);
}
runner 会运行失败?
ProfileResult::is_valid 检查是否有 AlgorithmDesc 值以及时间是否正常。
RedzoneCheckDisabled 读取环境变量TF_DISABLE_RZ_CHECK
的值。
RedzoneAllocator::TotalAllocatedBytesExcludingRedzones 返回分配的字节数。
DnnScratchAllocator::TotalByteSize
proto_utils::ToDurationProto 将 absl::Duration 转换为 google::protobuf::Duration。
// We need to make sure the profiling results are one-to-one with the
// "runners". So, we insert dummy results when the execution fails.
results.emplace_back();
auto& result = results.back();
*result.mutable_algorithm() = desc.ToProto();
if (cudnn_launch_status.ok() && profile_result.is_valid()) {
result.set_scratch_bytes(
!RedzoneCheckDisabled()
? rz_scratch_allocator.TotalAllocatedBytesExcludingRedzones()
: scratch_allocator.TotalByteSize());
*result.mutable_run_time() = proto_utils::ToDurationProto(
absl::Milliseconds(profile_result.elapsed_time_in_ms()));
CheckRedzones(rz_scratch_allocator, &result);
CheckRedzones(rz_allocator, &result);
} else {
result.mutable_failure()->set_kind(AutotuneResult::UNKNOWN);
result.mutable_failure()->set_msg(
absl::StrCat("Profiling failure on CUDNN engine ", desc.ToString(),
": ", cudnn_launch_status.ToString()));
}
}
return results;
}
AutotuningLog 包含 AutotuneResult 以及软硬件信息。
ConvolutionProto 记录卷积信息。
void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
se::dnn::DataType element_type,
se::DeviceMemoryBase input_buffer,
se::DeviceMemoryBase filter_buffer,
se::DeviceMemoryBase output_buffer,
const se::dnn::BatchDescriptor& input_desc,
const se::dnn::FilterDescriptor& filter_desc,
const se::dnn::BatchDescriptor& output_desc,
const se::dnn::ConvolutionDescriptor& conv_desc,
se::StreamExecutor* stream_exec,
absl::Span<const AutotuneResult> results) {
AutotuningLog log;
{
ConvolutionProto instr;
instr.set_kind(kind);
*instr.mutable_input() = input_desc.ToProto(element_type);
*instr.mutable_filter() = filter_desc.ToProto(element_type);
*instr.mutable_output() = output_desc.ToProto(element_type);
*instr.mutable_conv_desc() = conv_desc.ToProto();
instr.set_conv_scale(1);
instr.set_side_value_scale(0);
instr.set_input_address(reinterpret_cast<uint64>(input_buffer.opaque()));
instr.set_filter_address(reinterpret_cast<uint64>(filter_buffer.opaque()));
instr.set_output_address(reinterpret_cast<uint64>(output_buffer.opaque()));
log.mutable_instr()->PackFrom(std::move(instr));
}
GetCudnnVersion
GetComputeCapability
*log.mutable_cudnn_version() = GetCudnnVersion(stream_exec);
*log.mutable_compute_capability() = GetComputeCapability(stream_exec);
log.set_device_pci_bus_id(stream_exec->GetDeviceDescription().pci_bus_id());
{
string blas_version;
if (auto* blas = stream_exec->AsBlas()) {
if (blas->GetVersion(&blas_version).ok()) {
log.set_blas_version(blas_version);
}
}
}
for (const auto& result : results) {
*log.add_results() = result;
}
VLOG(2) << log.DebugString();
Logger::GetSingleton()->LogProto(log);
}
BestCudnnConvAlgorithmIndices 找到耗时最短算法的索引。
AutotuneEntry::FromOpRunners 使用预缓存的 OpRunner 进行初始化,例如在自动调整期间。
TF_ASSIGN_OR_RETURN 执行表达式,如果成功赋值给变量;否则返回状态。
emplate <typename Op>
StatusOr<AutotuneEntry<Op>> BestCudnnConvAlgorithm(
absl::Span<const AutotuneResult> results,
std::vector<
std::unique_ptr<const se::dnn::OpRunner<typename Op::Signature>>>
runners) {
if (runners.size() != results.size()) {
return errors::Internal(
"Mismatched size of autotune results and runners vectors.");
}
int idx;
int idx_no_scratch;
TF_ASSIGN_OR_RETURN(std::tie(idx, idx_no_scratch),
BestCudnnConvAlgorithmIndices(results));
VLOG(2) << "fastest algorithm: "
<< proto_utils::FromDurationProto(results[idx].run_time())
<< " with algo " << runners[idx]->ToString() << ", workspace bytes "
<< results[idx].scratch_bytes();
return AutotuneEntry<Op>::FromOpRunners(
std::move(runners[idx]), idx_no_scratch == -1 || idx_no_scratch == idx
? nullptr
: std::move(runners[idx_no_scratch]));
}
compare_run_times
比较结果中的时间。
StatusOr<std::tuple<int, int>> BestCudnnConvAlgorithmIndices(
absl::Span<const AutotuneResult> results) {
auto compare_run_times
= [](const AutotuneResult& lhs,
const AutotuneResult& rhs) {
return proto_utils::FromDurationProto(lhs.run_time()) <
proto_utils::FromDurationProto(rhs.run_time());
};
遍历每个结果,找到最小值的索引。
int idx = -1;
int idx_no_scratch = -1;
for (int i = 0; i < results.size(); i++) {
if (!results[i].has_failure()) {
if (OpDeterminismRequired()) {
// When determinism is enabled, choose first working algorithm, and
// don't choose a no_scratch algorithm.
idx = i;
break;
}
if (idx == -1 || compare_run_times(results[i], results[idx])) {
idx = i;
}
if (results[i].scratch_bytes() == 0 &&
(idx_no_scratch == -1 ||
compare_run_times(results[i], results[idx_no_scratch]))) {
idx_no_scratch = i;
}
}
}
如果未找到返回错误。
if (idx == -1) {
std::ostringstream msg;
msg << "No algorithm worked! Error messages:";
// TODO(awpr): identify the algorithm as part of this error message, too.
for (const auto& result : results) {
msg << "\n " << result.failure().msg();
}
return errors::NotFound(msg.str());
}
return std::make_tuple(idx, idx_no_scratch);
}
AutotuneEntry::is_algorithm_config 检查是否使用了 AlgorithmConfig。
AutotuneEntry::GetOpRunners 返回 AutotuneEntry::OpRunners 结构体。
se::dnn::ConvOp::Config 为卷积的数据类型和描述符。
LazyOpRunner::GetOrCreateRunner 如果可用,则返回一个已经初始化的 OpRunner,或者创建一个。
AllocateScratchOrFallback 返回指向runners
的主要OpRunner
的指针,如果可分配,则分配暂存内存;否则指向其后备无暂存空间运行器的指针,以及空DeviceMemoryBase
。
ConvRunner 使用3个输入的函数签名。
CudnnExecutionPlanRunner::operator() 调用 cudnn 前端操作。
template <typename T>
Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
DnnScratchAllocator* scratch_allocator,
se::dnn::ConvolutionKind kind, se::Stream* stream,
const se::dnn::BatchDescriptor& input_desc,
se::DeviceMemory<T> in_ptr,
const se::dnn::FilterDescriptor& filter_desc,
se::DeviceMemory<T> filter_ptr,
const se::dnn::ConvolutionDescriptor& conv_desc,
const se::dnn::BatchDescriptor& output_desc,
se::DeviceMemory<T> out_ptr) {
if (!autotune_entry.is_algorithm_config()) {
const auto& runners = autotune_entry.GetOpRunners();
se::dnn::DataType element_type = se::dnn::ToDataType<T>::value;
se::dnn::ConvOp::Config config{kind, element_type, element_type,
input_desc, filter_desc, output_desc,
conv_desc};
TF_ASSIGN_OR_RETURN(auto* primary,
runners.primary->GetOrCreateRunner(config, stream));
const se::dnn::ConvRunner* no_scratch_fallback = nullptr;
if (runners.no_scratch_fallback) {
TF_ASSIGN_OR_RETURN(
no_scratch_fallback,
runners.no_scratch_fallback->GetOrCreateRunner(config, stream));
}
TF_ASSIGN_OR_RETURN(auto runner_and_scratch,
AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
scratch_allocator, primary, no_scratch_fallback));
auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch);
return runner(stream, nullptr,
std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
filter_ptr, out_ptr);
否则调用 Stream::ConvolveWithAlgorithm
} else {
return stream->ConvolveWithAlgorithm(
kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc, out_ptr,
conv_desc, scratch_allocator, autotune_entry.GetAlgorithmConfig(),
nullptr);
}
}