Skip to content

Commit

Permalink
[GPU] Added convolution_gpu_b_fs_zyx_fsv16_imad shape agnostic kernel (
Browse files Browse the repository at this point in the history
  • Loading branch information
Lyamin-Roman authored Nov 6, 2024
1 parent b6fe65f commit d8befa1
Show file tree
Hide file tree
Showing 9 changed files with 233 additions and 47 deletions.
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ struct convolution_impl : typed_primitive_impl_ocl<convolution> {
if (is_dynamic()) {
auto& kernel_selector = kernel_selector_t::Instance();
auto kernel_impl = kernel_selector.GetImplementation(_kernel_data.kernelName);

const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
_kernel_data.params = std::make_shared<kernel_params_t>(get_kernel_params(*impl_params, true));

kernel_impl->GetUpdateDispatchDataFunc(_kernel_data);
}
}
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/graph/impls/ocl/convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ struct ConvolutionImplementationManager : public ImplementationManager {
static const std::vector<format::type> supported_dyn_formats = {
format::bfyx,
format::bfzyx,
format::b_fs_yx_fsv16
format::b_fs_yx_fsv16,
format::b_fs_zyx_fsv16
};

if (!one_of(input_fmt.value, supported_dyn_formats) || !one_of(output_fmt.value, supported_dyn_formats))
Expand Down
25 changes: 7 additions & 18 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1796,15 +1796,9 @@ void program::save(cldnn::BinaryOutputBuffer& ob) const {
for (auto& impl_id : impl_ids) {
std::string type_name = get_node_ptr(impl_id)->get_selected_impl()->m_manager->get_type_info().name;
ob << type_name;
if (get_node_ptr(impl_id)->get_selected_impl()->is_onednn()) {
ob << true;
auto params = get_node_ptr(impl_id)->get_kernel_impl_params();
ob.setKernelImplParams(params.get());
ob << get_node_ptr(impl_id)->selected_impl;
} else {
ob << false;
ob << get_node_ptr(impl_id)->selected_impl;
}
auto params = get_node_ptr(impl_id)->get_kernel_impl_params();
ob.setKernelImplParams(params.get());
ob << get_node_ptr(impl_id)->selected_impl;
ob << get_node_ptr(impl_id)->get_selected_impl()->get_cached_kernel_ids(kernels_cache);
}
}
Expand Down Expand Up @@ -1930,15 +1924,10 @@ void program::load(cldnn::BinaryInputBuffer& ib) {
ib >> type_name;
ov::DiscreteTypeInfo type(type_name.c_str());
auto impl_manager = p_node.type()->get(type);
bool is_onednn;
ib >> is_onednn;
if (is_onednn) {
auto params = p_node.get_kernel_impl_params();
ib.setKernelImplParams(params.get());
ib >> p_node.selected_impl;
} else {
ib >> p_node.selected_impl;
}

auto params = p_node.get_kernel_impl_params();
ib.setKernelImplParams(params.get());
ib >> p_node.selected_impl;

p_node.selected_impl->m_manager = impl_manager.get();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
REQD_SUB_GROUP_SIZE(SIMD)
__attribute__((reqd_work_group_size(1, 1, FEATURE_SLM_SPLIT * SIMD)))
KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE *conv_input,
__global OUTPUT_TYPE *output,
const __global FILTER_TYPE *weights
Expand Down Expand Up @@ -606,11 +607,15 @@ KERNEL(convolution_gpu_b_fs_zyx_fsv16_imad)(
__attribute__((opencl_unroll_hint(OUT_BLOCK_WIDTH)))
for (uint ow = 0; ow < OUT_BLOCK_WIDTH; ow++) {

#if !IS_DYNAMIC
#if OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0
if (out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
break;
#endif

#else
if (OUTPUT_SIZE_X % OUT_BLOCK_WIDTH != 0 && out_x + OUT_BLOCK_WIDTH > OUTPUT_SIZE_X && ow >= OUTPUT_SIZE_X % OUT_BLOCK_WIDTH)
break;
#endif
if (out_f_g < FILTER_OFM_NUM) {
output[dst_index + ow * FSV + oh * OUTPUT_Y_PITCH * FSV + od * OUTPUT_Z_PITCH * FSV] = result[ofb][od][oh][ow];
}
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
if (_tensor.GetLayout() == DataLayout::bf || _tensor.GetLayout() == DataLayout::bfyx ||
_tensor.GetLayout() == DataLayout::bfzyx || _tensor.GetLayout() == DataLayout::bfwzyx ||
_tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx ||
_tensor.GetLayout() == DataLayout::b_fs_yx_fsv16 || _tensor.GetLayout() == DataLayout::b_fs_yx_fsv32) {
_tensor.GetLayout() == DataLayout::b_fs_yx_fsv16 || _tensor.GetLayout() == DataLayout::b_fs_yx_fsv32 ||
_tensor.GetLayout() == DataLayout::b_fs_zyx_fsv16) {
definitions.push_back({_name + "_X_PITCH", "1"});
definitions.push_back({_name + "_Y_PITCH", dims_padded.x()});
definitions.push_back({_name + "_Z_PITCH", toVectorMulString({dims_padded.x(), dims_padded.y()})});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,10 @@ namespace kernel_selector {

Convolution_kernel_b_fs_zyx_fsv16_imad::BlockParams
Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params& params) const {
size_t max_block_width = getOutBlock_X(params.outputs[0].X().v, params.stride.x, params.filterSize.x, params.dilation.x);
size_t max_block_width = 1;
if (!params.outputs[0].X().is_dynamic) {
max_block_width = getOutBlock_X(params.outputs[0].X().v, params.stride.x, params.filterSize.x, params.dilation.x);
}
size_t max_in_block_width = (max_block_width - 1) * params.stride.x + (params.filterSize.x - 1) * params.dilation.x + 1;

size_t block_width = max_block_width;
Expand Down Expand Up @@ -90,7 +93,9 @@ Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params&
size_t max_slm_split = params.engineInfo.maxWorkGroupSize / simd;

// TGLU exceptions related to SLM usage
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.computeUnitsCount == 96) {
if (params.is_shape_agnostic) {
max_slm_split = 2;
} else if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.computeUnitsCount == 96) {
bool split_exception_1 = params.outputs[0].X().v == 3 && params.outputs[0].Y().v == 3 && params.outputs[0].Z().v == 1
&& params.outputs[0].Feature().v == 512;
bool split_exception_2 = params.outputs[0].X().v == 5 && params.outputs[0].Y().v == 5 && params.outputs[0].Z().v == 1
Expand Down Expand Up @@ -118,13 +123,16 @@ Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params&
}
}

size_t max_d = params.outputs[0].Z().is_dynamic ? 1 : 16;
size_t max_h = params.outputs[0].Y().is_dynamic ? 1 : 16;

for (size_t split = 1; split <= max_slm_split; split *= 2) {
for (size_t temp_block_features = simd; temp_block_features <= simd * 2; temp_block_features += simd) {
for (size_t d = 1; d < 16; ++d) {
if (params.outputs[0].Z().v % d)
for (size_t d = 1; d < max_d; ++d) {
if (d != 1 && params.outputs[0].Z().v % d)
continue;
for (size_t h = 1; h < 16; ++h) {
if (params.outputs[0].Y().v % h)
for (size_t h = 1; h < max_h; ++h) {
if (h != 1 && params.outputs[0].Y().v % h)
continue;

bool c_ifm_mul = CeilDiv(params.weights.IFM().v, fsv) % split == 0;
Expand Down Expand Up @@ -174,6 +182,10 @@ Convolution_kernel_b_fs_zyx_fsv16_imad::GetBlockParams(const convolution_params&
}

float Convolution_kernel_b_fs_zyx_fsv16_imad::EstimateBlockParamsRatio(const convolution_params& params, const BlockParams& block) const {
if (params.has_dynamic_outputs()) {
return -10.f;
}

float occupancy_by_logic_size = static_cast<float>(params.outputs[0].LogicalSize() / static_cast<size_t>(params.engineInfo.maxThreadsPerDevice));
bool increase_max_reg_pressure = occupancy_by_logic_size >= 595.f;
bool twice_increase_max_reg_pressure = occupancy_by_logic_size >= 595.f * 2.f;
Expand Down Expand Up @@ -373,6 +385,7 @@ ParamsKey Convolution_kernel_b_fs_zyx_fsv16_imad::GetSupportedKey() const {
k.EnableQuantization(QuantizationType::SYMMETRIC);
k.EnableQuantization(QuantizationType::ASYMMETRIC_DATA);
k.EnableDilation();
k.EnableDynamicShapesSupport();
return k;
}

Expand Down Expand Up @@ -450,10 +463,15 @@ JitConstants Convolution_kernel_b_fs_zyx_fsv16_imad::GetJitConstants(const convo

ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::SetDefault(const convolution_params& params,
int) const {
const BlockParams& block_params = GetBlockParams(params);
return CalcDispatchDataWithBlockParams(params, block_params);
} // SetDefault

ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::CalcDispatchDataWithBlockParams(const convolution_params& params,
const BlockParams& block_params) const {
DispatchData dispatchData;
const auto& output = params.outputs[0];
const auto& weights = params.weights;
auto block_params = GetBlockParams(params);

dispatchData.gws[0] = CeilDiv(output.X().v, block_params.output_block_width);
dispatchData.gws[1] = CeilDiv(output.Y().v, block_params.output_block_height) * CeilDiv(output.Z().v, block_params.output_block_depth);
Expand All @@ -466,17 +484,24 @@ ConvolutionKernelBase::DispatchData Convolution_kernel_b_fs_zyx_fsv16_imad::SetD

dispatchData.cldnnStyle = {0, 0, 0, 0, 0};
dispatchData.gemmStyle = {0, 0, 0, 0, 0, 0};

dispatchData.blockParams = { block_params.output_block_width, block_params.output_block_height,
block_params.output_block_depth, block_params.output_block_features,
block_params.input_block_width, block_params.input_block_height,
block_params.input_block_depth, block_params.feature_slm_split };
return dispatchData;
} // SetDefault
}

KernelsPriority Convolution_kernel_b_fs_zyx_fsv16_imad::GetKernelsPriority(const Params& params) const {
const auto& p = static_cast<const convolution_params&>(params);

if (static_cast<float>(p.weights.IFM().v) / static_cast<float>(Align(p.weights.IFM().v, fsv)) < 0.5f)
if (!p.is_shape_agnostic) {
if (static_cast<float>(p.weights.IFM().v) / static_cast<float>(Align(p.weights.IFM().v, fsv)) < 0.5f)
return FORCE_PRIORITY_4;
else
return FORCE_PRIORITY_2;
} else {
return FORCE_PRIORITY_4;
else
return FORCE_PRIORITY_2;
}
}

bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params) const {
Expand Down Expand Up @@ -507,4 +532,23 @@ bool Convolution_kernel_b_fs_zyx_fsv16_imad::Validate(const Params& params) cons

return true;
}

void Convolution_kernel_b_fs_zyx_fsv16_imad::GetUpdateDispatchDataFunc(KernelData& kd) const {
const auto& prim_params = static_cast<const convolution_params&>(*kd.params);
const auto& dynamicDispatchData = SetDefault(prim_params);

kd.update_dispatch_data_func = [this, dynamicDispatchData](const Params& params, KernelData& kd) {
const auto& prim_params = static_cast<const convolution_params&>(params);
const auto& dispatchData = CalcDispatchDataWithBlockParams(prim_params, dynamicDispatchData.blockParams);
OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func");
kd.kernels[0].params.workGroups.global = dispatchData.gws;
kd.kernels[0].params.workGroups.local = dispatchData.lws;
kd.kernels[0].skip_execution = KernelData::SkipKernelExecution(prim_params);

kd.internalBufferSizes.clear();
kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes());
kd.internalBufferDataType = prim_params.inputs[0].GetDType();
};
}

} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ namespace kernel_selector {
class Convolution_kernel_b_fs_zyx_fsv16_imad : public ConvolutionKernelBase {
public:
using Parent = ConvolutionKernelBase;
using BlockParams = DispatchData::BlockParams;
Convolution_kernel_b_fs_zyx_fsv16_imad() : ConvolutionKernelBase("convolution_gpu_b_fs_zyx_fsv16_imad") {}
virtual ~Convolution_kernel_b_fs_zyx_fsv16_imad() {}

Expand All @@ -24,6 +25,7 @@ class Convolution_kernel_b_fs_zyx_fsv16_imad : public ConvolutionKernelBase {
bool Validate(const Params& params) const override;
JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override;
DispatchData SetDefault(const convolution_params& params, int autoTuneIndex = -1) const override;
void GetUpdateDispatchDataFunc(KernelData& kd) const override;
bool NeedPaddedInput() const override { return true; }
WeightsLayout GetPreferredWeightsLayout(const convolution_params& p) const override {
return p.groups > 1 ? WeightsLayout::g_os_is_zyx_osv16_isv16 : WeightsLayout::os_is_zyx_osv16_isv16;
Expand All @@ -35,24 +37,11 @@ class Convolution_kernel_b_fs_zyx_fsv16_imad : public ConvolutionKernelBase {
FusedOpType::ACTIVATION };
}

struct BlockParams {
size_t output_block_width;
size_t output_block_height;
size_t output_block_depth;

size_t output_block_features;

size_t input_block_width;
size_t input_block_height;
size_t input_block_depth;

size_t feature_slm_split;
};

BlockParams GetBlockParams(const convolution_params& params) const;
float EstimateBlockParamsRatio(const convolution_params& params, const BlockParams& block) const;
float EstimateRegPressure(const convolution_params& params, const BlockParams& block) const;
float EstimateOccupancy(const convolution_params& params, const BlockParams& block) const;
float EstimateSLMUsage(const convolution_params& params, const BlockParams& block) const;
DispatchData CalcDispatchDataWithBlockParams(const convolution_params& params, const BlockParams& block_params) const;
};
} // namespace kernel_selector
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,23 @@ class ConvolutionKernelBase : public WeightBiasKernelBase {
size_t globalWorkSizeDZ;
};

struct BlockParams {
size_t output_block_width;
size_t output_block_height;
size_t output_block_depth;

size_t output_block_features;

size_t input_block_width;
size_t input_block_height;
size_t input_block_depth;

size_t feature_slm_split;
};

CLDNNStyle cldnnStyle;
GEMMStyle gemmStyle;
BlockParams blockParams;
};

std::string GetAutoTuneOptions(int autoTuneIndex) const;
Expand Down
Loading

0 comments on commit d8befa1

Please sign in to comment.