Skip to content

Commit

Permalink
[XPU] add new kernels, refine conv2d and bug fix (#4817) (#4926)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhupengyang authored Dec 9, 2020
1 parent f7bdb71 commit 68e64e0
Show file tree
Hide file tree
Showing 18 changed files with 1,082 additions and 167 deletions.
398 changes: 355 additions & 43 deletions lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ else()
add_kernel(transpose_compute_xpu XPU basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps})
add_kernel(density_prior_box_compute XPU basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps})
add_kernel(prior_box_compute_xpu XPU basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps})
add_kernel(interpolate_compute_xpu XPU basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps})
add_kernel(box_coder_compute XPU basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps})
add_kernel(split_compute_xpu XPU basic SRCS split_compute.cc DEPS ${lite_kernel_deps})

# extra
add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
Expand All @@ -45,6 +48,7 @@ else()
add_kernel(sequence_unpad_compute_xpu XPU extra SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps})
add_kernel(lrn_compute_xpu XPU extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps})
add_kernel(topk_compute_xpu XPU extra SRCS topk_compute.cc DEPS ${lite_kernel_deps})
add_kernel(im2sequence_compute_xpu XPU extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps})
add_kernel(unstack_compute_xpu XPU extra SRCS unstack_compute.cc DEPS ${lite_kernel_deps})

# extra(fused kernel)
Expand Down
82 changes: 33 additions & 49 deletions lite/kernels/xpu/__xpu__conv2d_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "lite/kernels/xpu/__xpu__conv2d_compute.h"
#include <string>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

Expand All @@ -27,70 +28,53 @@ void XPUConv2dCompute::Run() {
auto& ctx = this->ctx_->As<XPUContext>();

auto& input_dims = param.Input->dims();
auto& filter_dims = param.Filter->dims();
auto& filter_dims = param.filter_dims;
int batch = static_cast<int>(input_dims[0]);
int img_c = static_cast<int>(input_dims[1]);
int img_h = static_cast<int>(input_dims[2]);
int img_w = static_cast<int>(input_dims[3]);
int filter_num = static_cast<int>(filter_dims[0]);
int win_h = static_cast<int>(filter_dims[2]);
int win_w = static_cast<int>(filter_dims[3]);

auto paddings = *param.paddings;
auto dilations = *param.dilations;
int stride_h = param.strides[0];
int stride_w = param.strides[1];
int paddings_h = paddings[0];
int paddings_w = paddings[1];
int dilations_h = dilations[0];
int dilations_w = dilations[1];

std::string filter_type = param.filter_type;
int groups = param.groups;

int act_type = (param.act_type == "relu")
? xdnn::Activation_t::RELU
: xdnn::Activation_t::LINEAR; // -1 means not init
int act_type = param.act_type;
float* output_max = param.OutputMax->mutable_data<float>(TARGET(kXPU));
float* output = param.Output->mutable_data<float>(TARGET(kXPU));
const auto* bias = param.Bias ? param.Bias->data<float>() : nullptr;
const auto* branch = param.Branch ? param.Branch->data<float>() : nullptr;
const float* input_max =
param.InputMax ? param.InputMax->data<float>() : nullptr;
float* output_max = param.OutputMax
? param.OutputMax->mutable_data<float>(TARGET(kXPU))
: nullptr;
float* output = param.Output->mutable_data<float>(TARGET(kXPU));

// TODO(luohang): now support for resnet50 first
CHECK_EQ(groups, 1);
CHECK_EQ(filter_type, "int16");

xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type);
int r = xdnn::conv2d_forward_int16<float, int16_t, float, float>(
ctx.GetRawContext(), /* context */
batch, /* batch */
img_c, /* input_c */
img_h, /* input_h */
img_w, /* input_w */
filter_num, /* num_filter */
win_h, /* kernel_h */
win_w, /* kernel_w */
stride_h, /* stride_h */
stride_w, /* stride_w */
paddings_h, /* pad_h */
paddings_w, /* pad_w */
dilations_h, /* dilation_h */
dilations_w, /* dilation_w */
groups, /* group */
param.Input->data<float>(), /* input bottom */
param.Filter->data<int16_t>(), /* filter weight */
output, /* output top */
bias, /* bias */
branch, /* branch */
act, /* act type */
input_max, /* max_image_ptr */
param.FilterMax->data<float>(), /* max_filter_ptr */
output_max /* max_result_ptr */);

if (act_type == 5) {
act.leaky_alpha = param.act_param;
CHECK(act.leaky_alpha >= 0.0001 && act.leaky_alpha <= 10);
} else if (act_type == 15) {
act.hard_sigmoid_slope = param.act_param;
}
int r = xdnn::conv2d_fusion<float, int16_t, float, int16_t>(
ctx.GetRawContext(),
param.Input->data<float>(),
param.Filter->data<int16_t>(),
output,
batch,
img_c,
img_h,
img_w,
filter_num,
std::vector<int>{win_h, win_w},
param.strides,
paddings,
dilations,
groups,
input_max,
param.FilterMax->data<float>(),
output_max,
true,
bias,
branch,
act);
CHECK_EQ(r, 0);
}

Expand Down
33 changes: 19 additions & 14 deletions lite/kernels/xpu/batch_norm_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include "lite/kernels/xpu/batch_norm_compute.h"
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

Expand All @@ -24,23 +25,27 @@ namespace xpu {
void BatchNormCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();

float epsilon = param.epsilon;
auto& x_dims = param.x->dims();
CHECK_LE(x_dims.size(), 4);
std::vector<int> x_shape(4, 1);
for (int i = 0; i < x_dims.size(); i++) {
x_shape[i] = x_dims[i];
}

int r = xdnn::batch_norm_infer_forward(
ctx.GetRawContext(), /* context */
epsilon, /* epsilon */
x_dims[0], /* img_n */
x_dims[1], /* img_c */
x_dims[2], /* img_h */
x_dims[3], /* img_w */
param.x->data<float>(), /* img_gm */
param.y->mutable_data<float>(TARGET(kXPU)), /* out_gm */
param.scale->data<float>(), /* scale_gm */
param.bias->data<float>(), /* bias_gm */
param.mean->data<float>(), /* mean_gm */
param.variance->data<float>() /* var__gm */);
int r =
xdnn::batch_norm_infer_forward(ctx.GetRawContext(),
epsilon,
x_shape[0],
x_shape[1],
x_shape[2],
x_shape[3],
param.x->data<float>(),
param.y->mutable_data<float>(TARGET(kXPU)),
param.scale->data<float>(),
param.bias->data<float>(),
param.mean->data<float>(),
param.variance->data<float>());
CHECK_EQ(r, 0);
}

Expand Down
101 changes: 101 additions & 0 deletions lite/kernels/xpu/box_coder_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/box_coder_compute.h"
#include <string>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

void BoxCoderCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();

auto prior_box_var_size = 0;
auto* prior_box = param.prior_box;
auto* prior_box_var = param.prior_box_var;
if (prior_box_var) {
prior_box_var_size = prior_box_var->dims().size();
}
auto* target_box = param.target_box;
auto* output_box = param.proposals;
std::vector<float> variance = param.variance;
const int axis = param.axis;
std::string code_type = param.code_type;
bool normalized = param.box_normalized;

auto row = target_box->dims()[0];
auto col = prior_box->dims()[0];
if (code_type == "decode_center_size") {
col = target_box->dims()[1];
}
auto len = prior_box->dims()[1];
output_box->Resize({row, col, len});
auto* output = output_box->mutable_data<float>(TARGET(kXPU));

if (code_type == "encode_center_size") {
int r = xdnn::box_coder_encode(ctx.GetRawContext(),
prior_box->data<float>(),
prior_box_var->data<float>(),
target_box->data<float>(),
row,
col,
len,
normalized,
prior_box_var_size,
variance.data(),
variance.size(),
output);
CHECK_EQ(r, 0);
} else if (code_type == "decode_center_size") {
int r = xdnn::box_coder_decode(ctx.GetRawContext(),
prior_box->data<float>(),
prior_box_var->data<float>(),
target_box->data<float>(),
row,
col,
len,
normalized,
prior_box_var_size,
variance.data(),
variance.size(),
axis,
output);
CHECK_EQ(r, 0);
} else {
LOG(FATAL) << "box_coder don't support this code_type: " << code_type;
}
}

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(box_coder,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::BoxCoderCompute,
def)
.BindInput("PriorBox", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("PriorBoxVar", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("TargetBox", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("OutputBox", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
36 changes: 36 additions & 0 deletions lite/kernels/xpu/box_coder_compute.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "lite/core/kernel.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

class BoxCoderCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
public:
using param_t = operators::BoxCoderParam;

virtual void Run();

virtual ~BoxCoderCompute() = default;
};

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
22 changes: 14 additions & 8 deletions lite/kernels/xpu/dropout_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,20 @@ namespace xpu {
void DropoutCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();

int size = param.x->numel() * sizeof(float);

int r = xdnn::memcpy_device(
ctx.GetRawContext(), /* context */
param.output->mutable_data<float>(TARGET(kXPU)), /* dst */
param.x->data<float>(), /* src */
size /* size */);
float scale = 1.0f;
if (param.dropout_implementation == "upscale_in_train") {
scale = 1.0f;
} else {
scale = 1.0f - param.dropout_prob;
}
int r =
xdnn::scale(ctx.GetRawContext(), /* context */
param.x->numel(),
scale,
0.0f,
0,
param.x->data<float>(), /* src */
param.output->mutable_data<float>(TARGET(kXPU))); /* dst */
CHECK_EQ(r, 0);
}

Expand Down
Loading

0 comments on commit 68e64e0

Please sign in to comment.