[XPU] add new kernels, refine conv2d and bug fix (#4817) (#4926)

PaddlePaddle · Dec 9, 2020 · 68e64e0 · 68e64e0
1 parent f7bdb71
commit 68e64e0
Show file tree

Hide file tree

Showing 18 changed files with 1,082 additions and 167 deletions.
diff --git a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
@@ -31,6 +31,9 @@ else()
   add_kernel(transpose_compute_xpu XPU basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(density_prior_box_compute XPU basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(prior_box_compute_xpu XPU basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(interpolate_compute_xpu XPU basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(box_coder_compute XPU basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(split_compute_xpu XPU basic SRCS split_compute.cc DEPS ${lite_kernel_deps})
 
   # extra
   add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
@@ -45,6 +48,7 @@ else()
   add_kernel(sequence_unpad_compute_xpu XPU extra SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps})  
   add_kernel(lrn_compute_xpu XPU extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(topk_compute_xpu XPU extra SRCS topk_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(im2sequence_compute_xpu XPU extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(unstack_compute_xpu XPU extra SRCS unstack_compute.cc DEPS ${lite_kernel_deps})
 
   # extra(fused kernel)

diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc
@@ -14,6 +14,7 @@
 
 #include "lite/kernels/xpu/__xpu__conv2d_compute.h"
 #include <string>
+#include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -27,70 +28,53 @@ void XPUConv2dCompute::Run() {
   auto& ctx = this->ctx_->As<XPUContext>();
 
   auto& input_dims = param.Input->dims();
-  auto& filter_dims = param.Filter->dims();
+  auto& filter_dims = param.filter_dims;
   int batch = static_cast<int>(input_dims[0]);
   int img_c = static_cast<int>(input_dims[1]);
   int img_h = static_cast<int>(input_dims[2]);
   int img_w = static_cast<int>(input_dims[3]);
   int filter_num = static_cast<int>(filter_dims[0]);
   int win_h = static_cast<int>(filter_dims[2]);
   int win_w = static_cast<int>(filter_dims[3]);
-
   auto paddings = *param.paddings;
   auto dilations = *param.dilations;
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int paddings_h = paddings[0];
-  int paddings_w = paddings[1];
-  int dilations_h = dilations[0];
-  int dilations_w = dilations[1];
-
-  std::string filter_type = param.filter_type;
   int groups = param.groups;
-
-  int act_type = (param.act_type == "relu")
-                     ? xdnn::Activation_t::RELU
-                     : xdnn::Activation_t::LINEAR;  // -1 means not init
+  int act_type = param.act_type;
+  float* output_max = param.OutputMax->mutable_data<float>(TARGET(kXPU));
+  float* output = param.Output->mutable_data<float>(TARGET(kXPU));
   const auto* bias = param.Bias ? param.Bias->data<float>() : nullptr;
   const auto* branch = param.Branch ? param.Branch->data<float>() : nullptr;
   const float* input_max =
       param.InputMax ? param.InputMax->data<float>() : nullptr;
-  float* output_max = param.OutputMax
-                          ? param.OutputMax->mutable_data<float>(TARGET(kXPU))
-                          : nullptr;
-  float* output = param.Output->mutable_data<float>(TARGET(kXPU));
-
-  // TODO(luohang): now support for resnet50 first
-  CHECK_EQ(groups, 1);
-  CHECK_EQ(filter_type, "int16");
-
   xdnn::Activation_t act((xdnn::Activation_t::act_enum)act_type);
-  int r = xdnn::conv2d_forward_int16<float, int16_t, float, float>(
-      ctx.GetRawContext(),            /* context */
-      batch,                          /* batch */
-      img_c,                          /* input_c */
-      img_h,                          /* input_h */
-      img_w,                          /* input_w */
-      filter_num,                     /* num_filter */
-      win_h,                          /* kernel_h */
-      win_w,                          /* kernel_w */
-      stride_h,                       /* stride_h */
-      stride_w,                       /* stride_w */
-      paddings_h,                     /* pad_h */
-      paddings_w,                     /* pad_w */
-      dilations_h,                    /* dilation_h */
-      dilations_w,                    /* dilation_w */
-      groups,                         /* group */
-      param.Input->data<float>(),     /* input bottom */
-      param.Filter->data<int16_t>(),  /* filter weight */
-      output,                         /* output top */
-      bias,                           /* bias */
-      branch,                         /* branch */
-      act,                            /* act type */
-      input_max,                      /* max_image_ptr */
-      param.FilterMax->data<float>(), /* max_filter_ptr */
-      output_max /* max_result_ptr */);
-
+  if (act_type == 5) {
+    act.leaky_alpha = param.act_param;
+    CHECK(act.leaky_alpha >= 0.0001 && act.leaky_alpha <= 10);
+  } else if (act_type == 15) {
+    act.hard_sigmoid_slope = param.act_param;
+  }
+  int r = xdnn::conv2d_fusion<float, int16_t, float, int16_t>(
+      ctx.GetRawContext(),
+      param.Input->data<float>(),
+      param.Filter->data<int16_t>(),
+      output,
+      batch,
+      img_c,
+      img_h,
+      img_w,
+      filter_num,
+      std::vector<int>{win_h, win_w},
+      param.strides,
+      paddings,
+      dilations,
+      groups,
+      input_max,
+      param.FilterMax->data<float>(),
+      output_max,
+      true,
+      bias,
+      branch,
+      act);
   CHECK_EQ(r, 0);
 }
 

diff --git a/lite/kernels/xpu/batch_norm_compute.cc b/lite/kernels/xpu/batch_norm_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/batch_norm_compute.h"
+#include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
 #include "lite/core/op_registry.h"
 
@@ -24,23 +25,27 @@ namespace xpu {
 void BatchNormCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
-
   float epsilon = param.epsilon;
   auto& x_dims = param.x->dims();
+  CHECK_LE(x_dims.size(), 4);
+  std::vector<int> x_shape(4, 1);
+  for (int i = 0; i < x_dims.size(); i++) {
+    x_shape[i] = x_dims[i];
+  }
 
-  int r = xdnn::batch_norm_infer_forward(
-      ctx.GetRawContext(),                        /* context */
-      epsilon,                                    /* epsilon */
-      x_dims[0],                                  /* img_n */
-      x_dims[1],                                  /* img_c */
-      x_dims[2],                                  /* img_h */
-      x_dims[3],                                  /* img_w */
-      param.x->data<float>(),                     /* img_gm */
-      param.y->mutable_data<float>(TARGET(kXPU)), /* out_gm */
-      param.scale->data<float>(),                 /* scale_gm */
-      param.bias->data<float>(),                  /* bias_gm */
-      param.mean->data<float>(),                  /* mean_gm */
-      param.variance->data<float>() /* var__gm */);
+  int r =
+      xdnn::batch_norm_infer_forward(ctx.GetRawContext(),
+                                     epsilon,
+                                     x_shape[0],
+                                     x_shape[1],
+                                     x_shape[2],
+                                     x_shape[3],
+                                     param.x->data<float>(),
+                                     param.y->mutable_data<float>(TARGET(kXPU)),
+                                     param.scale->data<float>(),
+                                     param.bias->data<float>(),
+                                     param.mean->data<float>(),
+                                     param.variance->data<float>());
   CHECK_EQ(r, 0);
 }
 

diff --git a/lite/kernels/xpu/box_coder_compute.cc b/lite/kernels/xpu/box_coder_compute.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/box_coder_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void BoxCoderCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto prior_box_var_size = 0;
+  auto* prior_box = param.prior_box;
+  auto* prior_box_var = param.prior_box_var;
+  if (prior_box_var) {
+    prior_box_var_size = prior_box_var->dims().size();
+  }
+  auto* target_box = param.target_box;
+  auto* output_box = param.proposals;
+  std::vector<float> variance = param.variance;
+  const int axis = param.axis;
+  std::string code_type = param.code_type;
+  bool normalized = param.box_normalized;
+
+  auto row = target_box->dims()[0];
+  auto col = prior_box->dims()[0];
+  if (code_type == "decode_center_size") {
+    col = target_box->dims()[1];
+  }
+  auto len = prior_box->dims()[1];
+  output_box->Resize({row, col, len});
+  auto* output = output_box->mutable_data<float>(TARGET(kXPU));
+
+  if (code_type == "encode_center_size") {
+    int r = xdnn::box_coder_encode(ctx.GetRawContext(),
+                                   prior_box->data<float>(),
+                                   prior_box_var->data<float>(),
+                                   target_box->data<float>(),
+                                   row,
+                                   col,
+                                   len,
+                                   normalized,
+                                   prior_box_var_size,
+                                   variance.data(),
+                                   variance.size(),
+                                   output);
+    CHECK_EQ(r, 0);
+  } else if (code_type == "decode_center_size") {
+    int r = xdnn::box_coder_decode(ctx.GetRawContext(),
+                                   prior_box->data<float>(),
+                                   prior_box_var->data<float>(),
+                                   target_box->data<float>(),
+                                   row,
+                                   col,
+                                   len,
+                                   normalized,
+                                   prior_box_var_size,
+                                   variance.data(),
+                                   variance.size(),
+                                   axis,
+                                   output);
+    CHECK_EQ(r, 0);
+  } else {
+    LOG(FATAL) << "box_coder don't support this code_type: " << code_type;
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(box_coder,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::BoxCoderCompute,
+                     def)
+    .BindInput("PriorBox", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("PriorBoxVar", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("TargetBox", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("OutputBox", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/box_coder_compute.h b/lite/kernels/xpu/box_coder_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class BoxCoderCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BoxCoderParam;
+
+  virtual void Run();
+
+  virtual ~BoxCoderCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/dropout_compute.cc b/lite/kernels/xpu/dropout_compute.cc
@@ -24,14 +24,20 @@ namespace xpu {
 void DropoutCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
-
-  int size = param.x->numel() * sizeof(float);
-
-  int r = xdnn::memcpy_device(
-      ctx.GetRawContext(),                             /* context */
-      param.output->mutable_data<float>(TARGET(kXPU)), /* dst */
-      param.x->data<float>(),                          /* src */
-      size /* size */);
+  float scale = 1.0f;
+  if (param.dropout_implementation == "upscale_in_train") {
+    scale = 1.0f;
+  } else {
+    scale = 1.0f - param.dropout_prob;
+  }
+  int r =
+      xdnn::scale(ctx.GetRawContext(), /* context */
+                  param.x->numel(),
+                  scale,
+                  0.0f,
+                  0,
+                  param.x->data<float>(),                           /* src */
+                  param.output->mutable_data<float>(TARGET(kXPU))); /* dst */
   CHECK_EQ(r, 0);
 }