Skip to content

Commit

Permalink
[NeoML] Add MultiheadAttentionPerformerLayer
Browse files Browse the repository at this point in the history
Signed-off-by: Kirill Golikov <[email protected]>
  • Loading branch information
favorart committed Jan 18, 2024
1 parent 7a71670 commit 3b1282b
Show file tree
Hide file tree
Showing 11 changed files with 1,272 additions and 15 deletions.
97 changes: 97 additions & 0 deletions NeoML/include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/* Copyright © 2023-2024 ABBYY
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--------------------------------------------------------------------------------------------------------------*/

#pragma once

#include <NeoML/Dnn/Dnn.h>

namespace NeoML {

struct CFavorAttentionDesc;

// Computes FAVOR normalized self-attention.
// https://arxiv.org/pdf/2009.14794.pdf.
//
// Inputs: query, key, value
// Emulates equation: Output ~~ softmax( query * ( key )^T / normalizer ) * value
//
// output
// ^
// |
// +---------------+
// | F A V O R | <-- projection matrix
// | Attention | (random features)
// +---------------+
// ^ ^ ^
// | | |
// query key value
//
class NEOML_API CFavorAttentionPerformerLayer : public CBaseLayer {
NEOML_DNN_LAYER( CFavorAttentionPerformerLayer )
public:
// Possible activation kernel transformations
enum class TAKernel { SoftMax = 0, ReLU = 1 };
// Layer inputs numeration
enum TInput { TI_Q = 0, TI_K = 1, TI_V = 2 };
// Constructs a random matrix Q using
enum class TRandomMaxrixStructMode {
QMatrix, // QR-factorization of a random 2D-tensor
GivensRotations // Givens random rotations
};
static constexpr TRandomMaxrixStructMode StructMode = TRandomMaxrixStructMode::GivensRotations;
// For normalization of a random matrix Q use sum of rows' norms of a random matrix, or just =sqrt(dim)
static constexpr bool Scaling = false;

// Constructor
CFavorAttentionPerformerLayer( IMathEngine& mathEngine, const char* name = nullptr );

// The projection matrix columns size if it is used, or 0 if not
// Set to 0, if the projection matrix should not be used
int GetRandomFeaturesCount() const { return randomFeaturesCount; }
void SetRandomFeaturesCount( int randomFeaturesCount );
// The activation kernel transformations is used
int GetActivationKernel() const { return static_cast<int>( activation ); }
void SetActivationKernel( int activation );
// The auto-regressive attention is used or not
bool GetCausal() const { return causal; }
void SetCausal( bool causal );

void Serialize( CArchive& archive ) override;

protected:
~CFavorAttentionPerformerLayer();

// Create output blobs using the input blobs
void Reshape() override;
// One step of a forward pass
void RunOnce() override;
// One step of a backward pass
void BackwardOnce() override;

private:
// Number of random features to be used
// For SoftMax should be > 0, the random projection matrix should be applied
int randomFeaturesCount = 0;
TAKernel activation = TAKernel::SoftMax; // Activation Kernel type
bool causal = false; // Auto-regressive attention or not
CFavorAttentionDesc* desc = nullptr; // Favor Attention desctiption

void destroyFavorAttentionDesc();
};

NEOML_API CLayerWrapper<CFavorAttentionPerformerLayer> FavorAttentionPerformer(
int randomFeaturesCount, int activation, bool causal );

} // namespace NeoML
101 changes: 101 additions & 0 deletions NeoML/include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/* Copyright © 2023-2024 ABBYY
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--------------------------------------------------------------------------------------------------------------*/

#pragma once

#include <NeoML/Dnn/Dnn.h>
#include <NeoML/Dnn/Layers/CompositeLayer.h>

namespace NeoML {

// Multihead Self Attention Performer
// https://arxiv.org/pdf/2009.14794.pdf
// Implementation of multiheaded FAVOR-attention & FAVOR-self-attention layers.
//
// +----------------------+--------+-------------------------------------------------------
// | Parameter | Type | Description
// +----------------------+--------+-------------------------------------------------------
// | HiddenSize | int | size of trainable matrices, output dim of hidden layer
// | HeadCount | int | number of heads to repeat the same attention structure
// | OutputSize | int | size of the output
// | ActivationKernel | int | activation (ReLU or SoftMax) kernel transformation
// | RandomFeaturesCount | int | projection matrix columns number, or 0 if isn't used
// | Casual | bool | auto-regressive attention is used or not
// +----------------------+--------+-------------------------------------------------------
class NEOML_API CMultiheadAttentionPerformerLayer : public CCompositeLayer {
NEOML_DNN_LAYER( CMultiheadAttentionPerformerLayer )
public:
explicit CMultiheadAttentionPerformerLayer( IMathEngine& mathEngine );

// Activation kernel type: SoftMax(=0), ReLU(=1)
// By default is SoftMax
int GetActivationKernel() const { return activationKernel; }
void SetActivationKernel( int activationKernel, int randomFeaturesCount, bool casual );
int GetRandomFeaturesCount() const { return randomFeaturesCount; }
bool GetCasual() const { return casual; }

// The number of heads in attention
// The GetHiddenSize() must be a multiple of this value
// By default attention consist of 1 head
int GetHeadCount() const { return headCount; }
void SetHeadCount( int headCount );

// The size of trainable matrices
// Must be a multiple of GetHeadCount()
int GetHiddenSize() const { return hiddenSize; }
void SetHiddenSize( int hiddenSize );

// The size of output
int GetOutputSize() const { return outputSize; }
void SetOutputSize( int outputSize );

void Serialize( CArchive& archive ) override;

// Recreates the layer if forceRebuild is true or it doesn't contain sublayers
void Rebuild( bool forceRebuild );

protected:
void Reshape() override;

private:
// FAVOR+ attention settings
int activationKernel; // Activation kernel transformation
int randomFeaturesCount; // Projection matrix size, if > 0
bool casual; // Auto-regression or not

// The amount of heads
int headCount;
// The size of the trainable matrix
int hiddenSize;
// Output size
int outputSize;

// Layer inputs numeration
enum TInputs { I_Q = 0, I_K = 1, I_V = 2 };

bool isCreated() const { return HasLayer( "Q" ); }
void create();

CBaseLayer* multiplyInputByMatrixWeights( int size, const char* name, TInputs input );
CBaseLayer* multiplyByMatrixWeights( CBaseLayer* input, int width );
CBaseLayer* prepareQ( CBaseLayer* input );
CBaseLayer* prepareKV( CBaseLayer* input, bool isK );
CBaseLayer* prepareOutput( CBaseLayer* input );
};

NEOML_API CLayerWrapper<CMultiheadAttentionPerformerLayer> MultiheadAttentionPerformer(
int headCount, int hiddenSize, int outputSize, int activationKernel, int randomFeaturesCount, bool casual );

} // namespace NeoML
2 changes: 2 additions & 0 deletions NeoML/include/NeoML/NeoML.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ limitations under the License.
#include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
#include <NeoML/Dnn/Layers/DotProductLayer.h>
#include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
#include <NeoML/Dnn/Layers/FocalLossLayer.h>
#include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
#include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
Expand All @@ -131,6 +132,7 @@ limitations under the License.
#include <NeoML/Dnn/Layers/LrnLayer.h>
#include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
#include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
#include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
#include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
#include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>
Expand Down
4 changes: 4 additions & 0 deletions NeoML/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ set(NeoML_SOURCES
Dnn/Layers/DotProductLayer.cpp
Dnn/Layers/EnumBinarizationLayer.cpp
Dnn/Layers/FocalLossLayer.cpp
Dnn/Layers/FavorAttentionPerformerLayer.cpp
Dnn/Layers/FullyConnectedSourceLayer.cpp
Dnn/Layers/GlobalMaxPoolingLayer.cpp
Dnn/Layers/GlobalSumPoolingLayer.cpp
Expand All @@ -133,6 +134,7 @@ set(NeoML_SOURCES
Dnn/Layers/MaxOverTimePoolingLayer.cpp
Dnn/Layers/MobileNetV3BlockLayer.cpp
Dnn/Layers/ModelWrapperLayer.cpp
Dnn/Layers/MultiheadAttentionPerformerLayer.cpp
Dnn/Layers/ObjectNormalizationLayer.cpp
Dnn/Layers/Onnx/OnnxEltwiseLayer.cpp
Dnn/Layers/Onnx/OnnxCastLayer.cpp
Expand Down Expand Up @@ -379,6 +381,7 @@ set(NeoML_HEADERS
../include/NeoML/Dnn/Layers/DotProductLayer.h
../include/NeoML/Dnn/Layers/EnumBinarizationLayer.h
../include/NeoML/Dnn/Layers/FocalLossLayer.h
../include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
../include/NeoML/Dnn/Layers/FullyConnectedSourceLayer.h
../include/NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h
../include/NeoML/Dnn/Layers/GlobalSumPoolingLayer.h
Expand All @@ -394,6 +397,7 @@ set(NeoML_HEADERS
../include/NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h
../include/NeoML/Dnn/Layers/MobileNetV3BlockLayer.h
../include/NeoML/Dnn/Layers/ModelWrapperLayer.h
../include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
../include/NeoML/Dnn/Layers/MultiHingeLossLayer.h
../include/NeoML/Dnn/Layers/ObjectNormalizationLayer.h
../include/NeoML/Dnn/Layers/Onnx/OnnxEltwiseLayer.h
Expand Down
6 changes: 5 additions & 1 deletion NeoML/src/Dnn/Dnn.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright © 2017-2023 ABBYY
/* Copyright © 2017-2024 ABBYY
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -73,6 +73,7 @@ limitations under the License.
#include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
#include <NeoML/Dnn/Layers/DotProductLayer.h>
#include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
#include <NeoML/Dnn/Layers/FocalLossLayer.h>
#include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
#include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
Expand All @@ -89,6 +90,7 @@ limitations under the License.
#include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
#include <NeoML/Dnn/Layers/MobileNetV3BlockLayer.h>
#include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
#include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
#include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
#include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>
Expand Down Expand Up @@ -351,6 +353,7 @@ REGISTER_NEOML_LAYER( CCtcDecodingLayer, "FmlCnnCtcDecodingLayer" )
REGISTER_NEOML_LAYER( CCtcLossLayer, "FmlCnnCtcLossLayer" )
REGISTER_NEOML_LAYER( CDotProductLayer, "FmlCnnDotProductLayer" )
REGISTER_NEOML_LAYER( CEnumBinarizationLayer, "FmlCnnEnumBinarizationLayer" )
REGISTER_NEOML_LAYER( CFavorAttentionPerformerLayer, "NeoMLDnnFavorAttentionPerformerLayer" )
REGISTER_NEOML_LAYER( CGlobalMaxPoolingLayer, "FmlCnnGlobalMaxPoolingLayer" )
REGISTER_NEOML_LAYER( CGrnLayer, "NeoMLDnnGrnLayer" )
REGISTER_NEOML_LAYER( CGruLayer, "FmlCnnGruLayer" )
Expand All @@ -362,6 +365,7 @@ REGISTER_NEOML_LAYER( CLoraFullyConnectedLayer, "NeoMLDnnLoraFullyConnectedLayer
REGISTER_NEOML_LAYER( CMaxOverTimePoolingLayer, "FmlCnnMaxOverTimePoolingLayer" )
REGISTER_NEOML_LAYER( CMobileNetV3PreSEBlockLayer, "NeoMLDnnMobileNetV3PreSEBlockLayer" )
REGISTER_NEOML_LAYER( CMobileNetV3PostSEBlockLayer, "NeoMLDnnMobileNetV3PostSEBlockLayer" )
REGISTER_NEOML_LAYER( CMultiheadAttentionPerformerLayer, "NeoMLDnnMultiheadAttentionPerformerLayer" )
REGISTER_NEOML_LAYER( CMultiHingeLossLayer, "FmlCnnMultyHingeLossLayer" )
REGISTER_NEOML_LAYER( CMultiSquaredHingeLossLayer, "FmlCnnMultySquaredHingeLossLayer" )
REGISTER_NEOML_LAYER( CPixelToImageLayer, "FmlCnnPixelToImageLayerClass" )
Expand Down
Loading

0 comments on commit 3b1282b

Please sign in to comment.