[NeoML] Add MultiheadAttentionPerformerLayer

Signed-off-by: Kirill Golikov <[email protected]>
neoml-lib · Jan 18, 2024 · 3b1282b · 3b1282b
1 parent 7a71670
commit 3b1282b
Show file tree

Hide file tree

Showing 11 changed files with 1,272 additions and 15 deletions.
diff --git a/NeoML/include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h b/NeoML/include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
@@ -0,0 +1,97 @@
+/* Copyright © 2023-2024 ABBYY
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#pragma once
+
+#include <NeoML/Dnn/Dnn.h>
+
+namespace NeoML {
+
+struct CFavorAttentionDesc;
+
+// Computes FAVOR normalized self-attention.
+// https://arxiv.org/pdf/2009.14794.pdf.
+// 
+// Inputs: query, key, value
+// Emulates equation: Output ~~ softmax( query * ( key )^T / normalizer ) * value
+//
+//         output
+//           ^
+//           |
+//   +---------------+
+//   |   F A V O R   | <-- projection matrix
+//   |   Attention   |     (random features)
+//   +---------------+
+//    ^      ^      ^
+//    |      |      |
+//  query   key   value
+//
+class NEOML_API CFavorAttentionPerformerLayer : public CBaseLayer {
+	NEOML_DNN_LAYER( CFavorAttentionPerformerLayer )
+public:
+	// Possible activation kernel transformations
+	enum class TAKernel { SoftMax = 0, ReLU = 1 };
+	// Layer inputs numeration
+	enum TInput { TI_Q = 0, TI_K = 1, TI_V = 2 };
+	// Constructs a random matrix Q using
+	enum class TRandomMaxrixStructMode {
+		QMatrix, // QR-factorization of a random 2D-tensor
+		GivensRotations // Givens random rotations
+	};
+	static constexpr TRandomMaxrixStructMode StructMode = TRandomMaxrixStructMode::GivensRotations;
+	// For normalization of a random matrix Q use sum of rows' norms of a random matrix, or just =sqrt(dim)
+	static constexpr bool Scaling = false;
+
+	// Constructor
+	CFavorAttentionPerformerLayer( IMathEngine& mathEngine, const char* name = nullptr );
+
+	// The projection matrix columns size if it is used, or 0 if not
+	// Set to 0, if the projection matrix should not be used
+	int GetRandomFeaturesCount() const { return randomFeaturesCount; }
+	void SetRandomFeaturesCount( int randomFeaturesCount );
+	// The activation kernel transformations is used
+	int GetActivationKernel() const { return static_cast<int>( activation ); }
+	void SetActivationKernel( int activation );
+	// The auto-regressive attention is used or not 
+	bool GetCausal() const { return causal; }
+	void SetCausal( bool causal );
+
+	void Serialize( CArchive& archive ) override;
+
+protected:
+	~CFavorAttentionPerformerLayer();
+
+	// Create output blobs using the input blobs
+	void Reshape() override;
+	// One step of a forward pass
+	void RunOnce() override;
+	// One step of a backward pass
+	void BackwardOnce() override;
+
+private:
+	// Number of random features to be used
+	// For SoftMax should be > 0, the random projection matrix should be applied
+	int randomFeaturesCount = 0;
+	TAKernel activation = TAKernel::SoftMax; // Activation Kernel type
+	bool causal = false; // Auto-regressive attention or not
+	CFavorAttentionDesc* desc = nullptr; // Favor Attention desctiption
+
+	void destroyFavorAttentionDesc();
+};
+
+NEOML_API CLayerWrapper<CFavorAttentionPerformerLayer> FavorAttentionPerformer(
+	int randomFeaturesCount, int activation, bool causal );
+
+} // namespace NeoML
diff --git a/NeoML/include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h b/NeoML/include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
@@ -0,0 +1,101 @@
+/* Copyright © 2023-2024 ABBYY
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#pragma once
+
+#include <NeoML/Dnn/Dnn.h>
+#include <NeoML/Dnn/Layers/CompositeLayer.h>
+
+namespace NeoML {
+
+// Multihead Self Attention Performer
+// https://arxiv.org/pdf/2009.14794.pdf
+// Implementation of multiheaded FAVOR-attention & FAVOR-self-attention layers.
+//
+//  +----------------------+--------+-------------------------------------------------------
+//  | Parameter            | Type   | Description
+//  +----------------------+--------+-------------------------------------------------------
+//  | HiddenSize           | int    | size of trainable matrices, output dim of hidden layer
+//  | HeadCount            | int    | number of heads to repeat the same attention structure
+//  | OutputSize           | int    | size of the output
+//  | ActivationKernel     | int    | activation (ReLU or SoftMax) kernel transformation
+//  | RandomFeaturesCount  | int    | projection matrix columns number, or 0 if isn't used
+//  | Casual               | bool   | auto-regressive attention is used or not
+//  +----------------------+--------+-------------------------------------------------------
+class NEOML_API CMultiheadAttentionPerformerLayer : public CCompositeLayer {
+	NEOML_DNN_LAYER( CMultiheadAttentionPerformerLayer )
+public:
+	explicit CMultiheadAttentionPerformerLayer( IMathEngine& mathEngine );
+
+	// Activation kernel type: SoftMax(=0), ReLU(=1)
+	// By default is SoftMax
+	int GetActivationKernel() const { return activationKernel; }
+	void SetActivationKernel( int activationKernel, int randomFeaturesCount, bool casual );
+	int GetRandomFeaturesCount() const { return randomFeaturesCount; }
+	bool GetCasual() const { return casual; }
+
+	// The number of heads in attention
+	// The GetHiddenSize() must be a multiple of this value
+	// By default attention consist of 1 head
+	int GetHeadCount() const { return headCount; }
+	void SetHeadCount( int headCount );
+
+	// The size of trainable matrices
+	// Must be a multiple of GetHeadCount()
+	int GetHiddenSize() const { return hiddenSize; }
+	void SetHiddenSize( int hiddenSize );
+
+	// The size of output
+	int GetOutputSize() const { return outputSize; }
+	void SetOutputSize( int outputSize );
+
+	void Serialize( CArchive& archive ) override;
+
+	// Recreates the layer if forceRebuild is true or it doesn't contain sublayers
+	void Rebuild( bool forceRebuild );
+
+protected:
+	void Reshape() override;
+
+private:
+	// FAVOR+ attention settings
+	int activationKernel; // Activation kernel transformation
+	int randomFeaturesCount; // Projection matrix size, if > 0
+	bool casual; // Auto-regression or not
+
+	// The amount of heads
+	int headCount;
+	// The size of the trainable matrix
+	int hiddenSize;
+	// Output size
+	int outputSize;
+
+	// Layer inputs numeration
+	enum TInputs { I_Q = 0, I_K = 1, I_V = 2 };
+
+	bool isCreated() const { return HasLayer( "Q" ); }
+	void create();
+
+	CBaseLayer* multiplyInputByMatrixWeights( int size, const char* name, TInputs input );
+	CBaseLayer* multiplyByMatrixWeights( CBaseLayer* input, int width );
+	CBaseLayer* prepareQ( CBaseLayer* input );
+	CBaseLayer* prepareKV( CBaseLayer* input, bool isK );
+	CBaseLayer* prepareOutput( CBaseLayer* input );
+};
+
+NEOML_API CLayerWrapper<CMultiheadAttentionPerformerLayer> MultiheadAttentionPerformer(
+	int headCount, int hiddenSize, int outputSize, int activationKernel, int randomFeaturesCount, bool casual );
+
+} // namespace NeoML
diff --git a/NeoML/include/NeoML/NeoML.h b/NeoML/include/NeoML/NeoML.h
@@ -116,6 +116,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
 #include <NeoML/Dnn/Layers/DotProductLayer.h>
 #include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
+#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/FocalLossLayer.h>
 #include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
 #include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
@@ -131,6 +132,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/LrnLayer.h>
 #include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
 #include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
+#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
 #include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
 #include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>

diff --git a/NeoML/src/CMakeLists.txt b/NeoML/src/CMakeLists.txt
@@ -118,6 +118,7 @@ set(NeoML_SOURCES
     Dnn/Layers/DotProductLayer.cpp
     Dnn/Layers/EnumBinarizationLayer.cpp
     Dnn/Layers/FocalLossLayer.cpp
+    Dnn/Layers/FavorAttentionPerformerLayer.cpp
     Dnn/Layers/FullyConnectedSourceLayer.cpp
     Dnn/Layers/GlobalMaxPoolingLayer.cpp
     Dnn/Layers/GlobalSumPoolingLayer.cpp
@@ -133,6 +134,7 @@ set(NeoML_SOURCES
     Dnn/Layers/MaxOverTimePoolingLayer.cpp
     Dnn/Layers/MobileNetV3BlockLayer.cpp
     Dnn/Layers/ModelWrapperLayer.cpp
+    Dnn/Layers/MultiheadAttentionPerformerLayer.cpp
     Dnn/Layers/ObjectNormalizationLayer.cpp
     Dnn/Layers/Onnx/OnnxEltwiseLayer.cpp
     Dnn/Layers/Onnx/OnnxCastLayer.cpp
@@ -379,6 +381,7 @@ set(NeoML_HEADERS
     ../include/NeoML/Dnn/Layers/DotProductLayer.h
     ../include/NeoML/Dnn/Layers/EnumBinarizationLayer.h
     ../include/NeoML/Dnn/Layers/FocalLossLayer.h
+    ../include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
     ../include/NeoML/Dnn/Layers/FullyConnectedSourceLayer.h
     ../include/NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h
     ../include/NeoML/Dnn/Layers/GlobalSumPoolingLayer.h
@@ -394,6 +397,7 @@ set(NeoML_HEADERS
     ../include/NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h
     ../include/NeoML/Dnn/Layers/MobileNetV3BlockLayer.h
     ../include/NeoML/Dnn/Layers/ModelWrapperLayer.h
+    ../include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
     ../include/NeoML/Dnn/Layers/MultiHingeLossLayer.h
     ../include/NeoML/Dnn/Layers/ObjectNormalizationLayer.h
     ../include/NeoML/Dnn/Layers/Onnx/OnnxEltwiseLayer.h

diff --git a/NeoML/src/Dnn/Dnn.cpp b/NeoML/src/Dnn/Dnn.cpp
@@ -1,4 +1,4 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -73,6 +73,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/DepthToSpaceLayer.h>
 #include <NeoML/Dnn/Layers/DotProductLayer.h>
 #include <NeoML/Dnn/Layers/EnumBinarizationLayer.h>
+#include <NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/FocalLossLayer.h>
 #include <NeoML/Dnn/Layers/FullyConnectedSourceLayer.h>
 #include <NeoML/Dnn/Layers/GlobalMaxPoolingLayer.h>
@@ -89,6 +90,7 @@ limitations under the License.
 #include <NeoML/Dnn/Layers/MaxOverTimePoolingLayer.h>
 #include <NeoML/Dnn/Layers/MobileNetV3BlockLayer.h>
 #include <NeoML/Dnn/Layers/ModelWrapperLayer.h>
+#include <NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h>
 #include <NeoML/Dnn/Layers/MultiHingeLossLayer.h>
 #include <NeoML/Dnn/Layers/PositionalEmbeddingLayer.h>
 #include <NeoML/Dnn/Layers/PrecisionRecallLayer.h>
@@ -351,6 +353,7 @@ REGISTER_NEOML_LAYER( CCtcDecodingLayer, "FmlCnnCtcDecodingLayer" )
 REGISTER_NEOML_LAYER( CCtcLossLayer, "FmlCnnCtcLossLayer" )
 REGISTER_NEOML_LAYER( CDotProductLayer, "FmlCnnDotProductLayer" )
 REGISTER_NEOML_LAYER( CEnumBinarizationLayer, "FmlCnnEnumBinarizationLayer" )
+REGISTER_NEOML_LAYER( CFavorAttentionPerformerLayer, "NeoMLDnnFavorAttentionPerformerLayer" )
 REGISTER_NEOML_LAYER( CGlobalMaxPoolingLayer, "FmlCnnGlobalMaxPoolingLayer" )
 REGISTER_NEOML_LAYER( CGrnLayer, "NeoMLDnnGrnLayer" )
 REGISTER_NEOML_LAYER( CGruLayer, "FmlCnnGruLayer" )
@@ -362,6 +365,7 @@ REGISTER_NEOML_LAYER( CLoraFullyConnectedLayer, "NeoMLDnnLoraFullyConnectedLayer
 REGISTER_NEOML_LAYER( CMaxOverTimePoolingLayer, "FmlCnnMaxOverTimePoolingLayer" )
 REGISTER_NEOML_LAYER( CMobileNetV3PreSEBlockLayer, "NeoMLDnnMobileNetV3PreSEBlockLayer" )
 REGISTER_NEOML_LAYER( CMobileNetV3PostSEBlockLayer, "NeoMLDnnMobileNetV3PostSEBlockLayer" )
+REGISTER_NEOML_LAYER( CMultiheadAttentionPerformerLayer, "NeoMLDnnMultiheadAttentionPerformerLayer" )
 REGISTER_NEOML_LAYER( CMultiHingeLossLayer, "FmlCnnMultyHingeLossLayer" )
 REGISTER_NEOML_LAYER( CMultiSquaredHingeLossLayer, "FmlCnnMultySquaredHingeLossLayer" )
 REGISTER_NEOML_LAYER( CPixelToImageLayer, "FmlCnnPixelToImageLayerClass" )