-
Notifications
You must be signed in to change notification settings - Fork 126
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[NeoML] Add MultiheadAttentionPerformerLayer
Signed-off-by: Kirill Golikov <[email protected]>
- Loading branch information
Showing
11 changed files
with
1,272 additions
and
15 deletions.
There are no files selected for viewing
97 changes: 97 additions & 0 deletions
97
NeoML/include/NeoML/Dnn/Layers/FavorAttentionPerformerLayer.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* Copyright © 2023-2024 ABBYY | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
--------------------------------------------------------------------------------------------------------------*/ | ||
|
||
#pragma once | ||
|
||
#include <NeoML/Dnn/Dnn.h> | ||
|
||
namespace NeoML { | ||
|
||
struct CFavorAttentionDesc; | ||
|
||
// Computes FAVOR normalized self-attention. | ||
// https://arxiv.org/pdf/2009.14794.pdf. | ||
// | ||
// Inputs: query, key, value | ||
// Emulates equation: Output ~~ softmax( query * ( key )^T / normalizer ) * value | ||
// | ||
// output | ||
// ^ | ||
// | | ||
// +---------------+ | ||
// | F A V O R | <-- projection matrix | ||
// | Attention | (random features) | ||
// +---------------+ | ||
// ^ ^ ^ | ||
// | | | | ||
// query key value | ||
// | ||
class NEOML_API CFavorAttentionPerformerLayer : public CBaseLayer { | ||
NEOML_DNN_LAYER( CFavorAttentionPerformerLayer ) | ||
public: | ||
// Possible activation kernel transformations | ||
enum class TAKernel { SoftMax = 0, ReLU = 1 }; | ||
// Layer inputs numeration | ||
enum TInput { TI_Q = 0, TI_K = 1, TI_V = 2 }; | ||
// Constructs a random matrix Q using | ||
enum class TRandomMaxrixStructMode { | ||
QMatrix, // QR-factorization of a random 2D-tensor | ||
GivensRotations // Givens random rotations | ||
}; | ||
static constexpr TRandomMaxrixStructMode StructMode = TRandomMaxrixStructMode::GivensRotations; | ||
// For normalization of a random matrix Q use sum of rows' norms of a random matrix, or just =sqrt(dim) | ||
static constexpr bool Scaling = false; | ||
|
||
// Constructor | ||
CFavorAttentionPerformerLayer( IMathEngine& mathEngine, const char* name = nullptr ); | ||
|
||
// The projection matrix columns size if it is used, or 0 if not | ||
// Set to 0, if the projection matrix should not be used | ||
int GetRandomFeaturesCount() const { return randomFeaturesCount; } | ||
void SetRandomFeaturesCount( int randomFeaturesCount ); | ||
// The activation kernel transformations is used | ||
int GetActivationKernel() const { return static_cast<int>( activation ); } | ||
void SetActivationKernel( int activation ); | ||
// The auto-regressive attention is used or not | ||
bool GetCausal() const { return causal; } | ||
void SetCausal( bool causal ); | ||
|
||
void Serialize( CArchive& archive ) override; | ||
|
||
protected: | ||
~CFavorAttentionPerformerLayer(); | ||
|
||
// Create output blobs using the input blobs | ||
void Reshape() override; | ||
// One step of a forward pass | ||
void RunOnce() override; | ||
// One step of a backward pass | ||
void BackwardOnce() override; | ||
|
||
private: | ||
// Number of random features to be used | ||
// For SoftMax should be > 0, the random projection matrix should be applied | ||
int randomFeaturesCount = 0; | ||
TAKernel activation = TAKernel::SoftMax; // Activation Kernel type | ||
bool causal = false; // Auto-regressive attention or not | ||
CFavorAttentionDesc* desc = nullptr; // Favor Attention desctiption | ||
|
||
void destroyFavorAttentionDesc(); | ||
}; | ||
|
||
NEOML_API CLayerWrapper<CFavorAttentionPerformerLayer> FavorAttentionPerformer( | ||
int randomFeaturesCount, int activation, bool causal ); | ||
|
||
} // namespace NeoML |
101 changes: 101 additions & 0 deletions
101
NeoML/include/NeoML/Dnn/Layers/MultiheadAttentionPerformerLayer.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
/* Copyright © 2023-2024 ABBYY | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
--------------------------------------------------------------------------------------------------------------*/ | ||
|
||
#pragma once | ||
|
||
#include <NeoML/Dnn/Dnn.h> | ||
#include <NeoML/Dnn/Layers/CompositeLayer.h> | ||
|
||
namespace NeoML { | ||
|
||
// Multihead Self Attention Performer | ||
// https://arxiv.org/pdf/2009.14794.pdf | ||
// Implementation of multiheaded FAVOR-attention & FAVOR-self-attention layers. | ||
// | ||
// +----------------------+--------+------------------------------------------------------- | ||
// | Parameter | Type | Description | ||
// +----------------------+--------+------------------------------------------------------- | ||
// | HiddenSize | int | size of trainable matrices, output dim of hidden layer | ||
// | HeadCount | int | number of heads to repeat the same attention structure | ||
// | OutputSize | int | size of the output | ||
// | ActivationKernel | int | activation (ReLU or SoftMax) kernel transformation | ||
// | RandomFeaturesCount | int | projection matrix columns number, or 0 if isn't used | ||
// | Casual | bool | auto-regressive attention is used or not | ||
// +----------------------+--------+------------------------------------------------------- | ||
class NEOML_API CMultiheadAttentionPerformerLayer : public CCompositeLayer { | ||
NEOML_DNN_LAYER( CMultiheadAttentionPerformerLayer ) | ||
public: | ||
explicit CMultiheadAttentionPerformerLayer( IMathEngine& mathEngine ); | ||
|
||
// Activation kernel type: SoftMax(=0), ReLU(=1) | ||
// By default is SoftMax | ||
int GetActivationKernel() const { return activationKernel; } | ||
void SetActivationKernel( int activationKernel, int randomFeaturesCount, bool casual ); | ||
int GetRandomFeaturesCount() const { return randomFeaturesCount; } | ||
bool GetCasual() const { return casual; } | ||
|
||
// The number of heads in attention | ||
// The GetHiddenSize() must be a multiple of this value | ||
// By default attention consist of 1 head | ||
int GetHeadCount() const { return headCount; } | ||
void SetHeadCount( int headCount ); | ||
|
||
// The size of trainable matrices | ||
// Must be a multiple of GetHeadCount() | ||
int GetHiddenSize() const { return hiddenSize; } | ||
void SetHiddenSize( int hiddenSize ); | ||
|
||
// The size of output | ||
int GetOutputSize() const { return outputSize; } | ||
void SetOutputSize( int outputSize ); | ||
|
||
void Serialize( CArchive& archive ) override; | ||
|
||
// Recreates the layer if forceRebuild is true or it doesn't contain sublayers | ||
void Rebuild( bool forceRebuild ); | ||
|
||
protected: | ||
void Reshape() override; | ||
|
||
private: | ||
// FAVOR+ attention settings | ||
int activationKernel; // Activation kernel transformation | ||
int randomFeaturesCount; // Projection matrix size, if > 0 | ||
bool casual; // Auto-regression or not | ||
|
||
// The amount of heads | ||
int headCount; | ||
// The size of the trainable matrix | ||
int hiddenSize; | ||
// Output size | ||
int outputSize; | ||
|
||
// Layer inputs numeration | ||
enum TInputs { I_Q = 0, I_K = 1, I_V = 2 }; | ||
|
||
bool isCreated() const { return HasLayer( "Q" ); } | ||
void create(); | ||
|
||
CBaseLayer* multiplyInputByMatrixWeights( int size, const char* name, TInputs input ); | ||
CBaseLayer* multiplyByMatrixWeights( CBaseLayer* input, int width ); | ||
CBaseLayer* prepareQ( CBaseLayer* input ); | ||
CBaseLayer* prepareKV( CBaseLayer* input, bool isK ); | ||
CBaseLayer* prepareOutput( CBaseLayer* input ); | ||
}; | ||
|
||
NEOML_API CLayerWrapper<CMultiheadAttentionPerformerLayer> MultiheadAttentionPerformer( | ||
int headCount, int hiddenSize, int outputSize, int activationKernel, int randomFeaturesCount, bool casual ); | ||
|
||
} // namespace NeoML |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.