neoml-lib · yekatkov · Jan 14, 2021 · Jan 19, 2021 · Feb 15, 2021 · Feb 19, 2021
diff --git a/NeoMathEngine/include/NeoMathEngine/SimdMathEngine.h b/NeoMathEngine/include/NeoMathEngine/SimdMathEngine.h
@@ -48,6 +48,33 @@ class ISimdMathEngine : public CCrtAllocatedObject {
 	virtual void Exp( float* dst, const float* src, size_t dataSize, bool isMultithread = true ) = 0;
 	virtual void RunOnceRestOfLstm( CMathEngineLstmDesc* desc, const CConstFloatHandle& inputStateBackLink,
 		const CFloatHandle& outputStateBackLink, const CFloatHandle& outputMainBackLink, bool isMultithread = true ) = 0;
+
+	using vectorAddFunc = void (*)( const float* first, const float* second, float* result, int vectorSize );
+	using alignedVectorAdd = void (*)( const float* first, float* second, int vectorSize );
+	using vectorEltwiseMax = void (*)( const float* first, const float* second, float* result, int vectorSize );
+	using vectorReLU = void (*)( const float* first, float* result, int vectorSize );
+	using vectorReLUTreshold = void (*)( const float* first, float* result, int vectorSize, float threshold );
+	using alignedVectorMultiplyAndAdd = void (*)( const float* first, const float* second,
+		float* result, int vectorSize, const float* mult );
+	using vectorMultiply = void (*)( const float* first, float multiplier, float* result, int vectorSize );
+	using vectorEltwiseMultiply = void (*)( const float* first, const float* second, float* result, int vectorSize );
+	using vectorEltwiseMultiplyAdd = void (*)( const float* first, const float* second, float* result, int vectorSize );
+	using vectorAddValue = void (*)( const float* first, float value, float* result, int vectorSize );
+	using vectorDotProduct = void (*)( const float* first, const float* second, float* result, int vectorSize );
+	using vectorMinMax = void (*)( const float* first, float* result, int vectorSize, const float minValue, const float maxValue );
+
+	virtual vectorAddFunc GetVectorAddFunc() = 0;
+	virtual alignedVectorAdd GetAlignedVectorAddFunc() = 0;
+	virtual vectorEltwiseMax GetVectorMaxFunc() = 0;
+	virtual vectorReLU GetVectorReLUFunc() = 0;
+	virtual vectorReLUTreshold GetVectorReLUTresholdFunc() = 0;
+	virtual alignedVectorMultiplyAndAdd GetAlignedVectorMultiplyAndAddFunc() = 0;
+	virtual vectorMultiply GetVectorMultiplyFunc() = 0;
+	virtual vectorEltwiseMultiply GetVectorEltwiseMultiplyFunc() = 0;
+	virtual vectorEltwiseMultiplyAdd GetVectorEltwiseMultiplyAddFunc() = 0;
+	virtual vectorAddValue GetVectorAddValueFunc() = 0;
+	virtual vectorDotProduct GetVectorDotProductFunc() = 0;
+	virtual vectorMinMax GetVectorMinMaxFunc() = 0;
 };
 
 }
diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.cpp b/NeoMathEngine/src/CPU/CpuMathEngine.cpp
@@ -24,6 +24,7 @@ limitations under the License.
 #include <NeoMathEngine/SimdMathEngine.h>
 #include <DllLoader.h>
 #include <CPUInfo.h>
+#include <CpuMathEnginePrivate.h>
 
 #if FINE_PLATFORM( FINE_ANDROID ) || FINE_PLATFORM( FINE_LINUX )
 #include <PerformanceCountersCpuLinux.h>
@@ -78,6 +79,33 @@ CCpuMathEngine::CCpuMathEngine( int _threadCount, size_t _memoryLimit ) :
 #ifdef NEOML_USE_MKL
 	vmlSetMode( VML_ERRMODE_NOERR );
 #endif
+	if( simdMathEngine != nullptr ) {
+		vectorAdd = simdMathEngine->GetVectorAddFunc();
+		alignedVectorAdd = simdMathEngine->GetAlignedVectorAddFunc();
+		vectorEltwiseMax = simdMathEngine->GetVectorMaxFunc();
+		vectorReLU = simdMathEngine->GetVectorReLUFunc();
+		vectorReLUTreshold = simdMathEngine->GetVectorReLUTresholdFunc();
+		alignedVectorMultiplyAndAdd = simdMathEngine->GetAlignedVectorMultiplyAndAddFunc();
+		vectorMultiply = simdMathEngine->GetVectorMultiplyFunc();
+		vectorEltwiseMultiply = simdMathEngine->GetVectorEltwiseMultiplyFunc();
+		vectorEltwiseMultiplyAdd = simdMathEngine->GetVectorEltwiseMultiplyAddFunc();
+		vectorAddValue = simdMathEngine->GetVectorAddValueFunc();
+		vectorDotProduct = simdMathEngine->GetVectorDotProductFunc();
+		vectorMinMax = simdMathEngine->GetVectorMinMaxFunc();
+	} else {
+		vectorAdd = &NeoML::vectorAdd;
+		alignedVectorAdd = &NeoML::alignedVectorAdd;
+		vectorEltwiseMax = &NeoML::vectorEltwiseMax;
+		vectorReLU = &NeoML::vectorReLU;
+		vectorReLUTreshold = &NeoML::vectorReLUTreshold;
+		alignedVectorMultiplyAndAdd = &NeoML::alignedVectorMultiplyAndAdd;
+		vectorMultiply = &NeoML::vectorMultiply;
+		vectorEltwiseMultiply = &NeoML::vectorEltwiseMultiply;
+		vectorEltwiseMultiplyAdd = &NeoML::vectorEltwiseMultiplyAdd;
+		vectorAddValue = &NeoML::vectorAddValue;
+		vectorDotProduct = &NeoML::vectorDotProduct;
+		vectorMinMax = &NeoML::vectorMinMax;
+	}
 }
 
 CCpuMathEngine::~CCpuMathEngine()

diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.h b/NeoMathEngine/src/CPU/CpuMathEngine.h
@@ -558,6 +558,20 @@ class CCpuMathEngine : public IMathEngine, public IRawMemoryManager {
 	std::unique_ptr<ISimdMathEngine> simdMathEngine; // interface for using simd instructions
 	SgemmFunc customSgemmFunction; // Used when it is availabled and is faster then default sgemm
 
+	void ( *vectorAdd )( const float* first, const float* second, float* result, int vectorSize );
+	void ( *alignedVectorAdd )( const float* first, float* second, int vectorSize );
+	void ( *vectorEltwiseMax )( const float* first, const float* second, float* result, int vectorSize );
+	void ( *vectorReLU )( const float* first, float* result, int vectorSize );
+	void ( *vectorReLUTreshold )( const float* first, float* result, int vectorSize, float threshold );
+	void ( *alignedVectorMultiplyAndAdd )( const float* first, const float* second,
+		float* result, int vectorSize, const float* mult );
+	void ( *vectorMultiply )( const float* first, float multiplier, float* result, int vectorSize );
+	void ( *vectorEltwiseMultiply )( const float* first, const float* second, float* result, int vectorSize );
+	void ( *vectorEltwiseMultiplyAdd )( const float* first, const float* second, float* result, int vectorSize );
+	void ( *vectorAddValue )( const float* first, float value, float* result, int vectorSize );
+	void ( *vectorDotProduct )( const float* first, const float* second, float* result, int vectorSize );
+	void ( *vectorMinMax )( const float* first, float* result, int vectorSize, const float minValue, const float maxValue );
+
 	IMathEngine& mathEngine() { IMathEngine* engine = this; return *engine; }
 
 	void blob3dConvolution1x1x1( const CBlobDesc& source, const CBlobDesc& filter, const CBlobDesc& result,

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineBlas.cpp b/NeoMathEngine/src/CPU/CpuMathEngineBlas.cpp
@@ -219,7 +219,7 @@ void CCpuMathEngine::AddVectorToMatrixColumns(const CConstFloatHandle& matrixHan
 	const float* vector = GetRaw( vectorHandle );
 
 	for(int i = 0; i < matrixHeight; ++i) {
-		vectorAddValue(matrix, result, matrixWidth, *vector);
+		vectorAddValue(matrix, *vector, result, matrixWidth);
 		matrix += matrixWidth;
 		result += matrixWidth;
 		++vector;
@@ -276,7 +276,7 @@ void CCpuMathEngine::RowMultiplyMatrixByMatrix(const CConstFloatHandle& firstHan
 	float* result = GetRaw( resultHandle );
 
 	for(int i = 0; i < height; ++i) {
-		vectorDotProduct(first, second, width, result);
+		vectorDotProduct(first, second, result, width);
 		first += width;
 		second += width;
 		++result;
@@ -819,7 +819,7 @@ void CCpuMathEngine::MultiplyDiagMatrixByMatrix( const CConstFloatHandle& firstH
 	NEOML_OMP_FOR_NUM_THREADS( curThreadCount )
 	for( int i = 0; i < firstSize; i++ ) {
 		const float multiplier = *( first + i );
-		vectorMultiply( second + i * secondWidth, result + i * secondWidth, multiplier, secondWidth );
+		vectorMultiply( second + i * secondWidth, multiplier, result + i * secondWidth, secondWidth );
 	}
 }
 

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnn3dConv.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnn3dConv.cpp
@@ -98,7 +98,7 @@ void CCpuMathEngine::blob3dConvolution1x1x1Backward( const CCommon3dConvolutionD
 						for( int i = 0; i < resultBlob.Width(); ++i ) {
 							float* inputDiffPixel = inputDiffCol;
 							for( int k = 0; k < resultBlob.Depth(); ++k ) {
-								NeoML::vectorAdd( inputDiffPixel, resultData, inputDiffPixel, inputDiff.Channels() );
+								vectorAdd( inputDiffPixel, resultData, inputDiffPixel, inputDiff.Channels() );
 								inputDiffPixel += inputDiff.Channels() * desc.StrideDepth;
 								resultData += inputDiff.Channels();
 							}
@@ -437,8 +437,8 @@ void CCpuMathEngine::blob3dConvolutionBackward( const CCommon3dConvolutionDesc&
 		int outputLineCount;
 		if( OmpGetTaskIndexAndCount( outputLineY, outputLineStart, outputLineCount ) ) {
 			if( freeTermData == 0 ) {
-				vectorFill( resultData + outputLineStart * outputRowSize,
-					0, outputLineCount * outputRowSize );
+				vectorFill0( resultData + outputLineStart * outputRowSize,
+					outputLineCount * outputRowSize );
 			}
 
 			int outputLineEnd = outputLineStart + outputLineCount;

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnnChannelwiseConv.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnnChannelwiseConv.cpp
@@ -166,7 +166,7 @@ void CCpuMathEngine::blobChannelwiseConvolutionFilter3x3Padding1Stride2( const C
 					if( freeTerm != 0 ) {
 						fillResultRow( desc, freeTerm, resultFirstRow );
 					} else {
-						NeoML::vectorFill( resultFirstRow, 0, resultDesc.Width() * channels );
+						NeoML::vectorFill0( resultFirstRow, resultDesc.Width() * channels );
 					}
 
 					processFilterRowStride2( desc, filter + filterRowSize, sourceFirstRow, resultFirstRow );
@@ -181,7 +181,7 @@ void CCpuMathEngine::blobChannelwiseConvolutionFilter3x3Padding1Stride2( const C
 					if( freeTerm != 0 ) {
 						fillResultRow( desc, freeTerm, resRow );
 					} else {
-						NeoML::vectorFill( resRow, 0, resultDesc.Width() * channels );
+						NeoML::vectorFill0( resRow, resultDesc.Width() * channels );
 					}
 
 					processFilterRowStride2( desc, filter, srcRow, resRow );
@@ -193,7 +193,7 @@ void CCpuMathEngine::blobChannelwiseConvolutionFilter3x3Padding1Stride2( const C
 					if( freeTerm != 0 ) {
 						fillResultRow( desc, freeTerm, resultLastRow );
 					} else {
-						NeoML::vectorFill( resultLastRow, 0, resultDesc.Width() * channels );
+						NeoML::vectorFill0( resultLastRow, resultDesc.Width() * channels );
 					}
 
 					processFilterRowStride2( desc, filter, sourceLastRow, resultLastRow );
@@ -263,7 +263,7 @@ void CCpuMathEngine::blobChannelwiseConvolutionFilter3x3Padding1Stride1( const C
 					if( freeTerm != 0 ) {
 						fillResultRow( desc, freeTerm, resultFirstRow );
 					} else {
-						NeoML::vectorFill( resultFirstRow, 0, resultDesc.Width() * channels );
+						NeoML::vectorFill0( resultFirstRow, resultDesc.Width() * channels );
 					}
 					processFilterRowStride1( desc, filter + filterRowSize, sourceFirstRow, resultFirstRow );
 					if( resultCount >= 0 ) {
@@ -277,7 +277,7 @@ void CCpuMathEngine::blobChannelwiseConvolutionFilter3x3Padding1Stride1( const C
 					if( freeTerm != 0 ) {
 						fillResultRow( desc, freeTerm, resRow );
 					} else {
-						NeoML::vectorFill( resRow, 0, resultDesc.Width() * channels );
+						NeoML::vectorFill0( resRow, resultDesc.Width() * channels );
 					}
 
 					processFilterRowStride1( desc, filter, srcRow, resRow );
@@ -289,7 +289,7 @@ void CCpuMathEngine::blobChannelwiseConvolutionFilter3x3Padding1Stride1( const C
 					if( freeTerm != 0 ) {
 						fillResultRow( desc, freeTerm, resultLastRow );
 					} else {
-						NeoML::vectorFill( resultLastRow, 0, resultDesc.Width() * channels );
+						NeoML::vectorFill0( resultLastRow, resultDesc.Width() * channels );
 					}
 
 					processFilterRowStride1( desc, filter, sourceLastRow, resultLastRow );
@@ -366,7 +366,7 @@ void CCpuMathEngine::BlobChannelwiseConvolution( const CChannelwiseConvolutionDe
 							NeoML::dataCopy(rowStart, freeTerm, channels);
 						}
 					} else {
-						NeoML::vectorFill(resultRow, 0, resultDesc.Width() * channels);
+						NeoML::vectorFill0(resultRow, resultDesc.Width() * channels);
 					}
 
 					const int filterFirstRow = max( 0, -firstFilteredRow );

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnnConv.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnnConv.cpp
@@ -355,7 +355,7 @@ void CCpuMathEngine::fillTempData( const float* sourceData, float* tempData, con
 		for( int h = 0; h < desc.Filter.Height(); h++ ) {
 			if( 0 <= sourceHeight + h * desc.DilationHeight && sourceHeight + h * desc.DilationHeight < desc.Source.Height() ) {
 				if( startPaddingSize > 0 ) {
-					NeoML::vectorFill( tempStartPaddingPtr, 0.0, startPaddingSize * channelsCount );
+					NeoML::vectorFill0( tempStartPaddingPtr, startPaddingSize * channelsCount );
 				}
 
 				if( desc.DilationWidth == 1 ) {
@@ -369,10 +369,10 @@ void CCpuMathEngine::fillTempData( const float* sourceData, float* tempData, con
 				}
 
 				if( endPaddingSize > 0 ) {
-					NeoML::vectorFill( tempEndPaddingPtr, 0.0, endPaddingSize * channelsCount );
+					NeoML::vectorFill0( tempEndPaddingPtr, endPaddingSize * channelsCount );
 				}
 			} else {
-				NeoML::vectorFill( tempStartPaddingPtr, 0.0, filterLineSize );
+				NeoML::vectorFill0( tempStartPaddingPtr, filterLineSize );
 			}
 
 			tempStartPaddingPtr += filterLineSize;
@@ -577,7 +577,7 @@ void CCpuMathEngine::backwardConvolutionAddFilterToOutput( const CCpuConvolution
 			// Set the free term
 			setVectorToMatrixRows( outputDataPtr, output.Width(), output.Depth() * output.Channels(), freeTermDataRaw );
 		} else {
-			vectorFill( outputDataPtr, 0, output.Width() * output.Depth() * output.Channels() );
+			vectorFill0( outputDataPtr, output.Width() * output.Depth() * output.Channels() );
 		}
 
 		int batch = step / output.Height();

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineDnnRleConv.cpp b/NeoMathEngine/src/CPU/CpuMathEngineDnnRleConv.cpp
@@ -115,7 +115,7 @@ static inline void updateFilterConv( IMathEngine& mathEngine, CCpuRleConvolution
 	const float* convertFilterDataPtr = GetRaw( desc.ConvertedFilter.GetHandle() );
 	for( int j = 0; j < filterHeight; ++j ) {
 		for( int i = 0; i < filterWidth; ++i ) {
-			alignedVectorAdd( zeroFilterConvPtr, convertFilterDataPtr, filterCount );
+			alignedVectorAdd( convertFilterDataPtr, zeroFilterConvPtr, filterCount );
 			convertFilterDataPtr += filterCount;
 		}
 		zeroFilterConvPtr += filterCount;
@@ -268,7 +268,7 @@ void CCpuMathEngine::BlobRleConvolution( const CRleConvolutionDesc& convDesc, co
 				const float* curFilterConvData = filterConvData + index * filterConvStep;
 				float* curOutput = output;
 				for( int j = 0; j < jCount; ++j ) {
-					alignedVectorAdd( curOutput, curFilterConvData, filterCount );
+					alignedVectorAdd( curFilterConvData, curOutput, filterCount );
 					curFilterConvData += strideHeight * filterCount;
 					curOutput += outputRowSize;
 				}
@@ -382,7 +382,7 @@ void CCpuMathEngine::BlobRleConvolutionLearnAdd( const CRleConvolutionDesc& conv
 			// Calculate diff separately for the free terms
 			for( int j = 0; j < outputDiff.Height(); ++j ) {
 				for( int k = 0; k < outputDiff.Width(); ++k ) {
-					alignedVectorAdd( freeTermDiffReductionPrivatePtr, outputDiffDataPtr, filterCount );
+					alignedVectorAdd( outputDiffDataPtr, freeTermDiffReductionPrivatePtr, filterCount );
 					outputDiffDataPtr += filterCount;
 				}
 			}

diff --git a/NeoMathEngine/src/CPU/CpuMathEngineVectorMath.cpp b/NeoMathEngine/src/CPU/CpuMathEngineVectorMath.cpp
@@ -128,11 +128,11 @@ void CCpuMathEngine::VectorAdd(const CConstFloatHandle& firstHandle, const CCons
 		NEOML_OMP_NUM_THREADS( curThreadCount ) {
 			int index, count;
 			if( OmpGetTaskIndexAndCount( vectorSize, 16, index, count ) ) {
-				NeoML::vectorAdd( GetRaw(firstHandle + index), GetRaw(secondHandle + index), GetRaw(resultHandle + index), count );
+				vectorAdd( GetRaw(firstHandle + index), GetRaw(secondHandle + index), GetRaw(resultHandle + index), count );
 			}
 		}
 	} else {
-		NeoML::vectorAdd( GetRaw(firstHandle), GetRaw(secondHandle), GetRaw(resultHandle), vectorSize );
+		vectorAdd( GetRaw(firstHandle), GetRaw(secondHandle), GetRaw(resultHandle), vectorSize );
 	}
 }
 
@@ -352,7 +352,7 @@ void CCpuMathEngine::VectorAddValue(const CConstFloatHandle& firstHandle, const
 	float* result = GetRaw( resultHandle );
 	float value = *GetRaw( addition );
 
-	vectorAddValue( first, result, vectorSize, value );
+	vectorAddValue( first, value, result, vectorSize );
 }
 
 void CCpuMathEngine::VectorDotProduct(const CConstFloatHandle& firstHandle, const CConstFloatHandle& secondHandle,
@@ -367,7 +367,7 @@ void CCpuMathEngine::VectorDotProduct(const CConstFloatHandle& firstHandle, cons
 	const float* second = GetRaw( secondHandle );
 	float* result = GetRaw( resultHandle );
 
-	vectorDotProduct( first, second, vectorSize, result );
+	vectorDotProduct( first, second, result, vectorSize );
 }
 
 void CCpuMathEngine::VectorTopK(const CConstFloatHandle& firstHandle, int firstSize, int k, const CFloatHandle& resultHandle,
@@ -470,11 +470,11 @@ void CCpuMathEngine::VectorMultiply(const CConstFloatHandle& firstHandle,
 		NEOML_OMP_NUM_THREADS( curThreadCount ) {
 			int index, count;
 			if( OmpGetTaskIndexAndCount( vectorSize, 16, index, count ) ) {
-				vectorMultiply( GetRaw( firstHandle + index ), GetRaw( resultHandle + index ), multiplier, count );
+				vectorMultiply( GetRaw( firstHandle + index ), multiplier, GetRaw( resultHandle + index ), count );
 			}
 		}
 	} else {
-		vectorMultiply( GetRaw( firstHandle ), GetRaw( resultHandle ), multiplier, vectorSize );
+		vectorMultiply( GetRaw( firstHandle ), multiplier, GetRaw( resultHandle ),  vectorSize );
 	}
 }
 
@@ -656,11 +656,11 @@ void CCpuMathEngine::VectorMinMax(const CConstFloatHandle& firstHandle, const CF
 		NEOML_OMP_NUM_THREADS( curThreadCount ) {
 			int index, count;
 			if( OmpGetTaskIndexAndCount( vectorSize, 16, index, count ) ) {
-				vectorMinMax( GetRaw(firstHandle + index), GetRaw(resultHandle + index), minValue, maxValue, count );
+				vectorMinMax( GetRaw(firstHandle + index), GetRaw(resultHandle + index), count, minValue, maxValue );
 			}
 		}
 	} else {
-		vectorMinMax( GetRaw(firstHandle ), GetRaw(resultHandle ), minValue, maxValue, vectorSize );
+		vectorMinMax( GetRaw(firstHandle ), GetRaw(resultHandle ), vectorSize, minValue, maxValue );
 	}
 }
 

diff --git a/NeoMathEngine/src/CPU/arm/CpuArmMathEngineVectorMathPrivate.h b/NeoMathEngine/src/CPU/arm/CpuArmMathEngineVectorMathPrivate.h
@@ -272,7 +272,7 @@ inline void alignedVectorMultiplyAndAdd( const float* first, const float* second
 
 //------------------------------------------------------------------------------------------------------------
 
-inline void vectorMultiply( const float* first, float* result, float multiplier, int vectorSize )
+inline void vectorMultiply( const float* first, float multiplier, float* result, int vectorSize )
 {
 	int count = GetCount4(vectorSize);
 	float32x4_t mult = vdupq_n_f32(multiplier);
@@ -504,7 +504,7 @@ inline void vectorReLU( const float* first, float* result, int vectorSize, float
 
 //------------------------------------------------------------------------------------------------------------
 
-inline void vectorAddValue( const float* first, float* result, int vectorSize, float value )
+inline void vectorAddValue( const float* first, float value, float* result, int vectorSize )
 {
 	float32x4_t addition = vdupq_n_f32(value);
 
@@ -526,7 +526,7 @@ inline void vectorAddValue( const float* first, float* result, int vectorSize, f
 
 //------------------------------------------------------------------------------------------------------------
 
-inline void vectorDotProduct( const float* first, const float* second, int vectorSize, float* result )
+inline void vectorDotProduct( const float* first, const float* second, float* result, int vectorSize )
 {
 	float32x4_t acc = vdupq_n_f32(0);
 
@@ -709,7 +709,7 @@ static inline void qrnnIfPoolingStep( const float* z, const float* f, const floa
 	}
 }
 
-inline void vectorMinMax( const float* first, float* result, const float minValue, const float maxValue, int vectorSize )
+inline void vectorMinMax( const float* first, float* result, int vectorSize, const float minValue, const float maxValue )
 {
 	int count = GetCount4(vectorSize);