neoml-lib · favorart · Jul 14, 2023 · Jul 14, 2023 · Oct 13, 2023 · Oct 13, 2023
diff --git a/NeoMathEngine/CMakeLists.txt b/NeoMathEngine/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.11 FATAL_ERROR)
 
-project(NeoMathEngine LANGUAGES CXX)
+project(NeoMathEngine LANGUAGES CXX C ASM)
 
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
 if(USE_FINE_OBJECTS)

diff --git a/NeoMathEngine/src/CMakeLists.txt b/NeoMathEngine/src/CMakeLists.txt
@@ -170,7 +170,22 @@ if((DARWIN AND BUILD_ARCH MATCHES "^arm64.*") OR (ANDROID AND ANDROID_ABI MATCHE
     target_include_directories(${PROJECT_NAME} PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/CPU/arm>)
 else()
     message(STATUS "USE X86 SOURCES")
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8) # x64
+        if(WIN32)
+            set(CPU_X86_ASM_SOURCES
+                CPU/x86/CpuX86FmaCount/fma_shuffle_tpt.asm
+                CPU/x86/CpuX86FmaCount/fma_only_tpt.asm)
+        else() #if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
+            target_compile_options(${PROJECT_NAME} PRIVATE 
+                $<$<COMPILE_LANGUAGE:ASM>:-x$<SEMICOLON>assembler-with-cpp>)
+            set(CPU_X86_ASM_SOURCES
+                CPU/x86/CpuX86FmaCount/fma_shuffle_tpt.s
+                CPU/x86/CpuX86FmaCount/fma_only_tpt.s)
+        endif()
+    endif()
+
     set(CPU_X86_SOURCES
+        CPU/x86/CpuX86FmaCount.cpp
         CPU/x86/CpuX86MathEngineBlas.cpp
         CPU/x86/CpuX86MathEngineBlasMkl.cpp
         CPU/x86/CpuX86MathEngineDnn.cpp
@@ -180,6 +195,7 @@ else()
     target_sources(${PROJECT_NAME} 
     PRIVATE
         ${CPU_X86_SOURCES}
+        ${CPU_X86_ASM_SOURCES}
         CPU/x86/CpuX86.h
         CPU/x86/CpuX86Functors.h
         CPU/x86/CpuX86MathEngineVectorMathPrivate.h
@@ -202,6 +218,20 @@ else()
         set_property(SOURCE ${CPU_AVX_SOURCES} PROPERTY COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma>)
     endif()
 
+    set(CPU_AVX512_SOURCES
+        CPU/x86/avx512/Avx512VectorFunctions.cpp
+    )
+    target_sources(${PROJECT_NAME} PRIVATE
+        ${CPU_AVX512_SOURCES}
+        CPU/x86/avx512/Avx512Functions.h
+    )
+    set_property(SOURCE ${CPU_AVX512_SOURCES} PROPERTY UNITY_GROUP 3)
+    if(WIN32)
+        set_property(SOURCE ${CPU_AVX512_SOURCES} PROPERTY COMPILE_OPTIONS /arch:AVX512)
+    else()
+        set_property(SOURCE ${CPU_AVX512_SOURCES} PROPERTY COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-mfma -mavx -mavx2 -mavx512f -mavx512dq -mavx512vl>)
+    endif()
+
     if(NEOML_USE_AVX)
         target_sources(${PROJECT_NAME}
         PRIVATE

diff --git a/NeoMathEngine/src/CPU/CPUInfo.h b/NeoMathEngine/src/CPU/CPUInfo.h
@@ -1,4 +1,4 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -21,15 +21,22 @@ limitations under the License.
 #include <cpuid.h>
 #elif FINE_PLATFORM( FINE_WINDOWS )
 #include <intrin.h>
-#endif
+#endif // FINE_PLATFORM
 
-#endif // !FINE_ARCHITECTURE( FINE_ARM64 )
+#endif // !FINE_ARCHITECTURE( ARM )
 
 #include <cstring>
 
+#if FINE_ARCHITECTURE( FINE_X64 )
+// Intel X86/X64 optimization manual
+// https://github.com/intel/optimization-manual/tree/main/chap18/ex25
+int Avx512FmaUnitCount();
+#else   // !x64
+inline int Avx512FmaUnitCount() { return 0; }
+#endif  // !x64
 
 // The structure with CPU information
-struct CCPUInfo {
+struct CCPUInfo final {
 	enum class TCpuArch {
 		Intel,
 		AMD,
@@ -117,7 +124,7 @@ struct CCPUInfo {
 	{
 #ifdef NEOML_USE_NEON
 		return 4;
-#else
+#else // !NEOML_USE_NEON
 		int floatAlignment = 4; // SSE alignment
 
 		Regs regs;
@@ -138,15 +145,20 @@ struct CCPUInfo {
 			}
 #elif FINE_PLATFORM(FINE_LINUX) || FINE_PLATFORM(FINE_DARWIN) || FINE_PLATFORM(FINE_ANDROID) || FINE_PLATFORM(FINE_IOS)
 			floatAlignment = 8;
-#else
+#else  // ERROR FINE_PLATFORM
 #error "Platform isn't supported!"
-#endif
+#endif // ERROR FINE_PLATFORM
+		}
+		if( HasAvx512And2Fma ) {
+			floatAlignment = 16;
 		}
 
 		return floatAlignment;
-#endif // NEOML_USE_NEON
+#endif // !NEOML_USE_NEON
 	}
 
+	static const bool NEOMATHENGINE_API HasAvx512;
+	static const bool HasAvx512And2Fma;
 	static const bool HasAvxAndFma;
 	static const bool IsNotIntel;
 
@@ -184,9 +196,9 @@ struct CCPUInfo {
 
 #if FINE_PLATFORM(FINE_WINDOWS)
 	typedef int RegType;
-#else
+#else  // !FINE_WINDOWS
 	typedef unsigned int RegType;
-#endif
+#endif // !FINE_WINDOWS
 	struct Regs {
 		RegType eax;
 		RegType ebx;
@@ -201,12 +213,12 @@ struct CCPUInfo {
 		__cpuid( ( RegType* )( &outRegs ), eax );
 #elif FINE_PLATFORM( FINE_LINUX ) || FINE_PLATFORM( FINE_DARWIN )
 		__get_cpuid( eax, &outRegs.eax, &outRegs.ebx, &outRegs.ecx, &outRegs.edx );
-#else
+#else  // ERROR FINE_PLATFORM
 	( void ) eax;
-#endif
-#else
+#endif // ERROR FINE_PLATFORM
+#else  // ERROR FINE_ARCHITECTURE
 	( void ) eax;
-#endif // !FINE_ARCHITECTURE( FINE_ARM64 )
+#endif // ERROR FINE_ARCHITECTURE
 	}
 
 	static void callCpuIdEx( Regs& outRegs, const RegType& eax, const RegType& ecx ) {
@@ -216,14 +228,14 @@ struct CCPUInfo {
 		__cpuidex((RegType*)( &outRegs ), eax, ecx );
 #elif FINE_PLATFORM( FINE_LINUX ) || FINE_PLATFORM( FINE_DARWIN )
 		__cpuid_count( eax, ecx, outRegs.eax, outRegs.ebx, outRegs.ecx, outRegs.edx );
-#else
+#else  // ERROR FINE_PLATFORM
 	( void ) eax;
 	( void ) ecx;
-#endif
-#else
+#endif // ERROR FINE_PLATFORM
+#else  // ERROR FINE_ARCHITECTURE
 	( void ) eax;
 	( void ) ecx;
-#endif // !FINE_ARCHITECTURE( FINE_ARM64 )
+#endif // ERROR FINE_ARCHITECTURE
 	}
 
 };
diff --git a/NeoMathEngine/src/CPU/CpuMathEngine.cpp b/NeoMathEngine/src/CPU/CpuMathEngine.cpp
@@ -1,4 +1,4 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -43,6 +43,8 @@ limitations under the License.
 
 #endif // NEOML_USE_MKL
 
+const bool NEOMATHENGINE_API CCPUInfo::HasAvx512 = CCPUInfo::IsAvx512Available();
+const bool CCPUInfo::HasAvx512And2Fma = CCPUInfo::HasAvx512 && Avx512FmaUnitCount() > 1;
 const bool CCPUInfo::HasAvxAndFma = CCPUInfo::IsAvxAndFmaAvailable();
 const bool CCPUInfo::IsNotIntel = CCPUInfo::GetCpuArch() != CCPUInfo::TCpuArch::Intel;
 

diff --git a/NeoMathEngine/src/CPU/x86/CpuX86.h b/NeoMathEngine/src/CPU/x86/CpuX86.h
@@ -1,4 +1,4 @@
-/* Copyright © 2017-2023 ABBYY
+/* Copyright © 2017-2024 ABBYY
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@ limitations under the License.
 #include <NeoMathEngine/CrtAllocatedObject.h>
 
 #include "avx2/Avx2Functions.h"
+#include "avx512/Avx512Functions.h"
 #include "../CPUInfo.h"
 
 namespace NeoML {
@@ -341,7 +342,10 @@ inline void dataCopy(float* dst, const float* src, int vectorSize)
 {
 	static_assert( sizeof(float) == sizeof(unsigned int), "Size of float isn't equal to size of unsigned int." );
 
-	if( CCPUInfo::HasAvxAndFma && vectorSize >= NeoML::Avx2::VectorMathMinSize ) {
+	if( CCPUInfo::HasAvx512And2Fma && vectorSize >= NeoML::Avx512::VectorMathMinSize ) {
+		NeoML::Avx512::dataCopy( dst, src, vectorSize );
+		return;
+	} else if( CCPUInfo::HasAvxAndFma && vectorSize >= NeoML::Avx2::VectorMathMinSize ) {
 		NeoML::Avx2::dataCopy( dst, src, vectorSize );
 		return;
 	}

diff --git a/NeoMathEngine/src/CPU/x86/CpuX86FmaCount.cpp b/NeoMathEngine/src/CPU/x86/CpuX86FmaCount.cpp
@@ -0,0 +1,111 @@
+/* Copyright © 2023-2024 ABBYY
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+--------------------------------------------------------------------------------------------------------------*/
+
+#include <common.h>
+#pragma hdrstop
+
+#include <NeoMathEngine/NeoMathEngineDefs.h>
+#include <CPUInfo.h>
+
+#if FINE_ARCHITECTURE( FINE_X64 )
+
+#include <stdint.h>
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#include <immintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fma_shuffle_tpt( uint64_t loop_cnt );
+void fma_only_tpt( uint64_t loop_cnt );
+
+int64_t rdtsc( void )
+{
+	return __rdtsc();
+}
+
+int fma_unit_count( void )
+{
+	int i;
+	uint64_t fma_shuf_tpt_test[3];
+	uint64_t fma_shuf_tpt_test_min;
+	uint64_t fma_only_tpt_test[3];
+	uint64_t fma_only_tpt_test_min;
+	uint64_t start = 0;
+	int number_of_fma_units_per_core = 2;
+
+	/*********************************************************/
+	/* Step 1: Warmup */
+	/*********************************************************/
+
+	fma_only_tpt( 100000 );
+
+	/*********************************************************/
+	/* Step 2: Execute FMA and Shuffle TPT Test */
+	/*********************************************************/
+	for( i = 0; i < 3; ++i ) {
+		start = rdtsc();
+		fma_shuffle_tpt( 1000 );
+		fma_shuf_tpt_test[i] = rdtsc() - start;
+	}
+
+	/*********************************************************/
+	/* Step 3: Execute FMA only TPT Test */
+	/*********************************************************/
+	for( i = 0; i < 3; ++i ) {
+		start = rdtsc();
+		fma_only_tpt( 1000 );
+		fma_only_tpt_test[i] = rdtsc() - start;
+	}
+
+	/*********************************************************/
+	/* Step 4: Decide if 1 FMA server or 2 FMA server */
+	/*********************************************************/
+	fma_shuf_tpt_test_min = fma_shuf_tpt_test[0];
+	fma_only_tpt_test_min = fma_only_tpt_test[0];
+	for( i = 1; i < 3; ++i ) {
+		if( (int)fma_shuf_tpt_test[i] < (int)fma_shuf_tpt_test_min ) {
+			fma_shuf_tpt_test_min = fma_shuf_tpt_test[i];
+		}
+		if( (int)fma_only_tpt_test[i] < (int)fma_only_tpt_test_min ) {
+			fma_only_tpt_test_min = fma_only_tpt_test[i];
+		}
+	}
+
+	if( ( double( fma_shuf_tpt_test_min ) / fma_only_tpt_test_min ) < 1.5 ) {
+		number_of_fma_units_per_core = 1;
+	}
+
+	printf( " *** x64 AVX512 %d FMA units per core *** \n", number_of_fma_units_per_core );
+	return number_of_fma_units_per_core;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+//-------------------------------------------------------------------------------------------------
+
+int Avx512FmaUnitCount()
+{
+	return fma_unit_count();
+}
+
+#endif // x64