Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NeoMathEngine] try AVX512 vector functions (research..) #895

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion NeoMathEngine/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.11 FATAL_ERROR)

project(NeoMathEngine LANGUAGES CXX)
project(NeoMathEngine LANGUAGES CXX C ASM)

list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../cmake)
if(USE_FINE_OBJECTS)
Expand Down
30 changes: 30 additions & 0 deletions NeoMathEngine/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,22 @@ if((DARWIN AND BUILD_ARCH MATCHES "^arm64.*") OR (ANDROID AND ANDROID_ABI MATCHE
target_include_directories(${PROJECT_NAME} PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/CPU/arm>)
else()
message(STATUS "USE X86 SOURCES")
if(CMAKE_SIZEOF_VOID_P EQUAL 8) # x64
if(WIN32)
set(CPU_X86_ASM_SOURCES
CPU/x86/CpuX86FmaCount/fma_shuffle_tpt.asm
CPU/x86/CpuX86FmaCount/fma_only_tpt.asm)
else() #if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
target_compile_options(${PROJECT_NAME} PRIVATE
$<$<COMPILE_LANGUAGE:ASM>:-x$<SEMICOLON>assembler-with-cpp>)
set(CPU_X86_ASM_SOURCES
CPU/x86/CpuX86FmaCount/fma_shuffle_tpt.s
CPU/x86/CpuX86FmaCount/fma_only_tpt.s)
endif()
endif()

set(CPU_X86_SOURCES
CPU/x86/CpuX86FmaCount.cpp
CPU/x86/CpuX86MathEngineBlas.cpp
CPU/x86/CpuX86MathEngineBlasMkl.cpp
CPU/x86/CpuX86MathEngineDnn.cpp
Expand All @@ -180,6 +195,7 @@ else()
target_sources(${PROJECT_NAME}
PRIVATE
${CPU_X86_SOURCES}
${CPU_X86_ASM_SOURCES}
CPU/x86/CpuX86.h
CPU/x86/CpuX86Functors.h
CPU/x86/CpuX86MathEngineVectorMathPrivate.h
Expand All @@ -202,6 +218,20 @@ else()
set_property(SOURCE ${CPU_AVX_SOURCES} PROPERTY COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma>)
endif()

set(CPU_AVX512_SOURCES
CPU/x86/avx512/Avx512VectorFunctions.cpp
)
target_sources(${PROJECT_NAME} PRIVATE
${CPU_AVX512_SOURCES}
CPU/x86/avx512/Avx512Functions.h
)
set_property(SOURCE ${CPU_AVX512_SOURCES} PROPERTY UNITY_GROUP 3)
if(WIN32)
set_property(SOURCE ${CPU_AVX512_SOURCES} PROPERTY COMPILE_OPTIONS /arch:AVX512)
else()
set_property(SOURCE ${CPU_AVX512_SOURCES} PROPERTY COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:-mfma -mavx -mavx2 -mavx512f -mavx512dq -mavx512vl>)
endif()

if(NEOML_USE_AVX)
target_sources(${PROJECT_NAME}
PRIVATE
Expand Down
48 changes: 30 additions & 18 deletions NeoMathEngine/src/CPU/CPUInfo.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright © 2017-2023 ABBYY
/* Copyright © 2017-2024 ABBYY

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand All @@ -21,15 +21,22 @@ limitations under the License.
#include <cpuid.h>
#elif FINE_PLATFORM( FINE_WINDOWS )
#include <intrin.h>
#endif
#endif // FINE_PLATFORM

#endif // !FINE_ARCHITECTURE( FINE_ARM64 )
#endif // !FINE_ARCHITECTURE( ARM )

#include <cstring>

#if FINE_ARCHITECTURE( FINE_X64 )
// Intel X86/X64 optimization manual
// https://github.com/intel/optimization-manual/tree/main/chap18/ex25
int Avx512FmaUnitCount();
#else // !x64
inline int Avx512FmaUnitCount() { return 0; }
#endif // !x64

// The structure with CPU information
struct CCPUInfo {
struct CCPUInfo final {
enum class TCpuArch {
Intel,
AMD,
Expand Down Expand Up @@ -117,7 +124,7 @@ struct CCPUInfo {
{
#ifdef NEOML_USE_NEON
return 4;
#else
#else // !NEOML_USE_NEON
int floatAlignment = 4; // SSE alignment

Regs regs;
Expand All @@ -138,15 +145,20 @@ struct CCPUInfo {
}
#elif FINE_PLATFORM(FINE_LINUX) || FINE_PLATFORM(FINE_DARWIN) || FINE_PLATFORM(FINE_ANDROID) || FINE_PLATFORM(FINE_IOS)
floatAlignment = 8;
#else
#else // ERROR FINE_PLATFORM
#error "Platform isn't supported!"
#endif
#endif // ERROR FINE_PLATFORM
}
if( HasAvx512And2Fma ) {
floatAlignment = 16;
}

return floatAlignment;
#endif // NEOML_USE_NEON
#endif // !NEOML_USE_NEON
}

static const bool NEOMATHENGINE_API HasAvx512;
static const bool HasAvx512And2Fma;
static const bool HasAvxAndFma;
static const bool IsNotIntel;

Expand Down Expand Up @@ -184,9 +196,9 @@ struct CCPUInfo {

#if FINE_PLATFORM(FINE_WINDOWS)
typedef int RegType;
#else
#else // !FINE_WINDOWS
typedef unsigned int RegType;
#endif
#endif // !FINE_WINDOWS
struct Regs {
RegType eax;
RegType ebx;
Expand All @@ -201,12 +213,12 @@ struct CCPUInfo {
__cpuid( ( RegType* )( &outRegs ), eax );
#elif FINE_PLATFORM( FINE_LINUX ) || FINE_PLATFORM( FINE_DARWIN )
__get_cpuid( eax, &outRegs.eax, &outRegs.ebx, &outRegs.ecx, &outRegs.edx );
#else
#else // ERROR FINE_PLATFORM
( void ) eax;
#endif
#else
#endif // ERROR FINE_PLATFORM
#else // ERROR FINE_ARCHITECTURE
( void ) eax;
#endif // !FINE_ARCHITECTURE( FINE_ARM64 )
#endif // ERROR FINE_ARCHITECTURE
}

static void callCpuIdEx( Regs& outRegs, const RegType& eax, const RegType& ecx ) {
Expand All @@ -216,14 +228,14 @@ struct CCPUInfo {
__cpuidex((RegType*)( &outRegs ), eax, ecx );
#elif FINE_PLATFORM( FINE_LINUX ) || FINE_PLATFORM( FINE_DARWIN )
__cpuid_count( eax, ecx, outRegs.eax, outRegs.ebx, outRegs.ecx, outRegs.edx );
#else
#else // ERROR FINE_PLATFORM
( void ) eax;
( void ) ecx;
#endif
#else
#endif // ERROR FINE_PLATFORM
#else // ERROR FINE_ARCHITECTURE
( void ) eax;
( void ) ecx;
#endif // !FINE_ARCHITECTURE( FINE_ARM64 )
#endif // ERROR FINE_ARCHITECTURE
}

};
4 changes: 3 additions & 1 deletion NeoMathEngine/src/CPU/CpuMathEngine.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright © 2017-2023 ABBYY
/* Copyright © 2017-2024 ABBYY

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -43,6 +43,8 @@ limitations under the License.

#endif // NEOML_USE_MKL

const bool NEOMATHENGINE_API CCPUInfo::HasAvx512 = CCPUInfo::IsAvx512Available();
const bool CCPUInfo::HasAvx512And2Fma = CCPUInfo::HasAvx512 && Avx512FmaUnitCount() > 1;
const bool CCPUInfo::HasAvxAndFma = CCPUInfo::IsAvxAndFmaAvailable();
const bool CCPUInfo::IsNotIntel = CCPUInfo::GetCpuArch() != CCPUInfo::TCpuArch::Intel;

Expand Down
8 changes: 6 additions & 2 deletions NeoMathEngine/src/CPU/x86/CpuX86.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright © 2017-2023 ABBYY
/* Copyright © 2017-2024 ABBYY

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -35,6 +35,7 @@ limitations under the License.
#include <NeoMathEngine/CrtAllocatedObject.h>

#include "avx2/Avx2Functions.h"
#include "avx512/Avx512Functions.h"
#include "../CPUInfo.h"

namespace NeoML {
Expand Down Expand Up @@ -341,7 +342,10 @@ inline void dataCopy(float* dst, const float* src, int vectorSize)
{
static_assert( sizeof(float) == sizeof(unsigned int), "Size of float isn't equal to size of unsigned int." );

if( CCPUInfo::HasAvxAndFma && vectorSize >= NeoML::Avx2::VectorMathMinSize ) {
if( CCPUInfo::HasAvx512And2Fma && vectorSize >= NeoML::Avx512::VectorMathMinSize ) {
NeoML::Avx512::dataCopy( dst, src, vectorSize );
return;
} else if( CCPUInfo::HasAvxAndFma && vectorSize >= NeoML::Avx2::VectorMathMinSize ) {
NeoML::Avx2::dataCopy( dst, src, vectorSize );
return;
}
Expand Down
111 changes: 111 additions & 0 deletions NeoMathEngine/src/CPU/x86/CpuX86FmaCount.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/* Copyright © 2023-2024 ABBYY

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--------------------------------------------------------------------------------------------------------------*/

#include <common.h>
#pragma hdrstop

#include <NeoMathEngine/NeoMathEngineDefs.h>
#include <CPUInfo.h>

#if FINE_ARCHITECTURE( FINE_X64 )

#include <stdint.h>
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#include <immintrin.h>

#ifdef __cplusplus
extern "C" {
#endif

void fma_shuffle_tpt( uint64_t loop_cnt );
void fma_only_tpt( uint64_t loop_cnt );

int64_t rdtsc( void )
{
return __rdtsc();
}

int fma_unit_count( void )
{
int i;
uint64_t fma_shuf_tpt_test[3];
uint64_t fma_shuf_tpt_test_min;
uint64_t fma_only_tpt_test[3];
uint64_t fma_only_tpt_test_min;
uint64_t start = 0;
int number_of_fma_units_per_core = 2;

/*********************************************************/
/* Step 1: Warmup */
/*********************************************************/

fma_only_tpt( 100000 );

/*********************************************************/
/* Step 2: Execute FMA and Shuffle TPT Test */
/*********************************************************/
for( i = 0; i < 3; ++i ) {
start = rdtsc();
fma_shuffle_tpt( 1000 );
fma_shuf_tpt_test[i] = rdtsc() - start;
}

/*********************************************************/
/* Step 3: Execute FMA only TPT Test */
/*********************************************************/
for( i = 0; i < 3; ++i ) {
start = rdtsc();
fma_only_tpt( 1000 );
fma_only_tpt_test[i] = rdtsc() - start;
}

/*********************************************************/
/* Step 4: Decide if 1 FMA server or 2 FMA server */
/*********************************************************/
fma_shuf_tpt_test_min = fma_shuf_tpt_test[0];
fma_only_tpt_test_min = fma_only_tpt_test[0];
for( i = 1; i < 3; ++i ) {
if( (int)fma_shuf_tpt_test[i] < (int)fma_shuf_tpt_test_min ) {
fma_shuf_tpt_test_min = fma_shuf_tpt_test[i];
}
if( (int)fma_only_tpt_test[i] < (int)fma_only_tpt_test_min ) {
fma_only_tpt_test_min = fma_only_tpt_test[i];
}
}

if( ( double( fma_shuf_tpt_test_min ) / fma_only_tpt_test_min ) < 1.5 ) {
number_of_fma_units_per_core = 1;
}

printf( " *** x64 AVX512 %d FMA units per core *** \n", number_of_fma_units_per_core );
return number_of_fma_units_per_core;
}

#ifdef __cplusplus
}
#endif

//-------------------------------------------------------------------------------------------------

int Avx512FmaUnitCount()
{
return fma_unit_count();
}

#endif // x64
Loading