Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test zen4 support on azure #548

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/tests_archdetect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ jobs:
- x86_64/intel/skylake_avx512/archspec-linux-6132
- x86_64/amd/zen2/Azure-CentOS7-7V12
- x86_64/amd/zen3/Azure-CentOS7-7V73X
- x86_64/amd/zen4/Azure-Alma8-9V33X
- x86_64/amd/zen4/Shinx-RHEL8-9654
- aarch64/neoverse_n1/Azure-Ubuntu20-Altra
- aarch64/neoverse_n1/AWS-awslinux-graviton2
- aarch64/neoverse_v1/AWS-awslinux-graviton3
Expand Down
31 changes: 30 additions & 1 deletion EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,24 @@ else
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
fi

# if we run the script for the first time, e.g., to start building for a new
# stack, we need to ensure certain files are present in
# ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
# - .lmod/lmodrc.lua
# - .lmod/SitePackage.lua
_eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
_lmod_cfg_dir=${_eessi_software_path}/.lmod
_lmod_rc_file=${_lmod_cfg_dir}/lmodrc.lua
if [ ! -f ${_lmod_rc_file} ]; then
command -V python3
python3 ${TOPDIR}/create_lmodrc.py ${_eessi_software_path}
fi
_lmod_sitepackage_file=${_lmod_cfg_dir}/SitePackage.lua
if [ ! -f ${_lmod_sitepackage_file} ]; then
command -V python3
python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path}
fi

# Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE)
# $EESSI_SILENT - don't print any messages
# $EESSI_BASIC_ENV - give a basic set of environment variables
Expand Down Expand Up @@ -203,10 +221,21 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}
# Hardcode this for now, see if it works
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
# The install_cuda... script uses EasyBuild. So, we need to check if we have EB
# or skip this step.
module_avail_out=$TMPDIR/ml.out
module avail 2>&1 | grep EasyBuild &> ${module_avail_out}
if [[ $? -eq 0 ]]; then
echo_green ">> Found an EasyBuild module"
else
echo_yellow ">> No EasyBuild module found: skipping step to install CUDA (see output in ${module_avail_out})"
export skip_cuda_install=True
fi

if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed"
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
fi

# Install drivers in host_injections
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
easyconfigs:
- EasyBuild-4.9.1.eb:
options:
from-pr: 20299
- ReFrame-4.3.3.eb
42 changes: 39 additions & 3 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -594,11 +594,47 @@ declare -a EESSI_FUSE_MOUNTS=()
# always mount cvmfs-config repo (to get access to software.eessi.io)
EESSI_FUSE_MOUNTS+=("--fusemount" "container:cvmfs2 cvmfs-config.cern.ch /cvmfs/cvmfs-config.cern.ch")

# check if we got some data via --resume and, if so, use the overlayfs
# example scenario:
# 1st step: some software is build in rw mode
# 2nd step: the software is tested in ro mode (same access as when we would use a
# repository)
if [[ "${ACCESS}" == "ro" ]]; then
export EESSI_READONLY="container:cvmfs2 ${repo_name} /cvmfs/${repo_name}"
if [[ -d ${EESSI_TMPDIR}/overlay-upper ]]; then
# the overlay-upper directory is only created in a read-write-session, thus
# we are resuming from such a session here (otherwise there shouldn't be such
# directory yet as it is only created for read-write-sessions a bit further
# below); the overlay-upper directory can only exist because it is part of
# the ${RESUME} directory or tarball
# to be able to see the contents of the read-write session we have to mount
# the fuse-overlayfs (in read-only mode) on top of the CernVM-FS repository

# make sure the overlay-upper directory exists
mkdir -p ${EESSI_TMPDIR}/overlay-upper

# make the target CernVM-FS repository available under /cvmfs_ro
export EESSI_READONLY="container:cvmfs2 ${repo_name} /cvmfs_ro/${repo_name}"

EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}")

# now, put the overlay-upper read-only on top of the repo and make it under the usual prefix /cvmfs available
EESSI_READONLY_OVERLAY="container:fuse-overlayfs"
# ${EESSI_TMPDIR} is bind mounted to /tmp, hence ${EESSI_TMPDIR}/overlay-upper becomes available as /tmp/overlay-upper
# the left-most lower dir is put on top, with no upperdir=... the whole overlayfs is made available read-only
EESSI_READONLY_OVERLAY+=" -o lowerdir=/tmp/overlay-upper:/cvmfs_ro/${repo_name}"
EESSI_READONLY_OVERLAY+=" ${EESSI_CVMFS_REPO}"
export EESSI_READONLY_OVERLAY

EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY_OVERLAY}")
export EESSI_FUSE_MOUNTS
else
# no overlay-upper directory means we are in a plain read-only session and
# don't need any fuse-overlayfs
export EESSI_READONLY="container:cvmfs2 ${repo_name} /cvmfs/${repo_name}"

EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}")
export EESSI_FUSE_MOUNTS
EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}")
export EESSI_FUSE_MOUNTS
fi
fi

if [[ "${ACCESS}" == "rw" ]]; then
Expand Down
7 changes: 4 additions & 3 deletions init/arch_specs/eessi_arch_x86.spec
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# x86_64 CPU architecture specifications
# Software path in EESSI | Vendor ID | List of defining CPU features
"x86_64/intel/haswell" "GenuineIntel" "avx2 fma" # Intel Haswell, Broadwell
"x86_64/intel/haswell" "GenuineIntel" "avx2 fma" # Intel Haswell, Broadwell
"x86_64/intel/skylake_avx512" "GenuineIntel" "avx2 fma avx512f avx512bw avx512cd avx512dq avx512vl" # Intel Skylake, Cascade Lake
"x86_64/amd/zen2" "AuthenticAMD" "avx2 fma" # AMD Rome
"x86_64/amd/zen3" "AuthenticAMD" "avx2 fma vaes" # AMD Milan, Milan-X
"x86_64/amd/zen2" "AuthenticAMD" "avx2 fma" # AMD Rome
"x86_64/amd/zen3" "AuthenticAMD" "avx2 fma vaes" # AMD Milan, Milan-X
"x86_64/amd/zen4" "AuthenticAMD" "avx2 fma vaes avx512f avx512ifma" # AMD Genoa, Genoa-X
51 changes: 32 additions & 19 deletions init/eessi_environment_variables
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,39 @@ if [ -d $EESSI_PREFIX ]; then
if [ -d $EESSI_EPREFIX ]; then

# determine subdirectory in software layer
if [ "$EESSI_USE_ARCHDETECT" == "1" ]; then
# if archdetect is enabled, use internal code
all_cpupaths=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh -a cpupath)
# iterate over colon-separated list verifying if the architecture is present
# under $EESSI_PREFIX/software/$EESSI_OS_TYPE; if so use the architecture as best match
IFS=: read -r -a archs <<< "${all_cpupaths}"
for arch in "${archs[@]}"; do
if [ -d ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${arch} ]; then
export EESSI_SOFTWARE_SUBDIR=${arch}
show_msg "archdetect says ${EESSI_SOFTWARE_SUBDIR}"
break
fi
done
elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then
# note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined!
export EESSI_EPREFIX_PYTHON=$EESSI_EPREFIX/usr/bin/python3
export EESSI_SOFTWARE_SUBDIR=$($EESSI_EPREFIX_PYTHON ${EESSI_INIT_DIR_PATH}/eessi_software_subdir_for_host.py $EESSI_PREFIX)
show_msg "archspec says ${EESSI_SOFTWARE_SUBDIR}"
if [ ! -z "$EESSI_SOFTWARE_SUBDIR_OVERRIDE" ]; then
# a specific software subdirectory is given, so we use that and don't
# detect it
export EESSI_SOFTWARE_SUBDIR=${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
# however, check if the directory is present in the CernVM-FS repository
# and show a warning if it isn't
cvmfs_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR}
if [ ! -d ${cvmfs_software_path} ]; then
show_msg "Warning: EESSI_SOFTWARE_SUBDIR_OVERRIDE is '${EESSI_SOFTWARE_SUBDIR_OVERRIDE}',"
show_msg " but directory '${cvmfs_software_path}' does NOT exist in CernVM-FS repository"
fi
else
error "Don't know how to detect host CPU, giving up!"
if [ "$EESSI_USE_ARCHDETECT" == "1" ]; then
# if archdetect is enabled, use internal code
all_cpupaths=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh -a cpupath)
# iterate over colon-separated list verifying if the architecture is present
# under $EESSI_PREFIX/software/$EESSI_OS_TYPE; if so use the architecture as best match
IFS=: read -r -a archs <<< "${all_cpupaths}"
for arch in "${archs[@]}"; do
if [ -d ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${arch} ]; then
export EESSI_SOFTWARE_SUBDIR=${arch}
show_msg "archdetect says ${EESSI_SOFTWARE_SUBDIR}"
break
fi
done
elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then
# note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined!
export EESSI_EPREFIX_PYTHON=$EESSI_EPREFIX/usr/bin/python3
export EESSI_SOFTWARE_SUBDIR=$($EESSI_EPREFIX_PYTHON ${EESSI_INIT_DIR_PATH}/eessi_software_subdir_for_host.py $EESSI_PREFIX)
show_msg "archspec says ${EESSI_SOFTWARE_SUBDIR}"
else
error "Don't know how to detect host CPU, giving up!"
fi
fi
if [ ! -z $EESSI_SOFTWARE_SUBDIR ]; then

Expand Down
8 changes: 6 additions & 2 deletions test_suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,17 @@ fi
TMPDIR=$(mktemp -d)

echo ">> Setting up environment..."
module --force purge
export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
# TODO test if module is declared, if so purge modules
#module --force purge
# TODO why do we need to set this here? should already be defined by bot
#export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS)
echo "EESSI_SOFTWARE_SUBDIR_OVERRIDE='${EESSI_SOFTWARE_SUBDIR_OVERRIDE}'"

source $TOPDIR/init/bash

# Load the ReFrame module
# Currently, we load the default version. Maybe we should somehow make this configurable in the future?
# TODO what if no ReFrame module is available yet? --> FAILURE with description?
module load ReFrame
if [[ $? -eq 0 ]]; then
echo_green ">> Loaded ReFrame module"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
x86_64/amd/zen4:x86_64/amd/zen3:x86_64/amd/zen2:x86_64/generic
27 changes: 27 additions & 0 deletions tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.cpuinfo
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
processor : 0
vendor_id : AuthenticAMD
cpu family : 25
model : 17
model name : AMD EPYC 9V33X 96-Core Processor
stepping : 1
microcode : 0xffffffff
cpu MHz : 3705.853
cache size : 1024 KB
physical id : 0
siblings : 88
core id : 0
cpu cores : 88
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx512_bf16 clzero xsaveerptr arat npt nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload avx512vbmi umip avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid fsrm
bugs : sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass
bogomips : 5100.08
TLB size : 3584 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 48 bits physical, 48 bits virtual
power management:
1 change: 1 addition & 0 deletions tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
x86_64/amd/zen4
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
x86_64/amd/zen4:x86_64/amd/zen3:x86_64/amd/zen2:x86_64/generic
27 changes: 27 additions & 0 deletions tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.cpuinfo
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
processor : 0
vendor_id : AuthenticAMD
cpu family : 25
model : 17
model name : AMD EPYC 9654 96-Core Processor
stepping : 1
microcode : 0xa10113e
cpu MHz : 3699.993
cache size : 1024 KB
physical id : 0
siblings : 96
core id : 0
cpu cores : 96
apicid : 0
initial apicid : 0
fpu : yes
fpu_exception : yes
cpuid level : 16
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba perfmon_v2 ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d
bugs : sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass
bogomips : 4799.99
TLB size : 3584 4K pages
clflush size : 64
cache_alignment : 64
address sizes : 52 bits physical, 57 bits virtual
power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14]
1 change: 1 addition & 0 deletions tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.output
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
x86_64/amd/zen4