diff --git a/.github/workflows/tests_archdetect.yml b/.github/workflows/tests_archdetect.yml index 922c9a1bf0..bee348995d 100644 --- a/.github/workflows/tests_archdetect.yml +++ b/.github/workflows/tests_archdetect.yml @@ -13,6 +13,8 @@ jobs: - x86_64/intel/skylake_avx512/archspec-linux-6132 - x86_64/amd/zen2/Azure-CentOS7-7V12 - x86_64/amd/zen3/Azure-CentOS7-7V73X + - x86_64/amd/zen4/Azure-Alma8-9V33X + - x86_64/amd/zen4/Shinx-RHEL8-9654 - aarch64/neoverse_n1/Azure-Ubuntu20-Altra - aarch64/neoverse_n1/AWS-awslinux-graviton2 - aarch64/neoverse_v1/AWS-awslinux-graviton3 diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 91effe4aba..8a5789c2b2 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -147,6 +147,24 @@ else mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} fi +# if we run the script for the first time, e.g., to start building for a new +# stack, we need to ensure certain files are present in +# ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} +# - .lmod/lmodrc.lua +# - .lmod/SitePackage.lua +_eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE} +_lmod_cfg_dir=${_eessi_software_path}/.lmod +_lmod_rc_file=${_lmod_cfg_dir}/lmodrc.lua +if [ ! -f ${_lmod_rc_file} ]; then + command -V python3 + python3 ${TOPDIR}/create_lmodrc.py ${_eessi_software_path} +fi +_lmod_sitepackage_file=${_lmod_cfg_dir}/SitePackage.lua +if [ ! -f ${_lmod_sitepackage_file} ]; then + command -V python3 + python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path} +fi + # Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE) # $EESSI_SILENT - don't print any messages # $EESSI_BASIC_ENV - give a basic set of environment variables @@ -203,10 +221,21 @@ ${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX} # Hardcode this for now, see if it works # TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install # Allow skipping CUDA SDK install in e.g. CI environments +# The install_cuda... script uses EasyBuild. So, we need to check if we have EB +# or skip this step. +module_avail_out=$TMPDIR/ml.out +module avail 2>&1 | grep EasyBuild &> ${module_avail_out} +if [[ $? -eq 0 ]]; then + echo_green ">> Found an EasyBuild module" +else + echo_yellow ">> No EasyBuild module found: skipping step to install CUDA (see output in ${module_avail_out})" + export skip_cuda_install=True +fi + if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then ${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula else - echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed" + echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found" fi # Install drivers in host_injections diff --git a/easystacks/software.eessi.io/2023.06/zen4/eessi-2023.06-eb-4.9.1-001-system.yml b/easystacks/software.eessi.io/2023.06/zen4/eessi-2023.06-eb-4.9.1-001-system.yml new file mode 100644 index 0000000000..c4da7a90e7 --- /dev/null +++ b/easystacks/software.eessi.io/2023.06/zen4/eessi-2023.06-eb-4.9.1-001-system.yml @@ -0,0 +1,5 @@ +easyconfigs: + - EasyBuild-4.9.1.eb: + options: + from-pr: 20299 + - ReFrame-4.3.3.eb diff --git a/eessi_container.sh b/eessi_container.sh index 7d00d1400c..8430a44d2c 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -594,11 +594,47 @@ declare -a EESSI_FUSE_MOUNTS=() # always mount cvmfs-config repo (to get access to software.eessi.io) EESSI_FUSE_MOUNTS+=("--fusemount" "container:cvmfs2 cvmfs-config.cern.ch /cvmfs/cvmfs-config.cern.ch") +# check if we got some data via --resume and, if so, use the overlayfs +# example scenario: +# 1st step: some software is build in rw mode +# 2nd step: the software is tested in ro mode (same access as when we would use a +# repository) if [[ "${ACCESS}" == "ro" ]]; then - export EESSI_READONLY="container:cvmfs2 ${repo_name} /cvmfs/${repo_name}" + if [[ -d ${EESSI_TMPDIR}/overlay-upper ]]; then + # the overlay-upper directory is only created in a read-write-session, thus + # we are resuming from such a session here (otherwise there shouldn't be such + # directory yet as it is only created for read-write-sessions a bit further + # below); the overlay-upper directory can only exist because it is part of + # the ${RESUME} directory or tarball + # to be able to see the contents of the read-write session we have to mount + # the fuse-overlayfs (in read-only mode) on top of the CernVM-FS repository + + # make sure the overlay-upper directory exists + mkdir -p ${EESSI_TMPDIR}/overlay-upper + + # make the target CernVM-FS repository available under /cvmfs_ro + export EESSI_READONLY="container:cvmfs2 ${repo_name} /cvmfs_ro/${repo_name}" + + EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}") + + # now, put the overlay-upper read-only on top of the repo and make it under the usual prefix /cvmfs available + EESSI_READONLY_OVERLAY="container:fuse-overlayfs" + # ${EESSI_TMPDIR} is bind mounted to /tmp, hence ${EESSI_TMPDIR}/overlay-upper becomes available as /tmp/overlay-upper + # the left-most lower dir is put on top, with no upperdir=... the whole overlayfs is made available read-only + EESSI_READONLY_OVERLAY+=" -o lowerdir=/tmp/overlay-upper:/cvmfs_ro/${repo_name}" + EESSI_READONLY_OVERLAY+=" ${EESSI_CVMFS_REPO}" + export EESSI_READONLY_OVERLAY + + EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY_OVERLAY}") + export EESSI_FUSE_MOUNTS + else + # no overlay-upper directory means we are in a plain read-only session and + # don't need any fuse-overlayfs + export EESSI_READONLY="container:cvmfs2 ${repo_name} /cvmfs/${repo_name}" - EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}") - export EESSI_FUSE_MOUNTS + EESSI_FUSE_MOUNTS+=("--fusemount" "${EESSI_READONLY}") + export EESSI_FUSE_MOUNTS + fi fi if [[ "${ACCESS}" == "rw" ]]; then diff --git a/init/arch_specs/eessi_arch_x86.spec b/init/arch_specs/eessi_arch_x86.spec index 8d01cb0c03..81a7429a8d 100755 --- a/init/arch_specs/eessi_arch_x86.spec +++ b/init/arch_specs/eessi_arch_x86.spec @@ -1,6 +1,7 @@ # x86_64 CPU architecture specifications # Software path in EESSI | Vendor ID | List of defining CPU features -"x86_64/intel/haswell" "GenuineIntel" "avx2 fma" # Intel Haswell, Broadwell +"x86_64/intel/haswell" "GenuineIntel" "avx2 fma" # Intel Haswell, Broadwell "x86_64/intel/skylake_avx512" "GenuineIntel" "avx2 fma avx512f avx512bw avx512cd avx512dq avx512vl" # Intel Skylake, Cascade Lake -"x86_64/amd/zen2" "AuthenticAMD" "avx2 fma" # AMD Rome -"x86_64/amd/zen3" "AuthenticAMD" "avx2 fma vaes" # AMD Milan, Milan-X +"x86_64/amd/zen2" "AuthenticAMD" "avx2 fma" # AMD Rome +"x86_64/amd/zen3" "AuthenticAMD" "avx2 fma vaes" # AMD Milan, Milan-X +"x86_64/amd/zen4" "AuthenticAMD" "avx2 fma vaes avx512f avx512ifma" # AMD Genoa, Genoa-X diff --git a/init/eessi_environment_variables b/init/eessi_environment_variables index 5450b2bfb4..855693ce2d 100644 --- a/init/eessi_environment_variables +++ b/init/eessi_environment_variables @@ -25,26 +25,39 @@ if [ -d $EESSI_PREFIX ]; then if [ -d $EESSI_EPREFIX ]; then # determine subdirectory in software layer - if [ "$EESSI_USE_ARCHDETECT" == "1" ]; then - # if archdetect is enabled, use internal code - all_cpupaths=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh -a cpupath) - # iterate over colon-separated list verifying if the architecture is present - # under $EESSI_PREFIX/software/$EESSI_OS_TYPE; if so use the architecture as best match - IFS=: read -r -a archs <<< "${all_cpupaths}" - for arch in "${archs[@]}"; do - if [ -d ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${arch} ]; then - export EESSI_SOFTWARE_SUBDIR=${arch} - show_msg "archdetect says ${EESSI_SOFTWARE_SUBDIR}" - break - fi - done - elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then - # note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined! - export EESSI_EPREFIX_PYTHON=$EESSI_EPREFIX/usr/bin/python3 - export EESSI_SOFTWARE_SUBDIR=$($EESSI_EPREFIX_PYTHON ${EESSI_INIT_DIR_PATH}/eessi_software_subdir_for_host.py $EESSI_PREFIX) - show_msg "archspec says ${EESSI_SOFTWARE_SUBDIR}" + if [ ! -z "$EESSI_SOFTWARE_SUBDIR_OVERRIDE" ]; then + # a specific software subdirectory is given, so we use that and don't + # detect it + export EESSI_SOFTWARE_SUBDIR=${EESSI_SOFTWARE_SUBDIR_OVERRIDE} + # however, check if the directory is present in the CernVM-FS repository + # and show a warning if it isn't + cvmfs_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR} + if [ ! -d ${cvmfs_software_path} ]; then + show_msg "Warning: EESSI_SOFTWARE_SUBDIR_OVERRIDE is '${EESSI_SOFTWARE_SUBDIR_OVERRIDE}'," + show_msg " but directory '${cvmfs_software_path}' does NOT exist in CernVM-FS repository" + fi else - error "Don't know how to detect host CPU, giving up!" + if [ "$EESSI_USE_ARCHDETECT" == "1" ]; then + # if archdetect is enabled, use internal code + all_cpupaths=$(${EESSI_INIT_DIR_PATH}/eessi_archdetect.sh -a cpupath) + # iterate over colon-separated list verifying if the architecture is present + # under $EESSI_PREFIX/software/$EESSI_OS_TYPE; if so use the architecture as best match + IFS=: read -r -a archs <<< "${all_cpupaths}" + for arch in "${archs[@]}"; do + if [ -d ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${arch} ]; then + export EESSI_SOFTWARE_SUBDIR=${arch} + show_msg "archdetect says ${EESSI_SOFTWARE_SUBDIR}" + break + fi + done + elif [ "$EESSI_USE_ARCHSPEC" == "1" ]; then + # note: eessi_software_subdir_for_host.py will pick up value from $EESSI_SOFTWARE_SUBDIR_OVERRIDE if it's defined! + export EESSI_EPREFIX_PYTHON=$EESSI_EPREFIX/usr/bin/python3 + export EESSI_SOFTWARE_SUBDIR=$($EESSI_EPREFIX_PYTHON ${EESSI_INIT_DIR_PATH}/eessi_software_subdir_for_host.py $EESSI_PREFIX) + show_msg "archspec says ${EESSI_SOFTWARE_SUBDIR}" + else + error "Don't know how to detect host CPU, giving up!" + fi fi if [ ! -z $EESSI_SOFTWARE_SUBDIR ]; then diff --git a/test_suite.sh b/test_suite.sh index 95eb9daa2a..3712c8c716 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -74,13 +74,17 @@ fi TMPDIR=$(mktemp -d) echo ">> Setting up environment..." -module --force purge -export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS) +# TODO test if module is declared, if so purge modules +#module --force purge +# TODO why do we need to set this here? should already be defined by bot +#export EESSI_SOFTWARE_SUBDIR_OVERRIDE=$(python3 $TOPDIR/eessi_software_subdir.py $DETECTION_PARAMETERS) +echo "EESSI_SOFTWARE_SUBDIR_OVERRIDE='${EESSI_SOFTWARE_SUBDIR_OVERRIDE}'" source $TOPDIR/init/bash # Load the ReFrame module # Currently, we load the default version. Maybe we should somehow make this configurable in the future? +# TODO what if no ReFrame module is available yet? --> FAILURE with description? module load ReFrame if [[ $? -eq 0 ]]; then echo_green ">> Loaded ReFrame module" diff --git a/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.all.output b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.all.output new file mode 100644 index 0000000000..e1bbd79e4a --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.all.output @@ -0,0 +1 @@ +x86_64/amd/zen4:x86_64/amd/zen3:x86_64/amd/zen2:x86_64/generic diff --git a/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.cpuinfo b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.cpuinfo new file mode 100644 index 0000000000..4a97da862c --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.cpuinfo @@ -0,0 +1,27 @@ +processor : 0 +vendor_id : AuthenticAMD +cpu family : 25 +model : 17 +model name : AMD EPYC 9V33X 96-Core Processor +stepping : 1 +microcode : 0xffffffff +cpu MHz : 3705.853 +cache size : 1024 KB +physical id : 0 +siblings : 88 +core id : 0 +cpu cores : 88 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 13 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core invpcid_single vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx512_bf16 clzero xsaveerptr arat npt nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload avx512vbmi umip avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid fsrm +bugs : sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass +bogomips : 5100.08 +TLB size : 3584 4K pages +clflush size : 64 +cache_alignment : 64 +address sizes : 48 bits physical, 48 bits virtual +power management: diff --git a/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.output b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.output new file mode 100644 index 0000000000..950740a78c --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Azure-Alma8-9V33X.output @@ -0,0 +1 @@ +x86_64/amd/zen4 diff --git a/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.all.output b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.all.output new file mode 100644 index 0000000000..e1bbd79e4a --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.all.output @@ -0,0 +1 @@ +x86_64/amd/zen4:x86_64/amd/zen3:x86_64/amd/zen2:x86_64/generic diff --git a/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.cpuinfo b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.cpuinfo new file mode 100644 index 0000000000..f28381d7a2 --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.cpuinfo @@ -0,0 +1,27 @@ +processor : 0 +vendor_id : AuthenticAMD +cpu family : 25 +model : 17 +model name : AMD EPYC 9654 96-Core Processor +stepping : 1 +microcode : 0xa10113e +cpu MHz : 3699.993 +cache size : 1024 KB +physical id : 0 +siblings : 96 +core id : 0 +cpu cores : 96 +apicid : 0 +initial apicid : 0 +fpu : yes +fpu_exception : yes +cpuid level : 16 +wp : yes +flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba perfmon_v2 ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a avx512f avx512dq rdseed adx avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local avx512_bf16 clzero irperf xsaveerptr wbnoinvd amd_ppin cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl avx512vbmi umip pku ospke avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid overflow_recov succor smca fsrm flush_l1d +bugs : sysret_ss_attrs spectre_v1 spectre_v2 spec_store_bypass +bogomips : 4799.99 +TLB size : 3584 4K pages +clflush size : 64 +cache_alignment : 64 +address sizes : 52 bits physical, 57 bits virtual +power management: ts ttp tm hwpstate cpb eff_freq_ro [13] [14] diff --git a/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.output b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.output new file mode 100644 index 0000000000..950740a78c --- /dev/null +++ b/tests/archdetect/x86_64/amd/zen4/Shinx-RHEL8-9654.output @@ -0,0 +1 @@ +x86_64/amd/zen4