diff --git a/bot/build.sh b/bot/build.sh index 5667c06f..69b4c9f8 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -261,13 +261,6 @@ declare -a BUILD_STEP_ARGS=() BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") -# add options required to handle NVIDIA support -if nvidia_gpu_available; then - BUILD_STEP_ARGS+=("--nvidia" "all") -else - BUILD_STEP_ARGS+=("--nvidia" "install") -fi - # Retain location for host injections so we don't reinstall CUDA # (Always need to run the driver installation as available driver may change) if [[ ! -z ${SHARED_FS_PATH} ]]; then @@ -294,19 +287,37 @@ else # prepend accel/ to all array elements EESSI_ACCELERATOR_TARGET_OVERRIDES=("${ACCEL_OVERRIDES_ARRAY[@]/#/accel/}") fi +RESUME_DIR="" + for ACCEL_OVERRIDE in "${EESSI_ACCELERATOR_TARGET_OVERRIDES[@]}"; do + # copy the common build step arguments to a a + BUILD_STEP_ARGS_ACCEL=("${BUILD_STEP_ARGS[@]}") + if [[ "${ACCEL_OVERRIDE}" == "accel/nvidia/"* ]]; then + nvidia_cc=${ACCEL_OVERRIDE##*/cc} + # add options required to handle NVIDIA support + # only make the GPU available in the container if the host has a GPU and it has the correct compute capability + if nvidia_gpu_available && nvidia_gpu_has_compute_capability "${nvidia_cc}" ; then + BUILD_STEP_ARGS_ACCEL+=("--nvidia" "all") + else + BUILD_STEP_ARGS_ACCEL+=("--nvidia" "install") + fi + fi + # resume from the previous accelerator's build directory + # as we want to combine all accelerator builds into a single tarball in the end + if [[ ! -z "${RESUME_DIR}" ]]; then + BUILD_STEP_ARGS_ACCEL+=("--resume" "${RESUME_DIR}") + fi + export EESSI_ACCELERATOR_TARGET_OVERRIDE="${ACCEL_OVERRIDE}" echo "bot/build.sh: EESSI_ACCELERATOR_TARGET_OVERRIDE='${ACCEL_OVERRIDE}'" echo "Executing command to build software:" - echo "$software_layer_dir/eessi_container.sh ${COMMON_ARGS[@]} ${BUILD_STEP_ARGS[@]}" + echo "$software_layer_dir/eessi_container.sh ${COMMON_ARGS[@]} ${BUILD_STEP_ARGS_ACCEL[@]}" echo " -- $software_layer_dir/install_software_layer.sh \"${INSTALL_SCRIPT_ARGS[@]}\" \"$@\" 2>&1 | tee -a ${build_outerr}" - $software_layer_dir/eessi_container.sh "${COMMON_ARGS[@]}" "${BUILD_STEP_ARGS[@]}" \ + $software_layer_dir/eessi_container.sh "${COMMON_ARGS[@]}" "${BUILD_STEP_ARGS_ACCEL[@]}" \ -- $software_layer_dir/install_software_layer.sh "${INSTALL_SCRIPT_ARGS[@]}" "$@" 2>&1 | tee -a ${build_outerr} # determine temporary directory to resume from for the next accelerator, - # as we want to combine all accelerator builds into a single tarball in the end - BUILD_TMPDIR=$(grep ' as tmp directory ' ${build_outerr} | cut -d ' ' -f 2) - BUILD_STEP_ARGS+=("--resume" "${BUILD_TMPDIR}") + RESUME_DIR=$(grep ' as tmp directory ' ${build_outerr} | cut -d ' ' -f 2) done # prepare directory to store tarball of tmp for tarball step diff --git a/scripts/utils.sh b/scripts/utils.sh index 51fb2155..d5bcfe8d 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -166,3 +166,34 @@ function nvidia_gpu_available { return 2 fi } + +function nvidia_gpu_has_compute_capability { + # Ensure we are given a single compute capability argument + if [ $# -ne 1 ]; then + echo_red "Function requires a single compute capability argument" >&2 + return $ANY_ERROR_EXITCODE + fi + # Remove period (if present) from the given compute capability, i.e. 8.0 -> 80 + requested_cc=${1//./} + # We are careful here in case we are running in a container and LD_LIBARY_PATH has been wiped. + mapfile -t gpu_ccs < <(LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" nvidia-smi --query-gpu=compute_cap --format=noheader) + # Remove the periods from all compute capabilities + gpu_ccs=("${gpu_ccs[@]//./}") + # On a multi-GPU system we may get the compute capabilities of all GPUs, one per line. + # In that case we print a warning and check the first GPU. + if [ ${#gpu_ccs[@]} -eq 0 ]; then + echo_red "Error: querying for the GPU's compute capability did not return anything." + return 1 + else + if [ ${#gpu_ccs[@]} -gt 1 ]; then + echo_yellow "Warning: multiple GPUs detected, checking the compute capability of the first GPU". + fi + if [ "$requested_cc" == "${gpu_ccs[0]}" ]; then + echo_green "Requested compute capability matches the one from the GPU." + return 0 + else + echo_red "Error: the compute capability of the GPU (${gpu_ccs[0]}) does not match the requested compute capability ($requested_cc)." + return 2 + fi + fi +}