AWS Código de script de configuración del nodo de inicio de sesión multiclúster PCS - AWS PCS

Las traducciones son generadas a través de traducción automática. En caso de conflicto entre la traducción y la version original de inglés, prevalecerá la version en inglés.

AWS Código de script de configuración del nodo de inicio de sesión multiclúster PCS

Guarde el siguiente código fuente en un archivo con el siguiente nombre:

pcs-multi-cluster-login-configure.sh

Código fuente del script

#!/bin/bash # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # AWS PCS Multi-Cluster Standalone Login Node Configuration Script # # This script configures AWS Parallel Computing Service (PCS) multi-cluster stand alone login nodes # by setting up the Slurm authentication and credential kiosk daemon (sackd) # for connecting to remote PCS clusters. # # Prerequisites: # - AWS CLI configured with appropriate permissions # - Slurm version 25.05 or later # - Root privileges for system configuration # - Network connectivity to AWS PCS endpoints set -eo pipefail # Function to display usage usage() { echo "Usage: $0 --cluster-identifier <cluster-identifier> [--endpoint-url <endpoint-url>]" echo " $0 -h|--help" } # Function to display help help() { echo "AWS PCS Multi-Cluster Standalone Login Node Configuration Script" echo "===============================================" echo echo "This script configures multi-cluster standalone login node for AWS Parallel Computing Service (PCS)" echo "by setting up the Slurm authentication and credential kiosk daemon (sackd)." echo usage echo echo "Options:" echo " --cluster-identifier <id> AWS PCS cluster identifier (required)" echo " --endpoint-url <url> Custom PCS endpoint URL (optional)" echo " -h, --help Show this help message" echo echo "Examples:" echo " $0 --cluster-identifier my-pcs-cluster" echo echo "Note: This script requires root privileges and Slurm version 25.05 or later." } # Function to retrieve authentication key get_auth_key() { if [ "$ALTERNATE_SECRET_RETRIEVAL" = "true" ]; then echo "Retrieving authentication key from AWS Secrets Manager..." >&2 local auth_key_arn=$(echo "$CLUSTER_INFO" | jq -r '.cluster.slurmConfiguration.authKey.secretArn') local auth_key_version=$(echo "$CLUSTER_INFO" | jq -r '.cluster.slurmConfiguration.authKey.secretVersion') if [ "$auth_key_arn" = "null" ] || [ "$auth_key_version" = "null" ]; then echo "Error: Auth key information not found in cluster configuration" >&2 exit 1 fi if ! aws secretsmanager get-secret-value --secret-id "$auth_key_arn" --version-id "$auth_key_version" --query SecretString --output text --region "$REGION" 2>/dev/null; then echo "Error: Failed to retrieve auth key from Secrets Manager" >&2 exit 1 fi else echo "Please enter the base64-encoded Slurm authentication key:" >&2 echo -n "Base64 of the Slurm secret key: " >&2 local key read -rs key echo >&2 echo "$key" fi } # Function to get next available SACKD port get_next_sackd_port() { local exclude_file="$1" local port=6918 local used_ports=() # Get all currently used SACKD ports into an array while IFS= read -r line; do used_ports+=("$line") done < <(find /etc/sysconfig -name "sackd-pcs-*" ! -path "$exclude_file" \ -exec grep SACKD_PORT= '{}' ';' 2>/dev/null | \ sed 's/.*SACKD_PORT=//' | sort -n) # Loop through used ports to find first available port for used_port in "${used_ports[@]}"; do if [ "$port" -lt "$used_port" ]; then break elif [ "$port" -eq "$used_port" ]; then ((port++)) fi done echo "$port" } # Function to configure cluster configure_cluster() { mkdir -p /etc/slurm SLURM_JWKS_FILE="/etc/slurm/slurm-${CLUSTER_NAME}.jwks" echo '{"keys":[{"alg":"HS256","kty":"oct","kid":"key-'"${CLUSTER_ID}"'","k":"'"${BASE64_SLURM_KEY}"'"}]}' | jq -c '.' > "${SLURM_JWKS_FILE}" chmod 0600 "$SLURM_JWKS_FILE" chown slurm:slurm "$SLURM_JWKS_FILE" SLURM_INSTALL_PATH="/opt/aws/pcs/scheduler/slurm-${SLURM_VERSION}" SACKD_RUNTIME_DIRECTORY="/run/slurm-${CLUSTER_NAME}" mkdir -p "${SACKD_RUNTIME_DIRECTORY}" chown slurm:slurm "${SACKD_RUNTIME_DIRECTORY}" mkdir -p /etc/sysconfig SACKD_SERVICE_NAME="sackd-pcs-${CLUSTER_NAME}" SACKD_SERVICE_ENV="/etc/sysconfig/${SACKD_SERVICE_NAME}" SACKD_PORT=$(get_next_sackd_port "$SACKD_SERVICE_ENV") cat > "${SACKD_SERVICE_ENV}" << EOF SACKD_OPTIONS='--conf-server=$ENDPOINTS' SLURM_SACK_JWKS='$SLURM_JWKS_FILE' RUNTIME_DIRECTORY='$SACKD_RUNTIME_DIRECTORY' SACKD_PORT=$SACKD_PORT EOF SACKD_SERVICE_PATH="/etc/systemd/system/${SACKD_SERVICE_NAME}.service" cat << EOF > "$SACKD_SERVICE_PATH" [Unit] Description=Slurm auth and cred kiosk daemon After=network-online.target remote-fs.target Wants=network-online.target ConditionPathExists=${SACKD_SERVICE_ENV} [Service] Type=notify EnvironmentFile=${SACKD_SERVICE_ENV} User=slurm Group=slurm RuntimeDirectory=slurm-${CLUSTER_NAME} RuntimeDirectoryMode=0755 ExecStart=${SLURM_INSTALL_PATH}/sbin/sackd --systemd \$SACKD_OPTIONS ExecReload=/bin/kill -HUP \$MAINPID KillMode=process LimitNOFILE=131072 LimitMEMLOCK=infinity LimitSTACK=infinity [Install] WantedBy=multi-user.target EOF chown root:root "$SACKD_SERVICE_PATH" chmod 0644 "$SACKD_SERVICE_PATH" systemctl daemon-reload && systemctl enable "$SACKD_SERVICE_NAME" systemctl restart "$SACKD_SERVICE_NAME" ACTIVATE_SCRIPT="activate-pcs-${CLUSTER_NAME}" cat > "$ACTIVATE_SCRIPT" << EOF # Activate script for Slurm cluster ${CLUSTER_NAME} # Add Slurm paths export PATH="${SLURM_INSTALL_PATH}/bin:\$PATH" export MANPATH="${SLURM_INSTALL_PATH}/share/man:\$MANPATH" export LD_LIBRARY_PATH="${SLURM_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" ldconfig # Set Slurm configuration export SLURM_CONF="/run/slurm-${CLUSTER_NAME}/conf/slurm.conf" export PCS_CLUSTER_NAME="${CLUSTER_NAME}" export PCS_CLUSTER_IDENTIFIER="${CLUSTER_IDENTIFIER}" export PCS_CLUSTER_ID="${CLUSTER_ID}" echo "Activated PCS cluster environment: ${CLUSTER_NAME}" # Deactivate function function deactivate-pcs-${CLUSTER_NAME}() { export PATH="\$(echo "\$PATH" | sed -e "s|${SLURM_INSTALL_PATH}/bin:||g" -e "s|:${SLURM_INSTALL_PATH}/bin||g" -e "s|^${SLURM_INSTALL_PATH}/bin\$||")" export MANPATH="\$(echo "\$MANPATH" | sed -e "s|${SLURM_INSTALL_PATH}/share/man:||g" -e "s|:${SLURM_INSTALL_PATH}/share/man||g" -e "s|^${SLURM_INSTALL_PATH}/share/man\$||")" export LD_LIBRARY_PATH="\$(echo "\$LD_LIBRARY_PATH" | sed -e "s|${SLURM_INSTALL_PATH}/lib:||g" -e "s|:${SLURM_INSTALL_PATH}/lib||g" -e "s|^${SLURM_INSTALL_PATH}/lib\$||")" unset SLURM_CONF unset PCS_CLUSTER_NAME unset PCS_CLUSTER_IDENTIFIER unset PCS_CLUSTER_ID unset -f deactivate-pcs-${CLUSTER_NAME} ldconfig echo "Deactivated PCS cluster environment: ${CLUSTER_NAME}" } export -f deactivate-pcs-${CLUSTER_NAME} EOF } # Main function main() { # Parse arguments CLUSTER_IDENTIFIER="" PCS_ENDPOINT_URL="" while [ "$1" != "" ]; do case $1 in --cluster-identifier) shift CLUSTER_IDENTIFIER="$1" ;; --endpoint-url) shift PCS_ENDPOINT_URL="--endpoint-url $1" ;; -h|--help) help exit 0 ;; *) echo "Invalid argument: $1" >&2 usage >&2 exit 1 ;; esac shift done # Validate required arguments if [ -z "$CLUSTER_IDENTIFIER" ]; then echo "Error: --cluster-identifier is required" >&2 usage >&2 exit 1 fi # Validate running as root if [ "$EUID" -ne 0 ]; then echo "Error: This script must be run as root" >&2 exit 1 fi # Validate required commands are available for cmd in aws jq curl; do if ! command -v "$cmd" &> /dev/null; then echo "Error: Required command '$cmd' not found" >&2 exit 1 fi done # Get the region name from IMDS v2 with error handling (try IPv6 first, fallback to IPv4) echo "Retrieving AWS region from instance metadata..." # Try IPv6 IMDS endpoint first (fd00:ec2::254) with fast timeout (1s connect, 2s total) # If IPv6 fails, fallback to IPv4 IMDS endpoint (169.254.169.254) IMDS_ENDPOINT="http://[fd00:ec2::254]" if ! TOKEN=$(curl -s -X PUT "${IMDS_ENDPOINT}/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" --connect-timeout 1 --max-time 2 2>/dev/null); then IMDS_ENDPOINT="http://169.254.169.254" if ! TOKEN=$(curl -s -X PUT "${IMDS_ENDPOINT}/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" --max-time 5); then echo "Error: Failed to retrieve IMDS token. Ensure this script is running on an EC2 instance." >&2 exit 1 fi fi if ! REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" "${IMDS_ENDPOINT}/latest/dynamic/instance-identity/document" --max-time 5 | jq -r '.region'); then echo "Error: Failed to retrieve AWS region from instance metadata" >&2 exit 1 fi echo "Detected AWS region: $REGION" # Retrieve cluster information from AWS PCS echo "Retrieving cluster information for: $CLUSTER_IDENTIFIER" # shellcheck disable=SC2086 if ! CLUSTER_INFO=$(aws pcs get-cluster --region "$REGION" --cluster-identifier "$CLUSTER_IDENTIFIER" $PCS_ENDPOINT_URL 2>/dev/null); then echo "Error: Failed to retrieve cluster information. Check cluster identifier and AWS permissions." >&2 exit 1 fi CLUSTER_ID=$(echo "$CLUSTER_INFO" | jq -r '.cluster.id') CLUSTER_NAME="$(echo "$CLUSTER_INFO" | jq -r '.cluster.name')" SLURM_VERSION=$(echo "$CLUSTER_INFO" | jq -r '.cluster.scheduler.version') SLURM_VERSION=${SLURM_VERSION#Slurm_} # Check if Slurm version is >= 25.05 # shellcheck disable=SC2072 if [[ "$SLURM_VERSION" < "25.05" ]]; then echo "Error: This script requires Slurm version 25.05 or later. Found version: $SLURM_VERSION" >&2 exit 1 fi ENDPOINTS=$(echo "$CLUSTER_INFO" | jq -r '.cluster.endpoints[] | select(.type == "SLURMCTLD") | (if .privateIpAddress != "" then .privateIpAddress else "[" + .ipv6Address + "]" end) + ":" + .port' | tr '\n' ',' | sed 's/,$//') # Get BASE64_SLURM_KEY BASE64_SLURM_KEY=$(get_auth_key) if [ -z "$BASE64_SLURM_KEY" ]; then echo "Error: base64 Slurm key cannot be empty" >&2 exit 1 fi configure_cluster # Final configuration summary echo "========================================" echo "Configuration completed successfully!" echo "========================================" echo "Cluster Name: $CLUSTER_NAME" echo "Cluster ID: $CLUSTER_ID" echo "Slurm Version: $SLURM_VERSION" echo "Service Name: $SACKD_SERVICE_NAME" echo "SACKD Port: $SACKD_PORT" echo echo "To activate this cluster environment, run:" echo " source ./$ACTIVATE_SCRIPT" echo echo "To deactivate this cluster environment, run:" echo " deactivate-pcs-${CLUSTER_NAME}" echo echo "To check service status:" echo " systemctl status $SACKD_SERVICE_NAME" echo echo "To view service logs:" echo " journalctl -u $SACKD_SERVICE_NAME -f" } # Exit if being sourced for testing [[ "${BASH_SOURCE[0]}" != "${0}" ]] && return # Execute main function main "$@"