AWS Kode skrip konfigurasi node login multi-cluster PCS - AWS PCS

Terjemahan disediakan oleh mesin penerjemah. Jika konten terjemahan yang diberikan bertentangan dengan versi bahasa Inggris aslinya, utamakan versi bahasa Inggris.

AWS Kode skrip konfigurasi node login multi-cluster PCS

Simpan kode sumber berikut ke file dengan nama berikut:

pcs-multi-cluster-login-configure.sh

Kode sumber skrip

#!/bin/bash # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # AWS PCS Multi-Cluster Standalone Login Node Configuration Script # # This script configures AWS Parallel Computing Service (PCS) multi-cluster stand alone login nodes # by setting up the Slurm authentication and credential kiosk daemon (sackd) # for connecting to remote PCS clusters. # # Prerequisites: # - AWS CLI configured with appropriate permissions # - Slurm version 25.05 or later # - Root privileges for system configuration # - Network connectivity to AWS PCS endpoints set -eo pipefail # Function to display usage usage() { echo "Usage: $0 --cluster-identifier <cluster-identifier> [--endpoint-url <endpoint-url>]" echo " $0 -h|--help" } # Function to display help help() { echo "AWS PCS Multi-Cluster Standalone Login Node Configuration Script" echo "===============================================" echo echo "This script configures multi-cluster standalone login node for AWS Parallel Computing Service (PCS)" echo "by setting up the Slurm authentication and credential kiosk daemon (sackd)." echo usage echo echo "Options:" echo " --cluster-identifier <id> AWS PCS cluster identifier (required)" echo " --endpoint-url <url> Custom PCS endpoint URL (optional)" echo " -h, --help Show this help message" echo echo "Examples:" echo " $0 --cluster-identifier my-pcs-cluster" echo echo "Note: This script requires root privileges and Slurm version 25.05 or later." } # Function to retrieve authentication key get_auth_key() { if [ "$ALTERNATE_SECRET_RETRIEVAL" = "true" ]; then echo "Retrieving authentication key from AWS Secrets Manager..." >&2 local auth_key_arn=$(echo "$CLUSTER_INFO" | jq -r '.cluster.slurmConfiguration.authKey.secretArn') local auth_key_version=$(echo "$CLUSTER_INFO" | jq -r '.cluster.slurmConfiguration.authKey.secretVersion') if [ "$auth_key_arn" = "null" ] || [ "$auth_key_version" = "null" ]; then echo "Error: Auth key information not found in cluster configuration" >&2 exit 1 fi if ! aws secretsmanager get-secret-value --secret-id "$auth_key_arn" --version-id "$auth_key_version" --query SecretString --output text --region "$REGION" 2>/dev/null; then echo "Error: Failed to retrieve auth key from Secrets Manager" >&2 exit 1 fi else echo "Please enter the base64-encoded Slurm authentication key:" >&2 echo -n "Base64 of the Slurm secret key: " >&2 local key read -rs key echo >&2 echo "$key" fi } # Function to get next available SACKD port get_next_sackd_port() { local exclude_file="$1" local port=6918 local used_ports=() # Get all currently used SACKD ports into an array while IFS= read -r line; do used_ports+=("$line") done < <(find /etc/sysconfig -name "sackd-pcs-*" ! -path "$exclude_file" \ -exec grep SACKD_PORT= '{}' ';' 2>/dev/null | \ sed 's/.*SACKD_PORT=//' | sort -n) # Loop through used ports to find first available port for used_port in "${used_ports[@]}"; do if [ "$port" -lt "$used_port" ]; then break elif [ "$port" -eq "$used_port" ]; then ((port++)) fi done echo "$port" } # Function to configure cluster configure_cluster() { mkdir -p /etc/slurm SLURM_JWKS_FILE="/etc/slurm/slurm-${CLUSTER_NAME}.jwks" echo '{"keys":[{"alg":"HS256","kty":"oct","kid":"key-'"${CLUSTER_ID}"'","k":"'"${BASE64_SLURM_KEY}"'"}]}' | jq -c '.' > "${SLURM_JWKS_FILE}" chmod 0600 "$SLURM_JWKS_FILE" chown slurm:slurm "$SLURM_JWKS_FILE" SLURM_INSTALL_PATH="/opt/aws/pcs/scheduler/slurm-${SLURM_VERSION}" SACKD_RUNTIME_DIRECTORY="/run/slurm-${CLUSTER_NAME}" mkdir -p "${SACKD_RUNTIME_DIRECTORY}" chown slurm:slurm "${SACKD_RUNTIME_DIRECTORY}" mkdir -p /etc/sysconfig SACKD_SERVICE_NAME="sackd-pcs-${CLUSTER_NAME}" SACKD_SERVICE_ENV="/etc/sysconfig/${SACKD_SERVICE_NAME}" SACKD_PORT=$(get_next_sackd_port "$SACKD_SERVICE_ENV") cat > "${SACKD_SERVICE_ENV}" << EOF SACKD_OPTIONS='--conf-server=$ENDPOINTS' SLURM_SACK_JWKS='$SLURM_JWKS_FILE' RUNTIME_DIRECTORY='$SACKD_RUNTIME_DIRECTORY' SACKD_PORT=$SACKD_PORT EOF SACKD_SERVICE_PATH="/etc/systemd/system/${SACKD_SERVICE_NAME}.service" cat << EOF > "$SACKD_SERVICE_PATH" [Unit] Description=Slurm auth and cred kiosk daemon After=network-online.target remote-fs.target Wants=network-online.target ConditionPathExists=${SACKD_SERVICE_ENV} [Service] Type=notify EnvironmentFile=${SACKD_SERVICE_ENV} User=slurm Group=slurm RuntimeDirectory=slurm-${CLUSTER_NAME} RuntimeDirectoryMode=0755 ExecStart=${SLURM_INSTALL_PATH}/sbin/sackd --systemd \$SACKD_OPTIONS ExecReload=/bin/kill -HUP \$MAINPID KillMode=process LimitNOFILE=131072 LimitMEMLOCK=infinity LimitSTACK=infinity [Install] WantedBy=multi-user.target EOF chown root:root "$SACKD_SERVICE_PATH" chmod 0644 "$SACKD_SERVICE_PATH" systemctl daemon-reload && systemctl enable "$SACKD_SERVICE_NAME" systemctl restart "$SACKD_SERVICE_NAME" ACTIVATE_SCRIPT="activate-pcs-${CLUSTER_NAME}" cat > "$ACTIVATE_SCRIPT" << EOF # Activate script for Slurm cluster ${CLUSTER_NAME} # Add Slurm paths export PATH="${SLURM_INSTALL_PATH}/bin:\$PATH" export MANPATH="${SLURM_INSTALL_PATH}/share/man:\$MANPATH" export LD_LIBRARY_PATH="${SLURM_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" ldconfig # Set Slurm configuration export SLURM_CONF="/run/slurm-${CLUSTER_NAME}/conf/slurm.conf" export PCS_CLUSTER_NAME="${CLUSTER_NAME}" export PCS_CLUSTER_IDENTIFIER="${CLUSTER_IDENTIFIER}" export PCS_CLUSTER_ID="${CLUSTER_ID}" echo "Activated PCS cluster environment: ${CLUSTER_NAME}" # Deactivate function function deactivate-pcs-${CLUSTER_NAME}() { export PATH="\$(echo "\$PATH" | sed -e "s|${SLURM_INSTALL_PATH}/bin:||g" -e "s|:${SLURM_INSTALL_PATH}/bin||g" -e "s|^${SLURM_INSTALL_PATH}/bin\$||")" export MANPATH="\$(echo "\$MANPATH" | sed -e "s|${SLURM_INSTALL_PATH}/share/man:||g" -e "s|:${SLURM_INSTALL_PATH}/share/man||g" -e "s|^${SLURM_INSTALL_PATH}/share/man\$||")" export LD_LIBRARY_PATH="\$(echo "\$LD_LIBRARY_PATH" | sed -e "s|${SLURM_INSTALL_PATH}/lib:||g" -e "s|:${SLURM_INSTALL_PATH}/lib||g" -e "s|^${SLURM_INSTALL_PATH}/lib\$||")" unset SLURM_CONF unset PCS_CLUSTER_NAME unset PCS_CLUSTER_IDENTIFIER unset PCS_CLUSTER_ID unset -f deactivate-pcs-${CLUSTER_NAME} ldconfig echo "Deactivated PCS cluster environment: ${CLUSTER_NAME}" } export -f deactivate-pcs-${CLUSTER_NAME} EOF } # Main function main() { # Parse arguments CLUSTER_IDENTIFIER="" PCS_ENDPOINT_URL="" while [ "$1" != "" ]; do case $1 in --cluster-identifier) shift CLUSTER_IDENTIFIER="$1" ;; --endpoint-url) shift PCS_ENDPOINT_URL="--endpoint-url $1" ;; -h|--help) help exit 0 ;; *) echo "Invalid argument: $1" >&2 usage >&2 exit 1 ;; esac shift done # Validate required arguments if [ -z "$CLUSTER_IDENTIFIER" ]; then echo "Error: --cluster-identifier is required" >&2 usage >&2 exit 1 fi # Validate running as root if [ "$EUID" -ne 0 ]; then echo "Error: This script must be run as root" >&2 exit 1 fi # Validate required commands are available for cmd in aws jq curl; do if ! command -v "$cmd" &> /dev/null; then echo "Error: Required command '$cmd' not found" >&2 exit 1 fi done # Get the region name from IMDS v2 with error handling (try IPv6 first, fallback to IPv4) echo "Retrieving AWS region from instance metadata..." # Try IPv6 IMDS endpoint first (fd00:ec2::254) with fast timeout (1s connect, 2s total) # If IPv6 fails, fallback to IPv4 IMDS endpoint (169.254.169.254) IMDS_ENDPOINT="http://[fd00:ec2::254]" if ! TOKEN=$(curl -s -X PUT "${IMDS_ENDPOINT}/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" --connect-timeout 1 --max-time 2 2>/dev/null); then IMDS_ENDPOINT="http://169.254.169.254" if ! TOKEN=$(curl -s -X PUT "${IMDS_ENDPOINT}/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" --max-time 5); then echo "Error: Failed to retrieve IMDS token. Ensure this script is running on an EC2 instance." >&2 exit 1 fi fi if ! REGION=$(curl -s -H "X-aws-ec2-metadata-token: $TOKEN" "${IMDS_ENDPOINT}/latest/dynamic/instance-identity/document" --max-time 5 | jq -r '.region'); then echo "Error: Failed to retrieve AWS region from instance metadata" >&2 exit 1 fi echo "Detected AWS region: $REGION" # Retrieve cluster information from AWS PCS echo "Retrieving cluster information for: $CLUSTER_IDENTIFIER" # shellcheck disable=SC2086 if ! CLUSTER_INFO=$(aws pcs get-cluster --region "$REGION" --cluster-identifier "$CLUSTER_IDENTIFIER" $PCS_ENDPOINT_URL 2>/dev/null); then echo "Error: Failed to retrieve cluster information. Check cluster identifier and AWS permissions." >&2 exit 1 fi CLUSTER_ID=$(echo "$CLUSTER_INFO" | jq -r '.cluster.id') CLUSTER_NAME="$(echo "$CLUSTER_INFO" | jq -r '.cluster.name')" SLURM_VERSION=$(echo "$CLUSTER_INFO" | jq -r '.cluster.scheduler.version') SLURM_VERSION=${SLURM_VERSION#Slurm_} # Check if Slurm version is >= 25.05 # shellcheck disable=SC2072 if [[ "$SLURM_VERSION" < "25.05" ]]; then echo "Error: This script requires Slurm version 25.05 or later. Found version: $SLURM_VERSION" >&2 exit 1 fi ENDPOINTS=$(echo "$CLUSTER_INFO" | jq -r '.cluster.endpoints[] | select(.type == "SLURMCTLD") | (if .privateIpAddress != "" then .privateIpAddress else "[" + .ipv6Address + "]" end) + ":" + .port' | tr '\n' ',' | sed 's/,$//') # Get BASE64_SLURM_KEY BASE64_SLURM_KEY=$(get_auth_key) if [ -z "$BASE64_SLURM_KEY" ]; then echo "Error: base64 Slurm key cannot be empty" >&2 exit 1 fi configure_cluster # Final configuration summary echo "========================================" echo "Configuration completed successfully!" echo "========================================" echo "Cluster Name: $CLUSTER_NAME" echo "Cluster ID: $CLUSTER_ID" echo "Slurm Version: $SLURM_VERSION" echo "Service Name: $SACKD_SERVICE_NAME" echo "SACKD Port: $SACKD_PORT" echo echo "To activate this cluster environment, run:" echo " source ./$ACTIVATE_SCRIPT" echo echo "To deactivate this cluster environment, run:" echo " deactivate-pcs-${CLUSTER_NAME}" echo echo "To check service status:" echo " systemctl status $SACKD_SERVICE_NAME" echo echo "To view service logs:" echo " journalctl -u $SACKD_SERVICE_NAME -f" } # Exit if being sourced for testing [[ "${BASH_SOURCE[0]}" != "${0}" ]] && return # Execute main function main "$@"