Terjemahan disediakan oleh mesin penerjemah. Jika konten terjemahan yang diberikan bertentangan dengan versi bahasa Inggris aslinya, utamakan versi bahasa Inggris.
SageMaker HyperPod FAQs
-
-
"logs": { "logs_collected": { "files": { "collect_list": [ { "file_path": "/var/log/provision/provisioning.log", "log_group_name": "/aws/sagemaker/Clusters/[ClusterName]/[ClusterID]", "log_stream_name": "LifecycleConfig/[InstanceGroupName]/{instance_id}", "retention_in_days": -1 } ] } }, "force_flush_interval": 3 } -
"file_path": "/var/log/custom-provision/custom-provisioning.log" -
sudo /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl \ -a fetch-config -m ec2 -s -c \ file:/opt/aws/amazon-cloudwatch-agent/sagemaker_cwagent_config.json
-
penting
-
TaskPlugin=task/none SchedulerParameters=permit_job_expansion
Untuk mempelajari selengkapnya, lihat dan .
-
Berikut adalah contoh skrip.
#!/bin/bash : <<'SUMMARY' Script: epilog.sh Use this script with caution, as it can potentially delete unnecessary resources and cause issues if you don't use it correctly. Note: You must save this script in a shared in a shared location that is accessible to all nodes in the cluster, such as/fsx volume. Workers must be able to access the script to run the script after jobs. SUMMARY # Define the log directory and create it if it doesn't exist LOG_DIR="/<PLACEHOLDER>/epilogue" #NOTE: UpdatePLACEHOLDERto be a shared value path, such as/fsx/epilogue. mkdir -p "$LOG_DIR" # Name the log file using the Slurm job name and job ID log_file="$LOG_DIR/epilogue-${SLURM_JOB_NAME}_${SLURM_JOB_ID}.log" logging() { echo "[$(date)] $1" | tee -a "$log_file" } # Slurm epilogue script to clean up IPC resources logging "Starting IPC cleanup for Job $SLURM_JOB_ID" # Clean up shared memory segments by username for seg in $(ipcs -m | awk -v owner="$SLURM_JOB_USER" '$3 == owner {print $2}'); do if ipcrm -m "$seg"; then logging "Removed shared memory segment $seg" else logging "Failed to remove shared memory segment $seg" fi done # Clean up semaphores by username for sem in $(ipcs -s | awk -v user="$SLURM_JOB_USER" '$3 == user {print $2}'); do if ipcrm -s "$sem"; then logging "Removed semaphore $sem" else logging "Failed to remove semaphore $sem" fi done # Clean up NCCL IPC NCCL_IPC_PATH="/dev/shm/nccl-*" for file in $NCCL_IPC_PATH; do if [ -e "$file" ]; then if rm "$file"; then logging "Removed NCCL IPC file $file" else logging "Failed to remove NCCL IPC file $file" fi fi done logging "IPC cleanup completed for Job $SLURM_JOB_ID" exit 0 -
Epilog="/path/to/epilog.sh" #For example: /fsx/epilogue/epilog.sh -
chown slurm:slurm /path/to/epilog.sh chmod +x /path/to/epilog.sh
TaskPlugin=task/cgroup
CgroupAutomount=yes ConstrainCores=yes CgroupPlugin=autodetect ConstrainDevices=yes ConstrainRAMSpace=yes ConstrainSwapSpace=yes SignalChildrenProcesses=yes MaxRAMPercent=99 MaxSwapPercent=80 MinRAMSpace=100