사전 구성된 알림 - Amazon SageMaker AI

기계 번역으로 제공되는 번역입니다. 제공된 번역과 원본 영어의 내용이 상충하는 경우에는 영어 버전이 우선합니다.

사전 구성된 알림

Amazon SageMaker HyperPod(SageMaker HyperPod) 관찰성 추가 기능을 사용하면 클러스터 및 워크로드에 대한 기본 알림을 통해 시스템이 클러스터 성능 저하의 일반적인 초기 지표를 감지하면 사용자에게 알릴 수 있습니다. 이러한 알림은 Amazon Managed Grafana 기본 제공 알림 시스템 내에 정의됩니다. 이러한 사전 구성된 알림을 수정하거나 새 알림을 생성하는 방법에 대한 자세한 내용은 Amazon Managed Grafana 사용 설명서의 Grafana 버전 10의 알림을 참조하세요. 다음 YAML은 기본 알림을 보여줍니다.

groups: - name: sagemaker_hyperpod_alerts rules: # GPU_TEMP_ABOVE_80C - alert: GPUHighTemperature expr: DCGM_FI_DEV_GPU_TEMP > 80 for: 5m labels: severity: warning annotations: summary: "GPU Temperature Above 80C" description: "GPU {{ $labels.gpu }} temperature is {{ $value }}°C." # GPU_TEMP_ABOVE_85C - alert: GPUCriticalTemperature expr: DCGM_FI_DEV_GPU_TEMP > 85 for: 1m labels: severity: critical annotations: summary: "GPU Temperature Above 85C" description: "GPU {{ $labels.gpu }} temperature is {{ $value }}°C." # GPU_MEMORY_ERROR # Any ECC double-bit errors indicate serious memory issues requiring immediate attention - alert: GPUMemoryErrorDetected expr: DCGM_FI_DEV_ECC_DBE_VOL_TOTAL > 0 or DCGM_FI_DEV_ECC_DBE_AGG_TOTAL > DCGM_FI_DEV_ECC_DBE_AGG_TOTAL offset 5m labels: severity: critical annotations: summary: "GPU ECC Double-Bit Error Detected" description: "GPU {{ $labels.gpu }} has detected ECC double-bit errors." # GPU_POWER_WARNING # Sustained power limit violations can impact performance and stability - alert: GPUPowerViolation expr: DCGM_FI_DEV_POWER_VIOLATION > 100 for: 5m labels: severity: warning annotations: summary: "GPU Power Violation" description: "GPU {{ $labels.gpu }} has been operating at power limit for extended period." # GPU_NVLINK_ERROR # NVLink errors above threshold indicate interconnect stability issues - alert: NVLinkErrorsDetected expr: DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL > 0 or DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL > 10 labels: severity: warning annotations: summary: "NVLink Errors Detected" description: "GPU {{ $labels.gpu }} has detected NVLink errors." # GPU_THERMAL_VIOLATION # Immediate alert on thermal violations to prevent hardware damage - alert: GPUThermalViolation expr: increase(DCGM_FI_DEV_THERMAL_VIOLATION[5m]) > 0 for: 1m labels: severity: critical annotations: summary: "GPU Thermal Violation Detected" description: "GPU {{ $labels.gpu }} has thermal violations on node {{ $labels.Hostname }}" # GPU_XID_ERROR # XID errors indicate driver or hardware level GPU issues requiring investigation - alert: GPUXidError expr: DCGM_FI_DEV_XID_ERRORS > 0 for: 0m labels: severity: critical annotations: summary: "GPU XID Error Detected" description: "GPU {{ $labels.gpu }} experienced XID error {{ $value }} on node {{ $labels.Hostname }}" # DISK_SPACE_WARNING # 90% threshold ensures time to respond before complete disk exhaustion - alert: NodeDiskSpaceWarning expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90 for: 5m labels: severity: warning annotations: summary: "High Disk Usage" description: "Node {{ $labels.instance }} disk usage is above 90%" # FSX_STORAGE_WARNING # 80% FSx utilization allows buffer for burst workloads - alert: FsxLustreStorageWarning expr: fsx_lustre_storage_used_bytes / fsx_lustre_storage_capacity_bytes * 100 > 80 for: 5m labels: severity: warning annotations: summary: "High FSx Lustre Usage" description: "FSx Lustre storage usage is above 80% on file system {{ $labels.filesystem_id }}"