Terjemahan disediakan oleh mesin penerjemah. Jika konten terjemahan yang diberikan bertentangan dengan versi bahasa Inggris aslinya, utamakan versi bahasa Inggris.
Topik
Prasyarat
aws sagemaker create-cluster \ --cluster-name my-mig-cluster \ --orchestrator 'Eks={ClusterArn=arn:aws:eks:region:account:cluster/cluster-name}' \ --instance-groups '{ "InstanceGroupName": "gpu-group", "InstanceType": "ml.p4d.24xlarge", "InstanceCount": 1, "LifeCycleConfig": { "SourceS3Uri": "s3://my-bucket", "OnCreate": "on_create_script.sh" }, "KubernetesConfig": { "Labels": { "nvidia.com/mig.config": "all-1g.5gb" } }, "ExecutionRole": "arn:aws:iam::account:role/execution-role", "ThreadsPerCore": 1 }' \ --vpc-config '{ "SecurityGroupIds": ["sg-12345"], "Subnets": ["subnet-12345"] }' \ --node-provisioning-mode Continuous
{ "ClusterName": "my-mig-cluster", "InstanceGroups": [ { "InstanceGroupName": "gpu-group", "InstanceType": "ml.p4d.24xlarge", "InstanceCount": 1, "KubernetesConfig": { "Labels": { "nvidia.com/mig.config": "all-2g.10gb" } }, "ExecutionRole": "arn:aws:iam::account:role/execution-role" } ], "Orchestrator": { "Eks": { "ClusterArn": "arn:aws:eks:region:account:cluster/cluster-name" } }, "NodeProvisioningMode": "Continuous" }
helm install gpuo helm_chart/HyperPodHelmChart/charts/gpu-operator \ -f helm_chart/HyperPodHelmChart/charts/gpu-operator/regional-values/values-{$AWS_REGION}.yaml \ -n kube-system
kubectl get pods -n kube-system | grep -E "(gpu-operator|nvidia-)"
helm upgrade dependencies helm_chart/HyperPodHelmChart \ --set nvidia-device-plugin.devicePlugin.enabled=false \ -n kube-system
kubectl describe nodes | grep "nvidia.com/gpu"
# List all nodes in the instance group kubectl get nodes -l sagemaker.amazonaws.com/instance-group-name=INSTANCE_GROUP_NAME# Example: kubectl get nodes -l sagemaker.amazonaws.com/instance-group-name=p4d-group
# Cordon a single node kubectl cordonNODE_NAME# Example: kubectl cordon hyperpod-i-014a41a7001adca60
# Drain the node (ignore DaemonSets and evict pods) kubectl drainNODE_NAME\ --ignore-daemonsets \ --delete-emptydir-data \ --force \ --grace-period=300 # Example: kubectl drain hyperpod-i-014a41a7001adca60 \ --ignore-daemonsets \ --delete-emptydir-data \ --force \ --grace-period=300
penting
# Check for any remaining pods outside system namespaces kubectl get pods --all-namespaces --field-selector spec.nodeName=NODE_NAME\ | grep -v "kube-system" \ | grep -v "cert-manager" \ | grep -v "kubeflow" \ | grep -v "hyperpod-inference-system" \ | grep -v "kube-public" \ | grep -v "mpi-operator" \ | grep -v "gpu-operator" \ | grep -v "aws-hyperpod" \ | grep -v "jupyter-k8s-system" \ | grep -v "hyperpod-observability" \ | grep -v "kueue-system" \ | grep -v "keda" # Example: kubectl get pods --all-namespaces --field-selector spec.nodeName=hyperpod-i-014a41a7001adca60 \ | grep -v "kube-system" \ | grep -v "cert-manager" \ | grep -v "kubeflow" \ | grep -v "hyperpod-inference-system" \ | grep -v "kube-public" \ | grep -v "mpi-operator" \ | grep -v "gpu-operator" \ | grep -v "aws-hyperpod" \ | grep -v "jupyter-k8s-system" \ | grep -v "hyperpod-observability" \ | grep -v "kueue-system" \ | grep -v "keda"
# Check node status - should show "SchedulingDisabled" kubectl get nodes -l sagemaker.amazonaws.com/instance-group-name=INSTANCE_GROUP_NAME
aws sagemaker update-cluster \ --cluster-name my-mig-cluster \ --instance-groups '{ "InstanceGroupName": "gpu-group", "InstanceType": "ml.p4d.24xlarge", "InstanceCount": 1, "KubernetesConfig": { "Labels": { "nvidia.com/mig.config": "all-3g.20gb" } }, "ExecutionRole": "arn:aws:iam::account:role/execution-role" }'
catatan
# Update kubeconfig aws eks update-kubeconfig --nameyour-eks-cluster--regionus-east-2# Check MIG labels kubectl get nodeNODE_NAME-o=jsonpath='{.metadata.labels}' | grep mig # Check available MIG resources kubectl describe nodeNODE_NAME| grep -A 10 "Allocatable:"
# Check GPU Operator status kubectl get pods -n gpu-operator-resources # View MIG configuration kubectl exec -n gpu-operator-resources nvidia-driver-XXXXX -- nvidia-smi mig -lgi # Check device plugin configuration kubectl logs -n gpu-operator-resources nvidia-device-plugin-XXXXX # Monitor node events kubectl get events --field-selector involvedObject.name=NODE_NAME