Terjemahan disediakan oleh mesin penerjemah. Jika konten terjemahan yang diberikan bertentangan dengan versi bahasa Inggris aslinya, utamakan versi bahasa Inggris.
apiVersion: inference.sagemaker.aws.amazon.com/v1 kind: JumpStartModel metadata: name: deepseek-sample624 namespace: ns-team-a spec: sageMakerEndpoint: name: deepsek7bsme624 model: modelHubName: SageMakerPublicHub modelId: deepseek-llm-r1-distill-qwen-1-5b modelVersion: 2.0.4 server: instanceType: ml.g5.8xlarge metrics: enabled: true environmentVariables: - name: SAMPLE_ENV_VAR value: "sample_value" maxDeployTimeInSeconds: 1800 tlsConfig: tlsCertificateOutputS3Uri: "s3://{USER}-tls-bucket-{REGION}/certificates" autoScalingSpec: minReplicaCount: 0 maxReplicaCount: 5 pollingInterval: 15 initialCooldownPeriod: 60 cooldownPeriod: 120 scaleDownStabilizationTime: 60 scaleUpStabilizationTime: 0 cloudWatchTrigger: name: "SageMaker-Invocations" namespace: "AWS/SageMaker" useCachedMetrics: false metricName: "Invocations" targetValue: 10.5 activationTargetValue: 5.0 minValue: 0.0 metricCollectionStartTime: 300 metricCollectionPeriod: 30 metricStat: "Sum" metricType: "Average" dimensions: - name: "EndpointName" value: "deepsek7bsme624" - name: "VariantName" value: "AllTraffic" prometheusTrigger: name: "Prometheus-Trigger" useCachedMetrics: false serverAddress: http://<prometheus-host>:9090 query: sum(rate(http_requests_total{deployment="my-deployment"}[2m])) targetValue: 10.0 activationTargetValue: 5.0 namespace: "namespace" customHeaders: "X-Client-Id=cid" metricType: "Value"
-
-
-
Default: 30 detik.
-
-
-
-
cloudWatchTrigger-
-
-
-
-
-
-
-
-
Default:
Average. -
Default:
Average.
-
prometheusTrigger-
-
-
-
-
-
-
-
-
Default:
Average.
-
catatan
apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: name: invocations-scaledobject # name of the scaled object that will be created by this namespace: ns-team-a # namespace that this scaled object targets spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: $DEPLOYMENT_NAME # name of the model deployment minReplicaCount: 1 # minimum number of pods to be maintained maxReplicaCount: 4 # maximum number of pods to scale to pollingInterval: 10 triggers: - type: aws-cloudwatch metadata: namespace: AWS/SageMaker metricName: Invocations targetMetricValue: "1" minMetricValue: "1" awsRegion: "us-west-2" dimensionName: EndpointName;VariantName dimensionValue: $ENDPOINT_NAME;$VARIANT_NAME metricStatPeriod: "30" # seconds metricStat: "Sum" identityOwner: operator
apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: name: invocations-scaledobject # name of the scaled object that will be created by this namespace: ns-team-a # namespace that this scaled object targets spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: $DEPLOYMENT_NAME # name of the model deployment minReplicaCount: 1 # minimum number of pods to be maintained maxReplicaCount: 4 # maximum number of pods to scale to pollingInterval: 10 triggers: - type: aws-sqs-queue metadata: queueURL: https://sqs.eu-west-1.amazonaws.com/account_id/QueueName queueLength: "5" # Default: "5" awsRegion: "us-west-1" scaleOnInFlight: true identityOwner: operator
Metrik Prometheus
apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: name: invocations-scaledobject # name of the scaled object that will be created by this namespace: ns-team-a # namespace that this scaled object targets spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: $DEPLOYMENT_NAME # name of the model deployment minReplicaCount: 1 # minimum number of pods to be maintained maxReplicaCount: 4 # maximum number of pods to scale to pollingInterval: 10 triggers: - type: prometheus metadata: serverAddress: http://<prometheus-host>:9090 query: avg(rate(http_requests_total{deployment="$DEPLOYMENT_NAME"}[2m])) # Note: query must return a vector/scalar single element response threshold: '100.50' namespace: example-namespace # for namespaced queries, eg. Thanos customHeaders: X-Client-Id=cid,X-Tenant-Id=tid,X-Organization-Id=oid # Optional. Custom headers to include in query. In case of auth header, use the custom authentication or relevant authModes. unsafeSsl: "false" # Default is `false`, Used for skipping certificate check when having self-signed certs for Prometheus endpoint timeout: 1000 # Custom timeout for the HTTP client used in this scaler identityOwner: operator
apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: name: invocations-scaledobject # name of the scaled object that will be created by this namespace: ns-team-a # namespace that this scaled object targets spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: $DEPLOYMENT_NAME # name of the model deployment minReplicaCount: 1 # minimum number of pods to be maintained maxReplicaCount: 4 # maximum number of pods to scale to pollingInterval: 10 triggers: - type: cpu metricType: Utilization # Allowed types are 'Utilization' or 'AverageValue' metadata: value: "60" containerName: "" # Optional. You can use this to target a specific container
apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: name: invocations-scaledobject # name of the scaled object that will be created by this namespace: ns-team-a # namespace that this scaled object targets spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: $DEPLOYMENT_NAME # name of the model deployment minReplicaCount: 1 # minimum number of pods to be maintained maxReplicaCount: 4 # maximum number of pods to scale to pollingInterval: 10 triggers: - type: memory metricType: Utilization # Allowed types are 'Utilization' or 'AverageValue' metadata: value: "60" containerName: "" # Optional. You can use this to target a specific container in a pod
catatan
apiVersion: keda.sh/v1alpha1 kind: ScaledObject metadata: name: invocations-scaledobject # name of the scaled object that will be created by this namespace: ns-team-a # namespace that this scaled object targets spec: scaleTargetRef: apiVersion: apps/v1 kind: Deployment name: $DEPLOYMENT_NAME # name of the model deployment minReplicaCount: 0 # minimum number of pods to be maintained maxReplicaCount: 4 # maximum number of pods to scale to pollingInterval: 10 cooldownPeriod: 30 initialCooldownPeriod: 180 # time before scaling down the pods after initial deployment triggers: - type: prometheus metadata: serverAddress: http://<prometheus-host>:9090 query: sum(rate(http_requests_total{deployment="my-deployment"}[2m])) # Note: query must return a vector/scalar single element response threshold: '100.50' activationThreshold: '5.5' # Required if minReplicaCount is 0 for initial scaling namespace: example-namespace timeout: 1000 identityOwner: operator
catatan