本文為英文版的機器翻譯版本，如內容有任何歧義或不一致之處，概以英文版為準。 # 附錄 B-示例卡方計算以下是收集錯誤測量結果和對資料執行卡方測試的範例。程式碼尚未就緒，不會執行必要的錯誤處理，但確實提供了邏輯運作方式的概念證明。您應該更新此範例以符合您的需求。首先，一個 Lambda 函數是由亞馬遜每分鐘調用EventBridge已排程的事件。事件的內容設定為下列資料： ``` { "timestamp": "2023-03-15T15:26:37.527Z", "namespace": "multi-az/frontend", "metricName": "5xx", "dimensions": [ { "Name": "Region", "Value": "us-east-1" }, { "Name": "Controller", "Value": "Home" }, { "Name": "Action", "Value": "Index" } ], "period": 60, "stat": "Sum", "unit": "Count", "chiSquareMetricName": "multi-az/chi-squared", "azs": [ "use1-az2", "use1-az4", "use1-az6" ] } ``` 該數據用於指定檢索適當所需的共同數據CloudWatch量度 (例如命名空間、量度名稱和維度)，然後針對每個可用區域發佈卡方結果。在拉姆達函數中的代碼看起來像下面使用 Python 3.9。在較高的水平，它收集指定的CloudWatch前一分鐘的指標，對該資料執行卡方測試，然後發佈CloudWatch與指定之每個可用區域之測試結果相關的測量結果。 ``` import os import boto3 import datetime import copy import json from datetime import timedelta from scipy.stats import chisquare from aws_embedded_metrics import metric_scope cw_client = boto3.client("cloudwatch", os.environ.get("AWS_REGION", "us-east-1")) @metric_scope def handler(event, context, metrics): metrics.set_property("Event", json.loads(json.dumps(event, default = str))) time = datetime.datetime.strptime(event["timestamp"], "%Y-%m-%dT%H:%M:%S.%fZ") # Round down to the previous minute end: datetime = roundTime(time) # Subtract a minute for the start start: datetime = end - timedelta(minutes = 1) # Get all the metrics that match the query results = get_all_metrics(event, start, end, metrics) metrics.set_property("MetricCounts", results) # Calculate the chi squared result chi_sq_result = chisquare(list(results.values())) expected = sum(list(results.values())) / len(results.values()) metrics.set_property("ChiSquaredResult", chi_sq_result) # Put the chi square metrics into CloudWatch put_all_metrics(event, results, chi_sq_result[1], expected, start, metrics) def get_all_metrics(detail: dict, start: datetime, end: datetime, metrics): """ Gets all of the error metrics for each AZ specified """ metric_query = { "MetricDataQueries": [ ], "StartTime": start, "EndTime": end } for az in detail["azs"]: dim = copy.deepcopy(detail["dimensions"]) dim.append({"Name": "AZ-ID", "Value": az}) query = { "Id": az.replace("-", "_"), "MetricStat": { "Metric": { "Namespace": detail["namespace"], "MetricName": detail["metricName"], "Dimensions": dim }, "Period": int(detail["period"]), "Stat": detail["stat"], "Unit": detail["unit"] }, "Label": az, "ReturnData": True } metric_query["MetricDataQueries"].append(query) metrics.set_property("GetMetricRequest", json.loads(json.dumps(metric_query, default=str))) next_token: str = None results = {} while True: if next_token is not None: metric_query["NextToken"] = next_token data = cw_client.get_metric_data(**metric_query) if next_token is not None: metrics.set_property("GetMetricResult::" + next_token, json.loads(json.dumps(data, default = str))) else: metrics.set_property("GetMetricResult", json.loads(json.dumps(data, default = str))) for item in data["MetricDataResults"]: key = item["Id"].replace("_", "-") if key not in results: results[key] = 0 results[key] += sum(item["Values"]) if "NextToken" in data: next_token = data["NextToken"] if next_token is None: break return results def put_all_metrics(detail: dict, results: dict, chi_sq_value: float, expected: float, timestamp: datetime, metrics): """ Adds the chi squared metric for all AZs to CloudWatch """ farthest_from_expected = None if len(results) > 0: keys = list(results.keys()) farthest_from_expected = keys[0] for key in keys: if abs(results[key] - expected) > abs(results[farthest_from_expected] - expected): farthest_from_expected = key metric_query = { "Namespace": detail["namespace"], "MetricData": [] } for az in detail["azs"]: dim = copy.deepcopy(detail["dimensions"]) dim.append({"Name": "AZ-ID", "Value": az}) query = { "MetricName": detail["chiSquareMetricName"], "Dimensions": dim, "Timestamp": timestamp, } if chi_sq_value <= 0.05 and az == farthest_from_expected: query["Value"] = 1 else: query["Value"] = 0 metric_query["MetricData"].append(query) metrics.set_property("PutMetricRequest", json.loads(json.dumps(metric_query, default = str))) cw_client.put_metric_data(**metric_query) def roundTime(dt=None, roundTo=60): """Round a datetime object to any time lapse in seconds dt : datetime.datetime object, default now. roundTo : Closest number of seconds to round to, default 1 minute. """ if dt == None : dt = datetime.datetime.now() seconds = (dt.replace(tzinfo=None) - dt.min).seconds rounding = (seconds+roundTo/2) // roundTo * roundTo return dt + datetime.timedelta(0,rounding-seconds,-dt.microsecond) ``` 然後，您可以為每個 AZ 建立警示。下面的例子是`use1-az2`以及連續三個 1 分鐘資料點的警示，其上限值等於 1 (1 是卡方測試判斷錯誤率的統計上顯著偏斜時所發佈的量度)。 ``` { "Type": "AWS::CloudWatch::Alarm", "Properties": { "AlarmName": "use1-az2-chi-squared", "ActionsEnabled": true, "OKActions": [], "AlarmActions": [], "InsufficientDataActions": [], "MetricName": "multi-az/chi-squared", "Namespace": "multi-az/frontend", "Statistic": "Maximum", "Dimensions": [ { "Name": "AZ-ID", "Value": "use1-az2" }, { "Name": "Action", "Value": "Index" }, { "Name": "Region", "Value": "us-east-1" }, { "Name": "Controller", "Value": "Home" } ], "Period": 60, "EvaluationPeriods": 3, "DatapointsToAlarm": 3, "Threshold": 1, "ComparisonOperator": "GreaterThanOrEqualToThreshold", "TreatMissingData": "missing" } } ``` 您也可以建立m-of-n報警並將這兩個警報與複合警報結合在一起。您也需要為每個可用區域中的每個控制器/動作組合或微服務建立相同的警示。最後，您可以將卡方複合警報新增至每個控制器/動作組合的可用區域特定警報，如圖所示[使用離群值偵測進行故障偵測](failure-detection-using-outlier-detection.md)。