aiplatform/v1/model_monitoring.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.aiplatform.v1;

import "google/api/resource.proto";
import "google/cloud/aiplatform/v1/io.proto";

option csharp_namespace = "Google.Cloud.AIPlatform.V1";
option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb";
option java_multiple_files = true;
option java_outer_classname = "ModelMonitoringProto";
option java_package = "com.google.cloud.aiplatform.v1";
option php_namespace = "Google\\Cloud\\AIPlatform\\V1";
option ruby_package = "Google::Cloud::AIPlatform::V1";
option (google.api.resource_definition) = {
  type: "monitoring.googleapis.com/NotificationChannel"
  pattern: "projects/{project}/notificationChannels/{notification_channel}"
};

// The objective configuration for model monitoring, including the information
// needed to detect anomalies for one particular model.
message ModelMonitoringObjectiveConfig {
  // Training Dataset information.
  message TrainingDataset {
    oneof data_source {
      // The resource name of the Dataset used to train this Model.
      string dataset = 3 [(google.api.resource_reference) = {
        type: "aiplatform.googleapis.com/Dataset"
      }];

      // The Google Cloud Storage uri of the unmanaged Dataset used to train
      // this Model.
      GcsSource gcs_source = 4;

      // The BigQuery table of the unmanaged Dataset used to train this
      // Model.
      BigQuerySource bigquery_source = 5;
    }

    // Data format of the dataset, only applicable if the input is from
    // Google Cloud Storage.
    // The possible formats are:
    //
    // "tf-record"
    // The source file is a TFRecord file.
    //
    // "csv"
    // The source file is a CSV file.
    // "jsonl"
    // The source file is a JSONL file.
    string data_format = 2;

    // The target field name the model is to predict.
    // This field will be excluded when doing Predict and (or) Explain for the
    // training data.
    string target_field = 6;

    // Strategy to sample data from Training Dataset.
    // If not set, we process the whole dataset.
    SamplingStrategy logging_sampling_strategy = 7;
  }

  // The config for Training & Prediction data skew detection. It specifies the
  // training dataset sources and the skew detection parameters.
  message TrainingPredictionSkewDetectionConfig {
    // Key is the feature name and value is the threshold. If a feature needs to
    // be monitored for skew, a value threshold must be configured for that
    // feature. The threshold here is against feature distribution distance
    // between the training and prediction feature.
    map<string, ThresholdConfig> skew_thresholds = 1;

    // Key is the feature name and value is the threshold. The threshold here is
    // against attribution score distance between the training and prediction
    // feature.
    map<string, ThresholdConfig> attribution_score_skew_thresholds = 2;

    // Skew anomaly detection threshold used by all features.
    // When the per-feature thresholds are not set, this field can be used to
    // specify a threshold for all features.
    ThresholdConfig default_skew_threshold = 6;
  }

  // The config for Prediction data drift detection.
  message PredictionDriftDetectionConfig {
    // Key is the feature name and value is the threshold. If a feature needs to
    // be monitored for drift, a value threshold must be configured for that
    // feature. The threshold here is against feature distribution distance
    // between different time windws.
    map<string, ThresholdConfig> drift_thresholds = 1;

    // Key is the feature name and value is the threshold. The threshold here is
    // against attribution score distance between different time windows.
    map<string, ThresholdConfig> attribution_score_drift_thresholds = 2;

    // Drift anomaly detection threshold used by all features.
    // When the per-feature thresholds are not set, this field can be used to
    // specify a threshold for all features.
    ThresholdConfig default_drift_threshold = 5;
  }

  // The config for integrating with Vertex Explainable AI. Only applicable if
  // the Model has explanation_spec populated.
  message ExplanationConfig {
    // Output from
    // [BatchPredictionJob][google.cloud.aiplatform.v1.BatchPredictionJob] for
    // Model Monitoring baseline dataset, which can be used to generate baseline
    // attribution scores.
    message ExplanationBaseline {
      // The storage format of the predictions generated BatchPrediction job.
      enum PredictionFormat {
        // Should not be set.
        PREDICTION_FORMAT_UNSPECIFIED = 0;

        // Predictions are in JSONL files.
        JSONL = 2;

        // Predictions are in BigQuery.
        BIGQUERY = 3;
      }

      // The configuration specifying of BatchExplain job output. This can be
      // used to generate the baseline of feature attribution scores.
      oneof destination {
        // Cloud Storage location for BatchExplain output.
        GcsDestination gcs = 2;

        // BigQuery location for BatchExplain output.
        BigQueryDestination bigquery = 3;
      }

      // The storage format of the predictions generated BatchPrediction job.
      PredictionFormat prediction_format = 1;
    }

    // If want to analyze the Vertex Explainable AI feature attribute scores or
    // not. If set to true, Vertex AI will log the feature attributions from
    // explain response and do the skew/drift detection for them.
    bool enable_feature_attributes = 1;

    // Predictions generated by the BatchPredictionJob using baseline dataset.
    ExplanationBaseline explanation_baseline = 2;
  }

  // Training dataset for models. This field has to be set only if
  // TrainingPredictionSkewDetectionConfig is specified.
  TrainingDataset training_dataset = 1;

  // The config for skew between training data and prediction data.
  TrainingPredictionSkewDetectionConfig
      training_prediction_skew_detection_config = 2;

  // The config for drift of prediction data.
  PredictionDriftDetectionConfig prediction_drift_detection_config = 3;

  // The config for integrating with Vertex Explainable AI.
  ExplanationConfig explanation_config = 5;
}

// The alert config for model monitoring.
message ModelMonitoringAlertConfig {
  // The config for email alert.
  message EmailAlertConfig {
    // The email addresses to send the alert.
    repeated string user_emails = 1;
  }

  oneof alert {
    // Email alert config.
    EmailAlertConfig email_alert_config = 1;
  }

  // Dump the anomalies to Cloud Logging. The anomalies will be put to json
  // payload encoded from proto
  // [google.cloud.aiplatform.logging.ModelMonitoringAnomaliesLogEntry][].
  // This can be further sinked to Pub/Sub or any other services supported
  // by Cloud Logging.
  bool enable_logging = 2;

  // Resource names of the NotificationChannels to send alert.
  // Must be of the format
  // `projects/<project_id_or_number>/notificationChannels/<channel_id>`
  repeated string notification_channels = 3 [(google.api.resource_reference) = {
    type: "monitoring.googleapis.com/NotificationChannel"
  }];
}

// The config for feature monitoring threshold.
message ThresholdConfig {
  oneof threshold {
    // Specify a threshold value that can trigger the alert.
    // If this threshold config is for feature distribution distance:
    //   1. For categorical feature, the distribution distance is calculated by
    //      L-inifinity norm.
    //   2. For numerical feature, the distribution distance is calculated by
    //      Jensen–Shannon divergence.
    // Each feature must have a non-zero threshold if they need to be monitored.
    // Otherwise no alert will be triggered for that feature.
    double value = 1;
  }
}

// Sampling Strategy for logging, can be for both training and prediction
// dataset.
message SamplingStrategy {
  // Requests are randomly selected.
  message RandomSampleConfig {
    // Sample rate (0, 1]
    double sample_rate = 1;
  }

  // Random sample config. Will support more sampling strategies later.
  RandomSampleConfig random_sample_config = 1;
}