1// Copyright 2020 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5package main 6 7import ( 8 "context" 9 "errors" 10 "flag" 11 "fmt" 12 "time" 13 14 "cloud.google.com/go/datastore" 15 16 "go.skia.org/infra/autoroll/go/manual" 17 "go.skia.org/infra/go/auth" 18 "go.skia.org/infra/go/firestore" 19 "go.skia.org/infra/go/skerr" 20 "go.skia.org/infra/task_driver/go/lib/auth_steps" 21 "go.skia.org/infra/task_driver/go/lib/checkout" 22 "go.skia.org/infra/task_driver/go/td" 23) 24 25const ( 26 MinWaitDuration = 4 * time.Minute 27) 28 29var ( 30 canaryRollNotCreatedErr = errors.New("Canary roll could not be created. Ask the Infra Gardener to investigate (or directly ping rmistry@) if this happens consistently.") 31 canaryRollSuccessTooQuicklyErr = fmt.Errorf("Canary roll returned success in less than %s. Failing canary due to skbug.com/10563.", MinWaitDuration) 32 33 // Lets add the roll link only once to step data. 34 addedRollLinkStepData = false 35) 36 37func main() { 38 var ( 39 projectId = flag.String("project_id", "", "ID of the Google Cloud project.") 40 taskId = flag.String("task_id", "", "ID of this task.") 41 taskName = flag.String("task_name", "", "Name of the task.") 42 output = flag.String("o", "", "If provided, dump a JSON blob of step data to the given file. Prints to stdout if '-' is given.") 43 local = flag.Bool("local", true, "True if running locally (as opposed to on the bots)") 44 45 checkoutFlags = checkout.SetupFlags(nil) 46 47 rollerName = flag.String("roller_name", "", "The roller we will use to create the canary with.") 48 ) 49 ctx := td.StartRun(projectId, taskId, taskName, output, local) 50 defer td.EndRun(ctx) 51 if *rollerName == "" { 52 td.Fatalf(ctx, "--roller_name must be specified") 53 } 54 55 rs, err := checkout.GetRepoState(checkoutFlags) 56 if err != nil { 57 td.Fatal(ctx, skerr.Wrap(err)) 58 } 59 if rs.Issue == "" || rs.Patchset == "" { 60 td.Fatalf(ctx, "This task driver should be run only as a try bot") 61 } 62 63 // Create token source with scope for datastore access. 64 ts, err := auth_steps.Init(ctx, *local, auth.ScopeUserinfoEmail, datastore.ScopeDatastore) 65 if err != nil { 66 td.Fatal(ctx, skerr.Wrap(err)) 67 } 68 69 // Add documentation link for canary rolls. 70 td.StepText(ctx, "Canary roll doc", "https://goto.google.com/autoroller-canary-bots") 71 72 // Instantiate firestore DB. 73 manualRollDB, err := manual.NewDBWithParams(ctx, firestore.FIRESTORE_PROJECT, "production", ts) 74 if err != nil { 75 td.Fatal(ctx, skerr.Wrap(err)) 76 } 77 78 // Retry if canary roll could not be created or if canary roll returned 79 // success too quickly (skbug.com/10563). 80 retryAttempts := 3 81 for retry := 0; ; retry++ { 82 retryText := "" 83 if retry > 0 { 84 retryText = fmt.Sprintf(" (Retry #%d)", retry) 85 } 86 87 req := manual.ManualRollRequest{ 88 Requester: *rollerName, 89 RollerName: *rollerName, 90 Status: manual.STATUS_PENDING, 91 Timestamp: firestore.FixTimestamp(time.Now()), 92 Revision: rs.GetPatchRef(), 93 94 DryRun: true, 95 NoEmail: true, 96 NoResolveRevision: true, 97 Canary: true, 98 } 99 if err := td.Do(ctx, td.Props(fmt.Sprintf("Trigger canary roll%s", retryText)).Infra(), func(ctx context.Context) error { 100 return manualRollDB.Put(&req) 101 }); err != nil { 102 // Immediately fail for errors in triggering. 103 td.Fatal(ctx, skerr.Wrap(err)) 104 } 105 106 if err := waitForCanaryRoll(ctx, manualRollDB, req.Id, fmt.Sprintf("Wait for canary roll%s", retryText)); err != nil { 107 // Retry these errors. 108 if err == canaryRollNotCreatedErr || err == canaryRollSuccessTooQuicklyErr { 109 if retry >= (retryAttempts - 1) { 110 td.Fatal(ctx, skerr.Wrapf(err, "failed inspite of 3 retries")) 111 } 112 time.Sleep(time.Minute) 113 continue 114 } 115 // Immediately fail for all other errors. 116 td.Fatal(ctx, skerr.Wrap(err)) 117 } else { 118 // The canary roll was successful, break out of the 119 // retry loop. 120 break 121 } 122 } 123} 124 125func waitForCanaryRoll(parentCtx context.Context, manualRollDB manual.DB, rollId, stepName string) error { 126 ctx := td.StartStep(parentCtx, td.Props(stepName)) 127 defer td.EndStep(ctx) 128 startTime := time.Now() 129 130 // For writing to the step's log stream. 131 stdout := td.NewLogStream(ctx, "stdout", td.SeverityInfo) 132 for { 133 roll, err := manualRollDB.Get(ctx, rollId) 134 if err != nil { 135 return td.FailStep(ctx, fmt.Errorf("Could not find canary roll with ID: %s", rollId)) 136 } 137 cl := roll.Url 138 var rollStatus string 139 if cl == "" { 140 rollStatus = fmt.Sprintf("Canary roll has status %s", roll.Status) 141 } else { 142 if !addedRollLinkStepData { 143 // Add the roll link to both the current step and it's parent. 144 td.StepText(ctx, "Canary roll CL", cl) 145 td.StepText(parentCtx, "Canary roll CL", cl) 146 addedRollLinkStepData = true 147 } 148 rollStatus = fmt.Sprintf("Canary roll [ %s ] has status %s", roll.Url, roll.Status) 149 } 150 if _, err := stdout.Write([]byte(rollStatus)); err != nil { 151 return td.FailStep(ctx, fmt.Errorf("Could not write to stdout: %s", err)) 152 } 153 154 if roll.Status == manual.STATUS_COMPLETE { 155 if roll.Result == manual.RESULT_SUCCESS { 156 // This is a hopefully temperory workaround for skbug.com/10563. Sometimes 157 // Canary-Chromium returns success immediately after creating a change and 158 // before the tryjobs have a chance to run. If we have waited 159 // for < MinWaitDuration then be cautious and assume failure. 160 if time.Now().Before(startTime.Add(MinWaitDuration)) { 161 return td.FailStep(ctx, canaryRollSuccessTooQuicklyErr) 162 } 163 return nil 164 } else if roll.Result == manual.RESULT_FAILURE { 165 if cl == "" { 166 return td.FailStep(ctx, canaryRollNotCreatedErr) 167 } 168 return td.FailStep(ctx, fmt.Errorf("Canary roll [ %s ] failed", cl)) 169 } else if roll.Result == manual.RESULT_UNKNOWN { 170 return td.FailStep(ctx, fmt.Errorf("Canary roll [ %s ] completed with an unknown result", cl)) 171 } 172 } 173 time.Sleep(30 * time.Second) 174 } 175} 176