From e2f4bf8f3818d49baf0f3789add75d5fd506ad8d Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 24 Apr 2018 11:22:07 +0200 Subject: pkg/gce: improve API rate limit logic Sometimes we see rate limiting errors. This is especially bad for e.g. patch testing requests. Increase default API delay and add backoff logic. --- pkg/gce/gce.go | 104 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 73 insertions(+), 31 deletions(-) diff --git a/pkg/gce/gce.go b/pkg/gce/gce.go index f2675339f..bb6d89838 100644 --- a/pkg/gce/gce.go +++ b/pkg/gce/gce.go @@ -14,6 +14,7 @@ package gce import ( "fmt" "io/ioutil" + "math/rand" "net/http" "strings" "time" @@ -37,14 +38,14 @@ type Context struct { computeService *compute.Service // apiCallTicker ticks regularly, preventing us from accidentally making - // GCE API calls too quickly. Our quota is 20 QPS, but we temporarily - // limit ourselves to less than that. + // GCE API calls too quickly. Our quota is 20 QPS, but we limit ourselves + // to less than that because several independent programs can do API calls. apiRateGate <-chan time.Time } func NewContext() (*Context, error) { ctx := &Context{ - apiRateGate: time.NewTicker(time.Second / 10).C, + apiRateGate: time.NewTicker(time.Second).C, } background := context.Background() tokenSource, err := google.DefaultTokenSource(background, compute.CloudPlatformScope) @@ -137,8 +138,11 @@ func (ctx *Context) CreateInstance(name, machineType, image, sshkey string) (str } retry: - <-ctx.apiRateGate - op, err := ctx.computeService.Instances.Insert(ctx.ProjectID, ctx.ZoneID, instance).Do() + var op *compute.Operation + err := ctx.apiCall(func() (err error) { + op, err = ctx.computeService.Instances.Insert(ctx.ProjectID, ctx.ZoneID, instance).Do() + return + }) if err != nil { return "", fmt.Errorf("failed to create instance: %v", err) } @@ -150,8 +154,11 @@ retry: return "", err } - <-ctx.apiRateGate - inst, err := ctx.computeService.Instances.Get(ctx.ProjectID, ctx.ZoneID, name).Do() + var inst *compute.Instance + err = ctx.apiCall(func() (err error) { + inst, err = ctx.computeService.Instances.Get(ctx.ProjectID, ctx.ZoneID, name).Do() + return + }) if err != nil { return "", fmt.Errorf("error getting instance %s details after creation: %v", name, err) } @@ -171,8 +178,11 @@ retry: } func (ctx *Context) DeleteInstance(name string, wait bool) error { - <-ctx.apiRateGate - op, err := ctx.computeService.Instances.Delete(ctx.ProjectID, ctx.ZoneID, name).Do() + var op *compute.Operation + err := ctx.apiCall(func() (err error) { + op, err = ctx.computeService.Instances.Delete(ctx.ProjectID, ctx.ZoneID, name).Do() + return + }) if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code == 404 { return nil } @@ -188,12 +198,15 @@ func (ctx *Context) DeleteInstance(name string, wait bool) error { } func (ctx *Context) IsInstanceRunning(name string) bool { - <-ctx.apiRateGate - instance, err := ctx.computeService.Instances.Get(ctx.ProjectID, ctx.ZoneID, name).Do() + var inst *compute.Instance + err := ctx.apiCall(func() (err error) { + inst, err = ctx.computeService.Instances.Get(ctx.ProjectID, ctx.ZoneID, name).Do() + return + }) if err != nil { return false } - return instance.Status == "RUNNING" + return inst.Status == "RUNNING" } func (ctx *Context) CreateImage(imageName, gcsFile string) error { @@ -206,16 +219,21 @@ func (ctx *Context) CreateImage(imageName, gcsFile string) error { "https://www.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx", }, } - <-ctx.apiRateGate - op, err := ctx.computeService.Images.Insert(ctx.ProjectID, image).Do() + var op *compute.Operation + err := ctx.apiCall(func() (err error) { + op, err = ctx.computeService.Images.Insert(ctx.ProjectID, image).Do() + return + }) if err != nil { // Try again without the vmx license in case it is not supported. image.Licenses = nil - <-ctx.apiRateGate - op, err = ctx.computeService.Images.Insert(ctx.ProjectID, image).Do() - } - if err != nil { - return fmt.Errorf("failed to create image: %v", err) + err := ctx.apiCall(func() (err error) { + op, err = ctx.computeService.Images.Insert(ctx.ProjectID, image).Do() + return + }) + if err != nil { + return fmt.Errorf("failed to create image: %v", err) + } } if err := ctx.waitForCompletion("global", "create image", op.Name, false); err != nil { return err @@ -224,8 +242,11 @@ func (ctx *Context) CreateImage(imageName, gcsFile string) error { } func (ctx *Context) DeleteImage(imageName string) error { - <-ctx.apiRateGate - op, err := ctx.computeService.Images.Delete(ctx.ProjectID, imageName).Do() + var op *compute.Operation + err := ctx.apiCall(func() (err error) { + op, err = ctx.computeService.Images.Delete(ctx.ProjectID, imageName).Do() + return + }) if apiErr, ok := err.(*googleapi.Error); ok && apiErr.Code == 404 { return nil } @@ -247,17 +268,18 @@ func (err resourcePoolExhaustedError) Error() string { func (ctx *Context) waitForCompletion(typ, desc, opName string, ignoreNotFound bool) error { for { time.Sleep(2 * time.Second) - <-ctx.apiRateGate - var err error var op *compute.Operation - switch typ { - case "global": - op, err = ctx.computeService.GlobalOperations.Get(ctx.ProjectID, opName).Do() - case "zone": - op, err = ctx.computeService.ZoneOperations.Get(ctx.ProjectID, ctx.ZoneID, opName).Do() - default: - panic("unknown operation type: " + typ) - } + err := ctx.apiCall(func() (err error) { + switch typ { + case "global": + op, err = ctx.computeService.GlobalOperations.Get(ctx.ProjectID, opName).Do() + case "zone": + op, err = ctx.computeService.ZoneOperations.Get(ctx.ProjectID, ctx.ZoneID, opName).Do() + default: + panic("unknown operation type: " + typ) + } + return + }) if err != nil { return fmt.Errorf("failed to get %v operation %v: %v", desc, opName, err) } @@ -302,3 +324,23 @@ func (ctx *Context) getMeta(path string) (string, error) { } return string(body), nil } + +func (ctx *Context) apiCall(fn func() error) error { + rateLimited := 0 + for { + <-ctx.apiRateGate + err := fn() + if err != nil { + if strings.Contains(err.Error(), "Rate Limit Exceeded") || + strings.Contains(err.Error(), "rateLimitExceeded") { + rateLimited++ + backoff := time.Duration(float64(rateLimited) * 1e9 * (rand.Float64() + 1)) + time.Sleep(backoff) + if rateLimited < 20 { + continue + } + } + } + return err + } +} -- cgit mrf-deployment