From 9d2118d7625f7448c44ecc1d158a608479121ee4 Mon Sep 17 00:00:00 2001 From: Pavel Boldyrev <627562+bpg@users.noreply.github.com> Date: Tue, 17 Dec 2024 14:31:46 -0500 Subject: [PATCH] fix(vm): retry `start` if it fails with a transient error (#1685) Signed-off-by: Pavel Boldyrev <627562+bpg@users.noreply.github.com> --- proxmox/nodes/vms/vms.go | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/proxmox/nodes/vms/vms.go b/proxmox/nodes/vms/vms.go index fded8410..2571a654 100644 --- a/proxmox/nodes/vms/vms.go +++ b/proxmox/nodes/vms/vms.go @@ -423,7 +423,24 @@ func (c *Client) StartVMAsync(ctx context.Context, timeoutSec int) (*string, err } resBody := &StartResponseBody{} - err := c.DoRequest(ctx, http.MethodPost, c.ExpandPath("status/start"), reqBody, resBody) + // PVE may return a 500 error "got no worker upid - start worker failed", so we retry few times. + err := retry.Do( + func() error { + err := c.DoRequest(ctx, http.MethodPost, c.ExpandPath("status/start"), reqBody, resBody) + if err != nil && strings.Contains(err.Error(), "already running") { + return nil + } + + return err + }, + retry.Context(ctx), + retry.Attempts(3), + retry.Delay(1*time.Second), + retry.LastErrorOnly(true), + retry.RetryIf(func(err error) bool { + return strings.Contains(err.Error(), "got no worker upid") + }), + ) if err != nil { return nil, fmt.Errorf("error starting VM: %w", err) }