// Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: BUSL-1.1 package jobs3 import ( "context" "fmt" "io" "math/rand" "os" "regexp" "strings" "testing" "time" "github.com/hashicorp/go-set/v3" nomadapi "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/e2e/v3/util3" "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/jobspec2" "github.com/shoenig/test" "github.com/shoenig/test/must" "github.com/shoenig/test/wait" ) type Submission struct { t *testing.T nomadClient *nomadapi.Client jobSpec string jobID string origJobID string noRandomJobID bool noCleanup bool timeout time.Duration verbose bool detach bool dispatcher bool // jobspec mutator funcs mutators []func(string) string // preCleanup funcs to run before deregistering the job preCleanup []func(*Submission) vars Vars waitComplete *set.Set[string] // groups to wait until complete inNamespace string authToken string legacyConsulToken string } func (sub *Submission) queryOptions() *nomadapi.QueryOptions { return &nomadapi.QueryOptions{ Namespace: sub.inNamespace, AuthToken: sub.authToken, } } func (sub *Submission) Evals() []*nomadapi.Evaluation { sub.t.Helper() evals, _, err := sub.nomadClient.Jobs(). Evaluations(sub.JobID(), sub.queryOptions()) must.NoError(sub.t, err) return evals } func (sub *Submission) Allocs() []*nomadapi.AllocationListStub { sub.t.Helper() allocs, _, err := sub.nomadClient.Jobs(). Allocations(sub.jobID, true, sub.queryOptions()) must.NoError(sub.t, err, must.Sprint("could not get allocs")) return allocs } type TaskEvents struct { Group string Task string Events []*nomadapi.TaskEvent } // AllocEvents returns a map of TaskEvents with alloc ID keys func (sub *Submission) AllocEvents() map[string]TaskEvents { sub.t.Helper() allocs := sub.Allocs() events := make(map[string]TaskEvents) for _, alloc := range allocs { for task, state := range alloc.TaskStates { events[alloc.ID] = TaskEvents{ Group: alloc.TaskGroup, Task: task, Events: state.Events, } } } return events } type Logs struct { Stdout string Stderr string } // TaskLogs returns the logs of the given task, using a random allocation of // the given group. func (sub *Submission) TaskLogs(group, task string) Logs { byAlloc := sub.TaskLogsByAlloc(group, task) must.Positive(sub.t, len(byAlloc), must.Sprintf("no allocations found for %s/%s", group, task)) var result Logs for _, logs := range byAlloc { result = logs break } return result } // TaskLogsByAlloc returns the logs of the given task, organized by allocation. func (sub *Submission) TaskLogsByAlloc(group, task string) map[string]Logs { result := make(map[string]Logs) // get list of allocs for the job queryOpts := sub.queryOptions() jobsAPI := sub.nomadClient.Jobs() stubs, _, err := jobsAPI.Allocations(sub.jobID, false, queryOpts) must.NoError(sub.t, err, must.Sprintf("failed to query allocations for %s/%s", group, task)) // get logs for each task in the group allocations for _, stub := range stubs { if stub.TaskGroup == group { result[stub.ID] = sub.getTaskLogs(stub.ID, task) } } return result } func (sub *Submission) getTaskLogs(allocID, task string) Logs { queryOpts := sub.queryOptions() allocAPI := sub.nomadClient.Allocations() alloc, _, err := allocAPI.Info(allocID, queryOpts) must.NoError(sub.t, err, must.Sprintf("failed to query allocation for %s", allocID)) fsAPI := sub.nomadClient.AllocFS() read := func(path string) string { var content string f := func() error { rc, err := fsAPI.ReadAt(alloc, path, 0, 0, queryOpts) if err != nil { return fmt.Errorf("failed to read alloc %s logs: %w", allocID, err) } b, err := io.ReadAll(rc) if err != nil { return fmt.Errorf("failed to read alloc %s logs: %w", allocID, err) } content = string(b) return rc.Close() } must.Wait(sub.t, wait.InitialSuccess( wait.ErrorFunc(f), wait.Timeout(15*time.Second), wait.Gap(1*time.Second), )) return content } stdout := fmt.Sprintf("alloc/logs/%s.stdout.0", task) stderr := fmt.Sprintf("alloc/logs/%s.stderr.0", task) return Logs{ Stdout: read(stdout), Stderr: read(stderr), } } // JobID provides the (possibly) randomized jobID associated with this Submission. func (sub *Submission) JobID() string { return sub.jobID } // AllocID returns the ID of an alloc of the given task group. If there is more than // one allocation for the task group, an ID is chosen at random. If there is no // allocation of the given task group the test assertion fails. func (sub *Submission) AllocID(group string) string { queryOpts := sub.queryOptions() jobsAPI := sub.nomadClient.Jobs() stubs, _, err := jobsAPI.Allocations(sub.jobID, false, queryOpts) must.NoError(sub.t, err) for _, stub := range stubs { if stub.TaskGroup == group { return stub.ID } } must.Unreachable(sub.t, must.Sprintf("no alloc id found for group %q", group)) panic("bug") } func (sub *Submission) logf(msg string, args ...any) { sub.t.Helper() util3.Log3(sub.t, sub.verbose, msg, args...) } func (sub *Submission) cleanup() { if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { return } if sub.noCleanup { return } sub.noCleanup = true // so this isn't attempted more than once // deregister the job that was submitted jobsAPI := sub.nomadClient.Jobs() sub.logf("deregister job %q", sub.jobID) _, _, err := jobsAPI.Deregister(sub.jobID, true, &nomadapi.WriteOptions{ Namespace: sub.inNamespace, }) test.NoError(sub.t, err, test.Sprintf("failed to deregister job %q", sub.origJobID)) // force a system gc just in case sysAPI := sub.nomadClient.System() sub.logf("system gc") err = sysAPI.GarbageCollect() test.NoError(sub.t, err, test.Sprint("failed to gc")) // todo: should probably loop over the gc until the job is actually gone } type Option func(*Submission) type Cleanup func() func Submit(t *testing.T, filename string, opts ...Option) (*Submission, Cleanup) { t.Helper() sub := initialize(t, filename) for _, opt := range opts { opt(sub) } sub.setClient() // setup base api clients sub.run() // submit job and wait on deployment sub.waits() // wait on batch/sysbatch allocations return sub, sub.cleanup } func Namespace(name string) Option { return func(sub *Submission) { sub.inNamespace = name } } func AuthToken(token string) Option { return func(sub *Submission) { sub.authToken = token } } var ( idRe = regexp.MustCompile(`(?m)^job "(.*)" \{`) ) func (sub *Submission) Rerun(opts ...Option) { sub.noRandomJobID = true for _, opt := range opts { opt(sub) } sub.run() sub.waits() } func (sub *Submission) run() { if !sub.noRandomJobID { sub.jobID = fmt.Sprintf("%s-%03d", sub.origJobID, rand.Int()%1000) sub.jobSpec = idRe.ReplaceAllString(sub.jobSpec, fmt.Sprintf("job %q {", sub.jobID)) } for _, mut := range sub.mutators { sub.jobSpec = mut(sub.jobSpec) } parseConfig := &jobspec2.ParseConfig{ // Path Body: []byte(sub.jobSpec), AllowFS: true, ArgVars: sub.vars.Slice(), // VarFiles // VarContent // Envs // Strict } job, err := jobspec2.ParseWithConfig(parseConfig) must.NoError(sub.t, err, must.Sprint("failed to parse job")) must.NotNil(sub.t, job) if job.Type == nil { job.Type = pointer.Of("service") } registerOpts := &nomadapi.RegisterOptions{ Submission: &nomadapi.JobSubmission{ Source: sub.jobSpec, Variables: sub.vars.String(), }, } writeOpts := &nomadapi.WriteOptions{ Namespace: sub.inNamespace, AuthToken: sub.authToken, } jobsAPI := sub.nomadClient.Jobs() sub.logf("register (%s) job: %q", *job.Type, sub.jobID) regResp, _, err := jobsAPI.RegisterOpts(job, registerOpts, writeOpts) must.NoError(sub.t, err) if !sub.noCleanup { sub.t.Cleanup(sub.cleanup) } // pre-cleanup callbacks run before main cleanup (reverse order of their // addition with t.Cleanup()) for _, f := range sub.preCleanup { sub.t.Cleanup(func() { f(sub) }) } if sub.dispatcher { return } evalID := regResp.EvalID queryOpts := &nomadapi.QueryOptions{ Namespace: sub.inNamespace, AuthToken: sub.authToken, } // setup a context with our submission timeout ctx, cancel := context.WithTimeout(context.Background(), sub.timeout) defer cancel() // we need to go through evals until we find the deployment evalAPI := sub.nomadClient.Evaluations() // start eval lookup loop var deploymentID string EVAL: for { // check if we have passed timeout expiration select { case <-ctx.Done(): must.Unreachable(sub.t, must.Sprint("timeout reached waiting for eval")) default: } eval, _, err := evalAPI.Info(evalID, queryOpts) must.NoError(sub.t, err) sub.logf("checking eval: %s, status: %s", evalID, eval.Status) switch eval.Status { case nomadapi.EvalStatusComplete: deploymentID = eval.DeploymentID break EVAL case nomadapi.EvalStatusFailed: must.Unreachable(sub.t, must.Sprintf("eval failed: %s, triggered by: %s, failed allocs: %d", eval.StatusDescription, eval.TriggeredBy, len(eval.FailedTGAllocs))) case nomadapi.EvalStatusCancelled: sub.logf("dumping information about a cancelled evaluation") sub.logf("\tJobID: %s", eval.JobID) sub.logf("\tNodeID: %s", eval.NodeID) sub.logf("\tDeploymentID: %s", eval.DeploymentID) sub.logf("\tType: %s", eval.Type) sub.logf("\tTriggeredBy: %s", eval.TriggeredBy) sub.logf("\tStatus: %s %q", eval.Status, eval.StatusDescription) sub.logf("\tPriority: %d", eval.Priority) sub.logf("\tBlockedEval: %s", eval.BlockedEval) sub.logf("\tClassEligibility: %v", eval.ClassEligibility) sub.logf("\tQuotaLimitReached: %s", eval.QuotaLimitReached) for group, metric := range eval.FailedTGAllocs { sub.logf("\t[%s]: %v", group, metric) } sub.logf("eval dump complete") must.Unreachable(sub.t, must.Sprintf("eval canceled: %s", eval.StatusDescription)) default: time.Sleep(1 * time.Second) } nextEvalID := eval.NextEval if nextEvalID != "" { evalID = nextEvalID continue } } if sub.detach { return } switch *job.Type { case "service": // need to monitor the deployment until it is complete depAPI := sub.nomadClient.Deployments() DEPLOY: for { // check if we have passed timeout expiration select { case <-ctx.Done(): must.Unreachable(sub.t, must.Sprint("timeout reached waiting for deployment")) default: } dep, _, err := depAPI.Info(deploymentID, queryOpts) must.NoError(sub.t, err) sub.logf("checking deployment: %s, status: %s", dep.ID, dep.Status) switch dep.Status { case nomadapi.DeploymentStatusBlocked: must.Unreachable(sub.t, must.Sprint("deployment is blocked")) case nomadapi.DeploymentStatusCancelled: must.Unreachable(sub.t, must.Sprint("deployment is cancelled")) case nomadapi.DeploymentStatusFailed: must.Unreachable(sub.t, must.Sprint("deployment is failed")) case nomadapi.DeploymentStatusPaused: must.Unreachable(sub.t, must.Sprint("deployment is paused")) case nomadapi.DeploymentStatusPending: break case nomadapi.DeploymentStatusRunning: break case nomadapi.DeploymentStatusSuccessful: sub.logf("deployment %s was a success", dep.ID) break DEPLOY case nomadapi.DeploymentStatusUnblocking: must.Unreachable(sub.t, must.Sprint("deployment is unblocking")) default: break } time.Sleep(1 * time.Second) } // todo: more job types default: } } func (sub *Submission) waitAlloc(group, id string) { queryOpts := sub.queryOptions() allocAPI := sub.nomadClient.Allocations() // Set up a context with our submission timeout. ctx, cancel := context.WithTimeout(context.Background(), sub.timeout) defer cancel() ALLOCATION: for { // Check if we have passed timeout expiration. select { case <-ctx.Done(): must.Unreachable(sub.t, must.Sprint("timeout reached waiting for alloc")) default: } latest, _, err := allocAPI.Info(id, queryOpts) must.NoError(sub.t, err) status := latest.ClientStatus sub.logf("wait for %q allocation %s, status: %s", group, id, status) switch status { case nomadapi.AllocClientStatusLost: must.Unreachable(sub.t, must.Sprintf("group %q allocation %s lost", group, id)) case nomadapi.AllocClientStatusFailed: must.Unreachable(sub.t, must.Sprintf("group %q allocation %s failed", group, id)) case nomadapi.AllocClientStatusPending: break case nomadapi.AllocClientStatusRunning: break case nomadapi.AllocClientStatusComplete: break ALLOCATION } time.Sleep(1 * time.Second) } } func (sub *Submission) waits() { queryOpts := sub.queryOptions() jobsAPI := sub.nomadClient.Jobs() allocations, _, err := jobsAPI.Allocations(sub.jobID, false, queryOpts) must.NoError(sub.t, err) // for each alloc, if this is an alloc we want to wait on, wait on it for _, alloc := range allocations { id := alloc.ID group := alloc.TaskGroup if sub.waitComplete.Contains(group) { sub.waitAlloc(group, id) } } } func (sub *Submission) setClient() { nomadClient, nomadErr := nomadapi.NewClient(nomadapi.DefaultConfig()) must.NoError(sub.t, nomadErr, must.Sprint("failed to create nomad api client")) sub.nomadClient = nomadClient } func initialize(t *testing.T, filename string) *Submission { b, err := os.ReadFile(filename) must.NoError(t, err, must.Sprintf("failed to read job file %q", filename)) job := string(b) jobID := idRe.FindStringSubmatch(job)[1] must.NotEq(t, "", jobID, must.Sprintf("could not find job id in %q", filename)) return &Submission{ t: t, jobSpec: job, jobID: jobID, origJobID: jobID, timeout: 20 * time.Second, vars: Vars{}, waitComplete: set.New[string](0), preCleanup: []func(*Submission){defaultPreCleanup}, } } func DisableRandomJobID() Option { return func(sub *Submission) { sub.noRandomJobID = true } } func DisableCleanup() Option { return func(sub *Submission) { sub.noCleanup = true } } func Detach() Option { return func(c *Submission) { c.detach = true } } func MutateJobSpec(mut func(string) string) Option { return func(c *Submission) { c.mutators = append(c.mutators, mut) } } func ReplaceInJobSpec(old, new string) Option { return MutateJobSpec(func(j string) string { return strings.ReplaceAll(j, old, new) }) } func Timeout(timeout time.Duration) Option { return func(c *Submission) { c.timeout = timeout } } // Verbose will turn on verbose logging. func Verbose(on bool) Option { return func(c *Submission) { c.verbose = on } } // Var sets a HCL variable. func Var(key, value string) Option { return func(sub *Submission) { sub.vars[key] = value } } type Vars map[string]string func (v Vars) Slice() []string { s := make([]string, 0, len(v)) for k, v := range v { s = append(s, fmt.Sprintf("%s=%s", k, v)) } return s } func (v Vars) String() string { s := "" for k, v := range v { s = s + fmt.Sprintf("%s=%q\n", k, v) } return s } // WaitComplete will wait until all allocations of the given group are // in the "complete" state (or timeout, or terminal with another status). func WaitComplete(group string) Option { return func(sub *Submission) { sub.waitComplete.Insert(group) } } // PreCleanup runs a function after run has completed, before cleanup. func PreCleanup(cb func(*Submission)) Option { return func(sub *Submission) { sub.preCleanup = append(sub.preCleanup, cb) } } // Dispatcher indicates the job is the parent for dispatched jobs, so we // shouldn't wait for evals or deployments func Dispatcher() Option { return func(sub *Submission) { sub.dispatcher = true } } // defaultPreCleanup looks for blocked evals, alloc errors, and task events // only when the test has failed. func defaultPreCleanup(job *Submission) { if !job.t.Failed() { return } for _, eval := range job.Evals() { for group, block := range eval.FailedTGAllocs { job.t.Logf("eval for tg '%s' failed; constraints: %+v", group, block.ConstraintFiltered) } } for _, alloc := range job.Allocs() { job.t.Logf("tg '%s' alloc status '%s': %s", alloc.TaskGroup, alloc.ClientStatus, alloc.ClientDescription) } for _, ae := range job.AllocEvents() { for _, event := range ae.Events { job.t.Logf("tg '%s' task '%s' event: %s", ae.Group, ae.Task, event.DisplayMessage) } } } // SkipEvalComplete will skip waiting for the evaluation(s) to be complete. // // Implies SkipDeploymentHealthy. func SkipEvalComplete() Option { panic("not yet implemented") } // SkipDeploymentHealthy will skip waiting for the deployment to become // healthy. func SkipDeploymentHealthy() Option { panic("not yet implemented") } func LegacyConsulToken(token string) Option { return func(c *Submission) { c.legacyConsulToken = token } }