// Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: BUSL-1.1 package state import ( "context" "errors" "fmt" "reflect" "sort" "time" "github.com/go-ozzo/ozzo-validation/v4" "github.com/hashicorp/go-memdb" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "google.golang.org/protobuf/types/known/timestamppb" "gorm.io/gorm" // "gorm.io/gorm/clause" "github.com/hashicorp/vagrant/internal/server" "github.com/hashicorp/vagrant/internal/server/logbuffer" "github.com/hashicorp/vagrant/internal/server/proto/vagrant_server" ) func init() { models = append(models, &InternalJob{}) dbIndexers = append(dbIndexers, (*State).jobIndexInit) schemas = append(schemas, jobSchema) } var ( jobBucket = []byte("jobs") jobWaitingTimeout = 2 * time.Minute jobHeartbeatTimeout = 2 * time.Minute ) const ( jobTableName = "jobs" jobIdIndexName = "id" jobStateIndexName = "state" jobQueueTimeIndexName = "queue-time" jobTargetIdIndexName = "target-id" maximumJobsInMem = 10000 ) type JobState uint8 const ( JOB_STATE_UNKNOWN JobState = JobState(vagrant_server.Job_UNKNOWN) JOB_STATE_QUEUED = JobState(vagrant_server.Job_QUEUED) JOB_STATE_WAITING = JobState(vagrant_server.Job_WAITING) JOB_STATE_RUNNING = JobState(vagrant_server.Job_RUNNING) JOB_STATE_ERROR = JobState(vagrant_server.Job_ERROR) JOB_STATE_SUCCESS = JobState(vagrant_server.Job_SUCCESS) ) type InternalJob struct { Model AssignTime *time.Time AckTime *time.Time AssignedRunnerID *uint `mapstructure:"-"` AssignedRunner *Runner CancelTime *time.Time CompleteTime *time.Time DataSource *ProtoValue DataSourceOverrides MetadataSet Error *ProtoValue ExpireTime *time.Time Labels MetadataSet Jid *string `gorm:"uniqueIndex" mapstructure:"Id"` Operation *ProtoValue `mapstructure:"Operation"` QueueTime *time.Time Result *ProtoValue Scope scope `gorm:"-:all"` ScopeID *uint `mapstructure:"-"` ScopeType string `mapstructure:"-"` State JobState TargetRunner *ProtoValue } // Job should come with an ID assigned to it, but if it doesn't for // some reason, we assign one now. func (i *InternalJob) BeforeCreate(tx *gorm.DB) error { if i.Jid == nil { id, err := server.Id() if err != nil { return err } i.Jid = &id } return nil } // If the job has a scope assigned to it, persist it. func (i *InternalJob) BeforeSave(tx *gorm.DB) (err error) { if i.Scope == nil { i.ScopeID = nil i.ScopeType = "" return nil } switch v := i.Scope.(type) { case *Basis: i.ScopeID = &v.ID i.ScopeType = "basis" case *Project: i.ScopeID = &v.ID i.ScopeType = "project" case *Target: i.ScopeID = &v.ID i.ScopeType = "target" default: return fmt.Errorf("unknown scope type (%T)", i.Scope) } return nil } // If the job has a scope, load it. func (i *InternalJob) AfterFind(tx *gorm.DB) (err error) { if i.ScopeID == nil { return nil } switch i.ScopeType { case "basis": var b Basis result := tx. First(&b, &Basis{Model: Model{ID: *i.ScopeID}}) if result.Error != nil { return result.Error } i.Scope = &b case "project": var p Project result := tx. First(&p, &Project{Model: Model{ID: *i.ScopeID}}) if result.Error != nil { return result.Error } i.Scope = &p case "target": var t Target result := tx. First(&t, &Target{Model: Model{ID: *i.ScopeID}}) if result.Error != nil { return result.Error } i.Scope = &t default: return fmt.Errorf("unknown scope type (%s)", i.ScopeType) } return nil } func (i *InternalJob) Validate(tx *gorm.DB) error { return validation.ValidateStruct(i, validation.Field(&i.Jid, validation.Required, validation.By( checkUnique( tx.Model((*InternalJob)(nil)). Where(&InternalJob{Jid: i.Jid}). Not(&InternalJob{Model: Model{ID: i.ID}}), ), ), ), ) } // Convert job to a protobuf message func (i *InternalJob) ToProto() *vagrant_server.Job { if i == nil { return nil } var j vagrant_server.Job err := decode(i, &j) if err != nil { panic("failed to decode job: " + err.Error()) } return &j } func (s *State) InternalJobFromProto(job *vagrant_server.Job) (*InternalJob, error) { if job == nil { return nil, ErrEmptyProtoArgument } if job.Id == "" { return nil, gorm.ErrRecordNotFound } var j InternalJob result := s.search().First(&j, &InternalJob{Jid: &job.Id}) if result.Error != nil { if !errors.Is(result.Error, gorm.ErrRecordNotFound) { panic(result.Error) } return nil, result.Error } return &j, nil } func jobSchema() *memdb.TableSchema { return &memdb.TableSchema{ Name: jobTableName, Indexes: map[string]*memdb.IndexSchema{ jobIdIndexName: { Name: jobIdIndexName, AllowMissing: false, Unique: true, Indexer: &memdb.StringFieldIndex{ Field: "Id", }, }, jobStateIndexName: { Name: jobStateIndexName, AllowMissing: true, Unique: false, Indexer: &memdb.IntFieldIndex{ Field: "State", }, }, jobQueueTimeIndexName: { Name: jobQueueTimeIndexName, AllowMissing: true, Unique: false, Indexer: &memdb.CompoundIndex{ Indexes: []memdb.Indexer{ &memdb.IntFieldIndex{ Field: "State", }, &IndexTime{ Field: "QueueTime", Asc: true, }, }, }, }, jobTargetIdIndexName: { Name: jobTargetIdIndexName, AllowMissing: true, Unique: true, Indexer: &memdb.CompoundIndex{ Indexes: []memdb.Indexer{ &memdb.IntFieldIndex{ Field: "State", }, &memdb.StringFieldIndex{ Field: "TargetRunnerId", Lowercase: true, }, &IndexTime{ Field: "QueueTime", Asc: true, }, }, }, }, }, } } type jobIndex struct { Id string // OpType is the operation type for the job. OpType reflect.Type // The basis/project/machine that this job is part of. This is used // to determine if the job is blocked. See job_assigned.go for more details. Scope interface { GetResourceId() string } // QueueTime is the time that the job was queued. QueueTime time.Time // TargetAny will be true if this job targets anything TargetAny bool // TargetRunnerId is the ID of the runner to target. TargetRunnerId string // State is the current state of this job. State vagrant_server.Job_State // StateTimer holds a timer that is usually acting as a timeout mechanism // on the current state. When the state changes, the timer should be cancelled. StateTimer *time.Timer // OutputBuffer stores the terminal output OutputBuffer *logbuffer.Buffer } // A helper, pulled out rather than on a value to allow it to be used against // vagrant_server.Job,s and jobIndex's alike. func jobIsCompleted(state vagrant_server.Job_State) bool { switch state { case vagrant_server.Job_ERROR, vagrant_server.Job_SUCCESS: return true default: return false } } // Job is the exported structure that is returned for most state APIs // and gives callers access to more information than the pure job structure. type Job struct { // Full job structure. *vagrant_server.Job // OutputBuffer is the terminal output for this job. This is a buffer // that may not contain the full amount of output depending on the // time of connection. OutputBuffer *logbuffer.Buffer // Blocked is true if this job is blocked on another job for the same // project/app/workspace. Blocked bool } func (s *State) JobValidate(jobpb *vagrant_server.Job) error { var err error var job *InternalJob if job, err = s.InternalJobFromProto(jobpb); err != nil { return errorToStatus(err) } if err := s.softDecode(jobpb, &job); err != nil { return errorToStatus(err) } if err := job.Validate(s.db); err != nil { return errorToStatus(err) } return nil } // JobCreate queues the given job. func (s *State) JobCreate(jobpb *vagrant_server.Job) error { txn := s.inmem.Txn(true) defer txn.Abort() err := s.jobCreate(txn, jobpb) if err == nil { txn.Commit() } if err != nil { return lookupErrorToStatus("job", err) } return nil } // JobList returns the list of jobs. func (s *State) JobList() ([]*vagrant_server.Job, error) { memTxn := s.inmem.Txn(false) defer memTxn.Abort() iter, err := memTxn.Get(jobTableName, jobIdIndexName+"_prefix", "") if err != nil { return nil, lookupErrorToStatus("job", err) } var result []*vagrant_server.Job for { next := iter.Next() if next == nil { break } idx := next.(*jobIndex) job, err := s.jobById(idx.Id) if err != nil { return nil, lookupErrorToStatus("job", err) } result = append(result, job) } return result, nil } // JobById looks up a job by ID. The returned Job will be a deep copy // of the job so it is safe to read/write. If the job can't be found, // a nil result with no error is returned. func (s *State) JobById(id string, ws memdb.WatchSet) (*Job, error) { memTxn := s.inmem.Txn(false) defer memTxn.Abort() watchCh, raw, err := memTxn.FirstWatch(jobTableName, jobIdIndexName, id) if err != nil { return nil, lookupErrorToStatus("job", err) } ws.Add(watchCh) if raw == nil { return nil, nil } jobIdx := raw.(*jobIndex) // Get blocked status if it is queued. var blocked bool if jobIdx.State == vagrant_server.Job_QUEUED { blocked, err = s.jobIsBlocked(memTxn, jobIdx, ws) if err != nil { return nil, lookupErrorToStatus("job", err) } } job, err := s.jobById(jobIdx.Id) if err != nil { return nil, lookupErrorToStatus("job", err) } result := jobIdx.Job(job) result.Blocked = blocked return result, nil } // JobAssignForRunner will wait for and assign a job to a specific runner. // This will automatically evaluate any conditions that the runner and/or // job may have on assignability. // // The assigned job is put into a "waiting" state until the runner // acks the assignment which can be set with JobAck. // // If ctx is provided and assignment has to block waiting for new jobs, // this will cancel when the context is done. func (s *State) JobAssignForRunner(ctx context.Context, r *vagrant_server.Runner) (*Job, error) { RETRY_ASSIGN: txn := s.inmem.Txn(false) defer txn.Abort() // Turn our runner into a runner record so we can more efficiently assign runnerRec, err := s.RunnerFromProto(r) if err != nil { return nil, fmt.Errorf("runner lookup failed: %w", err) } // candidateQuery finds candidate jobs to assign. type candidateFunc func(*memdb.Txn, memdb.WatchSet, *Runner) (*jobIndex, error) candidateQuery := []candidateFunc{ s.jobCandidateById, s.jobCandidateAny, } // If the runner is by id only, then explicitly set it to by id only. // We explicitly set the full list so that if we add more candidate // searches in the future, we're unlikely to break this. if r.ByIdOnly { candidateQuery = []candidateFunc{s.jobCandidateById} } // Build the list of candidates var candidates []*jobIndex ws := memdb.NewWatchSet() for _, f := range candidateQuery { job, err := f(txn, ws, runnerRec) if err != nil { return nil, err } if job == nil { continue } candidates = append(candidates, job) } // If we have no candidates, then we have to wait for a job to show up. // We set up a blocking query on the job table for a non-assigned job. if len(candidates) == 0 { iter, err := txn.Get(jobTableName, jobStateIndexName, vagrant_server.Job_QUEUED) if err != nil { return nil, err } ws.Add(iter.WatchCh()) } // We're done reading so abort the transaction txn.Abort() // If we have a watch channel set that means we didn't find any // results and we need to retry after waiting for changes. if len(candidates) == 0 { ws.WatchCtx(ctx) if err := ctx.Err(); err != nil { return nil, err } goto RETRY_ASSIGN } // We sort our candidates by queue time so that we can find the earliest sort.Slice(candidates, func(i, j int) bool { return candidates[i].QueueTime.Before(candidates[j].QueueTime) }) // Grab a write lock since we're going to delete, modify, add the // job that we chose. No need to defer here since the first defer works // at the top of the func. // // Write locks are exclusive so this will ensure we're the only one // writing at a time. This lets us be sure we're the only one "assigning" // a job candidate. txn = s.inmem.Txn(true) for _, job := range candidates { // Get the job raw, err := txn.First(jobTableName, jobIdIndexName, job.Id) if err != nil { return nil, err } if raw == nil { // The job no longer exists. It may be canceled or something. // Invalid candidate, continue to next. continue } // We need to verify that in the time between our candidate search // and our write lock acquisition, that this job hasn't been assigned, // canceled, etc. If so, this is an invalid candidate. job := raw.(*jobIndex) if job == nil || job.State != vagrant_server.Job_QUEUED { continue } // We also need to recheck that we aren't blocked. If we're blocked // now then we need to skip this job. if blocked, err := s.jobIsBlocked(txn, job, nil); blocked { continue } else if err != nil { return nil, err } // Update our state and update our on-disk job job.State = vagrant_server.Job_WAITING result, err := s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error { jobpb.State = job.State jobpb.AssignTime = timestamppb.New(time.Now()) return nil }) if err != nil { return nil, err } // Create our timer to requeue this if it isn't acked job.StateTimer = time.AfterFunc(jobWaitingTimeout, func() { s.log.Info("job ack timer expired", "job", job.Id, "timeout", jobWaitingTimeout) s.JobAck(job.Id, false) }) if err := txn.Insert(jobTableName, job); err != nil { return nil, err } // Update our assignment state if err := s.jobAssignedSet(txn, job, true); err != nil { s.JobAck(job.Id, false) return nil, err } txn.Commit() return job.Job(result), nil } txn.Abort() // If we reached here, all of our candidates were invalid, we retry goto RETRY_ASSIGN } // JobAck acknowledges that a job has been accepted or rejected by the runner. // If ack is false, then this will move the job back to the queued state // and be eligible for assignment. func (s *State) JobAck(id string, ack bool) (*Job, error) { txn := s.inmem.Txn(true) defer txn.Abort() // Get the job raw, err := txn.First(jobTableName, jobIdIndexName, id) if err != nil { return nil, lookupErrorToStatus("job", err) } if raw == nil { return nil, status.Errorf(codes.NotFound, "job not found: %s", id) } job := raw.(*jobIndex) // If the job is not in the assigned state, then this is an error. if job.State != vagrant_server.Job_WAITING { return nil, status.Errorf(codes.FailedPrecondition, "job can't be acked from state: %s", job.State.String()) } result, err := s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error { if ack { // Set to accepted job.State = vagrant_server.Job_RUNNING jobpb.State = job.State jobpb.AckTime = timestamppb.New(time.Now()) // We also initialize the output buffer here because we can // expect output to begin streaming in. job.OutputBuffer = logbuffer.New() } else { // Set to queued job.State = vagrant_server.Job_QUEUED jobpb.State = job.State jobpb.AssignTime = nil } return nil }) if err != nil { return nil, lookupErrorToStatus("job", err) } // Cancel our timer if job.StateTimer != nil { job.StateTimer.Stop() job.StateTimer = nil } // Create a new timer that we'll use for our heartbeat. After this // timer expires, the job will immediately move to an error state. job.StateTimer = time.AfterFunc(jobHeartbeatTimeout, func() { s.log.Info("canceling job due to heartbeat timeout", "job", job.Id) // Force cancel err := s.JobCancel(job.Id, true) if err != nil { s.log.Error("error canceling job due to heartbeat failure", "error", err, "job", job.Id) } }) s.log.Debug("heartbeat timer set", "job", job.Id, "timeout", jobHeartbeatTimeout) // Insert to update if err := txn.Insert(jobTableName, job); err != nil { return nil, saveErrorToStatus("job", err) } // Update our assigned state if we nacked if !ack { if err := s.jobAssignedSet(txn, job, false); err != nil { return nil, saveErrorToStatus("job", err) } } txn.Commit() return job.Job(result), nil } // JobComplete marks a running job as complete. If an error is given, // the job is marked as failed (a completed state). If no error is given, // the job is marked as successful. func (s *State) JobComplete(id string, result *vagrant_server.Job_Result, cerr error) error { txn := s.inmem.Txn(true) defer txn.Abort() // Get the job raw, err := txn.First(jobTableName, jobIdIndexName, id) if err != nil { return lookupErrorToStatus("job", err) } if raw == nil { return status.Errorf(codes.NotFound, "job not found: %s", id) } job := raw.(*jobIndex) // Update our assigned state if err := s.jobAssignedSet(txn, job, false); err != nil { return saveErrorToStatus("job", err) } // If the job is not in the assigned state, then this is an error. if job.State != vagrant_server.Job_RUNNING { return status.Errorf(codes.FailedPrecondition, "job can't be completed from state: %s", job.State.String()) } _, err = s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error { // Set to complete, assume success for now job.State = vagrant_server.Job_SUCCESS jobpb.State = job.State jobpb.Result = result jobpb.CompleteTime = timestamppb.New(time.Now()) if cerr != nil { job.State = vagrant_server.Job_ERROR jobpb.State = job.State st, _ := status.FromError(cerr) jobpb.Error = st.Proto() } return nil }) if err != nil { return saveErrorToStatus("job", err) } // End the job job.End() // Insert to update if err := txn.Insert(jobTableName, job); err != nil { return saveErrorToStatus("job", err) } txn.Commit() return nil } // JobCancel marks a job as cancelled. This will set the internal state // and request the cancel but if the job is running then it is up to downstream // to listen for and react to Job changes for cancellation. func (s *State) JobCancel(id string, force bool) error { txn := s.inmem.Txn(true) defer txn.Abort() // Get the job raw, err := txn.First(jobTableName, jobIdIndexName, id) if err != nil { return lookupErrorToStatus("job", err) } if raw == nil { return status.Errorf(codes.NotFound, "job not found: %s", id) } job := raw.(*jobIndex) if err := s.jobCancel(txn, job, force); err != nil { return saveErrorToStatus("job", err) } txn.Commit() return nil } func (s *State) jobCancel(txn *memdb.Txn, job *jobIndex, force bool) error { oldState := job.State // How we handle cancel depends on the state switch job.State { case vagrant_server.Job_ERROR, vagrant_server.Job_SUCCESS: s.log.Debug("attempted to cancel completed job", "state", job.State.String(), "job", job.Id) // Jobs that are already completed do nothing for cancellation. // We do not mark that they were requested as cancelled since they // completed fine. return nil case vagrant_server.Job_QUEUED: // For queued jobs, we immediately transition them to an error state. job.State = vagrant_server.Job_ERROR case vagrant_server.Job_WAITING, vagrant_server.Job_RUNNING: // For these states, we just need to mark it as cancelled and have // downstream listeners complete the job. However, if we are forcing // then we immediately transition to error. if force { job.State = vagrant_server.Job_ERROR job.End() } } s.log.Debug("changing job state for cancel", "old-state", oldState.String(), "new-state", job.State.String(), "job", job.Id, "force", force) if force && job.State == vagrant_server.Job_ERROR { // Update our assigned state to unblock future jobs if err := s.jobAssignedSet(txn, job, false); err != nil { return err } } // Persist the on-disk data _, err := s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error { jobpb.State = job.State jobpb.CancelTime = timestamppb.New(time.Now()) // If we transitioned to the error state we note that we were force // cancelled. We can only be in the error state under that scenario // since otherwise we would've returned early. if jobpb.State == vagrant_server.Job_ERROR { jobpb.Error = status.New(codes.Canceled, "canceled").Proto() } return nil }) if err != nil { return err } // Store the inmem data // This will be seen by a currently running RunnerJobStream goroutine, which // will then see that the job has been canceled and send the request to cancel // down to the runner. if err := txn.Insert(jobTableName, job); err != nil { return err } return nil } // JobHeartbeat resets the heartbeat timer for a running job. If the job // is not currently running this does nothing, it will not return an error. // If the job doesn't exist then this will return an error. func (s *State) JobHeartbeat(id string) error { txn := s.inmem.Txn(true) defer txn.Abort() if err := s.jobHeartbeat(txn, id); err != nil { return err } txn.Commit() return nil } func (s *State) jobHeartbeat(txn *memdb.Txn, id string) error { // Get the job raw, err := txn.First(jobTableName, jobIdIndexName, id) if err != nil { return err } if raw == nil { return status.Errorf(codes.NotFound, "job not found: %s", id) } job := raw.(*jobIndex) // If the job is not in the running state, we do nothing. if job.State != vagrant_server.Job_RUNNING { return nil } // If the state timer is nil... that is weird but we ignore it here. // It is up to other parts of the job system to ensure a running // job has a heartbeat timer. if job.StateTimer == nil { s.log.Info("job with no start timer detected", "job", id) return nil } // Reset the timer job.StateTimer.Reset(jobHeartbeatTimeout) return nil } // JobExpire expires a job. This will cancel the job if it is still queued. func (s *State) JobExpire(id string) error { txn := s.inmem.Txn(true) defer txn.Abort() // Get the job raw, err := txn.First(jobTableName, jobIdIndexName, id) if err != nil { return err } if raw == nil { return status.Errorf(codes.NotFound, "job not found: %s", id) } job := raw.(*jobIndex) // How we handle depends on the state switch job.State { case vagrant_server.Job_QUEUED, vagrant_server.Job_WAITING: if err := s.jobCancel(txn, job, false); err != nil { return err } default: } txn.Commit() return nil } // JobIsAssignable returns whether there is a registered runner that // meets the requirements to run this job. // // If this returns true, the job if queued should eventually be assigned // successfully to a runner. An assignable result does NOT mean that it will be // in queue a short amount of time. // // Note the result is a point-in-time result. If the only candidate runners // deregister between this returning true and queueing, the job may still // sit in a queue indefinitely. func (s *State) JobIsAssignable(ctx context.Context, jobpb *vagrant_server.Job) (bool, error) { // If we have no runners, we cannot be assigned empty, err := s.runnerEmpty() if err != nil { return false, err } if empty { return false, nil } // If we have a special targeting constraint, that has to be met tx := s.db.Model(&Runner{}) switch v := jobpb.TargetRunner.Target.(type) { case *vagrant_server.Ref_Runner_Any: tx = tx.Where("by_id_only = ?", false) case *vagrant_server.Ref_Runner_Id: tx = tx.Where("rid = ?", v.Id.Id) default: return false, fmt.Errorf("unknown runner target value: %#v", jobpb.TargetRunner.Target) } var c int64 result := tx.Count(&c) if result.Error != nil { return false, result.Error } return c > 0, result.Error } // jobIndexInit initializes the config index from persisted data. func (s *State) jobIndexInit(memTxn *memdb.Txn) error { var jobs []InternalJob // Get all jobs which are not completed result := s.search(). Where(&InternalJob{State: JOB_STATE_UNKNOWN}). Or(&InternalJob{State: JOB_STATE_QUEUED}). Or(&InternalJob{State: JOB_STATE_WAITING}). Or(&InternalJob{State: JOB_STATE_RUNNING}). Find(&jobs) if result.Error != nil { return result.Error } // Load all incomplete jobs into memory for _, j := range jobs { job := j.ToProto() if j.Jid == nil { continue } idx, err := s.jobIndexSet(memTxn, []byte(*j.Jid), job) if err != nil { return err } // If the job was running or waiting, set it as assigned. if j.State == JOB_STATE_WAITING || j.State == JOB_STATE_RUNNING { if err = s.jobAssignedSet(memTxn, idx, true); err != nil { return err } } } return nil } // jobIndexSet writes an index record for a single job. func (s *State) jobIndexSet(txn *memdb.Txn, id []byte, jobpb *vagrant_server.Job) (*jobIndex, error) { rec := &jobIndex{ Id: jobpb.Id, State: jobpb.State, OpType: reflect.TypeOf(jobpb.Operation), } switch v := jobpb.Scope.(type) { case *vagrant_server.Job_Basis: rec.Scope = v.Basis case *vagrant_server.Job_Project: rec.Scope = v.Project case *vagrant_server.Job_Target: rec.Scope = v.Target } // Target if jobpb.TargetRunner == nil || jobpb.TargetRunner.Target == nil { return nil, fmt.Errorf("job target runner must be set") } switch v := jobpb.TargetRunner.Target.(type) { case *vagrant_server.Ref_Runner_Any: rec.TargetAny = true case *vagrant_server.Ref_Runner_Id: rec.TargetRunnerId = v.Id.Id default: return nil, fmt.Errorf("unknown runner target value: %#v", jobpb.TargetRunner) } // Timestamps timestamps := []struct { Field *time.Time Src *timestamppb.Timestamp }{ {&rec.QueueTime, jobpb.QueueTime}, } for _, ts := range timestamps { err := ts.Src.CheckValid() if err != nil { return nil, err } *ts.Field = ts.Src.AsTime() } // If this job is assigned. Then we have to start a nacking timer. // We reset the nack timer so it gives runners time to reconnect. if rec.State == vagrant_server.Job_WAITING { // Create our timer to requeue this if it isn't acked rec.StateTimer = time.AfterFunc(jobWaitingTimeout, func() { s.JobAck(rec.Id, false) }) } // If this job is running, we need to restart a heartbeat timeout. // This should only happen on reinit. This is tested. if rec.State == vagrant_server.Job_RUNNING { rec.StateTimer = time.AfterFunc(jobHeartbeatTimeout, func() { // Force cancel s.JobCancel(rec.Id, true) }) } // If we have an expiry, we need to set a timer to expire this job. if jobpb.ExpireTime != nil { now := time.Now() err := jobpb.ExpireTime.CheckValid() if err != nil { return nil, err } dur := jobpb.ExpireTime.AsTime().Sub(now) if dur < 0 { dur = 1 } time.AfterFunc(dur, func() { s.JobExpire(jobpb.Id) }) } // Insert the index return rec, txn.Insert(jobTableName, rec) } func (s *State) jobCreate(memTxn *memdb.Txn, jobpb *vagrant_server.Job) error { // Setup our initial job state var err error jobpb.State = vagrant_server.Job_QUEUED jobpb.QueueTime = timestamppb.New(time.Now()) // Convert the job proto into a record job, err := s.InternalJobFromProto(jobpb) // If the job already exists, force an error if job != nil { return fmt.Errorf("job already exists, cannot create (jid: %s)", *job.Jid) } job = &InternalJob{} if err = s.softDecode(jobpb, job); err != nil { return err } // Save the record into the db result := s.db.Create(job) if result.Error != nil { return result.Error } id := []byte(*job.Jid) // Insert into the in memory db _, err = s.jobIndexSet(memTxn, id, job.ToProto()) s.pruneMu.Lock() defer s.pruneMu.Unlock() s.indexedJobs++ return err } func (s *State) jobById(sid string) (*vagrant_server.Job, error) { job, err := s.InternalJobFromProto(&vagrant_server.Job{Id: sid}) if err != nil { return nil, err } return job.ToProto(), nil } func (s *State) jobReadAndUpdate(id string, f func(*vagrant_server.Job) error) (*vagrant_server.Job, error) { var err error if db, err := s.db.DB(); err == nil && db.Ping() != nil { return nil, fmt.Errorf("cannot update job information, database closed") } j, err := s.jobById(id) if err != nil { return nil, err } if err := f(j); err != nil { return nil, err } ij, err := s.InternalJobFromProto(j) if err != nil { return nil, err } if err := s.softDecode(j, ij); err != nil { return nil, err } result := s.db.Save(ij) if result.Error != nil { return nil, result.Error } return ij.ToProto(), nil } // jobCandidateById returns the most promising candidate job to assign // that is targeting a specific runner by ID. func (s *State) jobCandidateById(memTxn *memdb.Txn, ws memdb.WatchSet, r *Runner) (*jobIndex, error) { iter, err := memTxn.LowerBound( jobTableName, jobTargetIdIndexName, vagrant_server.Job_QUEUED, *r.Rid, time.Unix(0, 0), ) if err != nil { return nil, err } for { raw := iter.Next() if raw == nil { break } job := raw.(*jobIndex) if job.State != vagrant_server.Job_QUEUED || job.TargetRunnerId != *r.Rid { continue } // If this job is blocked, it is not a candidate. if blocked, err := s.jobIsBlocked(memTxn, job, ws); err != nil { return nil, err } else if blocked { continue } return job, nil } return nil, nil } // jobCandidateAny returns the first candidate job that targets any runner. func (s *State) jobCandidateAny(memTxn *memdb.Txn, ws memdb.WatchSet, r *Runner) (*jobIndex, error) { iter, err := memTxn.LowerBound( jobTableName, jobQueueTimeIndexName, vagrant_server.Job_QUEUED, time.Unix(0, 0), ) if err != nil { return nil, err } for { raw := iter.Next() if raw == nil { break } job := raw.(*jobIndex) if job.State != vagrant_server.Job_QUEUED || !job.TargetAny { continue } // If this job is blocked, it is not a candidate. if blocked, err := s.jobIsBlocked(memTxn, job, ws); err != nil { return nil, err } else if blocked { continue } return job, nil } return nil, nil } func (s *State) jobsPruneOld(memTxn *memdb.Txn, max int) (int, error) { // Prune from memdb return pruneOld(memTxn, pruneOp{ lock: &s.pruneMu, table: jobTableName, index: jobQueueTimeIndexName, indexArgs: []interface{}{vagrant_server.Job_QUEUED, time.Unix(0, 0)}, max: max, cur: &s.indexedJobs, check: func(raw interface{}) bool { job := raw.(*jobIndex) return !jobIsCompleted(job.State) }, }) } func (s *State) JobsDBPruneOld(max int) (int, error) { var jobs []InternalJob result := s.db.Select("id").Order("queue_time asc").Offset(max).Find(&jobs) if result.Error != nil { return 0, result.Error } deleted := len(jobs) if deleted < 1 { return deleted, nil } result = s.db.Unscoped().Delete(jobs) if result.Error != nil { return 0, result.Error } return deleted, nil } // Job returns the Job for an index. func (idx *jobIndex) Job(jobpb *vagrant_server.Job) *Job { return &Job{ Job: jobpb, OutputBuffer: idx.OutputBuffer, } } // End notes this job is complete and performs any cleanup on the index. func (idx *jobIndex) End() { if idx.StateTimer != nil { idx.StateTimer.Stop() idx.StateTimer = nil } }