vaguerent/internal/server/singleprocess/state/job.go

package state

import (
	"context"
	"errors"
	"fmt"
	"reflect"
	"sort"
	"time"

	"github.com/go-ozzo/ozzo-validation/v4"
	"github.com/hashicorp/go-memdb"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/status"
	"google.golang.org/protobuf/types/known/timestamppb"
	"gorm.io/gorm"
	//	"gorm.io/gorm/clause"

	"github.com/hashicorp/vagrant/internal/server"
	"github.com/hashicorp/vagrant/internal/server/logbuffer"
	"github.com/hashicorp/vagrant/internal/server/proto/vagrant_server"
)

func init() {
	models = append(models, &InternalJob{})
	dbIndexers = append(dbIndexers, (*State).jobIndexInit)
	schemas = append(schemas, jobSchema)
}

var (
	jobBucket           = []byte("jobs")
	jobWaitingTimeout   = 2 * time.Minute
	jobHeartbeatTimeout = 2 * time.Minute
)

const (
	jobTableName          = "jobs"
	jobIdIndexName        = "id"
	jobStateIndexName     = "state"
	jobQueueTimeIndexName = "queue-time"
	jobTargetIdIndexName  = "target-id"
	maximumJobsInMem      = 10000
)

type JobState uint8

const (
	JOB_STATE_UNKNOWN JobState = JobState(vagrant_server.Job_UNKNOWN)
	JOB_STATE_QUEUED           = JobState(vagrant_server.Job_QUEUED)
	JOB_STATE_WAITING          = JobState(vagrant_server.Job_WAITING)
	JOB_STATE_RUNNING          = JobState(vagrant_server.Job_RUNNING)
	JOB_STATE_ERROR            = JobState(vagrant_server.Job_ERROR)
	JOB_STATE_SUCCESS          = JobState(vagrant_server.Job_SUCCESS)
)

type InternalJob struct {
	Model

	AssignTime          *time.Time
	AckTime             *time.Time
	AssignedRunnerID    *uint `mapstructure:"-"`
	AssignedRunner      *Runner
	CancelTime          *time.Time
	CompleteTime        *time.Time
	DataSource          *ProtoValue
	DataSourceOverrides MetadataSet
	Error               *ProtoValue
	ExpireTime          *time.Time
	Labels              MetadataSet
	Jid                 *string     `gorm:"uniqueIndex" mapstructure:"Id"`
	Operation           *ProtoValue `mapstructure:"Operation"`
	QueueTime           *time.Time
	Result              *ProtoValue
	Scope               scope  `gorm:"-:all"`
	ScopeID             *uint  `mapstructure:"-"`
	ScopeType           string `mapstructure:"-"`
	State               JobState
	TargetRunner        *ProtoValue
}

// Job should come with an ID assigned to it, but if it doesn't for
// some reason, we assign one now.
func (i *InternalJob) BeforeCreate(tx *gorm.DB) error {
	if i.Jid == nil {
		id, err := server.Id()
		if err != nil {
			return err
		}
		i.Jid = &id
	}

	return nil
}

// If the job has a scope assigned to it, persist it.
func (i *InternalJob) BeforeSave(tx *gorm.DB) (err error) {
	if i.Scope == nil {
		i.ScopeID = nil
		i.ScopeType = ""
		return nil
	}
	switch v := i.Scope.(type) {
	case *Basis:
		i.ScopeID = &v.ID
		i.ScopeType = "basis"
	case *Project:
		i.ScopeID = &v.ID
		i.ScopeType = "project"
	case *Target:
		i.ScopeID = &v.ID
		i.ScopeType = "target"
	default:
		return fmt.Errorf("unknown scope type (%T)", i.Scope)
	}

	return nil
}

// If the job has a scope, load it.
func (i *InternalJob) AfterFind(tx *gorm.DB) (err error) {
	if i.ScopeID == nil {
		return nil
	}
	switch i.ScopeType {
	case "basis":
		var b Basis
		result := tx.
			First(&b, &Basis{Model: Model{ID: *i.ScopeID}})
		if result.Error != nil {
			return result.Error
		}
		i.Scope = &b
	case "project":
		var p Project
		result := tx.
			First(&p, &Project{Model: Model{ID: *i.ScopeID}})
		if result.Error != nil {
			return result.Error
		}
		i.Scope = &p
	case "target":
		var t Target
		result := tx.
			First(&t, &Target{Model: Model{ID: *i.ScopeID}})
		if result.Error != nil {
			return result.Error
		}
		i.Scope = &t
	default:
		return fmt.Errorf("unknown scope type (%s)", i.ScopeType)
	}

	return nil
}

func (i *InternalJob) Validate(tx *gorm.DB) error {
	return validation.ValidateStruct(i,
		validation.Field(&i.Jid,
			validation.Required,
			validation.By(
				checkUnique(
					tx.Model((*InternalJob)(nil)).
						Where(&InternalJob{Jid: i.Jid}).
						Not(&InternalJob{Model: Model{ID: i.ID}}),
				),
			),
		),
	)
}

// Convert job to a protobuf message
func (i *InternalJob) ToProto() *vagrant_server.Job {
	if i == nil {
		return nil
	}

	var j vagrant_server.Job
	err := decode(i, &j)
	if err != nil {
		panic("failed to decode job: " + err.Error())
	}

	return &j
}

func (s *State) InternalJobFromProto(job *vagrant_server.Job) (*InternalJob, error) {
	if job == nil {
		return nil, ErrEmptyProtoArgument
	}

	if job.Id == "" {
		return nil, gorm.ErrRecordNotFound
	}

	var j InternalJob
	result := s.search().First(&j, &InternalJob{Jid: &job.Id})
	if result.Error != nil {
		return nil, result.Error
	}

	return &j, nil
}

func jobSchema() *memdb.TableSchema {
	return &memdb.TableSchema{
		Name: jobTableName,
		Indexes: map[string]*memdb.IndexSchema{
			jobIdIndexName: {
				Name:         jobIdIndexName,
				AllowMissing: false,
				Unique:       true,
				Indexer: &memdb.StringFieldIndex{
					Field: "Id",
				},
			},

			jobStateIndexName: {
				Name:         jobStateIndexName,
				AllowMissing: true,
				Unique:       false,
				Indexer: &memdb.IntFieldIndex{
					Field: "State",
				},
			},

			jobQueueTimeIndexName: {
				Name:         jobQueueTimeIndexName,
				AllowMissing: true,
				Unique:       false,
				Indexer: &memdb.CompoundIndex{
					Indexes: []memdb.Indexer{
						&memdb.IntFieldIndex{
							Field: "State",
						},

						&IndexTime{
							Field: "QueueTime",
							Asc:   true,
						},
					},
				},
			},

			jobTargetIdIndexName: {
				Name:         jobTargetIdIndexName,
				AllowMissing: true,
				Unique:       true,
				Indexer: &memdb.CompoundIndex{
					Indexes: []memdb.Indexer{
						&memdb.IntFieldIndex{
							Field: "State",
						},

						&memdb.StringFieldIndex{
							Field:     "TargetRunnerId",
							Lowercase: true,
						},

						&IndexTime{
							Field: "QueueTime",
							Asc:   true,
						},
					},
				},
			},
		},
	}
}

type jobIndex struct {
	Id string

	// OpType is the operation type for the job.
	OpType reflect.Type

	// The basis/project/machine that this job is part of. This is used
	// to determine if the job is blocked. See job_assigned.go for more details.
	Scope interface {
		GetResourceId() string
	}

	// QueueTime is the time that the job was queued.
	QueueTime time.Time

	// TargetAny will be true if this job targets anything
	TargetAny bool

	// TargetRunnerId is the ID of the runner to target.
	TargetRunnerId string

	// State is the current state of this job.
	State vagrant_server.Job_State

	// StateTimer holds a timer that is usually acting as a timeout mechanism
	// on the current state. When the state changes, the timer should be cancelled.
	StateTimer *time.Timer

	// OutputBuffer stores the terminal output
	OutputBuffer *logbuffer.Buffer
}

// A helper, pulled out rather than on a value to allow it to be used against
// vagrant_server.Job,s and jobIndex's alike.
func jobIsCompleted(state vagrant_server.Job_State) bool {
	switch state {
	case vagrant_server.Job_ERROR, vagrant_server.Job_SUCCESS:
		return true
	default:
		return false
	}
}

// Job is the exported structure that is returned for most state APIs
// and gives callers access to more information than the pure job structure.
type Job struct {
	// Full job structure.
	*vagrant_server.Job

	// OutputBuffer is the terminal output for this job. This is a buffer
	// that may not contain the full amount of output depending on the
	// time of connection.
	OutputBuffer *logbuffer.Buffer

	// Blocked is true if this job is blocked on another job for the same
	// project/app/workspace.
	Blocked bool
}

func (s *State) JobValidate(jobpb *vagrant_server.Job) error {
	var job InternalJob

	if err := s.softDecode(jobpb, &job); err != nil {
		return errorToStatus(err)
	}

	if err := job.Validate(s.db); err != nil {
		return errorToStatus(err)
	}

	return nil
}

// JobCreate queues the given job.
func (s *State) JobCreate(jobpb *vagrant_server.Job) error {
	txn := s.inmem.Txn(true)
	defer txn.Abort()

	err := s.jobCreate(txn, jobpb)
	if err == nil {
		txn.Commit()
	}

	if err != nil {
		return lookupErrorToStatus("job", err)
	}

	return nil
}

// JobList returns the list of jobs.
func (s *State) JobList() ([]*vagrant_server.Job, error) {
	memTxn := s.inmem.Txn(false)
	defer memTxn.Abort()

	iter, err := memTxn.Get(jobTableName, jobIdIndexName+"_prefix", "")
	if err != nil {
		return nil, lookupErrorToStatus("job", err)
	}

	var result []*vagrant_server.Job
	for {
		next := iter.Next()
		if next == nil {
			break
		}
		idx := next.(*jobIndex)

		job, err := s.jobById(idx.Id)
		if err != nil {
			return nil, lookupErrorToStatus("job", err)
		}

		result = append(result, job)
	}

	return result, nil
}

// JobById looks up a job by ID. The returned Job will be a deep copy
// of the job so it is safe to read/write. If the job can't be found,
// a nil result with no error is returned.
func (s *State) JobById(id string, ws memdb.WatchSet) (*Job, error) {
	memTxn := s.inmem.Txn(false)
	defer memTxn.Abort()

	watchCh, raw, err := memTxn.FirstWatch(jobTableName, jobIdIndexName, id)
	if err != nil {
		return nil, lookupErrorToStatus("job", err)
	}

	ws.Add(watchCh)

	if raw == nil {
		return nil, nil
	}
	jobIdx := raw.(*jobIndex)

	// Get blocked status if it is queued.
	var blocked bool
	if jobIdx.State == vagrant_server.Job_QUEUED {
		blocked, err = s.jobIsBlocked(memTxn, jobIdx, ws)
		if err != nil {
			return nil, lookupErrorToStatus("job", err)
		}
	}

	job, err := s.jobById(jobIdx.Id)
	if err != nil {
		return nil, lookupErrorToStatus("job", err)
	}

	result := jobIdx.Job(job)
	result.Blocked = blocked

	return result, nil
}

// JobAssignForRunner will wait for and assign a job to a specific runner.
// This will automatically evaluate any conditions that the runner and/or
// job may have on assignability.
//
// The assigned job is put into a "waiting" state until the runner
// acks the assignment which can be set with JobAck.
//
// If ctx is provided and assignment has to block waiting for new jobs,
// this will cancel when the context is done.
func (s *State) JobAssignForRunner(ctx context.Context, r *vagrant_server.Runner) (*Job, error) {
RETRY_ASSIGN:
	txn := s.inmem.Txn(false)
	defer txn.Abort()

	// Turn our runner into a runner record so we can more efficiently assign
	runnerRec, err := s.RunnerFromProto(r)
	if err != nil {
		return nil, fmt.Errorf("runner lookup failed: %w", err)
	}

	// candidateQuery finds candidate jobs to assign.
	type candidateFunc func(*memdb.Txn, memdb.WatchSet, *Runner) (*jobIndex, error)
	candidateQuery := []candidateFunc{
		s.jobCandidateById,
		s.jobCandidateAny,
	}

	// If the runner is by id only, then explicitly set it to by id only.
	// We explicitly set the full list so that if we add more candidate
	// searches in the future, we're unlikely to break this.
	if r.ByIdOnly {
		candidateQuery = []candidateFunc{s.jobCandidateById}
	}

	// Build the list of candidates
	var candidates []*jobIndex
	ws := memdb.NewWatchSet()
	for _, f := range candidateQuery {
		job, err := f(txn, ws, runnerRec)
		if err != nil {
			return nil, err
		}
		if job == nil {
			continue
		}

		candidates = append(candidates, job)
	}

	// If we have no candidates, then we have to wait for a job to show up.
	// We set up a blocking query on the job table for a non-assigned job.
	if len(candidates) == 0 {
		iter, err := txn.Get(jobTableName, jobStateIndexName, vagrant_server.Job_QUEUED)
		if err != nil {
			return nil, err
		}

		ws.Add(iter.WatchCh())
	}

	// We're done reading so abort the transaction
	txn.Abort()

	// If we have a watch channel set that means we didn't find any
	// results and we need to retry after waiting for changes.
	if len(candidates) == 0 {
		ws.WatchCtx(ctx)
		if err := ctx.Err(); err != nil {
			return nil, err
		}

		goto RETRY_ASSIGN
	}

	// We sort our candidates by queue time so that we can find the earliest
	sort.Slice(candidates, func(i, j int) bool {
		return candidates[i].QueueTime.Before(candidates[j].QueueTime)
	})

	// Grab a write lock since we're going to delete, modify, add the
	// job that we chose. No need to defer here since the first defer works
	// at the top of the func.
	//
	// Write locks are exclusive so this will ensure we're the only one
	// writing at a time. This lets us be sure we're the only one "assigning"
	// a job candidate.
	txn = s.inmem.Txn(true)
	for _, job := range candidates {
		// Get the job
		raw, err := txn.First(jobTableName, jobIdIndexName, job.Id)
		if err != nil {
			return nil, err
		}
		if raw == nil {
			// The job no longer exists. It may be canceled or something.
			// Invalid candidate, continue to next.
			continue
		}

		// We need to verify that in the time between our candidate search
		// and our write lock acquisition, that this job hasn't been assigned,
		// canceled, etc. If so, this is an invalid candidate.
		job := raw.(*jobIndex)
		if job == nil || job.State != vagrant_server.Job_QUEUED {
			continue
		}

		// We also need to recheck that we aren't blocked. If we're blocked
		// now then we need to skip this job.
		if blocked, err := s.jobIsBlocked(txn, job, nil); blocked {
			continue
		} else if err != nil {
			return nil, err
		}

		// Update our state and update our on-disk job
		job.State = vagrant_server.Job_WAITING
		result, err := s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error {
			jobpb.State = job.State
			jobpb.AssignTime = timestamppb.New(time.Now())
			return nil
		})
		if err != nil {
			return nil, err
		}

		// Create our timer to requeue this if it isn't acked
		job.StateTimer = time.AfterFunc(jobWaitingTimeout, func() {
			s.log.Info("job ack timer expired", "job", job.Id, "timeout", jobWaitingTimeout)
			s.JobAck(job.Id, false)
		})

		if err := txn.Insert(jobTableName, job); err != nil {
			return nil, err
		}

		// Update our assignment state
		if err := s.jobAssignedSet(txn, job, true); err != nil {
			s.JobAck(job.Id, false)
			return nil, err
		}

		txn.Commit()
		return job.Job(result), nil
	}
	txn.Abort()

	// If we reached here, all of our candidates were invalid, we retry
	goto RETRY_ASSIGN
}

// JobAck acknowledges that a job has been accepted or rejected by the runner.
// If ack is false, then this will move the job back to the queued state
// and be eligible for assignment.
func (s *State) JobAck(id string, ack bool) (*Job, error) {
	txn := s.inmem.Txn(true)
	defer txn.Abort()

	// Get the job
	raw, err := txn.First(jobTableName, jobIdIndexName, id)
	if err != nil {
		return nil, lookupErrorToStatus("job", err)
	}
	if raw == nil {
		return nil, status.Errorf(codes.NotFound, "job not found: %s", id)
	}
	job := raw.(*jobIndex)

	// If the job is not in the assigned state, then this is an error.
	if job.State != vagrant_server.Job_WAITING {
		return nil, status.Errorf(codes.FailedPrecondition,
			"job can't be acked from state: %s",
			job.State.String())
	}

	result, err := s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error {
		if ack {
			// Set to accepted
			job.State = vagrant_server.Job_RUNNING
			jobpb.State = job.State
			jobpb.AckTime = timestamppb.New(time.Now())

			// We also initialize the output buffer here because we can
			// expect output to begin streaming in.
			job.OutputBuffer = logbuffer.New()
		} else {
			// Set to queued
			job.State = vagrant_server.Job_QUEUED
			jobpb.State = job.State
			jobpb.AssignTime = nil
		}

		return nil
	})
	if err != nil {
		return nil, lookupErrorToStatus("job", err)
	}

	// Cancel our timer
	if job.StateTimer != nil {
		job.StateTimer.Stop()
		job.StateTimer = nil
	}

	// Create a new timer that we'll use for our heartbeat. After this
	// timer expires, the job will immediately move to an error state.
	job.StateTimer = time.AfterFunc(jobHeartbeatTimeout, func() {
		s.log.Info("canceling job due to heartbeat timeout", "job", job.Id)
		// Force cancel
		err := s.JobCancel(job.Id, true)
		if err != nil {
			s.log.Error("error canceling job due to heartbeat failure", "error", err, "job", job.Id)
		}
	})

	s.log.Debug("heartbeat timer set", "job", job.Id, "timeout", jobHeartbeatTimeout)

	// Insert to update
	if err := txn.Insert(jobTableName, job); err != nil {
		return nil, saveErrorToStatus("job", err)
	}

	// Update our assigned state if we nacked
	if !ack {
		if err := s.jobAssignedSet(txn, job, false); err != nil {
			return nil, saveErrorToStatus("job", err)
		}
	}

	txn.Commit()
	return job.Job(result), nil
}

// JobComplete marks a running job as complete. If an error is given,
// the job is marked as failed (a completed state). If no error is given,
// the job is marked as successful.
func (s *State) JobComplete(id string, result *vagrant_server.Job_Result, cerr error) error {
	txn := s.inmem.Txn(true)
	defer txn.Abort()

	// Get the job
	raw, err := txn.First(jobTableName, jobIdIndexName, id)
	if err != nil {
		return lookupErrorToStatus("job", err)
	}
	if raw == nil {
		return status.Errorf(codes.NotFound, "job not found: %s", id)
	}
	job := raw.(*jobIndex)

	// Update our assigned state
	if err := s.jobAssignedSet(txn, job, false); err != nil {
		return saveErrorToStatus("job", err)
	}

	// If the job is not in the assigned state, then this is an error.
	if job.State != vagrant_server.Job_RUNNING {
		return status.Errorf(codes.FailedPrecondition,
			"job can't be completed from state: %s",
			job.State.String())
	}

	_, err = s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error {
		// Set to complete, assume success for now
		job.State = vagrant_server.Job_SUCCESS
		jobpb.State = job.State
		jobpb.Result = result
		jobpb.CompleteTime = timestamppb.New(time.Now())

		if cerr != nil {
			job.State = vagrant_server.Job_ERROR
			jobpb.State = job.State

			st, _ := status.FromError(cerr)
			jobpb.Error = st.Proto()
		}

		return nil
	})
	if err != nil {
		return saveErrorToStatus("job", err)
	}

	// End the job
	job.End()

	// Insert to update
	if err := txn.Insert(jobTableName, job); err != nil {
		return saveErrorToStatus("job", err)
	}

	txn.Commit()
	return nil
}

// JobCancel marks a job as cancelled. This will set the internal state
// and request the cancel but if the job is running then it is up to downstream
// to listen for and react to Job changes for cancellation.
func (s *State) JobCancel(id string, force bool) error {
	txn := s.inmem.Txn(true)
	defer txn.Abort()

	// Get the job
	raw, err := txn.First(jobTableName, jobIdIndexName, id)
	if err != nil {
		return lookupErrorToStatus("job", err)
	}
	if raw == nil {
		return status.Errorf(codes.NotFound, "job not found: %s", id)
	}
	job := raw.(*jobIndex)

	if err := s.jobCancel(txn, job, force); err != nil {
		return saveErrorToStatus("job", err)
	}

	txn.Commit()
	return nil
}

func (s *State) jobCancel(txn *memdb.Txn, job *jobIndex, force bool) error {
	oldState := job.State

	// How we handle cancel depends on the state
	switch job.State {
	case vagrant_server.Job_ERROR, vagrant_server.Job_SUCCESS:
		s.log.Debug("attempted to cancel completed job", "state", job.State.String(), "job", job.Id)
		// Jobs that are already completed do nothing for cancellation.
		// We do not mark that they were requested as cancelled since they
		// completed fine.
		return nil

	case vagrant_server.Job_QUEUED:
		// For queued jobs, we immediately transition them to an error state.
		job.State = vagrant_server.Job_ERROR

	case vagrant_server.Job_WAITING, vagrant_server.Job_RUNNING:
		// For these states, we just need to mark it as cancelled and have
		// downstream listeners complete the job. However, if we are forcing
		// then we immediately transition to error.
		if force {
			job.State = vagrant_server.Job_ERROR
			job.End()
		}
	}

	s.log.Debug("changing job state for cancel", "old-state", oldState.String(), "new-state", job.State.String(), "job", job.Id, "force", force)

	if force && job.State == vagrant_server.Job_ERROR {
		// Update our assigned state to unblock future jobs
		if err := s.jobAssignedSet(txn, job, false); err != nil {
			return err
		}
	}

	// Persist the on-disk data
	_, err := s.jobReadAndUpdate(job.Id, func(jobpb *vagrant_server.Job) error {
		jobpb.State = job.State
		jobpb.CancelTime = timestamppb.New(time.Now())

		// If we transitioned to the error state we note that we were force
		// cancelled. We can only be in the error state under that scenario
		// since otherwise we would've returned early.
		if jobpb.State == vagrant_server.Job_ERROR {
			jobpb.Error = status.New(codes.Canceled, "canceled").Proto()
		}

		return nil
	})
	if err != nil {
		return err
	}

	// Store the inmem data
	// This will be seen by a currently running RunnerJobStream goroutine, which
	// will then see that the job has been canceled and send the request to cancel
	// down to the runner.
	if err := txn.Insert(jobTableName, job); err != nil {
		return err
	}

	return nil
}

// JobHeartbeat resets the heartbeat timer for a running job. If the job
// is not currently running this does nothing, it will not return an error.
// If the job doesn't exist then this will return an error.
func (s *State) JobHeartbeat(id string) error {
	txn := s.inmem.Txn(true)
	defer txn.Abort()

	if err := s.jobHeartbeat(txn, id); err != nil {
		return err
	}

	txn.Commit()
	return nil
}

func (s *State) jobHeartbeat(txn *memdb.Txn, id string) error {
	// Get the job
	raw, err := txn.First(jobTableName, jobIdIndexName, id)
	if err != nil {
		return err
	}
	if raw == nil {
		return status.Errorf(codes.NotFound, "job not found: %s", id)
	}
	job := raw.(*jobIndex)

	// If the job is not in the running state, we do nothing.
	if job.State != vagrant_server.Job_RUNNING {
		return nil
	}

	// If the state timer is nil... that is weird but we ignore it here.
	// It is up to other parts of the job system to ensure a running
	// job has a heartbeat timer.
	if job.StateTimer == nil {
		s.log.Info("job with no start timer detected", "job", id)
		return nil
	}

	// Reset the timer
	job.StateTimer.Reset(jobHeartbeatTimeout)

	return nil
}

// JobExpire expires a job. This will cancel the job if it is still queued.
func (s *State) JobExpire(id string) error {
	txn := s.inmem.Txn(true)
	defer txn.Abort()

	// Get the job
	raw, err := txn.First(jobTableName, jobIdIndexName, id)
	if err != nil {
		return err
	}
	if raw == nil {
		return status.Errorf(codes.NotFound, "job not found: %s", id)
	}
	job := raw.(*jobIndex)

	// How we handle depends on the state
	switch job.State {
	case vagrant_server.Job_QUEUED, vagrant_server.Job_WAITING:
		if err := s.jobCancel(txn, job, false); err != nil {
			return err
		}

	default:
	}

	txn.Commit()
	return nil
}

// JobIsAssignable returns whether there is a registered runner that
// meets the requirements to run this job.
//
// If this returns true, the job if queued should eventually be assigned
// successfully to a runner. An assignable result does NOT mean that it will be
// in queue a short amount of time.
//
// Note the result is a point-in-time result. If the only candidate runners
// deregister between this returning true and queueing, the job may still
// sit in a queue indefinitely.
func (s *State) JobIsAssignable(ctx context.Context, jobpb *vagrant_server.Job) (bool, error) {
	// If we have no runners, we cannot be assigned
	empty, err := s.runnerEmpty()
	if err != nil {
		return false, err
	}
	if empty {
		return false, nil
	}

	// If we have a special targeting constraint, that has to be met
	tx := s.db.Model(&Runner{})
	switch v := jobpb.TargetRunner.Target.(type) {
	case *vagrant_server.Ref_Runner_Any:
		tx = tx.Where("by_id_only = ?", false)
	case *vagrant_server.Ref_Runner_Id:
		tx = tx.Where("rid = ?", v.Id.Id)
	default:
		return false, fmt.Errorf("unknown runner target value: %#v", jobpb.TargetRunner.Target)
	}

	var c int64
	result := tx.Count(&c)
	if result.Error != nil {
		return false, result.Error
	}

	return c > 0, result.Error
}

// jobIndexInit initializes the config index from persisted data.
func (s *State) jobIndexInit(memTxn *memdb.Txn) error {
	var jobs []InternalJob

	// Get all jobs which are not completed
	result := s.search().
		Where(&InternalJob{State: JOB_STATE_UNKNOWN}).
		Or(&InternalJob{State: JOB_STATE_QUEUED}).
		Or(&InternalJob{State: JOB_STATE_WAITING}).
		Or(&InternalJob{State: JOB_STATE_RUNNING}).
		Find(&jobs)
	if result.Error != nil {
		return result.Error
	}

	// Load all incomplete jobs into memory
	for _, j := range jobs {
		job := j.ToProto()
		if j.Jid == nil {
			continue
		}
		idx, err := s.jobIndexSet(memTxn, []byte(*j.Jid), job)
		if err != nil {
			return err
		}

		// If the job was running or waiting, set it as assigned.
		if j.State == JOB_STATE_WAITING || j.State == JOB_STATE_RUNNING {
			if err = s.jobAssignedSet(memTxn, idx, true); err != nil {
				return err
			}
		}
	}

	return nil
}

// jobIndexSet writes an index record for a single job.
func (s *State) jobIndexSet(txn *memdb.Txn, id []byte, jobpb *vagrant_server.Job) (*jobIndex, error) {
	rec := &jobIndex{
		Id:     jobpb.Id,
		State:  jobpb.State,
		OpType: reflect.TypeOf(jobpb.Operation),
	}

	switch v := jobpb.Scope.(type) {
	case *vagrant_server.Job_Basis:
		rec.Scope = v.Basis
	case *vagrant_server.Job_Project:
		rec.Scope = v.Project
	case *vagrant_server.Job_Target:
		rec.Scope = v.Target
	}

	// Target
	if jobpb.TargetRunner == nil || jobpb.TargetRunner.Target == nil {
		return nil, fmt.Errorf("job target runner must be set")
	}
	switch v := jobpb.TargetRunner.Target.(type) {
	case *vagrant_server.Ref_Runner_Any:
		rec.TargetAny = true

	case *vagrant_server.Ref_Runner_Id:
		rec.TargetRunnerId = v.Id.Id

	default:
		return nil, fmt.Errorf("unknown runner target value: %#v", jobpb.TargetRunner)
	}

	// Timestamps
	timestamps := []struct {
		Field *time.Time
		Src   *timestamppb.Timestamp
	}{
		{&rec.QueueTime, jobpb.QueueTime},
	}
	for _, ts := range timestamps {
		err := ts.Src.CheckValid()
		if err != nil {
			return nil, err
		}

		*ts.Field = ts.Src.AsTime()
	}

	// If this job is assigned. Then we have to start a nacking timer.
	// We reset the nack timer so it gives runners time to reconnect.
	if rec.State == vagrant_server.Job_WAITING {
		// Create our timer to requeue this if it isn't acked
		rec.StateTimer = time.AfterFunc(jobWaitingTimeout, func() {
			s.JobAck(rec.Id, false)
		})
	}

	// If this job is running, we need to restart a heartbeat timeout.
	// This should only happen on reinit. This is tested.
	if rec.State == vagrant_server.Job_RUNNING {
		rec.StateTimer = time.AfterFunc(jobHeartbeatTimeout, func() {
			// Force cancel
			s.JobCancel(rec.Id, true)
		})
	}

	// If we have an expiry, we need to set a timer to expire this job.
	if jobpb.ExpireTime != nil {
		now := time.Now()

		err := jobpb.ExpireTime.CheckValid()
		if err != nil {
			return nil, err
		}

		dur := jobpb.ExpireTime.AsTime().Sub(now)
		if dur < 0 {
			dur = 1
		}

		time.AfterFunc(dur, func() { s.JobExpire(jobpb.Id) })
	}

	// Insert the index
	return rec, txn.Insert(jobTableName, rec)
}

func (s *State) jobCreate(memTxn *memdb.Txn, jobpb *vagrant_server.Job) error {
	// Setup our initial job state
	var err error
	jobpb.State = vagrant_server.Job_QUEUED
	jobpb.QueueTime = timestamppb.New(time.Now())

	// Convert the job proto into a record
	job, err := s.InternalJobFromProto(jobpb)
	if err != nil && !errors.Is(err, gorm.ErrRecordNotFound) {
		return err
	}

	if err != nil {
		job = &InternalJob{}
	}

	if err = s.softDecode(jobpb, job); err != nil {
		return err
	}

	// Save the record into the db
	result := s.db.Create(job)
	if result.Error != nil {
		return result.Error
	}

	id := []byte(*job.Jid)

	// Insert into the in memory db
	_, err = s.jobIndexSet(memTxn, id, job.ToProto())

	s.pruneMu.Lock()
	defer s.pruneMu.Unlock()
	s.indexedJobs++

	return err
}

func (s *State) jobById(sid string) (*vagrant_server.Job, error) {
	job, err := s.InternalJobFromProto(&vagrant_server.Job{Id: sid})
	if err != nil {
		return nil, err
	}

	return job.ToProto(), nil
}

func (s *State) jobReadAndUpdate(id string, f func(*vagrant_server.Job) error) (*vagrant_server.Job, error) {
	var err error

	j, err := s.jobById(id)
	if err != nil {
		return nil, err
	}

	if err := f(j); err != nil {
		return nil, err
	}

	ij, err := s.InternalJobFromProto(j)
	if err != nil {
		return nil, err
	}

	if err := s.softDecode(j, ij); err != nil {
		return nil, err
	}

	result := s.db.Save(ij)
	if result.Error != nil {
		return nil, result.Error
	}

	return ij.ToProto(), nil
}

// jobCandidateById returns the most promising candidate job to assign
// that is targeting a specific runner by ID.
func (s *State) jobCandidateById(memTxn *memdb.Txn, ws memdb.WatchSet, r *Runner) (*jobIndex, error) {
	iter, err := memTxn.LowerBound(
		jobTableName,
		jobTargetIdIndexName,
		vagrant_server.Job_QUEUED,
		*r.Rid,
		time.Unix(0, 0),
	)
	if err != nil {
		return nil, err
	}

	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		job := raw.(*jobIndex)
		if job.State != vagrant_server.Job_QUEUED || job.TargetRunnerId != *r.Rid {
			continue
		}

		// If this job is blocked, it is not a candidate.
		if blocked, err := s.jobIsBlocked(memTxn, job, ws); err != nil {
			return nil, err
		} else if blocked {
			continue
		}

		return job, nil
	}

	return nil, nil
}

// jobCandidateAny returns the first candidate job that targets any runner.
func (s *State) jobCandidateAny(memTxn *memdb.Txn, ws memdb.WatchSet, r *Runner) (*jobIndex, error) {
	iter, err := memTxn.LowerBound(
		jobTableName,
		jobQueueTimeIndexName,
		vagrant_server.Job_QUEUED,
		time.Unix(0, 0),
	)
	if err != nil {
		return nil, err
	}

	for {
		raw := iter.Next()
		if raw == nil {
			break
		}

		job := raw.(*jobIndex)
		if job.State != vagrant_server.Job_QUEUED || !job.TargetAny {
			continue
		}

		// If this job is blocked, it is not a candidate.
		if blocked, err := s.jobIsBlocked(memTxn, job, ws); err != nil {
			return nil, err
		} else if blocked {
			continue
		}

		return job, nil
	}

	return nil, nil
}

func (s *State) jobsPruneOld(memTxn *memdb.Txn, max int) (int, error) {
	// Prune from memdb
	return pruneOld(memTxn, pruneOp{
		lock:      &s.pruneMu,
		table:     jobTableName,
		index:     jobQueueTimeIndexName,
		indexArgs: []interface{}{vagrant_server.Job_QUEUED, time.Unix(0, 0)},
		max:       max,
		cur:       &s.indexedJobs,
		check: func(raw interface{}) bool {
			job := raw.(*jobIndex)
			return !jobIsCompleted(job.State)
		},
	})
}

func (s *State) JobsDBPruneOld(max int) (int, error) {
	var jobs []InternalJob
	result := s.db.Select("id").Order("queue_time asc").Offset(max).Find(&jobs)
	if result.Error != nil {
		return 0, result.Error
	}
	deleted := len(jobs)
	if deleted < 1 {
		return deleted, nil
	}
	result = s.db.Unscoped().Delete(jobs)
	if result.Error != nil {
		return 0, result.Error
	}

	return deleted, nil
}

// Job returns the Job for an index.
func (idx *jobIndex) Job(jobpb *vagrant_server.Job) *Job {
	return &Job{
		Job:          jobpb,
		OutputBuffer: idx.OutputBuffer,
	}
}

// End notes this job is complete and performs any cleanup on the index.
func (idx *jobIndex) End() {
	if idx.StateTimer != nil {
		idx.StateTimer.Stop()
		idx.StateTimer = nil
	}
}