// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

//go:build linux

package executor

import (
	"fmt"
	"os"
	"os/exec"
	"strconv"
	"syscall"

	"github.com/hashicorp/go-set/v3"
	"github.com/hashicorp/nomad/client/lib/cgroupslib"
	"github.com/hashicorp/nomad/client/lib/nsutil"
	"github.com/hashicorp/nomad/drivers/shared/executor/procstats"
	"github.com/hashicorp/nomad/plugins/drivers"
	"github.com/opencontainers/runc/libcontainer/cgroups"
	"golang.org/x/sys/unix"
)

const (
	// memoryNoLimit is a sentinel value for memory_max that indicates the
	// raw_exec driver should not enforce a maximum memory limit
	memoryNoLimit = -1
)

// setSubCmdCgroup sets the cgroup for non-Task child processes of the
// executor.Executor (since in cg2 it lives outside the task's cgroup)
func (e *UniversalExecutor) setSubCmdCgroup(cmd *exec.Cmd, cgroup string) (func(), error) {

	// no extra setup needed for cg v1 or when cgroups are "off"
	switch cgroupslib.GetMode() {
	case cgroupslib.OFF, cgroupslib.CG1:
		return func() {}, nil
	default:
		// continue for cg v2
	}

	if cgroup == "" {
		return nil, fmt.Errorf("error setting up exec subcommand: %w", ErrCgroupMustBeSet)
	}

	fd, cleanup, err := e.statCG(cgroup)
	if err != nil {
		return nil, err
	}

	// make sure attrs struct has been set
	if cmd.SysProcAttr == nil {
		cmd.SysProcAttr = new(syscall.SysProcAttr)
	}
	cmd.SysProcAttr.UseCgroupFD = true
	cmd.SysProcAttr.CgroupFD = fd

	return cleanup, nil
}

func (e *UniversalExecutor) ListProcesses() set.Collection[procstats.ProcessID] {
	switch cgroupslib.GetMode() {
	case cgroupslib.OFF:
		// cgroup is unavailable, could possibly due to rootless nomad client
		return procstats.ListByPid(e.childCmd.Process.Pid)
	default:
		return procstats.List(e.command)
	}
}

func (e *UniversalExecutor) statCG(cgroup string) (int, func(), error) {
	fd, err := unix.Open(cgroup, unix.O_PATH, 0)
	cleanup := func() {
		_ = unix.Close(fd)
	}
	return fd, cleanup, err
}

// runningFunc is called after task startup and is running.
//
// its use case is for moving the executor process out of the task cgroup once
// the child task process has been started (cgroups v1 only)
type runningFunc func() error

// cleanupFunc is called after task shutdown
//
// its use case is for removing the cgroup from the system once it is no longer
// being used for running the task
type cleanupFunc func()

// configureResourceContainer on Linux configures the cgroups to be used to track
// pids created by the executor
//
// pid: pid of the executor (i.e. ourself)
func (e *UniversalExecutor) configureResourceContainer(
	command *ExecCommand,
	pid int,
) (runningFunc, cleanupFunc, error) {
	cgroup := command.StatsCgroup()

	// we specify these return funcs as empty but non-nil,
	// because callers may call them even if this function errors.
	// deleteCgroup will be called after the task has been launched
	// v1: remove the executor process from the task's cgroups
	// v2: let go of the file descriptor of the task's cgroup
	var (
		deleteCgroup = func() {}
		moveProcess  = func() error { return nil }
	)

	// ensure tasks get the desired oom_score_adj value set
	if err := e.setOomAdj(command.OOMScoreAdj); err != nil {
		return moveProcess, deleteCgroup, err
	}

	// manually configure cgroup for cpu / memory constraints
	switch cgroupslib.GetMode() {
	case cgroupslib.CG1:
		if err := e.configureCG1(cgroup, command); err != nil {
			return moveProcess, deleteCgroup, err
		}
		moveProcess, deleteCgroup = e.enterCG1(cgroup, command.CpusetCgroup())

	case cgroupslib.OFF:
		// do nothing

	default:
		e.configureCG2(cgroup, command)
		// configure child process to spawn in the cgroup
		// get file descriptor of the cgroup made for this task
		fd, cleanup, err := e.statCG(cgroup)
		if err != nil {
			return moveProcess, deleteCgroup, err
		}
		e.childCmd.SysProcAttr.UseCgroupFD = true
		e.childCmd.SysProcAttr.CgroupFD = fd
		deleteCgroup = cleanup
	}

	e.logger.Info("configured cgroup for executor", "pid", pid)

	return moveProcess, deleteCgroup, nil
}

// enterCG1 will write the executor PID (i.e. itself) into the cgroups we
// created for the task - so that the task and its children will spawn in
// those cgroups. The cleanup function moves the executor out of the task's
// cgroups and into the nomad/ parent cgroups.
func (e *UniversalExecutor) enterCG1(statsCgroup, cpusetCgroup string) (runningFunc, cleanupFunc) {
	ed := cgroupslib.OpenPath(cpusetCgroup)
	pid := strconv.Itoa(unix.Getpid())

	// write pid to all the normal interfaces
	ifaces := []string{"freezer", "cpu", "memory"}
	for _, iface := range ifaces {
		ed := cgroupslib.OpenFromFreezerCG1(statsCgroup, iface)
		err := ed.Write("cgroup.procs", pid)
		if err != nil {
			e.logger.Warn("failed to write cgroup", "interface", iface, "error", err)
		}
	}

	// write pid to the cpuset interface, which varies between reserve/share
	err := ed.Write("cgroup.procs", pid)
	if err != nil {
		e.logger.Warn("failed to write cpuset cgroup", "error", err)
	}

	move := func() error {
		// move the executor back out
		for _, iface := range append(ifaces, "cpuset") {
			err := cgroupslib.WriteNomadCG1(iface, "cgroup.procs", pid)
			if err != nil {
				e.logger.Warn("failed to move executor cgroup", "interface", iface, "error", err)
				return err
			}
		}
		return nil
	}

	// cleanup func does nothing in cgroups v1
	cleanup := func() {}

	return move, cleanup
}

func (e *UniversalExecutor) configureCG1(cgroup string, command *ExecCommand) error {
	// some drivers like qemu entirely own resource management
	if command.Resources == nil || command.Resources.LinuxResources == nil {
		return nil
	}

	// if custom cgroups are set join those instead of configuring the /nomad
	// cgroups we are not going to use
	if len(e.command.OverrideCgroupV1) > 0 {
		pid := unix.Getpid()
		for controller, path := range e.command.OverrideCgroupV1 {
			absPath := cgroupslib.CustomPathCG1(controller, path)
			ed := cgroupslib.OpenPath(absPath)
			err := ed.Write("cgroup.procs", strconv.Itoa(pid))
			if err != nil {
				e.logger.Error("unable to write to custom cgroup", "error", err)
				return fmt.Errorf("unable to write to custom cgroup: %v", err)
			}
		}
		return nil
	}

	// write memory limits
	memHard, memSoft := e.computeMemory(command)
	ed := cgroupslib.OpenFromFreezerCG1(cgroup, "memory")
	_ = ed.Write("memory.limit_in_bytes", strconv.FormatInt(memHard, 10))
	if memSoft > 0 {
		_ = ed.Write("memory.soft_limit_in_bytes", strconv.FormatInt(memSoft, 10))
	}

	// write memory swappiness
	swappiness := cgroupslib.MaybeDisableMemorySwappiness()
	if swappiness != nil {
		value := int64(*swappiness)
		_ = ed.Write("memory.swappiness", strconv.FormatInt(value, 10))
	}

	// write cpu shares
	cpuShares := strconv.FormatInt(command.Resources.LinuxResources.CPUShares, 10)
	ed = cgroupslib.OpenFromFreezerCG1(cgroup, "cpu")
	_ = ed.Write("cpu.shares", cpuShares)

	// write cpuset, if set
	if cpuSet := command.Resources.LinuxResources.CpusetCpus; cpuSet != "" {
		cpusetPath := command.Resources.LinuxResources.CpusetCgroupPath
		ed = cgroupslib.OpenPath(cpusetPath)
		_ = ed.Write("cpuset.cpus", cpuSet)
	}

	return nil
}

func (e *UniversalExecutor) configureCG2(cgroup string, command *ExecCommand) {
	// some drivers like qemu entirely own resource management
	if command.Resources == nil || command.Resources.LinuxResources == nil {
		return
	}

	// write memory cgroup files
	memHard, memSoft := e.computeMemory(command)
	ed := cgroupslib.OpenPath(cgroup)
	if memHard == memoryNoLimit {
		_ = ed.Write("memory.max", "max")
	} else {
		_ = ed.Write("memory.max", strconv.FormatInt(memHard, 10))
	}
	if memSoft > 0 {
		ed = cgroupslib.OpenPath(cgroup)
		_ = ed.Write("memory.low", strconv.FormatInt(memSoft, 10))
	}

	// set memory swappiness
	swappiness := cgroupslib.MaybeDisableMemorySwappiness()
	if swappiness != nil {
		ed := cgroupslib.OpenPath(cgroup)
		value := int64(*swappiness)
		_ = ed.Write("memory.swappiness", strconv.FormatInt(value, 10))
	}

	// write cpu weight cgroup file
	cpuWeight := e.computeCPU(command)
	ed = cgroupslib.OpenPath(cgroup)
	_ = ed.Write("cpu.weight", strconv.FormatUint(cpuWeight, 10))

	// write cpuset cgroup file, if set
	cpusetCpus := command.Resources.LinuxResources.CpusetCpus
	_ = ed.Write("cpuset.cpus", cpusetCpus)
}

func (e *UniversalExecutor) setOomAdj(oomScore int32) error {
	// /proc/self/oom_score_adj should work on both cgroups v1 and v2 systems
	// range is -1000 to 1000; 0 is the default
	return os.WriteFile("/proc/self/oom_score_adj", []byte(strconv.Itoa(int(oomScore))), 0644)
}

func (*UniversalExecutor) computeCPU(command *ExecCommand) uint64 {
	cpuShares := command.Resources.LinuxResources.CPUShares
	cpuWeight := cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares))
	return cpuWeight
}

func mbToBytes(n int64) int64 {
	return n * 1024 * 1024
}

// computeMemory returns the hard and soft memory limits for the task
func (*UniversalExecutor) computeMemory(command *ExecCommand) (int64, int64) {
	mem := command.Resources.NomadResources.Memory
	memHard, memSoft := mem.MemoryMaxMB, mem.MemoryMB

	switch memHard {
	case 0:
		// typical case where 'memory' is the hard limit
		memHard = mem.MemoryMB
		return mbToBytes(memHard), 0
	case memoryNoLimit:
		// special oversub case where 'memory' is soft limit and there is no
		// hard limit - helping re-create old raw_exec behavior
		return memoryNoLimit, mbToBytes(memSoft)
	default:
		// typical oversub case where 'memory' is soft limit and 'memory_max'
		// is hard limit
		return mbToBytes(memHard), mbToBytes(memSoft)
	}
}

// withNetworkIsolation calls the passed function the network namespace `spec`
func withNetworkIsolation(f func() error, spec *drivers.NetworkIsolationSpec) error {
	if spec != nil && spec.Path != "" {
		// Get a handle to the target network namespace
		netNS, err := nsutil.GetNS(spec.Path)
		if err != nil {
			return err
		}

		// Start the container in the network namespace
		return netNS.Do(func(nsutil.NetNS) error {
			return f()
		})
	}
	return f()
}