zlh-agent/internal/system/supervisor.go
2026-03-20 23:17:19 +00:00

127 lines
2.8 KiB
Go

package system
import (
"fmt"
"log"
"os"
"os/exec"
"sync"
"time"
"zlh-agent/internal/state"
)
const (
supervisorStabilityWindow = 30 * time.Second
maxRestartAttempts = 3
)
var restartBackoffSchedule = []time.Duration{
30 * time.Second,
60 * time.Second,
120 * time.Second,
}
type processSupervisor struct {
mu sync.Mutex
restartAttempts int
}
var gameSupervisor = &processSupervisor{}
func (s *processSupervisor) Reset() {
s.mu.Lock()
defer s.mu.Unlock()
s.restartAttempts = 0
}
func (s *processSupervisor) nextAttempt(uptime time.Duration) int {
s.mu.Lock()
defer s.mu.Unlock()
if uptime >= supervisorStabilityWindow {
s.restartAttempts = 0
}
s.restartAttempts++
return s.restartAttempts
}
func (s *processSupervisor) Watch(cfg *state.Config, cmd *exec.Cmd, ptmx *os.File, startedAt time.Time) {
err := cmd.Wait()
uptime := time.Since(startedAt)
s.clearProcess(cmd, ptmx)
if state.IsIntentionalStop() {
state.ClearIntentionalStop()
state.SetState(state.StateIdle)
state.SetReadyState(false, "", "")
log.Printf("[process] vmid=%d server exited after intentional stop", cfg.VMID)
return
}
if uptime >= supervisorStabilityWindow {
state.ResetCrashCount()
}
crashInfo := buildCrashInfo(cfg, err, startedAt)
state.SetLastCrash(crashInfo)
log.Printf("[process] vmid=%d server crashed exit_code=%d signal=%d uptime=%ds", cfg.VMID, crashInfo.ExitCode, crashInfo.Signal, crashInfo.UptimeSeconds)
if len(crashInfo.LogTail) > 0 {
log.Printf("[process] vmid=%d crash log tail:", cfg.VMID)
for _, line := range lastLines(crashInfo.LogTail, 20) {
log.Printf("[process] vmid=%d %s", cfg.VMID, line)
}
}
recordErr := err
if recordErr == nil {
recordErr = fmt.Errorf("server exited unexpectedly")
}
state.RecordCrash(recordErr)
attempt := s.nextAttempt(uptime)
if attempt > maxRestartAttempts {
restartErr := fmt.Errorf("server crashed repeatedly")
state.SetError(restartErr)
state.SetState(state.StateError)
log.Printf("[process] vmid=%d restart limit reached attempts=%d", cfg.VMID, attempt-1)
return
}
delay := restartBackoffSchedule[attempt-1]
log.Printf("[process] vmid=%d restart attempt=%d delay=%s", cfg.VMID, attempt, delay)
timer := time.NewTimer(delay)
defer timer.Stop()
<-timer.C
if state.IsIntentionalStop() || state.GetState() == state.StateError {
return
}
if err := StartServer(cfg); err != nil {
state.SetError(err)
state.SetState(state.StateError)
log.Printf("[process] vmid=%d restart attempt=%d failed err=%v", cfg.VMID, attempt, err)
return
}
}
func (s *processSupervisor) clearProcess(cmd *exec.Cmd, ptmx *os.File) {
mu.Lock()
defer mu.Unlock()
if ptmx != nil {
_ = ptmx.Close()
}
if serverCmd == cmd {
serverCmd = nil
}
if serverPTY == ptmx {
serverPTY = nil
}
serverStartTime = time.Time{}
}