127 lines
2.8 KiB
Go
127 lines
2.8 KiB
Go
package system
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/exec"
|
|
"sync"
|
|
"time"
|
|
|
|
"zlh-agent/internal/state"
|
|
)
|
|
|
|
const (
|
|
supervisorStabilityWindow = 30 * time.Second
|
|
maxRestartAttempts = 3
|
|
)
|
|
|
|
var restartBackoffSchedule = []time.Duration{
|
|
30 * time.Second,
|
|
60 * time.Second,
|
|
120 * time.Second,
|
|
}
|
|
|
|
type processSupervisor struct {
|
|
mu sync.Mutex
|
|
restartAttempts int
|
|
}
|
|
|
|
var gameSupervisor = &processSupervisor{}
|
|
|
|
func (s *processSupervisor) Reset() {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.restartAttempts = 0
|
|
}
|
|
|
|
func (s *processSupervisor) nextAttempt(uptime time.Duration) int {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
if uptime >= supervisorStabilityWindow {
|
|
s.restartAttempts = 0
|
|
}
|
|
s.restartAttempts++
|
|
return s.restartAttempts
|
|
}
|
|
|
|
func (s *processSupervisor) Watch(cfg *state.Config, cmd *exec.Cmd, ptmx *os.File, startedAt time.Time) {
|
|
err := cmd.Wait()
|
|
uptime := time.Since(startedAt)
|
|
|
|
s.clearProcess(cmd, ptmx)
|
|
|
|
if state.IsIntentionalStop() {
|
|
state.ClearIntentionalStop()
|
|
state.SetState(state.StateIdle)
|
|
state.SetReadyState(false, "", "")
|
|
log.Printf("[process] vmid=%d server exited after intentional stop", cfg.VMID)
|
|
return
|
|
}
|
|
|
|
if uptime >= supervisorStabilityWindow {
|
|
state.ResetCrashCount()
|
|
}
|
|
|
|
crashInfo := buildCrashInfo(cfg, err, startedAt)
|
|
state.SetLastCrash(crashInfo)
|
|
log.Printf("[process] vmid=%d server crashed exit_code=%d signal=%d uptime=%ds", cfg.VMID, crashInfo.ExitCode, crashInfo.Signal, crashInfo.UptimeSeconds)
|
|
if len(crashInfo.LogTail) > 0 {
|
|
log.Printf("[process] vmid=%d crash log tail:", cfg.VMID)
|
|
for _, line := range lastLines(crashInfo.LogTail, 20) {
|
|
log.Printf("[process] vmid=%d %s", cfg.VMID, line)
|
|
}
|
|
}
|
|
|
|
recordErr := err
|
|
if recordErr == nil {
|
|
recordErr = fmt.Errorf("server exited unexpectedly")
|
|
}
|
|
state.RecordCrash(recordErr)
|
|
|
|
attempt := s.nextAttempt(uptime)
|
|
if attempt > maxRestartAttempts {
|
|
restartErr := fmt.Errorf("server crashed repeatedly")
|
|
state.SetError(restartErr)
|
|
state.SetState(state.StateError)
|
|
log.Printf("[process] vmid=%d restart limit reached attempts=%d", cfg.VMID, attempt-1)
|
|
return
|
|
}
|
|
|
|
delay := restartBackoffSchedule[attempt-1]
|
|
log.Printf("[process] vmid=%d restart attempt=%d delay=%s", cfg.VMID, attempt, delay)
|
|
|
|
timer := time.NewTimer(delay)
|
|
defer timer.Stop()
|
|
|
|
<-timer.C
|
|
|
|
if state.IsIntentionalStop() || state.GetState() == state.StateError {
|
|
return
|
|
}
|
|
|
|
if err := StartServer(cfg); err != nil {
|
|
state.SetError(err)
|
|
state.SetState(state.StateError)
|
|
log.Printf("[process] vmid=%d restart attempt=%d failed err=%v", cfg.VMID, attempt, err)
|
|
return
|
|
}
|
|
}
|
|
|
|
func (s *processSupervisor) clearProcess(cmd *exec.Cmd, ptmx *os.File) {
|
|
mu.Lock()
|
|
defer mu.Unlock()
|
|
|
|
if ptmx != nil {
|
|
_ = ptmx.Close()
|
|
}
|
|
if serverCmd == cmd {
|
|
serverCmd = nil
|
|
}
|
|
if serverPTY == ptmx {
|
|
serverPTY = nil
|
|
}
|
|
serverStartTime = time.Time{}
|
|
}
|