package system import ( "fmt" "log" "os" "os/exec" "sync" "time" "zlh-agent/internal/state" ) const ( supervisorStabilityWindow = 30 * time.Second maxRestartAttempts = 3 ) var restartBackoffSchedule = []time.Duration{ 30 * time.Second, 60 * time.Second, 120 * time.Second, } type processSupervisor struct { mu sync.Mutex restartAttempts int } var gameSupervisor = &processSupervisor{} func (s *processSupervisor) Reset() { s.mu.Lock() defer s.mu.Unlock() s.restartAttempts = 0 } func (s *processSupervisor) nextAttempt(uptime time.Duration) int { s.mu.Lock() defer s.mu.Unlock() if uptime >= supervisorStabilityWindow { s.restartAttempts = 0 } s.restartAttempts++ return s.restartAttempts } func (s *processSupervisor) Watch(cfg *state.Config, cmd *exec.Cmd, ptmx *os.File, startedAt time.Time) { err := cmd.Wait() uptime := time.Since(startedAt) s.clearProcess(cmd, ptmx) if state.IsIntentionalStop() { state.ClearIntentionalStop() state.SetState(state.StateIdle) state.SetReadyState(false, "", "") log.Printf("[process] vmid=%d server exited after intentional stop", cfg.VMID) return } if uptime >= supervisorStabilityWindow { state.ResetCrashCount() } crashInfo := buildCrashInfo(cfg, err, startedAt) state.SetLastCrash(crashInfo) log.Printf("[process] vmid=%d server crashed exit_code=%d signal=%d uptime=%ds", cfg.VMID, crashInfo.ExitCode, crashInfo.Signal, crashInfo.UptimeSeconds) if len(crashInfo.LogTail) > 0 { log.Printf("[process] vmid=%d crash log tail:", cfg.VMID) for _, line := range lastLines(crashInfo.LogTail, 20) { log.Printf("[process] vmid=%d %s", cfg.VMID, line) } } recordErr := err if recordErr == nil { recordErr = fmt.Errorf("server exited unexpectedly") } state.RecordCrash(recordErr) attempt := s.nextAttempt(uptime) if attempt > maxRestartAttempts { restartErr := fmt.Errorf("server crashed repeatedly") state.SetError(restartErr) state.SetState(state.StateError) log.Printf("[process] vmid=%d restart limit reached attempts=%d", cfg.VMID, attempt-1) return } delay := restartBackoffSchedule[attempt-1] log.Printf("[process] vmid=%d restart attempt=%d delay=%s", cfg.VMID, attempt, delay) timer := time.NewTimer(delay) defer timer.Stop() <-timer.C if state.IsIntentionalStop() || state.GetState() == state.StateError { return } if err := StartServer(cfg); err != nil { state.SetError(err) state.SetState(state.StateError) log.Printf("[process] vmid=%d restart attempt=%d failed err=%v", cfg.VMID, attempt, err) return } } func (s *processSupervisor) clearProcess(cmd *exec.Cmd, ptmx *os.File) { mu.Lock() defer mu.Unlock() if ptmx != nil { _ = ptmx.Close() } if serverCmd == cmd { serverCmd = nil } if serverPTY == ptmx { serverPTY = nil } serverStartTime = time.Time{} }