- package main
-
- import (
- "flag"
- "fmt"
- "log"
- "os"
- "os/signal"
- "os/user"
- "path"
- "strconv"
- "strings"
- "sync"
- "syscall"
- "time"
-
- "github.com/czxichen/command/watchdog"
- conf "github.com/dlintw/goconf"
- )
-
-
- const logDir = "./watchdog"
-
- func newProc(svc *Service,null,pw *os.File) *os.ProcAttr {
- return &os.ProcAttr{Dir: svc.path,Files: []*os.File{null,pw,pw}}
- }
-
- func setPriority(pid,priority uintptr) syscall.Errno {
- return 0
- }
-
- var (
- logpath = flag.String("log_path","","Specify log path")
- configFile = flag.String("config","watchdog.ini","Watchdog configuration file")
- )
-
- func cfgOpt(cfg *conf.ConfigFile,section,option string) string {
- if !cfg.HasOption(section,option) {
- return ""
- }
- s,err := cfg.GetString(section,option)
- if err != nil {
- log.Fatalf("Failed to get %s for %s: %v",option,err)
- }
- return s
- }
-
- func svcOpt(cfg *conf.ConfigFile,service,option string,required bool) string {
- opt := cfgOpt(cfg,option)
- if opt == "" && required {
- log.Fatalf("Service %s has missing %s option",option)
- }
- return opt
- }
-
- var signalNames = map[syscall.Signal]string{
- syscall.SIGINT: "SIGINT",syscall.SIGQUIT: "SIGQUIT",syscall.SIGTERM: "SIGTERM",}
-
- func signalName(s syscall.Signal) string {
- if name,ok := signalNames[s]; ok {
- return name
- }
- return fmt.Sprintf("SIG %d",s)
- }
-
- type Shutdowner interface {
- Shutdown()
- }
-
- func shutdownHandler(server Shutdowner) {
- sigc := make(chan os.Signal, 3)
- signal.Notify(sigc,syscall.SIGINT,syscall.SIGQUIT,syscall.SIGTERM)
- go func() {
- for s := range sigc {
- name := s.String()
- if sig,ok := s.(syscall.Signal); ok {
- name = signalName(sig)
- }
- log.Printf("Received %v,initiating shutdown...",name)
- server.Shutdown()
- }
- }()
- }
-
- var (
- restartDelay = 2 * time.Second
- restartBackoff = 5 * time.Second
- restartBackoffMax = 60 * time.Second
- )
-
- type Watchdog struct {
- services map[string]*Service
- shutdown chan bool
- }
-
- func NewWatchdog() *Watchdog {
- return &Watchdog{
- services: make(map[string]*Service),shutdown: make(chan bool),}
- }
-
- func (w *Watchdog) Shutdown() {
- select {
- case w.shutdown <- true:
- default:
- }
- }
-
- func (w *Watchdog) AddService(name,binary string) (*Service,error) {
- if _,ok := w.services[name]; ok {
- return nil,fmt.Errorf("Service %q already exists",name)
- }
-
- svc := newService(name,binary)
- w.services[name] = svc
-
- return svc,nil
- }
-
- func (w *Watchdog) Walk() {
- log.Printf("Seesaw watchdog starting...")
-
- w.mapDependencies()
-
- for _,svc := range w.services {
- go svc.run()
- }
- <-w.shutdown
- for _,svc := range w.services {
- go svc.stop()
- }
- for _,svc := range w.services {
- stopped := <-svc.stopped
- svc.stopped <- stopped
- }
- }
-
- func (w *Watchdog) mapDependencies() {
- for name := range w.services {
- svc := w.services[name]
- for depName := range svc.dependencies {
- dep,ok := w.services[depName]
- if !ok {
- log.Fatalf("Failed to find dependency %q for service %q",depName,name)
- }
- svc.dependencies[depName] = dep
- dep.dependents[svc.name] = svc
- }
- }
- }
-
- const prioProcess = 0
-
- type Service struct {
- name string
- binary string
- path string
- args []string
-
- uid uint32
- gid uint32
- priority int
-
- dependencies map[string]*Service
- dependents map[string]*Service
-
- termTimeout time.Duration
-
- lock sync.Mutex
- process *os.Process
-
- done chan bool
- shutdown chan bool
- started chan bool
- stopped chan bool
-
- failures uint64
- restarts uint64
-
- lastFailure time.Time
- lastRestart time.Time
- }
-
- func newService(name,binary string) *Service {
- return &Service{
- name: name,binary: binary,args: make([]string, 0),dependencies: make(map[string]*Service),dependents: make(map[string]*Service),done: make(chan bool),shutdown: make(chan bool, 1),started: make(chan bool,stopped: make(chan bool,termTimeout: 5 * time.Second,}
- }
-
- func (svc *Service) AddDependency(name string) {
- svc.dependencies[name] = nil
- }
-
- func (svc *Service) AddArgs(args string) {
- svc.args = strings.Fields(args)
- }
-
- func (svc *Service) SetPriority(priority int) error {
- if priority < -20 || priority > 19 {
- return fmt.Errorf("Invalid priority %d - must be between -20 and 19",priority)
- }
- svc.priority = priority
- return nil
- }
-
- func (svc *Service) SetTermTimeout(tt time.Duration) {
- svc.termTimeout = tt
- }
-
- func (svc *Service) SetUser(username string) error {
- u,err := user.Lookup(username)
- if err != nil {
- return err
- }
- uid,err := strconv.Atoi(u.Uid)
- if err != nil {
- return err
- }
- gid,err := strconv.Atoi(u.Gid)
- if err != nil {
- return err
- }
- svc.uid = uint32(uid)
- svc.gid = uint32(gid)
- return nil
- }
-
- func (svc *Service) run() {
-
- for _,dep := range svc.dependencies {
- log.Printf("Service %s waiting for %s to start",svc.name,dep.name)
- select {
- case started := <-dep.started:
- dep.started <- started
- case <-svc.shutdown:
- goto done
- }
- }
-
- for {
-
-
- if svc.failures > 0 {
- delay := time.Duration(svc.failures) * restartBackoff
- if delay > restartBackoffMax {
- delay = restartBackoffMax
- }
- log.Printf("Service %s has Failed %d times - delaying %s before restart",svc.failures,delay)
-
- select {
- case <-time.After(delay):
- case <-svc.shutdown:
- goto done
- }
- }
-
- svc.restarts++
- svc.lastRestart = time.Now()
- svc.runOnce()
-
- select {
- case <-time.After(restartDelay):
- case <-svc.shutdown:
- goto done
- }
- }
- done:
- svc.done <- true
- }
-
- func (svc *Service) logFile() (*os.File,error) {
- logName := svc.name + ".log"
-
- if err := os.MkdirAll(logDir, 0666); err != nil {
- if !os.IsExist(err) {
- return nil,err
- }
- }
- f,err := os.Create(path.Join(logDir,logName))
- if err != nil {
- return nil,err
- }
- fmt.Fprintf(f,"Log file for %s (stdout/stderr)\n",svc.name)
- fmt.Fprintf(f,"Created at: %s\n",time.Now().Format("2006/01/02 15:04:05"))
- return f,nil
- }
-
- func (svc *Service) runOnce() {
- args := make([]string,len(svc.args)+1)
- args[0] = svc.name
- copy(args[1:],svc.args)
-
- fmt.Println("Args:",args)
- null,err := os.Open(os.DevNull)
- if err != nil {
- log.Printf("Service %s - Failed to open %s: %v",os.DevNull,err)
- return
- }
-
- lfile,err := svc.logFile()
- if err != nil {
- log.Printf("Service %s - Failed to create log file: %v",err)
- null.Close()
- return
- }
-
- attr := newProc(svc,lfile)
-
- log.Printf("Starting service %s...",svc.name)
- proc,err := os.StartProcess(svc.binary,args,attr)
- if err != nil {
- log.Printf("Service %s Failed to start: %v",err)
- svc.lastFailure = time.Now()
- svc.failures++
- null.Close()
- return
- }
-
- null.Close()
- lfile.Close()
- svc.lock.Lock()
- svc.process = proc
- svc.lock.Unlock()
-
- if err := setPriority(uintptr(proc.Pid),uintptr(svc.priority)); err != 0 {
- log.Printf("Failed to set priority to %d for service %s: %v",svc.priority,err)
- }
- select {
- case svc.started <- true:
- default:
- }
-
- state,err := svc.process.Wait()
- if err != nil {
- log.Printf("Service %s wait Failed with %v",err)
- svc.lastFailure = time.Now()
- svc.failures++
- return
- }
- if !state.Success() {
- log.Printf("Service %s exited with %v",state)
- svc.lastFailure = time.Now()
- svc.failures++
- return
- }
-
- svc.failures = 0
- log.Printf("Service %s exited normally.",svc.name)
- }
-
- func (svc *Service) signal(sig os.Signal) error {
- svc.lock.Lock()
- defer svc.lock.Unlock()
- if svc.process == nil {
- return nil
- }
- return svc.process.Signal(sig)
- }
-
- func (svc *Service) stop() {
- log.Printf("Stopping service %s...",svc.name)
-
- for _,dep := range svc.dependents {
- log.Printf("Service %s waiting for %s to stop",dep.name)
- stopped := <-dep.stopped
- dep.stopped <- stopped
- }
-
- svc.shutdown <- true
-
- svc.signal(syscall.SIGTERM)
- select {
- case <-svc.done:
- case <-time.After(svc.termTimeout):
- svc.signal(syscall.SIGKILL)
- <-svc.done
- }
- log.Printf("Service %s stopped",svc.name)
- svc.stopped <- true
- }
-
- func main() {
- flag.Parse()
- if *logpath == "" {
- *logpath = os.Args[0] + ".log"
- }
- logFile,err := os.Create(*logpath)
- if err != nil {
- log.Fatalf("Create log file error:%s\n",err.Error())
- }
- defer logFile.Close()
- log.SetOutput(logFile)
-
- cfg,err := conf.ReadConfigFile(*configFile)
- if err != nil {
- log.Fatalf("Failed to read config file %q: %v",*configFile,err)
- }
-
- fido := watchdog.NewWatchdog()
-
- shutdownHandler(fido)
- for _,name := range cfg.GetSections() {
- if name == "default" {
- continue
- }
-
- binary := svcOpt(cfg,name,"binary",true)
- args := svcOpt(cfg,"args",false)
-
- svc,err := fido.AddService(name,binary)
- if err != nil {
- log.Fatalf("Failed to add service %q: %v",err)
- }
- svc.AddArgs(args)
- if dep := svcOpt(cfg,"dependency",false); dep != "" {
- svc.AddDependency(dep)
- }
- if opt := svcOpt(cfg,"priority",false); opt != "" {
- prio,err := strconv.Atoi(opt)
- if err != nil {
- log.Fatalf("Service %s has invalid priority %q: %v",opt,err)
- }
- if err := svc.SetPriority(prio); err != nil {
- log.Fatalf("Failed to set priority for service %s: %v",err)
- }
- }
- if opt := svcOpt(cfg,"term_timeout",false); opt != "" {
- tt,err := time.ParseDuration(opt)
- if err != nil {
- log.Fatalf("Service %s has invalid term_timeout %q: %v",err)
- }
- svc.SetTermTimeout(tt)
- }
-
- if user := svcOpt(cfg,"user",false); user != "" {
- if err := svc.SetUser(user); err != nil {
- log.Fatalf("Failed to set user for service %s: %v",err)
- }
- }
- }
-
- fido.Walk()
- }