关于无损启动的尝试

105 阅读10分钟

由于本人之前基本从事PHP开发,所以从来没有考虑过关于无损启动的问题。 众所周不知,PHP基本通过FPM方式运行的。当然还有swoole之类的也是常驻进程,但常驻进程应该是殊途同归,处理方法应该是一样。

道途听说

之前学习Go的时候,一直有个疑问,像单体之类的web应用,Go和Java之类常驻进程是怎么做到无损重启的。

如果只是重载配置之类还好说,像代码做变更的时候是怎么做到无损重启的。

我先尝试之前听说的发送信号的方式,为了达到理想的尽可能少的损失连接,逻辑好像写复杂了。

信号重启大法

show code

package main

import (
    "context"
    "fmt"
    "io/fs"
    "math/rand/v2"
    "net/http"
    "os"
    "os/signal"
    "strconv"
    "sync"
    "syscall"
    "time"
)

var (
    pid  = os.Getpid()
    wg   = sync.WaitGroup{}
    addr = "0.0.0.0:8080"
    srv  = &http.Server{
       Addr:    addr,
       Handler: nil,
    }
    reloadSig = syscall.SIGUSR2
    startSig  = syscall.SIGUSR1
)

func main() {
    oldPid := getPid()
    recordPidToFile(pid, "main.pid")

    wg.Add(1)
    go listenSig(oldPid)
    time.Sleep(time.Second)

    hasOld := sendSig(oldPid, reloadSig)
    fmt.Printf("send reloadSig to %d, hasOld:%v\n", oldPid, hasOld)

    if !hasOld {
       go startSrv()
    }

    wg.Wait()
    fmt.Println("server is stop")
}

func startSrv() {
    fmt.Println("start srv")
    http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
       // 模拟不同的延时的接口
       time.Sleep(time.Millisecond * 100 * time.Duration(rand.N(50)))
       _, _ = w.Write([]byte("hello, World!"))
    })

    if err := srv.ListenAndServe(); err != nil {
       fmt.Printf("listen close: %s\n", addr)
    }
}

func listenSig(oldPid int) {
    defer wg.Done()
    hasOld := oldPid > 0
    signalChan := make(chan os.Signal, 2)
    signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1, syscall.SIGUSR2)
    fmt.Printf("start listen sig pid:%d hasOld:%v\n", pid, hasOld)

    var listenTimes int = 1
    if hasOld {
       listenTimes = 2
    }

    for i := 0; i < listenTimes; i++ {
       fmt.Println("wait signal ", i)
       select {
       case s := <-signalChan:
          fmt.Printf("receive sig %v\n", s)
          // 检查具体是哪个信号
          switch s {
          case reloadSig:
             fmt.Println("receive reloadSig case")
             servShutdown()
             newPid := getPid()
             fmt.Printf("newPid %d , sendStart cond:%v\n", newPid, newPid > 0)

             if newPid > 0 {
                fmt.Println("before send startSig")
                ok := sendSig(newPid, startSig)
                fmt.Printf("send startSig %d send res:%v\n", newPid, ok)
             }
          case startSig:
             fmt.Println("receive startSig case")
             if hasOld {
                go startSrv()
                fmt.Println("received startSig from old process start serv")
             } else {
                servShutdown()
                fmt.Println("Received SIGUSR1")
             }
          default:
             servShutdown()
             fmt.Println("receive signal, existing...")
             _ = os.WriteFile("main.pid", []byte{}, os.ModePerm)
             return
          }
       }
    }

}

func servShutdown() {
    fmt.Println("serv shutdown")

    // 尽可能处理完请求
    ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
    defer cancel()

    _ = srv.Shutdown(ctx)
}

func recordPidToFile(id int, path string) {
    if err := os.WriteFile(path, []byte(fmt.Sprintf("%d", pid)), fs.ModePerm); err != nil {
       fmt.Println("record pid to file failed: ", err)
    }
}

func sendSig(pid int, sig syscall.Signal) bool {
    if pid == 0 {
       return false
    }

    if err := syscall.Kill(pid, sig); err != nil {
       fmt.Println("Error sending signal:", err)
       return false
    } else {
       fmt.Printf("Success sent sig to process: %d, signal:%v \n", pid, sig)
    }

    return true
}

func getPid() int {
    pidBytes, _ := os.ReadFile("main.pid")
    fmt.Println("main.pid - ", string(pidBytes))
    p, _ := strconv.Atoi(string(pidBytes))
    return p
}
重启压测

然后开始wrk 压测,并在压测时重启服务

# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://127.0.0.1:8080"
Running 20s test @ http://127.0.0.1:8080
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency     2.29s     1.48s    4.91s    55.31%
    Req/Sec    19.54     13.57    80.00     78.47%
  Latency Distribution
     50%    2.20s 
     75%    3.61s 
     90%    4.40s 
     99%    4.90s 
  622 requests in 20.02s, 80.82KB read
  Socket errors: connect 0, read 0, write 663530, timeout 0
Requests/sec:     31.08
Transfer/sec:      4.04KB

很遗憾,不知道是不是我太菜了, 在重启的一瞬间,大量请求进不来, 不过所幸的是所有在处理的请求没有失败的,勉强达到预期

正常压测

未重启的情况下也压测了一下

# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://127.0.0.1:8080"
Running 20s test @ http://127.0.0.1:8080
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency     2.32s     1.45s    4.90s    57.63%
    Req/Sec    21.03     12.90    70.00     81.20%
  Latency Distribution
     50%    2.30s 
     75%    3.60s 
     90%    4.40s 
     99%    4.90s 
  786 requests in 20.01s, 99.79KB read
Requests/sec:     39.27
Transfer/sec:      4.99KB

数据上来看区别不大,说明平常不会有错误发生

信号+Nginx负载多端口的方案

package main

import (
    "context"
    "fmt"
    "io/fs"
    "log"
    "math/rand/v2"
    "net/http"
    "os"
    "os/signal"
    "strconv"
    "strings"
    "syscall"
    "time"
)

var (
    adds = [...]string{
       "0.0.0.0:8080",
       "0.0.0.0:8081",
       "0.0.0.0:8082",
       "0.0.0.0:8083",
       "0.0.0.0:8084",
    }
    srv *http.Server
    pid = os.Getpid()
)

func main() {
    oldPid := getPid("main.pid")
    recordPidToFile(pid, "main.pid")

    fmt.Println("oldPid:", oldPid)
    sendSig(oldPid, syscall.SIGINT)
    go listenStop()

    fmt.Println("start srv")

    http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
       // 模拟不同的延时的接口
       time.Sleep(time.Millisecond * 100 * time.Duration(rand.N(50)))
       _, _ = w.Write([]byte("hello, World!"))
    })

    for _, addr := range adds {
       srv = &http.Server{Addr: addr}
       fmt.Printf("try listen addr:%s\n", addr)
       if err := srv.ListenAndServe(); err != nil {
          if strings.Index(err.Error(), "bind: address already in use") >= 0 {
             continue
          }

          fmt.Printf("listen close: %s, msg:%s, err: %#v\n", addr, err.Error(), err)
          break
       }
    }

    fmt.Println("server stop...")
}

func getPid(path string) int {
    pidBytes, _ := os.ReadFile(path)
    p, _ := strconv.Atoi(string(pidBytes))
    return p
}

func recordPidToFile(id int, path string) {
    _ = os.WriteFile(path, []byte(fmt.Sprintf("%d", id)), fs.ModePerm)
}

func sendSig(pid int, sig syscall.Signal) {
    if pid == 0 {
       return
    }

    if err := syscall.Kill(pid, sig); err != nil {
       fmt.Printf("send sig fail %+v\n", err)
    }
}

func listenStop() {
    fmt.Println("wait stop signal")
    signalChan := make(chan os.Signal, 2)
    signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1, syscall.SIGUSR2)

    <-signalChan
    fmt.Println("receive stop signal wait stop ")

    ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
    defer cancel()

    if err := srv.Shutdown(ctx); err != nil {
       log.Fatal("Server Shutdown:", err)
    }

    log.Println("Server exiting")
}
upstream go_reload {
    server 127.0.0.1:8080 weight=1;
    server 127.0.0.1:8081 weight=1;
    server 127.0.0.1:8082 weight=1;
    server 127.0.0.1:8083 weight=1;
}

server {
  listen          80;
  server_name     test.local;
  client_max_body_size 100M;
  location / {
    proxy_pass      http://go_reload;
  }
}
正常压测
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency     2.51s     1.45s    4.91s    57.24%
    Req/Sec    20.52     12.40    70.00     81.02%
  Latency Distribution
     50%    2.50s 
     75%    3.90s 
     90%    4.50s 
     99%    4.90s 
  739 requests in 20.02s, 127.02KB read
Requests/sec:     36.91
Transfer/sec:      6.34KB

正常压测符合预期,OK

重启压测
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency   558.86ms    1.00s    4.90s    84.55%
    Req/Sec    13.17k    17.70k   69.53k    77.62%
  Latency Distribution
     50%    1.37ms
     75%  744.76ms
     90%    2.17s 
     99%    4.02s 
  477754 requests in 20.08s, 143.00MB read
  Non-2xx or 3xx responses: 477275
Requests/sec:  23798.01
Transfer/sec:      7.12MB

我勒个去, 这是什么情况,数值这么大肯定有问题。一看,大量非20X,30X的响应。这我当场斯巴达了。

再想想,玩个花活,修改nginx weight值,让ng重载配置,试试

func ReplaceNgWeight(addr, path string) error {
	re, err := regexp.Compile(`weight=\d+\s*;`)
	if err != nil {
		return err
	}

	addre, err := regexp.Compile(fmt.Sprintf(`server\s+%s\s+down\s*;`, addr))
	if err != nil {
		return err
	}

	b, err := os.ReadFile(path)
	if err != nil {
		return err
	}

	// 全部权重清空
	b = re.ReplaceAll(b, []byte("down;"))

	ok := addre.Match(b)
	if ok {
		// 仅保留当前监听地址权重
		b = addre.ReplaceAll(b, []byte(fmt.Sprintf("server %s weight=1;", addr)))
	}

	if err = os.WriteFile(path, b, 0644); err != nil {
		return err
	}

	// 正常执行Go程序是肯定没有这个命令的权限的
	c := exec.Command("systemctl", "reload", "nginx")
	if err = c.Run(); err != nil {
		return err
	}

	return nil
}
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency   837.52ms    1.17s    4.90s    82.33%
    Req/Sec   319.12      2.75k   31.11k    98.78%
  Latency Distribution
     50%    7.84ms
     75%    1.45s 
     90%    2.72s 
     99%    4.32s 
  10475 requests in 20.02s, 3.04MB read
  Socket errors: connect 0, read 100, write 0, timeout 0
  Non-2xx or 3xx responses: 9730
Requests/sec:    523.14
Transfer/sec:    155.40KB

看上去像是好一些。不过这数据依然不容乐观, read 100 和连接数对上了。 所以有同时100个的连接被遗弃, 同时出现近万的其他错误码, 感觉代码里应该有问题。 正常一个新服务上线后,另一个服务应该等所有链接处理完再下线的。 但是我操作的时候感觉是立马下线, 又得继续排查了~

新一版多端口
package main

import (
	"context"
	"fmt"
	"io/fs"
	"log"
	"math/rand/v2"
	"net/http"
	"os"
	"os/exec"
	"os/signal"
	"regexp"
	"runtime"
	"strconv"
	"strings"
	"sync/atomic"
	"syscall"
	"time"
)

const (
	stopOld = "stopOld"
)

var (
	adds = [...]string{
		"127.0.0.1:8080",
		"127.0.0.1:8081",
		"127.0.0.1:8082",
		"127.0.0.1:8083",
		"127.0.0.1:8084",
	}
	addr   = ""
	srv    *http.Server
	pid           = os.Getpid()
	oldPid        = 0
	reqNum uint64 = 0
	inst          = make(chan string)
)

func main() {
	go startServ()
	go listenInstructions()

	oldPid = getPid("main.pid")
	recordPidToFile(pid, "main.pid")

	fmt.Println("oldPid:", oldPid)

	listenStop()

}

func listenInstructions() {
	for {
		switch i := <-inst; i {
		case stopOld:
			sendSig(oldPid, syscall.SIGINT)
		}
	}
}

func isMac() bool {
	return runtime.GOOS == "darwin"
}

func getPid(path string) int {
	pidBytes, _ := os.ReadFile(path)
	p, _ := strconv.Atoi(string(pidBytes))
	return p
}

func recordPidToFile(id int, path string) {
	_ = os.WriteFile(path, []byte(fmt.Sprintf("%d", id)), fs.ModePerm)
}

func sendSig(pid int, sig syscall.Signal) {
	if pid == 0 {
		return
	}

	if err := syscall.Kill(pid, sig); err != nil {
		fmt.Printf("send sig fail %+v\n", err)
	}
}

func listenStop() {
	fmt.Println("wait stop signal")
	signalChan := make(chan os.Signal, 2)
	signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1, syscall.SIGUSR2)

	<-signalChan
	fmt.Println("receive stop signal wait stop ")

	start := time.Now()
	ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
	defer cancel()

	fmt.Println("Server offline ...")
	if err := srv.Shutdown(ctx); err != nil {
		fmt.Println("Server Shutdown err:", err)
		return
	}

	fmt.Printf("shutdown cost: %v \n", time.Since(start))
	log.Println("Server exiting")
}

func startServ() {
	fmt.Println("start srv")

	go func() {
		confpath := "/etc/nginx/conf.d/test.local.conf"
		if isMac() {
			confpath = "/usr/local/etc/nginx/servers/test.loc.conf"
		}

		// 等服务成功启动
		time.Sleep(time.Second * 2)

		if err := ReplaceNgWeight(addr, confpath); err != nil {
			fmt.Println("ReplaceNgWeight failed, ", err)
			return
		}

		fmt.Println("restart nginx , stop instuction sending...")
		inst <- stopOld
	}()

	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
		// 模拟不同的延时的接口
		sle := time.Millisecond * 100 * time.Duration(rand.N(50))
		time.Sleep(sle)
		_, _ = w.Write([]byte(fmt.Sprintf("hello, sleep%v!", sle)))
		atomic.AddUint64(&reqNum, 1)
	})

	for _, addr = range adds {
		srv = &http.Server{Addr: addr}
		fmt.Printf("try listen addr:%s\n", addr)
		if err := srv.ListenAndServe(); err != nil {
			if strings.Contains(err.Error(), "bind: address already in use") {
				continue
			}

			fmt.Printf("listen close: %s, msg:%s, err: %#v\n", addr, err.Error(), err)
			break
		}
	}

	fmt.Println("hanlder request num", reqNum)
	fmt.Println("server stoped")
}

func ReplaceNgWeight(addr, path string) error {
	re, err := regexp.Compile(`weight=\d+\s*;`)
	if err != nil {
		return err
	}

	addre, err := regexp.Compile(fmt.Sprintf(`server\s+%s\s+weight=\d+\s*;`, addr))
	if err != nil {
		return err
	}

	b, err := os.ReadFile(path)
	if err != nil {
		return err
	}

	// 全部权重清空
	b = re.ReplaceAll(b, []byte("weight=1;"))

	ok := addre.Match(b)
	fmt.Println("match current server", ok, addre.String())
	if ok {
		fmt.Println("replacing weight...")
		// 仅保留当前监听地址权重
		b = addre.ReplaceAll(b, []byte(fmt.Sprintf("server %s weight=10000;", addr)))
	}

	if err = os.WriteFile(path, b, 0644); err != nil {
		return err
	}

	// 正常执行Go程序是肯定没有这个命令的权限的
	if err = exec.Command("nginx", "-s", "reload").Run(); err != nil {
		return err
	}

	return nil
}
压测结果
./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency     2.35s     1.40s    4.90s    58.28%
    Req/Sec    21.12     12.69    60.00     78.95%
  Latency Distribution
     50%    2.30s 
     75%    3.60s 
     90%    4.30s 
     99%    4.90s 
  779 requests in 20.02s, 136.95KB read
  Socket errors: connect 0, read 100, write 0, timeout 0
Requests/sec:     38.91
Transfer/sec:      6.84KB

解决了大量进不来的请求,但是丢弃的连接似乎没有得到解决

nginx错误日志, error.log
2024/09/28 10:10:49 [notice] 11249#11249: signal process started
2024/09/28 10:10:49 [error] 10729#10729: *77219 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8081/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77071 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8081/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77151 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8081/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77151 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8082/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77151 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8083/", host: "test.local"
2024/09/28 10:10:49 [error] 10729#10729: *77099 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8082/", host: "test.local"
2024/09/28 10:10:49 [error] 10729#10729: *77099 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8083/", host: "test.local"
2024/09/28 10:10:49 [error] 10729#10729: *77179 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8083/", host: "test.local"

难顶,不知道有什么办法可以解决了,然后在自己的mac上试一下却是正常的, 不知道该怎么解决了。

./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency     2.30s     1.42s    4.91s    60.15%
    Req/Sec    21.72     13.25    80.00     79.64%
  Latency Distribution
     50%    2.30s 
     75%    3.50s 
     90%    4.31s 
     99%    4.81s 
  803 requests in 20.04s, 140.64KB read
Requests/sec:     40.06
Transfer/sec:      7.02KB

欢迎各位大神来评论区指点一下,单点服务有没有什么更好的办法?


	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
		// 模拟不同的延时的接口
		sle := time.Millisecond * 100 * time.Duration(rand.N(50))
		time.Sleep(sle)
		_, _ = w.Write([]byte(fmt.Sprintf("hello, sleep%v!", sle)))
		atomic.AddUint64(&reqNum, 1)
	})
-------------------- serv1 console ----------------
start srv
try listen addr:127.0.0.1:8080
oldPid: 15177
wait stop signal
match current server true server\s+127.0.0.1:8080\s+weight=\d+\s*;
replacing weight...
restart nginx , stop instuction sending...
send sig fail no such process
receive stop signal wait stop 
Server offline ...
listen close: 127.0.0.1:8080, msg:http: Server closed, err: &errors.errorString{s:"http: Server closed"}
hanlder request num 272
server stoped
shutdown cost: 4.689731015s 
2024/09/28 10:46:29 Server exiting

-------------------- serv2 console ----------------

start srv
try listen addr:127.0.0.1:8080
oldPid: 15489
wait stop signal
try listen addr:127.0.0.1:8081
match current server true server\s+127.0.0.1:8081\s+weight=\d+\s*;
replacing weight...
restart nginx , stop instuction sending...
^Creceive stop signal wait stop 
Server offline ...
listen close: 127.0.0.1:8081, msg:http: Server closed, err: &errors.errorString{s:"http: Server closed"}
hanlder request num 518
server stoped
shutdown cost: 182.804µs 
2024/09/28 10:46:42 Server exiting

-------------------- wrk console ----------------
Running 20s test @ http://test.local
  2 threads and 100 connections
  Thread Stats   Avg      Stdev     Max   +/- Stdev
    Latency     2.33s     1.42s    4.91s    56.46%
    Req/Sec    21.32     13.07    80.00     81.18%
  Latency Distribution
     50%    2.30s 
     75%    3.60s 
     90%    4.30s 
     99%    4.90s 
  790 requests in 20.03s, 138.87KB read
  Socket errors: connect 0, read 100, write 0, timeout 0
Requests/sec:     39.43
Transfer/sec:      6.93KB

从结果上看 518 + 272 = 790 应该确实把wrk的请求全部处理完了, wrk这个read 100的错误有没有可能是wrk 或者 Go http 在切换服务时mac与linux 行为不一致才导致的这个错误