由于本人之前基本从事PHP开发,所以从来没有考虑过关于无损启动的问题。 众所周不知,PHP基本通过FPM方式运行的。当然还有swoole之类的也是常驻进程,但常驻进程应该是殊途同归,处理方法应该是一样。
道途听说
之前学习Go的时候,一直有个疑问,像单体之类的web应用,Go和Java之类常驻进程是怎么做到无损重启的。
如果只是重载配置之类还好说,像代码做变更的时候是怎么做到无损重启的。
我先尝试之前听说的发送信号的方式,为了达到理想的尽可能少的损失连接,逻辑好像写复杂了。
信号重启大法
show code
package main
import (
"context"
"fmt"
"io/fs"
"math/rand/v2"
"net/http"
"os"
"os/signal"
"strconv"
"sync"
"syscall"
"time"
)
var (
pid = os.Getpid()
wg = sync.WaitGroup{}
addr = "0.0.0.0:8080"
srv = &http.Server{
Addr: addr,
Handler: nil,
}
reloadSig = syscall.SIGUSR2
startSig = syscall.SIGUSR1
)
func main() {
oldPid := getPid()
recordPidToFile(pid, "main.pid")
wg.Add(1)
go listenSig(oldPid)
time.Sleep(time.Second)
hasOld := sendSig(oldPid, reloadSig)
fmt.Printf("send reloadSig to %d, hasOld:%v\n", oldPid, hasOld)
if !hasOld {
go startSrv()
}
wg.Wait()
fmt.Println("server is stop")
}
func startSrv() {
fmt.Println("start srv")
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
// 模拟不同的延时的接口
time.Sleep(time.Millisecond * 100 * time.Duration(rand.N(50)))
_, _ = w.Write([]byte("hello, World!"))
})
if err := srv.ListenAndServe(); err != nil {
fmt.Printf("listen close: %s\n", addr)
}
}
func listenSig(oldPid int) {
defer wg.Done()
hasOld := oldPid > 0
signalChan := make(chan os.Signal, 2)
signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1, syscall.SIGUSR2)
fmt.Printf("start listen sig pid:%d hasOld:%v\n", pid, hasOld)
var listenTimes int = 1
if hasOld {
listenTimes = 2
}
for i := 0; i < listenTimes; i++ {
fmt.Println("wait signal ", i)
select {
case s := <-signalChan:
fmt.Printf("receive sig %v\n", s)
// 检查具体是哪个信号
switch s {
case reloadSig:
fmt.Println("receive reloadSig case")
servShutdown()
newPid := getPid()
fmt.Printf("newPid %d , sendStart cond:%v\n", newPid, newPid > 0)
if newPid > 0 {
fmt.Println("before send startSig")
ok := sendSig(newPid, startSig)
fmt.Printf("send startSig %d send res:%v\n", newPid, ok)
}
case startSig:
fmt.Println("receive startSig case")
if hasOld {
go startSrv()
fmt.Println("received startSig from old process start serv")
} else {
servShutdown()
fmt.Println("Received SIGUSR1")
}
default:
servShutdown()
fmt.Println("receive signal, existing...")
_ = os.WriteFile("main.pid", []byte{}, os.ModePerm)
return
}
}
}
}
func servShutdown() {
fmt.Println("serv shutdown")
// 尽可能处理完请求
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
_ = srv.Shutdown(ctx)
}
func recordPidToFile(id int, path string) {
if err := os.WriteFile(path, []byte(fmt.Sprintf("%d", pid)), fs.ModePerm); err != nil {
fmt.Println("record pid to file failed: ", err)
}
}
func sendSig(pid int, sig syscall.Signal) bool {
if pid == 0 {
return false
}
if err := syscall.Kill(pid, sig); err != nil {
fmt.Println("Error sending signal:", err)
return false
} else {
fmt.Printf("Success sent sig to process: %d, signal:%v \n", pid, sig)
}
return true
}
func getPid() int {
pidBytes, _ := os.ReadFile("main.pid")
fmt.Println("main.pid - ", string(pidBytes))
p, _ := strconv.Atoi(string(pidBytes))
return p
}
重启压测
然后开始wrk 压测,并在压测时重启服务
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://127.0.0.1:8080"
Running 20s test @ http://127.0.0.1:8080
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 2.29s 1.48s 4.91s 55.31%
Req/Sec 19.54 13.57 80.00 78.47%
Latency Distribution
50% 2.20s
75% 3.61s
90% 4.40s
99% 4.90s
622 requests in 20.02s, 80.82KB read
Socket errors: connect 0, read 0, write 663530, timeout 0
Requests/sec: 31.08
Transfer/sec: 4.04KB
很遗憾,不知道是不是我太菜了, 在重启的一瞬间,大量请求进不来, 不过所幸的是所有在处理的请求没有失败的,勉强达到预期
正常压测
未重启的情况下也压测了一下
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://127.0.0.1:8080"
Running 20s test @ http://127.0.0.1:8080
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 2.32s 1.45s 4.90s 57.63%
Req/Sec 21.03 12.90 70.00 81.20%
Latency Distribution
50% 2.30s
75% 3.60s
90% 4.40s
99% 4.90s
786 requests in 20.01s, 99.79KB read
Requests/sec: 39.27
Transfer/sec: 4.99KB
数据上来看区别不大,说明平常不会有错误发生
信号+Nginx负载多端口的方案
package main
import (
"context"
"fmt"
"io/fs"
"log"
"math/rand/v2"
"net/http"
"os"
"os/signal"
"strconv"
"strings"
"syscall"
"time"
)
var (
adds = [...]string{
"0.0.0.0:8080",
"0.0.0.0:8081",
"0.0.0.0:8082",
"0.0.0.0:8083",
"0.0.0.0:8084",
}
srv *http.Server
pid = os.Getpid()
)
func main() {
oldPid := getPid("main.pid")
recordPidToFile(pid, "main.pid")
fmt.Println("oldPid:", oldPid)
sendSig(oldPid, syscall.SIGINT)
go listenStop()
fmt.Println("start srv")
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
// 模拟不同的延时的接口
time.Sleep(time.Millisecond * 100 * time.Duration(rand.N(50)))
_, _ = w.Write([]byte("hello, World!"))
})
for _, addr := range adds {
srv = &http.Server{Addr: addr}
fmt.Printf("try listen addr:%s\n", addr)
if err := srv.ListenAndServe(); err != nil {
if strings.Index(err.Error(), "bind: address already in use") >= 0 {
continue
}
fmt.Printf("listen close: %s, msg:%s, err: %#v\n", addr, err.Error(), err)
break
}
}
fmt.Println("server stop...")
}
func getPid(path string) int {
pidBytes, _ := os.ReadFile(path)
p, _ := strconv.Atoi(string(pidBytes))
return p
}
func recordPidToFile(id int, path string) {
_ = os.WriteFile(path, []byte(fmt.Sprintf("%d", id)), fs.ModePerm)
}
func sendSig(pid int, sig syscall.Signal) {
if pid == 0 {
return
}
if err := syscall.Kill(pid, sig); err != nil {
fmt.Printf("send sig fail %+v\n", err)
}
}
func listenStop() {
fmt.Println("wait stop signal")
signalChan := make(chan os.Signal, 2)
signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1, syscall.SIGUSR2)
<-signalChan
fmt.Println("receive stop signal wait stop ")
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()
if err := srv.Shutdown(ctx); err != nil {
log.Fatal("Server Shutdown:", err)
}
log.Println("Server exiting")
}
upstream go_reload {
server 127.0.0.1:8080 weight=1;
server 127.0.0.1:8081 weight=1;
server 127.0.0.1:8082 weight=1;
server 127.0.0.1:8083 weight=1;
}
server {
listen 80;
server_name test.local;
client_max_body_size 100M;
location / {
proxy_pass http://go_reload;
}
}
正常压测
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 2.51s 1.45s 4.91s 57.24%
Req/Sec 20.52 12.40 70.00 81.02%
Latency Distribution
50% 2.50s
75% 3.90s
90% 4.50s
99% 4.90s
739 requests in 20.02s, 127.02KB read
Requests/sec: 36.91
Transfer/sec: 6.34KB
正常压测符合预期,OK
重启压测
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 558.86ms 1.00s 4.90s 84.55%
Req/Sec 13.17k 17.70k 69.53k 77.62%
Latency Distribution
50% 1.37ms
75% 744.76ms
90% 2.17s
99% 4.02s
477754 requests in 20.08s, 143.00MB read
Non-2xx or 3xx responses: 477275
Requests/sec: 23798.01
Transfer/sec: 7.12MB
我勒个去, 这是什么情况,数值这么大肯定有问题。一看,大量非20X,30X的响应。这我当场斯巴达了。
再想想,玩个花活,修改nginx weight值,让ng重载配置,试试
func ReplaceNgWeight(addr, path string) error {
re, err := regexp.Compile(`weight=\d+\s*;`)
if err != nil {
return err
}
addre, err := regexp.Compile(fmt.Sprintf(`server\s+%s\s+down\s*;`, addr))
if err != nil {
return err
}
b, err := os.ReadFile(path)
if err != nil {
return err
}
// 全部权重清空
b = re.ReplaceAll(b, []byte("down;"))
ok := addre.Match(b)
if ok {
// 仅保留当前监听地址权重
b = addre.ReplaceAll(b, []byte(fmt.Sprintf("server %s weight=1;", addr)))
}
if err = os.WriteFile(path, b, 0644); err != nil {
return err
}
// 正常执行Go程序是肯定没有这个命令的权限的
c := exec.Command("systemctl", "reload", "nginx")
if err = c.Run(); err != nil {
return err
}
return nil
}
# ./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 837.52ms 1.17s 4.90s 82.33%
Req/Sec 319.12 2.75k 31.11k 98.78%
Latency Distribution
50% 7.84ms
75% 1.45s
90% 2.72s
99% 4.32s
10475 requests in 20.02s, 3.04MB read
Socket errors: connect 0, read 100, write 0, timeout 0
Non-2xx or 3xx responses: 9730
Requests/sec: 523.14
Transfer/sec: 155.40KB
看上去像是好一些。不过这数据依然不容乐观, read 100 和连接数对上了。 所以有同时100个的连接被遗弃, 同时出现近万的其他错误码, 感觉代码里应该有问题。 正常一个新服务上线后,另一个服务应该等所有链接处理完再下线的。 但是我操作的时候感觉是立马下线, 又得继续排查了~
新一版多端口
package main
import (
"context"
"fmt"
"io/fs"
"log"
"math/rand/v2"
"net/http"
"os"
"os/exec"
"os/signal"
"regexp"
"runtime"
"strconv"
"strings"
"sync/atomic"
"syscall"
"time"
)
const (
stopOld = "stopOld"
)
var (
adds = [...]string{
"127.0.0.1:8080",
"127.0.0.1:8081",
"127.0.0.1:8082",
"127.0.0.1:8083",
"127.0.0.1:8084",
}
addr = ""
srv *http.Server
pid = os.Getpid()
oldPid = 0
reqNum uint64 = 0
inst = make(chan string)
)
func main() {
go startServ()
go listenInstructions()
oldPid = getPid("main.pid")
recordPidToFile(pid, "main.pid")
fmt.Println("oldPid:", oldPid)
listenStop()
}
func listenInstructions() {
for {
switch i := <-inst; i {
case stopOld:
sendSig(oldPid, syscall.SIGINT)
}
}
}
func isMac() bool {
return runtime.GOOS == "darwin"
}
func getPid(path string) int {
pidBytes, _ := os.ReadFile(path)
p, _ := strconv.Atoi(string(pidBytes))
return p
}
func recordPidToFile(id int, path string) {
_ = os.WriteFile(path, []byte(fmt.Sprintf("%d", id)), fs.ModePerm)
}
func sendSig(pid int, sig syscall.Signal) {
if pid == 0 {
return
}
if err := syscall.Kill(pid, sig); err != nil {
fmt.Printf("send sig fail %+v\n", err)
}
}
func listenStop() {
fmt.Println("wait stop signal")
signalChan := make(chan os.Signal, 2)
signal.Notify(signalChan, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGUSR1, syscall.SIGUSR2)
<-signalChan
fmt.Println("receive stop signal wait stop ")
start := time.Now()
ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()
fmt.Println("Server offline ...")
if err := srv.Shutdown(ctx); err != nil {
fmt.Println("Server Shutdown err:", err)
return
}
fmt.Printf("shutdown cost: %v \n", time.Since(start))
log.Println("Server exiting")
}
func startServ() {
fmt.Println("start srv")
go func() {
confpath := "/etc/nginx/conf.d/test.local.conf"
if isMac() {
confpath = "/usr/local/etc/nginx/servers/test.loc.conf"
}
// 等服务成功启动
time.Sleep(time.Second * 2)
if err := ReplaceNgWeight(addr, confpath); err != nil {
fmt.Println("ReplaceNgWeight failed, ", err)
return
}
fmt.Println("restart nginx , stop instuction sending...")
inst <- stopOld
}()
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
// 模拟不同的延时的接口
sle := time.Millisecond * 100 * time.Duration(rand.N(50))
time.Sleep(sle)
_, _ = w.Write([]byte(fmt.Sprintf("hello, sleep%v!", sle)))
atomic.AddUint64(&reqNum, 1)
})
for _, addr = range adds {
srv = &http.Server{Addr: addr}
fmt.Printf("try listen addr:%s\n", addr)
if err := srv.ListenAndServe(); err != nil {
if strings.Contains(err.Error(), "bind: address already in use") {
continue
}
fmt.Printf("listen close: %s, msg:%s, err: %#v\n", addr, err.Error(), err)
break
}
}
fmt.Println("hanlder request num", reqNum)
fmt.Println("server stoped")
}
func ReplaceNgWeight(addr, path string) error {
re, err := regexp.Compile(`weight=\d+\s*;`)
if err != nil {
return err
}
addre, err := regexp.Compile(fmt.Sprintf(`server\s+%s\s+weight=\d+\s*;`, addr))
if err != nil {
return err
}
b, err := os.ReadFile(path)
if err != nil {
return err
}
// 全部权重清空
b = re.ReplaceAll(b, []byte("weight=1;"))
ok := addre.Match(b)
fmt.Println("match current server", ok, addre.String())
if ok {
fmt.Println("replacing weight...")
// 仅保留当前监听地址权重
b = addre.ReplaceAll(b, []byte(fmt.Sprintf("server %s weight=10000;", addr)))
}
if err = os.WriteFile(path, b, 0644); err != nil {
return err
}
// 正常执行Go程序是肯定没有这个命令的权限的
if err = exec.Command("nginx", "-s", "reload").Run(); err != nil {
return err
}
return nil
}
压测结果
./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 2.35s 1.40s 4.90s 58.28%
Req/Sec 21.12 12.69 60.00 78.95%
Latency Distribution
50% 2.30s
75% 3.60s
90% 4.30s
99% 4.90s
779 requests in 20.02s, 136.95KB read
Socket errors: connect 0, read 100, write 0, timeout 0
Requests/sec: 38.91
Transfer/sec: 6.84KB
解决了大量进不来的请求,但是丢弃的连接似乎没有得到解决
nginx错误日志, error.log
2024/09/28 10:10:49 [notice] 11249#11249: signal process started
2024/09/28 10:10:49 [error] 10729#10729: *77219 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8081/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77071 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8081/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77151 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8081/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77151 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8082/", host: "test.local"
2024/09/28 10:10:49 [error] 10730#10730: *77151 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8083/", host: "test.local"
2024/09/28 10:10:49 [error] 10729#10729: *77099 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8082/", host: "test.local"
2024/09/28 10:10:49 [error] 10729#10729: *77099 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8083/", host: "test.local"
2024/09/28 10:10:49 [error] 10729#10729: *77179 connect() failed (111: Connection refused) while connecting to upstream, client: 127.0.0.1, server: test.local, request: "GET / HTTP/1.1", upstream: "http://127.0.0.1:8083/", host: "test.local"
难顶,不知道有什么办法可以解决了,然后在自己的mac上试一下却是正常的, 不知道该怎么解决了。
./wrk -t2 -c100 -d20s --latency --timeout 30s "http://test.local"
Running 20s test @ http://test.local
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 2.30s 1.42s 4.91s 60.15%
Req/Sec 21.72 13.25 80.00 79.64%
Latency Distribution
50% 2.30s
75% 3.50s
90% 4.31s
99% 4.81s
803 requests in 20.04s, 140.64KB read
Requests/sec: 40.06
Transfer/sec: 7.02KB
欢迎各位大神来评论区指点一下,单点服务有没有什么更好的办法?
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
// 模拟不同的延时的接口
sle := time.Millisecond * 100 * time.Duration(rand.N(50))
time.Sleep(sle)
_, _ = w.Write([]byte(fmt.Sprintf("hello, sleep%v!", sle)))
atomic.AddUint64(&reqNum, 1)
})
-------------------- serv1 console ----------------
start srv
try listen addr:127.0.0.1:8080
oldPid: 15177
wait stop signal
match current server true server\s+127.0.0.1:8080\s+weight=\d+\s*;
replacing weight...
restart nginx , stop instuction sending...
send sig fail no such process
receive stop signal wait stop
Server offline ...
listen close: 127.0.0.1:8080, msg:http: Server closed, err: &errors.errorString{s:"http: Server closed"}
hanlder request num 272
server stoped
shutdown cost: 4.689731015s
2024/09/28 10:46:29 Server exiting
-------------------- serv2 console ----------------
start srv
try listen addr:127.0.0.1:8080
oldPid: 15489
wait stop signal
try listen addr:127.0.0.1:8081
match current server true server\s+127.0.0.1:8081\s+weight=\d+\s*;
replacing weight...
restart nginx , stop instuction sending...
^Creceive stop signal wait stop
Server offline ...
listen close: 127.0.0.1:8081, msg:http: Server closed, err: &errors.errorString{s:"http: Server closed"}
hanlder request num 518
server stoped
shutdown cost: 182.804µs
2024/09/28 10:46:42 Server exiting
-------------------- wrk console ----------------
Running 20s test @ http://test.local
2 threads and 100 connections
Thread Stats Avg Stdev Max +/- Stdev
Latency 2.33s 1.42s 4.91s 56.46%
Req/Sec 21.32 13.07 80.00 81.18%
Latency Distribution
50% 2.30s
75% 3.60s
90% 4.30s
99% 4.90s
790 requests in 20.03s, 138.87KB read
Socket errors: connect 0, read 100, write 0, timeout 0
Requests/sec: 39.43
Transfer/sec: 6.93KB
从结果上看 518 + 272 = 790 应该确实把wrk的请求全部处理完了, wrk这个read 100的错误有没有可能是wrk 或者 Go http 在切换服务时mac与linux 行为不一致才导致的这个错误