主要逻辑就是程序启动的时候根据数据范围分配一个pod并上锁,锁设置的有过期时间,然后会有一个协程keepPodAlive对这个锁进行续期直到这个范围内的任务执行完毕
var (
setPodKey = "pod_%v"
internal = 2000000
cacheGetLastId = "cm_lastid_%v"
cacheGetCmAPiErrLastID = "cm_apierr_lastid_%v"
cacheNoDataPod = "cm_nodata_pod_%v"
cacheIsRun = "cm_isrun"
)
func (l *CMCData) keepPodAlive(ctx context.Context, podId int) error {
podKey := fmt.Sprintf(setPodKey, podId)
endId := (podId + 1) * internal
var num = 0
for {
num++
// 更新锁的过期时间
set, err := l.cmRDB.Expire(ctx, podKey, 2*time.Minute).Result()
if err != nil {
logger.Errorf("cmRDB Expire err: %v,podid:%+v", err, podKey)
//return err
}
if !set {
break
}
if num%3 == 0 {
getLastIdKey := fmt.Sprintf(cacheGetLastId, podId)
lastId, err := l.cmRDB.Get(ctx, getLastIdKey).Result()
newLastId := cast.ToInt(lastId)
logger.Infof("keepPodAlive endId:%+v,num:%+v,podId:%+v,newLastId:%+v", endId, num, podId, newLastId)
if err != nil && err != redis.Nil {
logger.Errorf("keepPodAlive setPod cmRDB.Get err: %v,podid:%+v", err, podId)
continue
}
if newLastId >= endId && newLastId > 0 && endId > 0 {
// 删除原来的pod标识
if err = l.cmRDB.Del(ctx, podKey).Err(); err != nil {
logger.Errorf("cmRDB del err: %v,podid:%+v", err, podKey)
//return fmt.Errorf("del old pod key err podkey:%+v", podKey)
}
}
if newLastId == 0 {
// 删掉第一次查询都没有数据运行的pod,避免该pod位置又有数据,所以时间设置的要短,但要大于3*20s
noDataPod, err := l.cmRDB.Get(ctx, fmt.Sprintf(cacheNoDataPod, podId)).Result()
if err != nil && err != redis.Nil {
logger.Errorf("keeplive cmRDB.HGet err: %v,podid:%+v", err, podId)
continue
}
if noDataPod != "" {
// 删除原来的pod标识
if err = l.cmRDB.Del(ctx, podKey).Err(); err != nil {
logger.Errorf("cmRDB del err: %v,podid:%+v", err, podKey)
//return fmt.Errorf("del old pod key err podkey:%+v", podKey)
}
}
}
}
if num >= 10000 {
num = 0
}
time.Sleep(20 * time.Second)
}
return nil
}
启动时候自动分配key 5分钟内,pod发生重启,旧key还存在的情况下,新key无法分配成功,需要等待旧key过期后,才能再次分配成功 5分钟后,旧key过期,新的key可以再次分配成功 存量数据数据只走到6kw
func (l *CMCData) setPod(ctx context.Context) (int, error) {
for i := 0; i <= 100; i++ {
// 0 1-1000000
// 1 1000001-2000000
// ...
// 90 9000001-100000000
podKey := fmt.Sprintf(setPodKey, i)
endId := (i + 1) * internal
getLastIdKey := fmt.Sprintf(cacheGetLastId, i)
// 尝试获取分布式锁
set, err := l.cmRDB.SetNX(ctx, podKey, 1, 2*time.Minute).Result()
if err != nil {
logger.Errorf("SetNX err: %v,podid:%+v", err, i)
return -1, err
}
if !set {
continue
} else {
lastId, err := l.cmRDB.Get(ctx, getLastIdKey).Result()
newLastId := cast.ToInt(lastId)
if err != nil && err != redis.Nil {
logger.Errorf("setPod cmRDB.Get err: %v,podid:%+v", err, i)
return -1, err
}
if newLastId >= endId && newLastId > 0 && endId > 0 {
// 删除原来的pod标识
if err = l.cmRDB.Del(ctx, podKey).Err(); err != nil {
logger.Errorf("cmRDB del err: %v,podid:%+v", err, podKey)
return -1, fmt.Errorf("del old pod key err podkey:%+v", podKey)
}
logger.Infof("pod continue podKey:%+v endId:%+v,newLastId:%+v", podKey, endId, newLastId)
continue
}
return i, nil
}
}
return -1, errors.New("no pod")
}
主流程在这里进行
func (l *CMCData) HandleIncr(ctx context.Context) (s int, err error) {
podId, err := l.setPod(ctx)
if err != nil {
logger.Errorf("setPod err:%v", err)
return -1, err
}
if podId < 0 {
logger.Error("setPod failed")
return -1, errors.New("setPod failed")
}
go l.keepPodAlive(ctx, podId)
logger.Infof("handle run podid:%+v", podId)
i := 0
for {
if err := l.doneIncr(ctx, podId); err != nil {
if strings.Contains(err.Error(), "invalid connection") {
logger.Infof("Invalid connection error, retrying num:%+v,podId:%+v", i, podId)
fmt.Println("Invalid connection error, retrying...")
time.Sleep(time.Second * 20)
} else {
fmt.Println("Fatal error, not retrying:", err)
logger.Errorf("cm_clean.Handle err: %v", err)
break
}
} else {
logger.Errorf("cm_clean run done podid:%+v", podId)
break
}
i++
time.Sleep(time.Minute * 1)
}
return 0, err
}
具体的业务逻辑处理
func (l *CMData) doneIncr(ctx context.Context, podId int) (err error) {
start := time.Now()
startId := podId*internal + 1
endId := (podId + 1) * internal
logger.Errorf("start run cm_data startid:%v,endid:%v,podid:%+v", startId, endId, podId)
for {
setLastIdKey := fmt.Sprintf(cacheGetLastId, podId)
getLastId, err := l.cmRDB.Get(ctx, setLastIdKey).Result()
if err != nil && err != redis.Nil {
logger.Errorf("cmRDB.Get err: %v,podid:%+v", err, podId)
fmt.Printf("cmRDB.Get err:%+v,podid:%+v", err, podId)
return err
}
newLastId := cast.ToInt(getLastId)
if newLastId == 0 {
newLastId = startId
}
var cmData = make([]models.CmData, 0, 100)
err = l.queryDB.Raw("SELECT id,status,..... FROM cm_data FORCE INDEX (PRIMARY) WHERE id >= ? and id<=? and status = 0 order by id asc limit ? ", newLastId, endId, BatchNum).
Scan(&cmData).Error
if err != nil {
fmt.Printf("find cm_data err:%+v\n", err)
logger.Errorf("find cm_data err: %v,podid:%+v", err, podId)
return err
}
if len(cmData) == 0 {
// 第一次查询就是一个数据也没有,白占锁
if startId == newLastId {
fmt.Printf("md startId:%+v, newLastId:%+v", startId, newLastId)
_, err = l.cmRDB.Set(ctx, fmt.Sprintf(cacheNoDataPod, podId), podId, 2*time.Minute).Result()
if err != nil {
logger.Errorf("cmRDB.cacheNoDataPod err: %v,podid:%+v", err, podId)
fmt.Printf("cmRDB.cacheNoDataPod err:%+v,podid:%+v", err, podId)
}
}
logger.Infof("run end done podid:%+v", podId)
fmt.Printf("cmData zero\n")
// 检测是否已经达到pod最大值范围,达到最大范围就换个podid,重新调自己一次
s, err := l.HandleIncr(ctx)
if err != nil {
fmt.Printf("done Handle err:%+v", err)
// logger.Errorf("again Handle err:%+v", err)
}
if s < 0 {
logger.Errorf("no pod set")
break
}
break
}
pn := 62
cmDataLen := len(cmData)
partSize := cmDataLen / pn
if cmDataLen < pn {
pn = 1
}
var eg errgroup.Group
eg.SetLimit(pn)
for i := 0; i < pn; i++ {
start := i * partSize
end := start + partSize
if i == pn-1 {
// 最后一个协程处理剩余的所有元素
end = len(cmData)
}
part := cmData[start:end]
eg.TryGo(func() error {
defer func() {
if err := recover(); err != nil {
fmt.Println("recover error:", err)
logger.Errorf("run recover error:%+v", err)
return
}
}()
return l.processcmDataIncr(ctx, part, podId)
})
}
if err = eg.Wait(); err != nil {
fmt.Printf("processcmData err:%+v", err)
logger.Errorf("processcmData err:%+v", err)
return err
}
lastId := cmData[len(cmData)-1].ID + 1
_, err = l.cmRDB.Set(ctx, setLastIdKey, lastId, 0).Result()
if err != nil {
logger.Errorf("setLastIdKey err:%+v", err)
return err
}
}
diffTime := time.Since(start)
fmt.Printf("\n耗时:%+v\n", diffTime)
logger.Infof("run time:%+v,podId:%+v", diffTime, podId)
return nil
}
func (l *CMData) processCmIncr(ctx context.Context, cmDataList []models.CmData, podId int) error {
...具体处理逻辑
}
如果任务执行时间过长,避免锁自动过期,导致任务被重复执行,锁的时间自动续期策略
import (
"time"
"github.com/go-redis/redis/v8" // 假设使用的是go-redis库
)
var rdb *redis.Client
func init() {
// 初始化Redis连接
rdb = redis.NewClient(&redis.Options{
Addr: "localhost:6379", // Redis地址
Password: "", // 密码
DB: 0, // 使用默认DB
})
}
func acquireLockWithUniqueId(lockKey string, uniqueId string, lockTimeout int) bool {
// 尝试设置锁,value为唯一标识
ctx := context.Background()
isSuccess, err := rdb.SetNX(ctx, lockKey, uniqueId, time.Duration(lockTimeout)*time.Second).Result()
if err != nil {
return false
}
return isSuccess
}
func renewLock(lockKey string, uniqueId string, lockTimeout int) {
// 定期检查并续期
ctx := context.Background()
for {
time.Sleep(time.Duration(lockTimeout/2) * time.Second) // 在锁过期前一定时间续期
val, err := rdb.Get(ctx, lockKey).Result()
if err != nil || val != uniqueId {
break // 锁的唯一标识不匹配或获取锁失败,停止续期
}
_, err = rdb.Expire(ctx, lockKey, time.Duration(lockTimeout)*time.Second).Result()
if err != nil {
break // 续期失败,停止续期
}
}
}
func releaseLock(lockKey string, uniqueId string) {
// 释放锁前检查唯一标识,确保不会误解别的实例的锁
ctx := context.Background()
val, err := rdb.Get(ctx, lockKey).Result()
if err == nil && val == uniqueId {
_, err = rdb.Del(ctx, lockKey).Result()
if err != nil {
// 处理删除失败的情况
}
}
}
func executeLongTask(lockKey string, uniqueId string) {
if acquireLockWithUniqueId(lockKey, uniqueId, 30) { // 假设锁超时时间为30秒
go renewLock(lockKey, uniqueId, 30) // 启动一个协程续期锁
// 执行长时间任务
// ...
releaseLock(lockKey, uniqueId) // 任务完成,释放锁
} else {
// 获取锁失败,任务正在执行中
}
}