由于普罗米修斯api无法直接指定数据写入时间,我们的数据统计又需要指定写入时间,所以使用Remote Write协议直接写入。
示例代码
package prompush
import (
"bufio"
"bytes"
"context"
"fmt"
"github.com/gogf/gf/v2/os/glog"
"github.com/pkg/errors"
"io"
"io/ioutil"
"net/http"
"net/url"
"regexp"
"time"
"github.com/gogo/protobuf/proto"
"github.com/golang/snappy"
"github.com/opentracing-contrib/go-stdlib/nethttp"
opentracing "github.com/opentracing/opentracing-go"
"github.com/prometheus/common/model"
"github.com/prometheus/prometheus/prompb"
)
type RecoverableError struct {
error
}
type HttpClient struct {
url *url.URL
Client *http.Client
timeout time.Duration
User, Pwd string
}
var (
MetricNameRE = regexp.MustCompile(`^[a-zA-Z_:][a-zA-Z0-9_:]*$`)
metricAddr = ""
)
type MetricPoint struct {
Metric string `json:"metric"` // 指标名称
TagsMap map[string]string `json:"tags"` // 数据标签
Time int64 `json:"time"` // 时间戳,单位是秒
Value float64 `json:"value"` // 内部字段,最终转换之后的float64数值
}
func (c *HttpClient) remoteWritePost(req []byte) error {
httpReq, err := http.NewRequest("POST", c.url.String(), bytes.NewReader(req))
if c.User != "" && c.Pwd != "" {
httpReq.SetBasicAuth(c.User, c.Pwd)
}
httpReq.Header.Add("Content-Encoding", "snappy")
httpReq.Header.Set("Content-Type", "application/x-protobuf")
httpReq.Header.Set("User-Agent", "opcai")
httpReq.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
defer cancel()
httpReq = httpReq.WithContext(ctx)
if parentSpan := opentracing.SpanFromContext(ctx); parentSpan != nil {
var ht *nethttp.Tracer
httpReq, ht = nethttp.TraceRequest(
parentSpan.Tracer(),
httpReq,
nethttp.OperationName("Remote Store"),
nethttp.ClientTrace(false),
)
defer ht.Finish()
}
httpResp, err := c.Client.Do(httpReq)
if err != nil {
// Errors from Client.Do are from (for example) network errors, so are
// recoverable.
return RecoverableError{err}
}
defer func() {
io.Copy(ioutil.Discard, httpResp.Body)
httpResp.Body.Close()
}()
if httpResp.StatusCode/100 != 2 {
scanner := bufio.NewScanner(io.LimitReader(httpResp.Body, 512))
line := ""
if scanner.Scan() {
line = scanner.Text()
}
err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line)
}
if httpResp.StatusCode/100 == 5 {
return RecoverableError{err}
}
return err
}
func buildWriteRequest(samples []prompb.TimeSeries) ([]byte, error) {
req := &prompb.WriteRequest{
Timeseries: samples,
}
data, err := proto.Marshal(req)
if err != nil {
return nil, err
}
compressed := snappy.Encode(nil, data)
return compressed, nil
}
type sample struct {
labels []prompb.Label
t int64
v float64
}
const (
LABEL_NAME = "__name__"
LabelJob = "job"
)
func convertOne(item MetricPoint, job string) (prompb.TimeSeries, error) {
pt := prompb.TimeSeries{}
pt.Samples = []prompb.Sample{{}}
s := sample{}
s.t = item.Time
s.v = item.Value
// name
if !MetricNameRE.MatchString(item.Metric) {
return pt, errors.New("invalid metrics name")
}
nameLs := []prompb.Label{
{Name: LABEL_NAME, Value: item.Metric},
{Name: LabelJob, Value: job},
}
s.labels = append(s.labels, nameLs...)
for k, v := range item.TagsMap {
if model.LabelNameRE.MatchString(k) {
ls := prompb.Label{
Name: k,
Value: v,
}
s.labels = append(s.labels, ls)
}
}
pt.Labels = append(pt.Labels, s.labels...)
// 时间赋值问题,使用毫秒时间戳
tsMs := s.t
pt.Samples[0].Timestamp = tsMs
pt.Samples[0].Value = s.v
return pt, nil
}
func (c *HttpClient) RemoteWrite(pt []prompb.TimeSeries) (int, error) {
if len(pt) == 0 {
return 0, nil
}
data, err := buildWriteRequest(pt)
if err != nil {
return 0, err
}
err = c.remoteWritePost(data)
return len(data), err
}
func NewClient(ur, user, pwd string, timeout time.Duration) (c *HttpClient, err error) {
u, err := url.Parse(ur)
if err != nil {
return
}
c = &HttpClient{
url: u,
Client: &http.Client{},
timeout: timeout,
User: user,
Pwd: pwd,
}
return
}
func Push(ctx context.Context, url, user, pwd, job string, pt []prompb.TimeSeries) error {
c, err := NewClient(url, user, pwd, 10*time.Second)
if err != nil {
fmt.Println(err)
return err
}
for i := 0; i < 3; i++ {
startTime := time.Now().UnixMilli()
bytesNum, err := c.RemoteWrite(pt)
reportUseTime := float64(time.Now().UnixMilli()-startTime) / 1000.0
if err != nil {
glog.Errorf(ctx, "err send metrics:%+v,retry:%d,用时: %.3f", err, i, reportUseTime)
time.Sleep(time.Millisecond * 300)
continue
}
glog.Infof(ctx, "job:%s, 指标数量: %d,数据大小: %.3f M,用时: %.3f 秒", job, len(pt),
float64(bytesNum)/float64(1<<20), reportUseTime)
break
}
return nil
}
使用示例:
1.k,v的结构及缓存
package prompush
import (
"context"
"github.com/gogf/gf/v2/os/glog"
"github.com/prometheus/prometheus/prompb"
"sync"
"time"
)
// 热度策略组统计
type (
Keys struct {
Env string
Uid string
}
Counter struct {
Total, Sum, Speed int64
}
promData struct {
mu sync.Mutex
Job string
statMap map[Keys]*Counter
}
)
var SlaMetricNames = []struct {
Name string
Help string
Field func(*Counter) float64
}{
{"total", "总计", func(s *Counter) float64 { return float64(s.Total) }},
{"sum", "总和", func(s *Counter) float64 { return float64(s.Sum) }},
{"speed", "速度", func(s *Counter) float64 { return float64(s.Speed) }},
}
var Sla = promData{
Job: "test", // 最好指定,不指定也可以但是容易混
statMap: make(map[Keys]*Counter),
}
func (s *promData) ProcessLog(log Keys, static Counter) {
s.mu.Lock()
defer s.mu.Unlock()
counter, ok := s.statMap[log]
if !ok {
counter = &Counter{}
s.statMap[log] = counter
}
counter.Total += static.Total
counter.Sum += static.Sum
counter.Speed += static.Speed
}
func (s *promData) GetAndNew() map[Keys]*Counter {
s.mu.Lock()
defer s.mu.Unlock()
tmp := s.statMap
s.statMap = make(map[Keys]*Counter)
return tmp
}
func (s *promData) DataToPt(ctx context.Context, data map[Keys]*Counter) []prompb.TimeSeries {
t := time.Now().Truncate(time.Minute).UnixMilli() - 60000
ts := make([]prompb.TimeSeries, 0, len(data))
for key, counter := range data {
tagsMap := map[string]string{
"env": key.Env,
"uid": key.Uid,
}
items := make([]MetricPoint, 0, len(SlaMetricNames))
for _, m := range SlaMetricNames {
tmp := MetricPoint{
Metric: m.Name,
TagsMap: tagsMap,
Time: t, // Using the same timestamp we captured above
Value: m.Field(counter),
}
items = append(items, tmp)
}
for i := range items {
tmp, err := convertOne(items[i], s.Job)
if err != nil {
glog.Errorf(ctx, "convertOne err:%+v,item:%+v", err, items[i])
continue
}
ts = append(ts, tmp)
}
}
return ts
}
2.数据上报
func PushProm(ctx context.Context) {
slaData := prompush.Sla.GetAndNew()
if len(slaDAta) == 0 {
return
}
slaPt := prompush.Sla.DataToPt(ctx, slaDAta)
// 随机40秒之后上报
randTime := rand.Intn(40000)
time.Sleep(time.Duration(randTime) * time.Millisecond)
pushGatewayURL := gcfg.Instance().MustGet(ctx, "uri", "").String()
user := gcfg.Instance().MustGet(ctx, "user", "").String()
pwd := gcfg.Instance().MustGet(ctx, "pwd", "").String()
err := prompush.Push(ctx, pushGatewayURL, user, pwd, prompush.Sla.Job, slaPt)
if err != nil {
glog.Errorf(ctx, "pusher err:%s", err)
}
}