普罗米修斯Remote Write

76 阅读3分钟

​ 由于普罗米修斯api无法直接指定数据写入时间,我们的数据统计又需要指定写入时间,所以使用Remote Write协议直接写入。

​ 示例代码

package prompush

import (
	"bufio"
	"bytes"
	"context"
	"fmt"
	"github.com/gogf/gf/v2/os/glog"
	"github.com/pkg/errors"
	"io"
	"io/ioutil"
	"net/http"
	"net/url"
	"regexp"
	"time"

	"github.com/gogo/protobuf/proto"
	"github.com/golang/snappy"
	"github.com/opentracing-contrib/go-stdlib/nethttp"
	opentracing "github.com/opentracing/opentracing-go"
	"github.com/prometheus/common/model"
	"github.com/prometheus/prometheus/prompb"
)

type RecoverableError struct {
	error
}

type HttpClient struct {
	url       *url.URL
	Client    *http.Client
	timeout   time.Duration
	User, Pwd string
}

var (
	MetricNameRE = regexp.MustCompile(`^[a-zA-Z_:][a-zA-Z0-9_:]*$`)
	metricAddr   = ""
)

type MetricPoint struct {
	Metric  string            `json:"metric"` // 指标名称
	TagsMap map[string]string `json:"tags"`   // 数据标签
	Time    int64             `json:"time"`   // 时间戳,单位是秒
	Value   float64           `json:"value"`  // 内部字段,最终转换之后的float64数值
}

func (c *HttpClient) remoteWritePost(req []byte) error {
	httpReq, err := http.NewRequest("POST", c.url.String(), bytes.NewReader(req))
	if c.User != "" && c.Pwd != "" {
		httpReq.SetBasicAuth(c.User, c.Pwd)
	}
	httpReq.Header.Add("Content-Encoding", "snappy")
	httpReq.Header.Set("Content-Type", "application/x-protobuf")
	httpReq.Header.Set("User-Agent", "opcai")
	httpReq.Header.Set("X-Prometheus-Remote-Write-Version", "0.1.0")
	ctx, cancel := context.WithTimeout(context.Background(), c.timeout)
	defer cancel()

	httpReq = httpReq.WithContext(ctx)

	if parentSpan := opentracing.SpanFromContext(ctx); parentSpan != nil {
		var ht *nethttp.Tracer
		httpReq, ht = nethttp.TraceRequest(
			parentSpan.Tracer(),
			httpReq,
			nethttp.OperationName("Remote Store"),
			nethttp.ClientTrace(false),
		)
		defer ht.Finish()
	}

	httpResp, err := c.Client.Do(httpReq)
	if err != nil {
		// Errors from Client.Do are from (for example) network errors, so are
		// recoverable.
		return RecoverableError{err}
	}
	defer func() {
		io.Copy(ioutil.Discard, httpResp.Body)
		httpResp.Body.Close()
	}()

	if httpResp.StatusCode/100 != 2 {
		scanner := bufio.NewScanner(io.LimitReader(httpResp.Body, 512))
		line := ""
		if scanner.Scan() {
			line = scanner.Text()
		}
		err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line)
	}
	if httpResp.StatusCode/100 == 5 {
		return RecoverableError{err}
	}
	return err
}

func buildWriteRequest(samples []prompb.TimeSeries) ([]byte, error) {
	req := &prompb.WriteRequest{
		Timeseries: samples,
	}
	data, err := proto.Marshal(req)
	if err != nil {
		return nil, err
	}
	compressed := snappy.Encode(nil, data)
	return compressed, nil
}

type sample struct {
	labels []prompb.Label
	t      int64
	v      float64
}

const (
	LABEL_NAME = "__name__"
	LabelJob   = "job"
)

func convertOne(item MetricPoint, job string) (prompb.TimeSeries, error) {
	pt := prompb.TimeSeries{}
	pt.Samples = []prompb.Sample{{}}
	s := sample{}
	s.t = item.Time
	s.v = item.Value
	// name
	if !MetricNameRE.MatchString(item.Metric) {
		return pt, errors.New("invalid metrics name")
	}
	nameLs := []prompb.Label{
		{Name: LABEL_NAME, Value: item.Metric},
		{Name: LabelJob, Value: job},
	}
	s.labels = append(s.labels, nameLs...)
	for k, v := range item.TagsMap {
		if model.LabelNameRE.MatchString(k) {
			ls := prompb.Label{
				Name:  k,
				Value: v,
			}
			s.labels = append(s.labels, ls)
		}
	}

	pt.Labels = append(pt.Labels, s.labels...)
	// 时间赋值问题,使用毫秒时间戳
	tsMs := s.t
	pt.Samples[0].Timestamp = tsMs
	pt.Samples[0].Value = s.v
	return pt, nil
}

func (c *HttpClient) RemoteWrite(pt []prompb.TimeSeries) (int, error) {
	if len(pt) == 0 {
		return 0, nil
	}

	data, err := buildWriteRequest(pt)
	if err != nil {
		return 0, err
	}
	err = c.remoteWritePost(data)
	return len(data), err
}

func NewClient(ur, user, pwd string, timeout time.Duration) (c *HttpClient, err error) {
	u, err := url.Parse(ur)
	if err != nil {
		return
	}
	c = &HttpClient{
		url:     u,
		Client:  &http.Client{},
		timeout: timeout,
		User:    user,
		Pwd:     pwd,
	}
	return
}

func Push(ctx context.Context, url, user, pwd, job string, pt []prompb.TimeSeries) error {
	c, err := NewClient(url, user, pwd, 10*time.Second)
	if err != nil {
		fmt.Println(err)
		return err
	}

	for i := 0; i < 3; i++ {
		startTime := time.Now().UnixMilli()
		bytesNum, err := c.RemoteWrite(pt)
		reportUseTime := float64(time.Now().UnixMilli()-startTime) / 1000.0
		if err != nil {
			glog.Errorf(ctx, "err send metrics:%+v,retry:%d,用时: %.3f", err, i, reportUseTime)
			time.Sleep(time.Millisecond * 300)
			continue
		}
		glog.Infof(ctx, "job:%s, 指标数量: %d,数据大小: %.3f M,用时: %.3f 秒", job, len(pt),
			float64(bytesNum)/float64(1<<20), reportUseTime)
		break
	}

	return nil
}

使用示例:

1.k,v的结构及缓存

package prompush

import (
	"context"
	"github.com/gogf/gf/v2/os/glog"
	"github.com/prometheus/prometheus/prompb"
	"sync"
	"time"
)

// 热度策略组统计
type (
	Keys struct {
		Env string
		Uid string
	}
	Counter struct {
		Total, Sum, Speed int64
	}

	promData struct {
		mu      sync.Mutex
		Job     string
		statMap map[Keys]*Counter
	}
)

var SlaMetricNames = []struct {
	Name  string
	Help  string
	Field func(*Counter) float64
}{
	{"total", "总计", func(s *Counter) float64 { return float64(s.Total) }},
	{"sum", "总和", func(s *Counter) float64 { return float64(s.Sum) }},
	{"speed", "速度", func(s *Counter) float64 { return float64(s.Speed) }},
}

var Sla = promData{
	Job:     "test", // 最好指定,不指定也可以但是容易混
	statMap: make(map[Keys]*Counter),
}

func (s *promData) ProcessLog(log Keys, static Counter) {
	s.mu.Lock()
	defer s.mu.Unlock()
	counter, ok := s.statMap[log]
	if !ok {
		counter = &Counter{}
		s.statMap[log] = counter
	}
	counter.Total += static.Total
	counter.Sum += static.Sum
	counter.Speed += static.Speed
}

func (s *promData) GetAndNew() map[Keys]*Counter {
	s.mu.Lock()
	defer s.mu.Unlock()
	tmp := s.statMap
	s.statMap = make(map[Keys]*Counter)
	return tmp
}

func (s *promData) DataToPt(ctx context.Context, data map[Keys]*Counter) []prompb.TimeSeries {
	t := time.Now().Truncate(time.Minute).UnixMilli() - 60000
	ts := make([]prompb.TimeSeries, 0, len(data))
	for key, counter := range data {
		tagsMap := map[string]string{
			"env": key.Env,
			"uid": key.Uid,
		}
		items := make([]MetricPoint, 0, len(SlaMetricNames))
		for _, m := range SlaMetricNames {
			tmp := MetricPoint{
				Metric:  m.Name,
				TagsMap: tagsMap,
				Time:    t, // Using the same timestamp we captured above
				Value:   m.Field(counter),
			}
			items = append(items, tmp)
		}
		for i := range items {
			tmp, err := convertOne(items[i], s.Job)
			if err != nil {
				glog.Errorf(ctx, "convertOne err:%+v,item:%+v", err, items[i])
				continue
			}
			ts = append(ts, tmp)
		}
	}
	return ts
}

2.数据上报

func PushProm(ctx context.Context) {
	slaData := prompush.Sla.GetAndNew()
	if len(slaDAta) == 0 {
		return
	}
	slaPt := prompush.Sla.DataToPt(ctx, slaDAta)

    // 随机40秒之后上报
	randTime := rand.Intn(40000)
	time.Sleep(time.Duration(randTime) * time.Millisecond)
	pushGatewayURL := gcfg.Instance().MustGet(ctx, "uri", "").String()
	user := gcfg.Instance().MustGet(ctx, "user", "").String()
	pwd := gcfg.Instance().MustGet(ctx, "pwd", "").String()

	err := prompush.Push(ctx, pushGatewayURL, user, pwd, prompush.Sla.Job, slaPt)
	if err != nil {
		glog.Errorf(ctx, "pusher err:%s", err)
	}
}