用图结构来监控业务故障关系报警

126 阅读2分钟

背景:

源由于有一次机房光纤切割,导致各种业务报警信息袭来,服务是多对多的关系。

#!/usr/bin/env python
# _*_ coding: UTF-8 _*_
from redis_class import *
import urllib
import os

class Vertex:
    def __init__(self,key):
        self.id = key
        #self.color = white
        #self.alertFlag = 0
        self.connectedTo = {}

    def addNeighbor(self,nbr,weight=0):
        self.connectedTo[nbr] = weight

    def __str__(self):
        return str(self.id) + ' connectedTo: ' + str([x.id for x in self.connectedTo])

    def getConnections(self):
        return self.connectedTo.keys()

    def getId(self):
        return self.id

    def getWeight(self,nbr):
        return self.connectedTo[nbr]



class Graph:
    def __init__(self):
        self.vertList = {}
        self.numVertices = 0

    def addVertex(self,key):
        self.numVertices = self.numVertices + 1
        newVertex = Vertex(key)
        self.vertList[key] = newVertex
        return newVertex

    def getVertex(self,n):
        if n in self.vertList:
            return self.vertList[n]
        else:
            return None

    def __contains__(self,n):
        return n in self.vertList

    def addEdge(self,f,t,cost=0):
        if f not in self.vertList:
            nv = self.addVertex(f)
        if t not in self.vertList:
            nv = self.addVertex(t)
        self.vertList[f].addNeighbor(self.vertList[t], cost)

    def getVertices(self):
        return self.vertList.keys()

    def __iter__(self):
        return iter(self.vertList.values())



def send_sms(revicer, subject, content):
   command = "curl http://notice.ops.x.com/send_sms -d "receiver=%s&subject=%s&content=%s"" % (
   receiver, urllib.quote(subject), urllib.quote(content))
   print command
   if os.system(command) == 0:
      print 'successfully'
      return True
   else:
      file = open(logfile, "a")
      timenow = time.localtime()
      datenow = time.strftime('%Y-%m-%d %H:%M:%s', timenow)
      file.write("sendsms %s has failed at %s") % (command, datenow)
      file.close()
      return False

def send_voice(revicer, subject, content):
   command = "curl http://notice.ops.x.com/send_voice -d "receiver=%s&subject=%s&content=%s"" % (
   receiver, urllib.quote(subject), urllib.quote(content))
   print command
   if os.system(command) == 0:
      print 'successfully'
      return True
   else:
      file = open(logfile, "a")
      timenow = time.localtime()
      datenow = time.strftime('%Y-%m-%d %H:%M:%s', timenow)
      file.write("sendsms %s has failed at %s") % (command, datenow)
      file.close()
      return False

#nginx 监控
nginx_alert_key = "data_nginx_domain_alert"
redis_host = '192.168.x.x'
redis_port = 46389
redis_db = 0

receiver = '186xxxxxxxx'
#查询报警域名
def getAlertInfo(L):
   failed_result = []
   for dn in L:
      res  = r1.hget(nginx_alert_key,dn)
      if res == 'True':
         failed_result.append(dn)
   return failed_result




#监控域名列表
monitor_domain_lists = ['recommend-hot.x.com','score-server.hot-rec.x.com','api.youxi.x.com','scorer.x.com']

#根据报警信息获取节点
def getNodeId(v,data):
   print data
   print v
   for key,value in data:
      print value
      print v
      if value == v:
         # 转成int方便后面图遍历
         return key
      else:
         return False

#记录故障点状态
def pushTroubleNode(V_id,currentTime):
   pass

#发送报警
def sendAlert(msg):
   pass


g = Graph()

print g.getVertices()


#存节点邻居
v_Neighbor = []



#设置域名拓扑结构
g.addEdge('recommend-hot.x.com','score-server.hot-rec.x.com',4)
g.addEdge('api.youxi.x.com','scorer.x.com',7)




def getTroubleNodeNei(g,id):
   for v in g:
      #print v.id
      #if v.id == 0:
      if v.id == id:
         #print type(v.getConnections())
         for w in v.getConnections():
            print type(w)
            w_id = w.getId()
            if w_id:
               print w_id
               v_Neighbor.append(w_id)
   return v_Neighbor


r1 = Redis_Conn(redis_host, redis_port, redis_db)


getTroubleNodeNei(g,'recommend-hot.x.com')

#获取故障业务域名
L = getAlertInfo(monitor_domain_lists)
if L:
   for dn in L:
      #遍历故障域名并获取相关影响的站点
      failed_node = getTroubleNodeNei(g,dn)
      print failed_node
      if failed_node:
         subject = "%s failure affects %s" %(dn,str(failed_node))
         print subject
         content = subject
         send_sms(receiver,subject,content)