背景: 在大型系统中,线上异常出现是必然的结果,那么我们该如何尽可能在早期就发现应用运行有问题并及时的处理以免扩大影响范围呢?
答案是 给应用加监控并告警错误信息。
本文借助SpringBoot +钉钉来实现应用的监控告警,当然也可以借助企业微信、短信通知等等
看了一下钉钉的文档,可发现通过调用Webhook地址可将告警消息发送到群聊里来实现消息通知的功能
实现步骤
获取调用地址
- 先创建一个钉钉群,并创建自定义机器人
选择其中一项安全设置,可加强安全性,防止Webhook地址泄密被乱发消息
- 创建成功,复制Webhook地址,等下需要用到
创建SpringBoot应用
- 项目结构
WarnService
定义上报错误信息的接口
错误信息会添加到队列里MonitorMessageQueue
interface WarnService {
fun reportErrorMsg(moduleName: String, msg: String)
}
@Service
class WarnServiceImpl : WarnService {
@Autowired
private lateinit var monitorMessageQueue: MonitorMessageQueue
override fun reportErrorMsg(moduleName: String, msg: String) {
monitorMessageQueue.add(MessageDto().apply {
this.moduleName = moduleName
this.content = msg
this.timestamp = System.currentTimeMillis()
})
}
MonitorMessageQueue队列
队列提供的方法
- start:启动守护线程
- drain:等待超时返回队列元素,这里设置30秒超时返回
- add:添加元素到队列里
@Component
@Scope("singleton")
class MonitorMessageQueue {
private val queue: BlockingQueue<MessageDto> = LinkedBlockingQueue()
private val logger = LoggerFactory.getLogger(MonitorMessageQueue::class.java)
@Autowired
private lateinit var sendService: DataSendService
@PostConstruct
private fun start() {
logger.info("MonitorMessageQueue start")
val thread = Thread(DataSendThread(this, sendService), "monitor_thread_0")
thread.isDaemon = true
thread.start()
}
//每个机器人每分钟最多发送20条消息到群里,如果超过20条,会限流10分钟。
fun drain(): ArrayList<MessageDto> {
val bulkData = ArrayList<MessageDto>()
Queues.drain(queue, bulkData, Int.MAX_VALUE, 30, TimeUnit.SECONDS)
return bulkData
}
fun add(message: MessageDto) {
queue.add(message)
}
}
告警线程
监控MonitorMessageQueue队列,并对消息进行分组汇总,并调用发送服务发送消息
class DataSendThread(private val queue: MonitorMessageQueue, private val sendService: DataSendService) : Runnable {
private val sendCount = AtomicLong(0)
private val stop = false
private val logger = LoggerFactory.getLogger(DataSendThread::class.java)
override fun run() {
while (!stop) {
val list = queue.drain()
if (list.isNullOrEmpty()) {
logger.info("queue isEmpty")
return
}
val format = SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val mid = UUID.randomUUID().toString().replace("-", "")
val stringBuilder = StringBuilder("[${format.format(System.currentTimeMillis())}][APP监控报警]")
stringBuilder.append("\n");
list.groupBy { it.moduleName }.map {
stringBuilder.append("${it.key}(${it.value.size}次)")
stringBuilder.append("\n")
stringBuilder.append(it.value.firstOrNull()?.content ?: "")
stringBuilder.append("\n")
}
stringBuilder.append("http://127.0.0.1/monitor/detail?mid=${mid}")
sendService.send(stringBuilder.toString())
logger.info("send success:${sendCount.addAndGet(1)}")
}
}
}
DataSendService
处理签名,实际调用Webhook发送告警信息到钉钉群聊
interface DataSendService {
fun send(content: String)
}
@Service
class DataSendServiceImpl : DataSendService {
@Autowired
private lateinit var restTemplate: RestTemplate
private fun url(timestamp: Long, sign: String): String {
return "https://oapi.dingtalk.com/robot/send?access_token=xxxxxxx×tamp=${timestamp}&sign=$sign"
}
override fun send(content: String) {
val timestamp = System.currentTimeMillis()
println(
restTemplate.postForObject(
url(timestamp, calcSign(timestamp)), mapOf(
"msgtype" to "text",
"text" to mapOf(
"content" to content
)
),
String::class.java
)
)
}
private fun calcSign(timestamp: Long): String {
val secret = "xxxxxxx"
val stringToSign = """
$timestamp
$secret
""".trimIndent()
val mac = Mac.getInstance("HmacSHA256")
mac.init(SecretKeySpec(secret.toByteArray(charset("UTF-8")), "HmacSHA256"))
val signData = mac.doFinal(stringToSign.toByteArray(charset("UTF-8")))
return URLEncoder.encode(String(Base64.getEncoder().encode(signData)), "UTF-8")
}
}
运行测试用例
@SpringBootTest
internal class WarnServiceImplTest {
@Autowired
private lateinit var warnService: WarnService
@Test
fun reportErrorMsg() {
while (true) {
for (i in 1..((1000 * Math.random()).toInt())) {
warnService.reportErrorMsg("app-test1", "too many error")
}
for (i in (1..((1000 * Math.random()).toInt()))) {
warnService.reportErrorMsg("app-test2", "too many error")
}
Thread.sleep(1000 * 30)
}
}
}