linux服务器显卡监控脚本

272 阅读2分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。


  • 前期准备: pip install pynvml

1.watch_nvidia

#参数:nvidia_ids:显卡id min_memory:最小可用显存 GB

#遍历每块卡的剩余显存 ---print("card {} free memory is {}GB".format(i,meminfo.free * 1.0 /(1024**3)))


def watch_nvidia(nvidia_ids,min_memory):
  flag = [1 for i in nvidia_ids]
  for i in nvidia_ids:
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print("card {} free memory is {}GB".format(i,meminfo.free * 1.0 /(1024**3)))
    if meminfo.free * 1.0 /(1024**3) > min_memory:
      flag[i-1]=0
    else:
      flag[i-1]=1#统计符合要求的卡的数量
  if 0 in flag:
    free_num = 0
    for i in flag:
      if i == 0:
        free_num += 1
    return free_num#返回符合要求的卡的数量
  else:
    print("no free card!")
    return -1

2.send_msg

#发送邮箱信息

#target_email:接受信息的邮箱,msg:发送的消息

sender --- #发送信息的邮箱

receivers-------# 接收邮件,可设置为你的QQ邮箱或者其他邮箱

MIMEText函数 ----- # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码

smtplib.SMTP_SSL('smtp.163.com')---- #这部分需要去发送的邮箱账号去开启IMAP服务,获取登录授权码


def send_msg(target_email,msg):
  sender = 'xxxxx@163.com'  
  receivers = [target_email]  

  message = MIMEText(msg, 'plain', 'utf-8')
  subject = 'nvidia显卡监控'
  message['Subject'] = Header(subject, 'utf-8')
  #server = smtplib.SMTP('smtp.163.com', 587)
  server = smtplib.SMTP_SSL('smtp.163.com')
  server.connect('smtp.163.com',465)
  #server.starttls()
  #server.ehlo()
  #xxxxxx是获取的登录授权码
  server.login(sender, "xxxxxx") 
  server.sendmail(sender, receivers, message.as_string()) 
  server.quit()

在网易邮箱中的操作如下:

  • 1.首先要开启IMAP/SMTP服务
  • 2.新增一个授权码 在这里插入图片描述

3.完整脚本nvidia.py

  • 这是完整的脚本代码,对上面的总结
  • 主要是watch_nvidia函数和send_msg函数
  • 这里默认有两块显卡,编号为0、1,并且每块显卡的剩余显存大于8G的时候,那么就会向邮箱发送信息
#-*-coding:GBK -*- 
import pynvml
pynvml.nvmlInit()
import time
import os
#from send_email import send_msg

import smtplib
from email.mime.text import MIMEText
from email.header import Header
 
def send_msg(target_email,msg):
  sender = 'xxxxx@163.com'
  receivers = [target_email]  # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
 
  # 三个参数:第一个为文本内容,第二个 plain 设置文本格式,第三个 utf-8 设置编码
  message = MIMEText(msg, 'plain', 'utf-8')
  subject = 'nvidia显卡监控'
  message['Subject'] = Header(subject, 'utf-8')
  #server = smtplib.SMTP('smtp.163.com', 587)
  server = smtplib.SMTP_SSL('smtp.163.com')
  server.connect('smtp.163.com',465)
  #server.starttls()
  #server.ehlo()
  server.login(sender, "xxxxxxx") 
  server.sendmail(sender, receivers, message.as_string()) 
  server.quit()

def watch_nvidia(nvidia_ids,min_memory):
  flag = [1 for i in nvidia_ids]
  for i in nvidia_ids:
    handle = pynvml.nvmlDeviceGetHandleByIndex(i)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
    print("card {} free memory is {}GB".format(i,meminfo.free * 1.0 /(1024**3)))
    if meminfo.free * 1.0 /(1024**3) > min_memory:
      flag[i-1]=0
    else:
      flag[i-1]=1
  if 0 in flag:
    free_num = 0
    for i in flag:
      if i == 0:
        free_num += 1
    return free_num
  else:
    print("no free card!")
    return -1

nvidia_ids = [0,1] # 显卡id
min_memory = 8 # 最小可用显存 GB
while True:
  flag = watch_nvidia(nvidia_ids,min_memory)
  if flag >= 1:
    send_msg("xxxxxxx@bjtu.edu.cn","{}张显卡空闲".format(flag))
    #os.system("sh veri.sh") # your command
    break
  time.sleep(10)

4.后台运行

nohup python nvidia.py >nvidia.out&

注:nohup + &表示后台运行;>nvidia.out表示打印信息到该文件中。