🥇个人主页:500佰 | 大数据开发者 + 职业认证工程师(腾讯+华为)
#大数据 #《我写了一个程序:集群规模小或资源不足时 如何控制业务并发?》#性能可靠性 #Java
说明(需求介绍):本文介绍一个案例 当大数据集群规模较小 或资源不足场景 ,如何控制业务并发数量 调整业务作业数量 以保证集群在出现性能瓶颈时,保证集群的可用性和安全性 已及灵活性、可靠性
业务流程(功能实现方案)
第一步----获取告警id
- 请求大数据集群后端接口: /api/v2/alarms (示例 这个接口是集群内部已有接口,我们只需要去调用接口,获取接口返回值,如果是restful接口,接口返回就遵循restful接口规范,返回值一般为json串,我的程序案例是restful接口规范)
-
从接口中获取状态是未恢复的告警 按时间倒序desc
-
告警筛选条件:
- CPU12016/内存12018 (告警id以实际为准)
- 未恢复
- CPU12016/内存12018,告警主机数量 >=3 (暂定不超3台主机,HDFS数据存储挂2节点以上数据不可读取)
-
满足告警条件 生成CPU告警标识文件
- abnormal.ck
- abnormal.ck
第二步----kill任务条件
- 存在标识文件
- 排除集群业务用户例如:csdn
- yarn api查询当前running的任务 从中取出appid
- 将appid按已分配vCore数量排序 取出最大的那一个
第三步----日志
- 记录被杀任务id 等信息
- 记录cpu、内存告警
业务程序代码(部署调用)
运行调用示例
(此shell脚本实现封装,通过此脚本可调用打包好的java程序和python程序):
#!/bin/bash
shell_dir=$(dirname "${BASH_SOURCE-$0}")
shell_dir=$(cd "$spark_submit_path"; pwd)
client="/opt/client2"
##useage: java -cp ./*.jar com.csdnx.mrs.rest.BigdataClusterAlarms 告警id 是否关联告警主机数量(为true则需告警主机数量>=3)
java -cp ./*.jar com.adtec.mrs.rest.BigdataClusterAlarms ALM-12012 false
source ${client}/bigdata_env && kinit -kt ${client}/user.keytab csdn && python3 ${shell_dir}/yarn_running.py
主类BigdataClusterAlarms:
实现大数据集群告警信息实时采集监控 如下
public class BigdataClusterAlarms {
private static final Logger LOG = LoggerFactory.getLogger(mrsClusterAlarms.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
/* 查询按时间倒叙,状态是未恢复的告警
告警类型。取值范围:0:全部(默认)1 : 未恢复告警 2 : 已恢复告警
告警级别。0:全部(默认)1 : Critical 2 : Major 3 : Minor 4 : Warning*/
private static final String QUERY_ALARMS_LIST_URL = "/api/v2/alarms?limit=50&offset=0&status=1&order=desc"
+ "&order_by=occurTime";
/**
* 程序运行入口
*
* @param args 参数
*/
public static void main(String[] args) {
LOG.info("Enter main.{}");
// 文件UserInfo.properties的路径
String userFilePath = "./conf/UserInfo.properties";
String logPath = System.getProperty("user.dir") + File.separator + "logs";
// String YARN_KILL_APP_SHELL = "source /opt/mrsClient/bigdata_env && /usr/bin/python3 "+System.getProperty("user.dir") + File.separator +"yarn_running.py";
// String YARN_KILL_APP_SHELL = args[0];
InputStream userInfo = null;
ResourceBundle resourceBundle = null;
try {
File file = new File(userFilePath);
if (!file.exists()) {
LOG.error("The user info file doesn't exist.");
return;
}
LOG.info("Get the web info and user info from file {} ", file);
userInfo = new BufferedInputStream(new FileInputStream(file));
resourceBundle = new PropertyResourceBundle(userInfo);
// 获取用户名
String userName = resourceBundle.getString("userName");
LOG.info("The user name is : {}.", userName);
if (userName == null || userName.isEmpty()) {
LOG.error("The userName is empty.");
}
// 获取用户密码
String password = resourceBundle.getString("password");
if (password == null || password.isEmpty()) {
LOG.error("The password is empty.");
}
String webUrl = resourceBundle.getString("webUrl");
LOG.info("The webUrl is : {}.", webUrl);
if (password == null || password.isEmpty()) {
LOG.error("The password is empty.");
}
// userTLSVersion是必备的参数,是处理jdk1.6服务端连接jdk1.8服务端时的重要参数,如果用户使用的是jdk1.8该参数赋值为空字符串即可
String userTLSVersion = "";
// 调用firstAccess接口完成登录认证
LOG.info("Begin to get httpclient and first access.");
BasicAuthAccess authAccess = new BasicAuthAccess();
HttpClient httpClient = authAccess.loginAndAccess(webUrl, userName, password, userTLSVersion);
LOG.info("Start to access REST API.");
HttpManager httpManager = new HttpManager();
String operationName = "";
String operationUrl = "";
// 访问大数据集群接口完成查找alarms列表
operationName = "QueryAlarmsList";
operationUrl = webUrl + QUERY_ALARMS_LIST_URL;
String responseLineContent = httpManager.sendHttpGetRequest(httpClient, operationUrl, operationName);
List<Map> alarmsList = jsonToMaps(responseLineContent);
LOG.info("未恢复的告警列表:");
LOG.info("The {} response is {}.", operationName, alarmsList);
String searchCpuId = args[0]; //传入告警id
String searchMemoryId = "ALM-12018";
File dir = new File(logPath);
if (!dir.exists()) {
dir.mkdirs();
}
if (!responseLineContent.contains(searchCpuId) && !responseLineContent.contains(searchMemoryId)) {
LOG.info("不存在CPU告警或存在内存告警");
File ck = new File(logPath + File.separator + "abnormal.ck");
if (ck.exists()) {
LOG.info("不存在CPU告警或存在内存告警 删除标志文件:abnormal.ck");
ck.delete(); //删除ck文件
}
} else {
LOG.info("存在CPU告警或存在内存告警");
File ck = new File(logPath + File.separator + "abnormal.ck");
// boolean isCreated = ck.createNewFile();
Date date = new Date();
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
int alarms_num = 0; // 获取cpu、内存 告警主机数量
for (int i = 0; i < alarmsList.size(); i++) {
String alarm = alarmsList.get(i).toString();
LOG.info(alarm);
if (alarm.contains(searchCpuId) || alarm.contains(searchMemoryId)) {
try (PrintWriter pw = new PrintWriter(new OutputStreamWriter(
new FileOutputStream(logPath + File.separator + "alarms_all.log", true), StandardCharsets.UTF_8))) {
pw.println(df.format(date) + " " + alarm);
LOG.info("未恢复的告警列表已成功写入到alarms_all.log文件中");
alarms_num++;
} catch (IOException e) {
LOG.error("写入文件时出现错误:" + e.getMessage());
}
}
}
boolean flag = Boolean.parseBoolean(args[1]);
if (flag){
LOG.info("关联告警主机数量!");
LOG.info("存在CPU告警或存在内存告警 告警主机数量:{}", alarms_num);
if (alarms_num >= 3) {
LOG.info("存在CPU告警或存在内存告警 告警主机数量 大于等于3台");
LOG.info("产生标志文件:abnormal.ck");
boolean isCreated = ck.createNewFile();
} else if (alarms_num >= 1 && alarms_num < 3) {
LOG.info("存在CPU告警或存在内存告警 告警主机数量 小于3台,集群正常可用,不杀业务!");
File ck1 = new File(logPath + File.separator + "abnormal.ck");
if (ck1.exists()) {
LOG.info("告警主机数量 小于3台 删除标志文件:abnormal.ck");
ck.delete(); //删除ck文件
}
}
}else {
LOG.info("不关联告警主机数量!");
LOG.info("产生标志文件:abnormal.ck");
boolean isCreated = ck.createNewFile();
}
}
File ff = new File(logPath);
File f = new File(ff, "alarms_all.log");
if (f.exists()) {
long fileSizeInBytes = f.length();
// 将字节数转换为MB
long fileSizeInMB = fileSizeInBytes / (1024 * 1024);
// 判断大于100MB 删除日志文件
if (fileSizeInMB > 100) {
LOG.info("大于100MB删除日志文件:alarms_all.log");
f.delete();
}
}
} catch (FileNotFoundException e) {
LOG.error("File not found exception.");
} catch (IOException e) {
e.printStackTrace();
LOG.error(e.getMessage());
} catch (Throwable e) {
e.printStackTrace();
LOG.error(e.getMessage());
} finally {
if (userInfo != null) {
try {
userInfo.close();
} catch (IOException e) {
LOG.error("IOException.");
}
}
}
}
public static List<Map> jsonToMaps(String json) throws IOException {
Map map = OBJECT_MAPPER.readValue(json, Map.class);
return (List) (map.get("alarms"));
}
}
杀死集群内部业务对应的任务python脚本如下:
import importlib,sys,os,json
importlib.reload(sys)
import time
def get_all_app_info():
"""
:return:当前running状态任务
"""
start_end_timestamp = int(time.time()) * 1000
start_begin_timestamp = start_end_timestamp - int(3 * 1000)
job_url = "https://resourceManager主:26001/ws/v1/cluster/apps?state=running"
all_app_info = json.loads(os.popen('curl -XGET --tlsv1.2 --negotiate -k -s -u : ' + job_url).read())["apps"]["app"]
return all_app_info
def get_all_app_info2():
"""
:return:当前running状态任务
"""
start_end_timestamp = int(time.time()) * 1000
start_begin_timestamp = start_end_timestamp - int(3 * 1000)
job_url = "https://resourceManager备:26001/ws/v1/cluster/apps?state=running"
all_app_info = json.loads(os.popen('curl -XGET --tlsv1.2 --negotiate -k -s -u : ' + job_url).read())["apps"]["app"]
return all_app_info
def data_byte_to_B(file):
fileinfo = os.stat(file)
size = fileinfo.st_size
vsize = size / float(1024 * 1024)
return round(vsize, 2)
def get_running_job(all_app_info):
job_app_list = all_app_info
job_process_time_dict = dict()
for job_base_info in job_app_list:
app_job_user = job_base_info['user']
#排除项目: 执行用户为:csdn和spark2x和hetuserver
if not app_job_user.startswith('spark2x') and not app_job_user.startswith('hds') and not app_job_user.startswith('hetuserver') and not app_job_user.startswith('eoi'):
app_id = job_base_info['id']
start_time = int(job_base_info['startedTime'] / 1000)# 单位 秒
n_timt = int(time.time())
process_time = float('%.2f'% (n_timt - start_time))
#杀死当前已分配CPU:allocatedVCores使用最大的任务:排除项目: 执行用户为:csdn和spark2x和hetuserver
app_job_name = job_base_info['name']
app_job_type = job_base_info['applicationType']
app_job_queue = job_base_info['queue']
allocatedVCores = job_base_info['allocatedVCores']
sep = ' '
yarn_job_running = str(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime(start_time))) + sep + app_job_user \
+ sep + app_id + sep + app_job_name + sep + app_job_type + sep +app_job_queue + " 运行时长(秒): " + str(process_time)
job_process_time_dict[yarn_job_running] = allocatedVCores
#按照使用的allocatedVCores cpu数量排序任务
job_sorted_list = sorted(job_process_time_dict.items(), key=lambda d: d[1], reverse=True)
return job_sorted_list
def process():
try:
all_app_info=get_all_app_info()
except Exception as e:
all_app_info=get_all_app_info2()
top_cpu_job_list = get_running_job(all_app_info)
if os.path.exists(r"/opt/client2/Spark2x/spark/bin/abnormal/logs/abnormal.ck"):
for i,j in top_cpu_job_list:
#最大container容器数量的任务id
max_vcore_app_id=str(str(str(top_cpu_job_list[0]).rstrip(")").lstrip("(").split(",")[0]).split(" ")[2])
s = f"yarn application -kill {max_vcore_app_id}"
os.system(s)
kill_time = str(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime(time.time())))
with open(r"/opt/client2/Spark2x/spark/bin/abnormal/logs/killedjob.log", "a+") as fp:
fp.write(f"[{kill_time}] [INFO] 为恢复集群状态正常被杀:{i} 使用的vCore数量: {j}" + "\n")
print(f"[{kill_time}] [INFO] 为恢复集群状态正常被杀:{i} 使用的vCore数量: {j}")
break
#保存的日志文件超100M时删除
if os.path.exists(r"/opt/client/Spark2x/spark/bin/abnormal/logs/killedjob.log"):
if data_byte_to_B(r"/opt/client/Spark2x/spark/bin/abnormal/logs/killedjob.log") > 100.0:
os.remove(r"/opt/client/Spark2x/spark/bin/abnormal/logs/killedjob.log")
if __name__ == "__main__":
process()
最后
谢谢大家 @500佰 欢迎留言指导🥇