不修改源代码的方案
系统内置的MaxJobId=999999,支持的MaxJobIdLow=999999,MaxJobIdHigh=9999999。即允许的jobId最大值属于集合[999999,9999999]。可通过配置lsb.params文件中的MAX_JOBID值来设置MaxJobId。通过bparams -l 查看。
修改配置文件
MAX_JOBID=9999999
重新读取配置
badmin mbdrestart
查看配置
bparams -l |grep MAX_JOBID
修改源代码的方案
主要相关代码分为3部分
代码片段:
1、定义片段 lsbatch\lsbatch.h
#define DEF_MAX_JOBID 999999
#define MAX_JOBID_LOW 999999
#define MAX_JOBID_HIGH 9999999
2、获取nextJobId片段 lsbatch\daemons\mbd.job.c
nextJobId = (nextJobId < maxJobId)? nextJobId : 1;
while ((getJobData (nextJobId) != NULL
&& i <= maxJobId)) {
nextJobId++;
i++;
if (nextJobId >= maxJobId)
nextJobId = 1;
}
3、读取配置文件的maxJobId lsbatch\lib\lsb.conf.c
else if (i == 31) {
int value = 0;
value = my_atoi(keylist[i].val, INFINIT_INT, 0);
if ( (value < MAX_JOBID_LOW)
|| (value > MAX_JOBID_HIGH) )
{
/*catgets 5062*/
ls_syslog( LOG_ERR,
I18N(5062, "%s: File%s in section Parameters ending at line %d: maxJobId value %s not in [%d, %d], use default value %d;"),
__func__,
fname,
*lineNum,
keylist[i].key,
MAX_JOBID_LOW,
MAX_JOBID_HIGH,
DEF_MAX_JOBID);
• lsberrno = LSBE_CONF_WARNING;
• pConf->param->maxJobId = DEF_MAX_JOBID;
}
else
{
• pConf->param->maxJobId = value;
}
只要修改片段1中的DEF_MAX_JOBID和MAX_JOBID_HIGH即可,最大支持2^31-1=2147483647。
修改完后,重新编译打包
./configure && make && make install
替换相对应的程序
cp -f openlava1/lib/liblsbatch.* openlava4.0/lib/
cp openlava1/sbin/mbatchd openlava4.0/sbin/
cp openlava1/sbin/sbatchd openlava4.0/sbin/
#还有一些程序可能有影响,例如lib daemons cmd bhist man1 man5 man8
有影响的指令测试
bhist
输入支持的最大值
[root@lava1 etc]# bhist 2147483647
No matching job found
[root@lava1 etc]# bhist 2147483648
2147483648: Illegal job ID.
最大值2^31-1 = 2147483647
输出
当bhist不以-l输出时,最大支持7位
/lsbatch/bhist/bhist.c L1040
if (first == TRUE) {
printf((_i18n_msg_get(ls_catd, NL_SETN, 3197,
"Summary of time in seconds spent in various states:\n"))); /* catgets 3197 */
printf((_i18n_msg_get(ls_catd, NL_SETN, 3198,
"JOBID USER JOB_NAME PEND PSUSP RUN USUSP "))); /* catgets 3198 */
printf(" ");
printf((_i18n_msg_get(ls_catd, NL_SETN, 3199, "SSUSP UNKWN TOTAL\n"))); /* catgets 3199 */
first = FALSE;
}
if (bhistReq->options & OPT_WIDEFORMAT) {
char *jobName, *pos;
jobName = job->submit.jobName;
if ((pos = strchr(jobName, '[')) && LSB_ARRAY_IDX(job->jobId)) {
*pos = '\0';
sprintf(jobName, "%s[%d]", jobName, LSB_ARRAY_IDX(job->jobId));
}
printf("%-7s %-7s %-9s %-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-10.0f\n",
lsb_jobid2str(job->jobId), jobRecord->job->user,
jobName, NegtoZero(pendTime),
NegtoZero(pendSuspTime), NegtoZero(runTime),
NegtoZero(usrSuspTime), NegtoZero(sysSuspTime),
NegtoZero(unknownTime),
NegtoZero(pendTime) + NegtoZero(pendSuspTime) +
NegtoZero(runTime) + NegtoZero(usrSuspTime) +
NegtoZero(sysSuspTime) + NegtoZero(unknownTime));
} else {
char *jobName, *pos;
jobName = job->submit.jobName;
if ((pos = strchr(jobName, '[')) && LSB_ARRAY_IDX(job->jobId)) {
*pos = '\0';
sprintf(jobName, "%s[%d]", jobName, LSB_ARRAY_IDX(job->jobId)
);
}
TRUNC_STR(jobName, 8);
printf("%-7.7s %-7.7s %-9.9s %-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-10.0f\n",
lsb_jobid2str(job->jobId), jobRecord->job->user,
jobName, NegtoZero(pendTime),
NegtoZero(pendSuspTime), NegtoZero(runTime),
NegtoZero(usrSuspTime), NegtoZero(sysSuspTime),
NegtoZero(unknownTime),
NegtoZero(pendTime) + NegtoZero(pendSuspTime) +
NegtoZero(runTime) + NegtoZero(usrSuspTime) +
NegtoZero(sysSuspTime) + NegtoZero(unknownTime));
}
bjobs
输入支持的最大值
[root@lava1 etc]# bjobs 9999999999
Job <1410065407> is not found
[root@lava1 etc]# bjobs 2147483647
Job <2147483647> is not found
[root@lava1 etc]# bjobs 2147483648
2147483648: Illegal job ID.
最大值2^31-1 = 2147483647
输出
当bjobs不以-l输出时,最大支持7位
/lsbatch/cmd/bjobs.c L1029
if (job->jType == JGRP_NODE_ARRAY) {
if (format != WIDE_FORMAT) {
printf("%-7d %-8.8s ", LSB_ARRAY_JOBID(job->jobId), job->submit.jobName);
printf("%8.8s ", job->user);
}
else {
printf("%-7d %s ", LSB_ARRAY_JOBID(job->jobId), job->submit.jobName);
printf("%s ", job->user);
}
printf(" %5d %4d %4d %4d %4d %5d %5d %5d\n",
job->counter[JGRP_COUNT_NJOBS],
job->counter[JGRP_COUNT_PEND],
job->counter[JGRP_COUNT_NDONE],
job->counter[JGRP_COUNT_NRUN],
job->counter[JGRP_COUNT_NEXIT],
job->counter[JGRP_COUNT_NSSUSP],
job->counter[JGRP_COUNT_NUSUSP],
job->counter[JGRP_COUNT_NPSUSP]);
return;
}
L1175
if (Wflag == true) {
printf("%-7d %-7s %-5.5s %-10s %-11s %-11s %-10s %-14.14s",
LSB_ARRAY_JOBID(job->jobId),
job->user,
status,
submitInfo->queue,
job->fromHost,
exec_host,
jobName,
Time2String(job->submitTime));
} else {
printf("%-7d %-7s %-5.5s %-10s %-11s %-11s %-10s %s",
LSB_ARRAY_JOBID(job->jobId),
job->user,
status,
submitInfo->queue,
job->fromHost,
exec_host,
jobName,
subtime);
}
bkill /bresume /bstop
[admin@lava1 ~]$ bkill 3333333333
3333333333: Illegal job ID.
[admin@lava1 ~]$ bkill 2147483647
Job <2147483647>: No matching job found
[admin@lava1 ~]$ bkill 2147483648
2147483648: Illegal job ID.
日志文件的写入
lsb.event / lsb.acct / lsb.event.index 只支持到int32,即最大值为2^31-1
/lsbatch/lib/lsb.log.c
//以下代码以JobNew事件为例, L2010
if (fprintf(log_fp, " %d %d %d %d %d %d %d %d %d %d",
jobNewLog->jobId,
jobNewLog->userId,
jobNewLog->options,
jobNewLog->numProcessors,
(int) jobNewLog->submitTime,
(int) jobNewLog->beginTime,
(int) jobNewLog->termTime,
jobNewLog->sigValue,
(int) jobNewLog->chkpntPeriod,
jobNewLog->restartPid) < 0)
return LSBE_SYS_CALL;