Openlava4.0的MaxJobId扩展方案

147 阅读3分钟

不修改源代码的方案

系统内置的MaxJobId=999999,支持的MaxJobIdLow=999999,MaxJobIdHigh=9999999。即允许的jobId最大值属于集合[999999,9999999]。可通过配置lsb.params文件中的MAX_JOBID值来设置MaxJobId。通过bparams -l 查看。

修改配置文件

MAX_JOBID=9999999

重新读取配置

badmin mbdrestart

查看配置

bparams -l |grep  MAX_JOBID

修改源代码的方案

主要相关代码分为3部分

代码片段:

1、定义片段 lsbatch\lsbatch.h

#define DEF_MAX_JOBID  999999
#define MAX_JOBID_LOW  999999
#define MAX_JOBID_HIGH 9999999

2、获取nextJobId片段 lsbatch\daemons\mbd.job.c

nextJobId = (nextJobId < maxJobId)? nextJobId : 1;
​
while ((getJobData (nextJobId) != NULL
    && i <= maxJobId)) {
  nextJobId++;
  i++;
  if (nextJobId >= maxJobId)
    nextJobId = 1;
​
}

3、读取配置文件的maxJobId lsbatch\lib\lsb.conf.c

else if (i == 31) {
  int value = 0;
  value = my_atoi(keylist[i].val, INFINIT_INT, 0);
  if ( (value < MAX_JOBID_LOW)
     || (value > MAX_JOBID_HIGH) )
  {
    /*catgets 5062*/
    ls_syslog( LOG_ERR,
          I18N(5062, "%s: File%s in section Parameters ending at line %d: maxJobId value %s not in [%d, %d], use default value %d;"),
          __func__,
          fname,
          *lineNum,
          keylist[i].key,
          MAX_JOBID_LOW,
          MAX_JOBID_HIGH,
          DEF_MAX_JOBID);
​
•    lsberrno = LSBE_CONF_WARNING;
•    pConf->param->maxJobId = DEF_MAX_JOBID;
  }
  else
  {
•    pConf->param->maxJobId = value;
  }

只要修改片段1中的DEF_MAX_JOBID和MAX_JOBID_HIGH即可,最大支持2^31-1=2147483647。

修改完后,重新编译打包

./configure && make && make install

替换相对应的程序

cp -f openlava1/lib/liblsbatch.* openlava4.0/lib/
cp openlava1/sbin/mbatchd openlava4.0/sbin/
cp openlava1/sbin/sbatchd openlava4.0/sbin/
​
#还有一些程序可能有影响,例如lib daemons cmd bhist man1 man5 man8

有影响的指令测试

bhist

输入支持的最大值

[root@lava1 etc]# bhist 2147483647
No matching job found
[root@lava1 etc]# bhist 2147483648
2147483648: Illegal job ID.
最大值2^31-1 = 2147483647

输出

当bhist不以-l输出时,最大支持7位
/lsbatch/bhist/bhist.c   L1040
if (first == TRUE) {
                printf((_i18n_msg_get(ls_catd, NL_SETN, 3197,
                                      "Summary of time in seconds spent in various states:\n"))); /* catgets  3197  */
                printf((_i18n_msg_get(ls_catd, NL_SETN, 3198,
                                      "JOBID   USER    JOB_NAME  PEND    PSUSP   RUN     USUSP "))); /* catgets  3198  */
                printf("  ");
                printf((_i18n_msg_get(ls_catd, NL_SETN, 3199, "SSUSP   UNKWN   TOTAL\n"))); /* catgets  3199  */
                first = FALSE;
            }
            if (bhistReq->options & OPT_WIDEFORMAT) {
                char *jobName, *pos;
                jobName = job->submit.jobName;
                if ((pos = strchr(jobName, '[')) && LSB_ARRAY_IDX(job->jobId)) {
                    *pos = '\0';
                    sprintf(jobName, "%s[%d]", jobName, LSB_ARRAY_IDX(job->jobId));
                }

                printf("%-7s %-7s %-9s %-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-10.0f\n",
                       lsb_jobid2str(job->jobId), jobRecord->job->user,
                       jobName, NegtoZero(pendTime),
                       NegtoZero(pendSuspTime), NegtoZero(runTime),
                       NegtoZero(usrSuspTime), NegtoZero(sysSuspTime),
                       NegtoZero(unknownTime),
                       NegtoZero(pendTime) + NegtoZero(pendSuspTime) +
                       NegtoZero(runTime) + NegtoZero(usrSuspTime) +
                       NegtoZero(sysSuspTime) + NegtoZero(unknownTime));
            } else {
                char *jobName, *pos;
                jobName = job->submit.jobName;
                if ((pos = strchr(jobName, '[')) && LSB_ARRAY_IDX(job->jobId)) {
                    *pos = '\0';
                    sprintf(jobName, "%s[%d]", jobName, LSB_ARRAY_IDX(job->jobId)
                    );
                }

                TRUNC_STR(jobName, 8);
                printf("%-7.7s %-7.7s %-9.9s %-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-8.0f%-10.0f\n",
                       lsb_jobid2str(job->jobId), jobRecord->job->user,
                       jobName, NegtoZero(pendTime),
                       NegtoZero(pendSuspTime), NegtoZero(runTime),
                       NegtoZero(usrSuspTime), NegtoZero(sysSuspTime),
                       NegtoZero(unknownTime),
                       NegtoZero(pendTime) + NegtoZero(pendSuspTime) +
                       NegtoZero(runTime) + NegtoZero(usrSuspTime) +
                       NegtoZero(sysSuspTime) + NegtoZero(unknownTime));
            }

bjobs

输入支持的最大值

[root@lava1 etc]# bjobs 9999999999
Job <1410065407> is not found
[root@lava1 etc]# bjobs 2147483647
Job <2147483647> is not found
[root@lava1 etc]# bjobs 2147483648
2147483648: Illegal job ID.
最大值2^31-1 = 2147483647

输出

当bjobs不以-l输出时,最大支持7位  
/lsbatch/cmd/bjobs.c   L1029
  if (job->jType == JGRP_NODE_ARRAY) {
        if (format != WIDE_FORMAT) {
            printf("%-7d  %-8.8s ", LSB_ARRAY_JOBID(job->jobId), job->submit.jobName);
            printf("%8.8s ", job->user);
        }
        else {
            printf("%-7d  %s ", LSB_ARRAY_JOBID(job->jobId), job->submit.jobName);
            printf("%s ", job->user);
        }
        printf("  %5d %4d %4d %4d %4d %5d %5d %5d\n",
               job->counter[JGRP_COUNT_NJOBS],
               job->counter[JGRP_COUNT_PEND],
               job->counter[JGRP_COUNT_NDONE],
               job->counter[JGRP_COUNT_NRUN],
               job->counter[JGRP_COUNT_NEXIT],
               job->counter[JGRP_COUNT_NSSUSP],
               job->counter[JGRP_COUNT_NUSUSP],
               job->counter[JGRP_COUNT_NPSUSP]);
        return;
    }
    
L1175
 if (Wflag == true) {
            printf("%-7d %-7s %-5.5s %-10s %-11s %-11s %-10s %-14.14s",
                   LSB_ARRAY_JOBID(job->jobId),
                   job->user,
                   status,
                   submitInfo->queue,
                   job->fromHost,
                   exec_host,
                   jobName,
                   Time2String(job->submitTime));
        } else {
            printf("%-7d %-7s %-5.5s %-10s %-11s %-11s %-10s %s",
                   LSB_ARRAY_JOBID(job->jobId),
                   job->user,
                   status,
                   submitInfo->queue,
                   job->fromHost,
                   exec_host,
                   jobName,

                   subtime);
        }

bkill /bresume /bstop

[admin@lava1 ~]$ bkill 3333333333
3333333333: Illegal job ID.
[admin@lava1 ~]$ bkill 2147483647
Job <2147483647>: No matching job found
[admin@lava1 ~]$ bkill 2147483648
2147483648: Illegal job ID.

日志文件的写入

lsb.event / lsb.acct / lsb.event.index 只支持到int32,即最大值为2^31-1

/lsbatch/lib/lsb.log.c

//以下代码以JobNew事件为例, L2010
if (fprintf(log_fp, " %d %d %d %d %d %d %d %d %d %d",
                jobNewLog->jobId,
                jobNewLog->userId,
                jobNewLog->options,
                jobNewLog->numProcessors,
                (int) jobNewLog->submitTime,
                (int) jobNewLog->beginTime,
                (int) jobNewLog->termTime,
                jobNewLog->sigValue,
                (int) jobNewLog->chkpntPeriod,
                jobNewLog->restartPid) < 0)
        return LSBE_SYS_CALL;

编译打包