For mpiexec-spawned jobs to survive across a mom restart, and to enable proper accounting for all jobs which continue across a mom restart, this patch fixes some behavior of mom when restarted with the "-p" flag. Note that this patch adds functionality to the machine-specific part of the mom code for linux only. Users of other system types could cut-n-paste that code without too much problem, but as it stands, this patch will break compilation on non-linux systems.

This patch does four things:

--- pbs-2.3.12/doc/man8/pbs_mom.8B | 10 ++-- pbs-2.3.12/src/include/mom_func.h | 1 pbs-2.3.12/src/resmom/catch_child.c | 4 + pbs-2.3.12/src/resmom/linux/mom_mach.c | 76 +++++++++++++++++++++++++++++- pbs-2.3.12/src/resmom/linux/mom_start.c | 3 - pbs-2.3.12/src/resmom/mom_main.c | 80 ++++++++++++++++++++------------ pbs-2.3.12/src/resmom/start_exec.c | 18 ++++++- 7 files changed, 152 insertions(+), 40 deletions(-) diff -puN doc/man8/pbs_mom.8B~mom-restart doc/man8/pbs_mom.8B --- pbs-2.3.12-orig/doc/man8/pbs_mom.8B~mom-restart 2004-08-06 17:52:06.000000000 -0400 +++ pbs-2.3.12/doc/man8/pbs_mom.8B 2004-08-06 17:52:06.000000000 -0400 @@ -320,10 +320,12 @@ file, and reinitialize resource structur results in a log file entry. The signal is used to limit the time taken by certain children processes, such as the prologue and epilogue. .IP "SIGINT and SIGTERM" -Result in pbs_mom terminating all running children and exiting. This is the -action for the following signals as well: SIGXCPU, SIGXFSZ, SIGCPULIM, -and SIGSHUTDN. -.IP "SIGPIPE, SIGUSR1, SIGUSR2, SIGINFO" +Result in pbs_mom exiting without terminating any running jobs. +This is the action for the following signals as well: SIGXCPU, SIGXFSZ, +SIGCPULIM, and SIGSHUTDN. +.IP SIGUSR1 +causes mom to kill all running jobs on the node, then exit. +.IP "SIGPIPE, SIGUSR1, SIGINFO" are ignored. .LP All other signals have their default behavior installed. diff -puN src/include/mom_func.h~mom-restart src/include/mom_func.h --- pbs-2.3.12-orig/src/include/mom_func.h~mom-restart 2004-08-06 17:52:06.000000000 -0400 +++ pbs-2.3.12/src/include/mom_func.h 2004-08-06 17:52:06.000000000 -0400 @@ -136,6 +136,7 @@ extern void mom_deljob A_((job *)); extern void mom_freenodes A_((job *)); extern void scan_for_exiting(); extern void scan_for_terminated(); +extern void scan_non_child_tasks(void); extern int set_job A_((job *, struct startjob_rtn *)); extern void set_globid A_((job *, struct startjob_rtn *)); extern int set_mach_vars A_((job *, struct var_table *)); diff -puN src/resmom/catch_child.c~mom-restart src/resmom/catch_child.c --- pbs-2.3.12-orig/src/resmom/catch_child.c~mom-restart 2004-08-06 17:52:06.000000000 -0400 +++ pbs-2.3.12/src/resmom/catch_child.c 2004-08-06 17:52:06.000000000 -0400 @@ -699,6 +699,10 @@ void init_abort_jobs(recover) if (pj == NULL) continue; + /* set the globid so mom does not coredump in response + * to tm_spawn */ + set_globid(pj, 0); + append_link(&svr_alljobs, &pj->ji_alljobs, pj); job_nodes(pj); task_recov(pj); diff -puN src/resmom/linux/mom_mach.c~mom-restart src/resmom/linux/mom_mach.c --- pbs-2.3.12-orig/src/resmom/linux/mom_mach.c~mom-restart 2004-08-06 17:52:06.000000000 -0400 +++ pbs-2.3.12/src/resmom/linux/mom_mach.c 2004-08-06 17:52:06.000000000 -0400 @@ -114,6 +114,7 @@ #include "job.h" #include "log.h" #include "mom_mach.h" +#include "mom_func.h" #include "resmon.h" #include "../rm_dep.h" @@ -1071,8 +1072,22 @@ kill_task(const task *ptask, int sig) } if (sesid == ps->session) { - (void)kill(ps->pid, sig); - ++ct; + if (ps->pid == 0) { + sprintf(log_buffer, + "%s: not killing pid 0 with sig %d", + __func__, sig); + log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, + ptask->ti_job->ji_qs.ji_jobid, log_buffer); + } else { + sprintf(log_buffer, + "%s: killing pid %d task %d with sig %d", + __func__, ps->pid, ptask->ti_qs.ti_task, sig); + log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, + ptask->ti_job->ji_qs.ji_jobid, log_buffer); + + (void)kill(ps->pid, sig); + ++ct; + } } } return ct; @@ -2216,3 +2231,60 @@ struct rm_attribute *attrib; return ret_string; } + +/* + * For a recovering (-p) mom, look through existing tasks in existing + * jobs for things that have exited that are not owned by us through a + * parent-child relationship. Otherwise we cannot report back to tm + * clients when tasks have exited. + */ +void +scan_non_child_tasks(void) +{ + job *job; + extern list_head svr_alljobs; + + for (job = GET_NEXT(svr_alljobs); job; job = GET_NEXT(job->ji_alljobs)) { + task *task; + for (task = GET_NEXT(job->ji_tasks); task; + task = GET_NEXT(task->ti_jobtask)) { + struct dirent *dent; + int found; + + /* only check on tasks that we think should still be around */ + if (task->ti_qs.ti_status != TI_STATE_RUNNING) + continue; + + /* look for processes with this session id */ + found = 0; + rewinddir(pdir); + while ((dent = readdir(pdir)) != NULL) { + proc_stat_t *ps; + if (!isdigit(dent->d_name[0])) + continue; + ps = get_proc_stat(atoi(dent->d_name)); + if (!ps) + continue; + + if (ps->session == task->ti_qs.ti_sid) { + ++found; + break; + } + } + if (!found) { + char buf[1024]; + extern int exiting_tasks; + sprintf(buf, + "found exited session %d for task %d in job %s", + task->ti_qs.ti_sid, task->ti_qs.ti_task, job->ji_qs.ji_jobid); + log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, buf); + + task->ti_qs.ti_exitstat = 0; /* actually unknown */ + task->ti_qs.ti_status = TI_STATE_EXITED; + task_save(task); + exiting_tasks = 1; + } + } + } +} + diff -puN src/resmom/linux/mom_start.c~mom-restart src/resmom/linux/mom_start.c --- pbs-2.3.12-orig/src/resmom/linux/mom_start.c~mom-restart 2004-08-06 17:52:06.000000000 -0400 +++ pbs-2.3.12/src/resmom/linux/mom_start.c 2004-08-06 17:52:06.000000000 -0400 @@ -264,7 +264,8 @@ void scan_for_terminated() ptask->ti_qs.ti_exitstat = exiteval; ptask->ti_qs.ti_status = TI_STATE_EXITED; task_save(ptask); - sprintf(log_buffer, "task %d terminated", ptask->ti_qs.ti_task); + sprintf(log_buffer, "%s: task %d terminated, sid %d", + __func__, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid); LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); diff -puN src/resmom/mom_main.c~mom-restart src/resmom/mom_main.c --- pbs-2.3.12-orig/src/resmom/mom_main.c~mom-restart 2004-08-06 17:52:06.000000000 -0400 +++ pbs-2.3.12/src/resmom/mom_main.c 2004-08-06 17:52:06.000000000 -0400 @@ -183,10 +183,11 @@ double wallfactor = 1.00; /* Local Data Items */ static char *log_file = (char *)0; -static int mom_run_state; +static enum { MOM_RUN_STATE_RUNNING, MOM_RUN_STATE_EXIT, MOM_RUN_STATE_KILLALL } mom_run_state; static int call_hup = 0; static int nconfig; static char *path_log; +static int recover = 0; struct config_list { struct config c; @@ -234,7 +235,7 @@ extern void scan_for_terminated(); /* Local public functions */ -void stop_me A_((int)); +static void stop_me A_((int sig)); /* Local private functions */ @@ -1549,9 +1550,7 @@ tcp_request(fd) * Kill a job. * Call with the job pointer and a signal number. */ -int kill_job(pjob, sig) - job *pjob; - int sig; +int kill_job(job *pjob, int sig) { task *ptask; int ct = 0; @@ -1562,10 +1561,15 @@ int kill_job(pjob, sig) ptask = (task *)GET_NEXT(pjob->ji_tasks); while (ptask) { if (ptask->ti_qs.ti_status == TI_STATE_RUNNING) { + log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, + pjob->ji_qs.ji_jobid, + "kill_job found a task to kill"); ct += kill_task(ptask, sig); } ptask = (task *)GET_NEXT(ptask->ti_jobtask); } + log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, + pjob->ji_qs.ji_jobid, "kill_job done"); return ct; } @@ -1584,8 +1588,11 @@ static void finish_loop(waittime) if (termin_child) scan_for_terminated(); + /* if -p, must poll tasks inside jobs to look for completion */ + if (recover == 2) + scan_non_child_tasks(); if (exiting_tasks) - scan_for_exiting(); + scan_for_exiting(); /* unblock signals */ if (sigprocmask(SIG_UNBLOCK, &allsigs, NULL) == -1) @@ -1777,7 +1784,6 @@ main(int argc, char *argv[]) char *mom_home; task *ptask; char *ptr; - int recover = 0; int tryport; int rppfd; /* fd for rm and im comm */ int privfd; /* fd for sending job info */ @@ -2071,7 +2077,6 @@ main(int argc, char *argv[]) */ act.sa_handler = SIG_IGN; sigaction( SIGPIPE, &act, NULL); - sigaction( SIGUSR1, &act, NULL); sigaction( SIGUSR2, &act, NULL); #ifdef SIGINFO sigaction( SIGINFO, &act, NULL); @@ -2107,6 +2112,7 @@ main(int argc, char *argv[]) act.sa_handler = stop_me; /* shutdown for these */ sigaction( SIGINT, &act, NULL); sigaction( SIGTERM, &act, NULL); + sigaction( SIGUSR1, &act, NULL); #ifdef SIGXCPU sigaction(SIGXCPU, &act, NULL); #endif @@ -2223,7 +2229,8 @@ main(int argc, char *argv[]) * section constitutes the "main" loop of MOM */ - for (mom_run_state=1; mom_run_state; finish_loop(wait_time)) { + mom_run_state = MOM_RUN_STATE_RUNNING; + for (; mom_run_state == MOM_RUN_STATE_RUNNING; finish_loop(wait_time)) { if (call_hup) process_hup(); @@ -2385,25 +2392,27 @@ main(int argc, char *argv[]) } } - /* kill any running jobs */ - - pjob = (job *)GET_NEXT(svr_alljobs); - while (pjob) { - if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { - (void)kill_job(pjob, SIGKILL); - pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING; - job_save(pjob, SAVEJOB_QUICK); - } - else - term_job(pjob); - - pjob = (job *)GET_NEXT(pjob->ji_alljobs); + if (mom_run_state == MOM_RUN_STATE_KILLALL) { + /* kill any running jobs */ + pjob = (job *)GET_NEXT(svr_alljobs); + while (pjob) { + if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) { + (void)kill_job(pjob, SIGKILL); + pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING; + job_save(pjob, SAVEJOB_QUICK); + } + else + term_job(pjob); + + pjob = (job *)GET_NEXT(pjob->ji_alljobs); + } + + if (termin_child) + scan_for_terminated(); + if (exiting_tasks) + scan_for_exiting(); } - if (termin_child) - scan_for_terminated(); - if (exiting_tasks) - scan_for_exiting(); (void)mom_close_poll(); rpp_shutdown(); @@ -2438,11 +2447,22 @@ static char *mk_dirs(base) * stop_me = signal handler for SIGTERM */ -void stop_me(sig) - int sig; +static void +stop_me(int sig) { - sprintf(log_buffer, "caught signal %d", sig); + const char *dowhat; + + if (sig == SIGUSR1) { + /* kill all jobs, then exit */ + mom_run_state = MOM_RUN_STATE_KILLALL; + dowhat = "killing all jobs then exiting"; + } else { + /* just exit, leaving jobs running */ + mom_run_state = MOM_RUN_STATE_EXIT; + dowhat = "leaving jobs running, just exiting"; + } + + sprintf(log_buffer, "caught signal %d: %s", sig, dowhat); log_record(PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); - mom_run_state = 0; } diff -puN src/resmom/start_exec.c~mom-restart src/resmom/start_exec.c --- pbs-2.3.12-orig/src/resmom/start_exec.c~mom-restart 2004-08-06 17:52:06.000000000 -0400 +++ pbs-2.3.12/src/resmom/start_exec.c 2004-08-06 17:52:06.000000000 -0400 @@ -195,8 +195,7 @@ static void no_hang(sig) } struct passwd * -check_pwd(pjob) - job *pjob; +check_pwd(job *pjob) { struct passwd *pwdp; struct group *grpp; @@ -1503,6 +1502,17 @@ start_process(ptask, argv, envp) } /* + * A restarted mom will not have called this yet, but it is needed + * to spawn tasks (ji_grpcache). + */ + if (!check_pwd(pjob)) { + sprintf(log_buffer, "job %s task %d check_pwd failed", + pjob->ji_qs.ji_jobid, ptask->ti_qs.ti_task); + log_err(-1, id, log_buffer); + return -1; + } + + /* ** Begin a new process for the fledgling task. */ if ((pid = fork_me(-1)) == -1) @@ -1551,7 +1561,9 @@ start_process(ptask, argv, envp) pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING; job_save(pjob, SAVEJOB_QUICK); } - (void)sprintf(log_buffer, "task started, %s", argv[0]); + (void)sprintf(log_buffer, + "%s: task started, tid %d, sid %d, cmd %s", + __func__, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid, argv[0]); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); return 0; _