For mpiexec-spawned jobs to survive across a mom restart, and to enable
proper accounting for all jobs which continue across a mom restart, this
patch fixes some behavior of mom when restarted with the "-p" flag. Note
that this patch adds functionality to the machine-specific part of the mom
code for linux only. Users of other system types could cut-n-paste that code
without too much problem, but as it stands, this patch will break compilation
on non-linux systems.
This patch does four things:
- Fix coredump resulting from tm_spawn to restarted pbs_mom
- Avoid race condition by which pbs_mom would sometimes kill itself
as tasks exit.
- Make a restarted pbs_mom search for and report exiting tasks
from jobs which were started before the old mom was killed.
- Change response of pbs_mom to various signals. Now the default
is to leave all jobs running. If you want to stop all jobs,
USR1 can be used to achieve the old behavior.
---
pbs-2.3.12/doc/man8/pbs_mom.8B | 10 ++--
pbs-2.3.12/src/include/mom_func.h | 1
pbs-2.3.12/src/resmom/catch_child.c | 4 +
pbs-2.3.12/src/resmom/linux/mom_mach.c | 76 +++++++++++++++++++++++++++++-
pbs-2.3.12/src/resmom/linux/mom_start.c | 3 -
pbs-2.3.12/src/resmom/mom_main.c | 80 ++++++++++++++++++++------------
pbs-2.3.12/src/resmom/start_exec.c | 18 ++++++-
7 files changed, 152 insertions(+), 40 deletions(-)
diff -puN doc/man8/pbs_mom.8B~mom-restart doc/man8/pbs_mom.8B
--- pbs-2.3.12-orig/doc/man8/pbs_mom.8B~mom-restart 2004-08-06 17:52:06.000000000 -0400
+++ pbs-2.3.12/doc/man8/pbs_mom.8B 2004-08-06 17:52:06.000000000 -0400
@@ -320,10 +320,12 @@ file, and reinitialize resource structur
results in a log file entry. The signal is used to limit the time taken by
certain children processes, such as the prologue and epilogue.
.IP "SIGINT and SIGTERM"
-Result in pbs_mom terminating all running children and exiting. This is the
-action for the following signals as well: SIGXCPU, SIGXFSZ, SIGCPULIM,
-and SIGSHUTDN.
-.IP "SIGPIPE, SIGUSR1, SIGUSR2, SIGINFO"
+Result in pbs_mom exiting without terminating any running jobs.
+This is the action for the following signals as well: SIGXCPU, SIGXFSZ,
+SIGCPULIM, and SIGSHUTDN.
+.IP SIGUSR1
+causes mom to kill all running jobs on the node, then exit.
+.IP "SIGPIPE, SIGUSR1, SIGINFO"
are ignored.
.LP
All other signals have their default behavior installed.
diff -puN src/include/mom_func.h~mom-restart src/include/mom_func.h
--- pbs-2.3.12-orig/src/include/mom_func.h~mom-restart 2004-08-06 17:52:06.000000000 -0400
+++ pbs-2.3.12/src/include/mom_func.h 2004-08-06 17:52:06.000000000 -0400
@@ -136,6 +136,7 @@ extern void mom_deljob A_((job *));
extern void mom_freenodes A_((job *));
extern void scan_for_exiting();
extern void scan_for_terminated();
+extern void scan_non_child_tasks(void);
extern int set_job A_((job *, struct startjob_rtn *));
extern void set_globid A_((job *, struct startjob_rtn *));
extern int set_mach_vars A_((job *, struct var_table *));
diff -puN src/resmom/catch_child.c~mom-restart src/resmom/catch_child.c
--- pbs-2.3.12-orig/src/resmom/catch_child.c~mom-restart 2004-08-06 17:52:06.000000000 -0400
+++ pbs-2.3.12/src/resmom/catch_child.c 2004-08-06 17:52:06.000000000 -0400
@@ -699,6 +699,10 @@ void init_abort_jobs(recover)
if (pj == NULL)
continue;
+ /* set the globid so mom does not coredump in response
+ * to tm_spawn */
+ set_globid(pj, 0);
+
append_link(&svr_alljobs, &pj->ji_alljobs, pj);
job_nodes(pj);
task_recov(pj);
diff -puN src/resmom/linux/mom_mach.c~mom-restart src/resmom/linux/mom_mach.c
--- pbs-2.3.12-orig/src/resmom/linux/mom_mach.c~mom-restart 2004-08-06 17:52:06.000000000 -0400
+++ pbs-2.3.12/src/resmom/linux/mom_mach.c 2004-08-06 17:52:06.000000000 -0400
@@ -114,6 +114,7 @@
#include "job.h"
#include "log.h"
#include "mom_mach.h"
+#include "mom_func.h"
#include "resmon.h"
#include "../rm_dep.h"
@@ -1071,8 +1072,22 @@ kill_task(const task *ptask, int sig)
}
if (sesid == ps->session) {
- (void)kill(ps->pid, sig);
- ++ct;
+ if (ps->pid == 0) {
+ sprintf(log_buffer,
+ "%s: not killing pid 0 with sig %d",
+ __func__, sig);
+ log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+ ptask->ti_job->ji_qs.ji_jobid, log_buffer);
+ } else {
+ sprintf(log_buffer,
+ "%s: killing pid %d task %d with sig %d",
+ __func__, ps->pid, ptask->ti_qs.ti_task, sig);
+ log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+ ptask->ti_job->ji_qs.ji_jobid, log_buffer);
+
+ (void)kill(ps->pid, sig);
+ ++ct;
+ }
}
}
return ct;
@@ -2216,3 +2231,60 @@ struct rm_attribute *attrib;
return ret_string;
}
+
+/*
+ * For a recovering (-p) mom, look through existing tasks in existing
+ * jobs for things that have exited that are not owned by us through a
+ * parent-child relationship. Otherwise we cannot report back to tm
+ * clients when tasks have exited.
+ */
+void
+scan_non_child_tasks(void)
+{
+ job *job;
+ extern list_head svr_alljobs;
+
+ for (job = GET_NEXT(svr_alljobs); job; job = GET_NEXT(job->ji_alljobs)) {
+ task *task;
+ for (task = GET_NEXT(job->ji_tasks); task;
+ task = GET_NEXT(task->ti_jobtask)) {
+ struct dirent *dent;
+ int found;
+
+ /* only check on tasks that we think should still be around */
+ if (task->ti_qs.ti_status != TI_STATE_RUNNING)
+ continue;
+
+ /* look for processes with this session id */
+ found = 0;
+ rewinddir(pdir);
+ while ((dent = readdir(pdir)) != NULL) {
+ proc_stat_t *ps;
+ if (!isdigit(dent->d_name[0]))
+ continue;
+ ps = get_proc_stat(atoi(dent->d_name));
+ if (!ps)
+ continue;
+
+ if (ps->session == task->ti_qs.ti_sid) {
+ ++found;
+ break;
+ }
+ }
+ if (!found) {
+ char buf[1024];
+ extern int exiting_tasks;
+ sprintf(buf,
+ "found exited session %d for task %d in job %s",
+ task->ti_qs.ti_sid, task->ti_qs.ti_task, job->ji_qs.ji_jobid);
+ log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, buf);
+
+ task->ti_qs.ti_exitstat = 0; /* actually unknown */
+ task->ti_qs.ti_status = TI_STATE_EXITED;
+ task_save(task);
+ exiting_tasks = 1;
+ }
+ }
+ }
+}
+
diff -puN src/resmom/linux/mom_start.c~mom-restart src/resmom/linux/mom_start.c
--- pbs-2.3.12-orig/src/resmom/linux/mom_start.c~mom-restart 2004-08-06 17:52:06.000000000 -0400
+++ pbs-2.3.12/src/resmom/linux/mom_start.c 2004-08-06 17:52:06.000000000 -0400
@@ -264,7 +264,8 @@ void scan_for_terminated()
ptask->ti_qs.ti_exitstat = exiteval;
ptask->ti_qs.ti_status = TI_STATE_EXITED;
task_save(ptask);
- sprintf(log_buffer, "task %d terminated", ptask->ti_qs.ti_task);
+ sprintf(log_buffer, "%s: task %d terminated, sid %d",
+ __func__, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid);
LOG_EVENT(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB,
pjob->ji_qs.ji_jobid, log_buffer);
diff -puN src/resmom/mom_main.c~mom-restart src/resmom/mom_main.c
--- pbs-2.3.12-orig/src/resmom/mom_main.c~mom-restart 2004-08-06 17:52:06.000000000 -0400
+++ pbs-2.3.12/src/resmom/mom_main.c 2004-08-06 17:52:06.000000000 -0400
@@ -183,10 +183,11 @@ double wallfactor = 1.00;
/* Local Data Items */
static char *log_file = (char *)0;
-static int mom_run_state;
+static enum { MOM_RUN_STATE_RUNNING, MOM_RUN_STATE_EXIT, MOM_RUN_STATE_KILLALL } mom_run_state;
static int call_hup = 0;
static int nconfig;
static char *path_log;
+static int recover = 0;
struct config_list {
struct config c;
@@ -234,7 +235,7 @@ extern void scan_for_terminated();
/* Local public functions */
-void stop_me A_((int));
+static void stop_me A_((int sig));
/* Local private functions */
@@ -1549,9 +1550,7 @@ tcp_request(fd)
* Kill a job.
* Call with the job pointer and a signal number.
*/
-int kill_job(pjob, sig)
- job *pjob;
- int sig;
+int kill_job(job *pjob, int sig)
{
task *ptask;
int ct = 0;
@@ -1562,10 +1561,15 @@ int kill_job(pjob, sig)
ptask = (task *)GET_NEXT(pjob->ji_tasks);
while (ptask) {
if (ptask->ti_qs.ti_status == TI_STATE_RUNNING) {
+ log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+ pjob->ji_qs.ji_jobid,
+ "kill_job found a task to kill");
ct += kill_task(ptask, sig);
}
ptask = (task *)GET_NEXT(ptask->ti_jobtask);
}
+ log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
+ pjob->ji_qs.ji_jobid, "kill_job done");
return ct;
}
@@ -1584,8 +1588,11 @@ static void finish_loop(waittime)
if (termin_child)
scan_for_terminated();
+ /* if -p, must poll tasks inside jobs to look for completion */
+ if (recover == 2)
+ scan_non_child_tasks();
if (exiting_tasks)
- scan_for_exiting();
+ scan_for_exiting();
/* unblock signals */
if (sigprocmask(SIG_UNBLOCK, &allsigs, NULL) == -1)
@@ -1777,7 +1784,6 @@ main(int argc, char *argv[])
char *mom_home;
task *ptask;
char *ptr;
- int recover = 0;
int tryport;
int rppfd; /* fd for rm and im comm */
int privfd; /* fd for sending job info */
@@ -2071,7 +2077,6 @@ main(int argc, char *argv[])
*/
act.sa_handler = SIG_IGN;
sigaction( SIGPIPE, &act, NULL);
- sigaction( SIGUSR1, &act, NULL);
sigaction( SIGUSR2, &act, NULL);
#ifdef SIGINFO
sigaction( SIGINFO, &act, NULL);
@@ -2107,6 +2112,7 @@ main(int argc, char *argv[])
act.sa_handler = stop_me; /* shutdown for these */
sigaction( SIGINT, &act, NULL);
sigaction( SIGTERM, &act, NULL);
+ sigaction( SIGUSR1, &act, NULL);
#ifdef SIGXCPU
sigaction(SIGXCPU, &act, NULL);
#endif
@@ -2223,7 +2229,8 @@ main(int argc, char *argv[])
* section constitutes the "main" loop of MOM
*/
- for (mom_run_state=1; mom_run_state; finish_loop(wait_time)) {
+ mom_run_state = MOM_RUN_STATE_RUNNING;
+ for (; mom_run_state == MOM_RUN_STATE_RUNNING; finish_loop(wait_time)) {
if (call_hup)
process_hup();
@@ -2385,25 +2392,27 @@ main(int argc, char *argv[])
}
}
- /* kill any running jobs */
-
- pjob = (job *)GET_NEXT(svr_alljobs);
- while (pjob) {
- if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) {
- (void)kill_job(pjob, SIGKILL);
- pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
- job_save(pjob, SAVEJOB_QUICK);
- }
- else
- term_job(pjob);
-
- pjob = (job *)GET_NEXT(pjob->ji_alljobs);
+ if (mom_run_state == MOM_RUN_STATE_KILLALL) {
+ /* kill any running jobs */
+ pjob = (job *)GET_NEXT(svr_alljobs);
+ while (pjob) {
+ if (pjob->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) {
+ (void)kill_job(pjob, SIGKILL);
+ pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
+ job_save(pjob, SAVEJOB_QUICK);
+ }
+ else
+ term_job(pjob);
+
+ pjob = (job *)GET_NEXT(pjob->ji_alljobs);
+ }
+
+ if (termin_child)
+ scan_for_terminated();
+ if (exiting_tasks)
+ scan_for_exiting();
}
- if (termin_child)
- scan_for_terminated();
- if (exiting_tasks)
- scan_for_exiting();
(void)mom_close_poll();
rpp_shutdown();
@@ -2438,11 +2447,22 @@ static char *mk_dirs(base)
* stop_me = signal handler for SIGTERM
*/
-void stop_me(sig)
- int sig;
+static void
+stop_me(int sig)
{
- sprintf(log_buffer, "caught signal %d", sig);
+ const char *dowhat;
+
+ if (sig == SIGUSR1) {
+ /* kill all jobs, then exit */
+ mom_run_state = MOM_RUN_STATE_KILLALL;
+ dowhat = "killing all jobs then exiting";
+ } else {
+ /* just exit, leaving jobs running */
+ mom_run_state = MOM_RUN_STATE_EXIT;
+ dowhat = "leaving jobs running, just exiting";
+ }
+
+ sprintf(log_buffer, "caught signal %d: %s", sig, dowhat);
log_record(PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER,
msg_daemonname, log_buffer);
- mom_run_state = 0;
}
diff -puN src/resmom/start_exec.c~mom-restart src/resmom/start_exec.c
--- pbs-2.3.12-orig/src/resmom/start_exec.c~mom-restart 2004-08-06 17:52:06.000000000 -0400
+++ pbs-2.3.12/src/resmom/start_exec.c 2004-08-06 17:52:06.000000000 -0400
@@ -195,8 +195,7 @@ static void no_hang(sig)
}
struct passwd *
-check_pwd(pjob)
- job *pjob;
+check_pwd(job *pjob)
{
struct passwd *pwdp;
struct group *grpp;
@@ -1503,6 +1502,17 @@ start_process(ptask, argv, envp)
}
/*
+ * A restarted mom will not have called this yet, but it is needed
+ * to spawn tasks (ji_grpcache).
+ */
+ if (!check_pwd(pjob)) {
+ sprintf(log_buffer, "job %s task %d check_pwd failed",
+ pjob->ji_qs.ji_jobid, ptask->ti_qs.ti_task);
+ log_err(-1, id, log_buffer);
+ return -1;
+ }
+
+ /*
** Begin a new process for the fledgling task.
*/
if ((pid = fork_me(-1)) == -1)
@@ -1551,7 +1561,9 @@ start_process(ptask, argv, envp)
pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
job_save(pjob, SAVEJOB_QUICK);
}
- (void)sprintf(log_buffer, "task started, %s", argv[0]);
+ (void)sprintf(log_buffer,
+ "%s: task started, tid %d, sid %d, cmd %s",
+ __func__, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid, argv[0]);
log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB,
pjob->ji_qs.ji_jobid, log_buffer);
return 0;
_