Add a few fixes to the TM interface and some functionality enhancements for the MPI parallel code launcher, mpiexec (http://www.osc.edu/~pw/mpiexec/). Copied from mpiexec/patch/pbs-2.3.12-mpiexec.diff. --- pbs-2.3.12-pw/README.mpiexec | 24 +++++++ pbs-2.3.12-pw/src/cmds/pbs_demux.c | 25 ------- pbs-2.3.12-pw/src/lib/Libifl/tm.c | 3 pbs-2.3.12-pw/src/resmom/start_exec.c | 113 ++++++++++++++++++++++++++-------- 4 files changed, 115 insertions(+), 50 deletions(-) diff -puN /dev/null README.mpiexec --- /dev/null 2003-09-15 09:02:32.000000000 -0400 +++ pbs-2.3.12-pw/README.mpiexec 2004-04-17 09:43:07.000000000 -0400 @@ -0,0 +1,24 @@ + +Documentation of changes applied as part of the mpiexec patch. +http://www.osc.edu/~pw/mpiexec/ +Last changed 04 Dec 2001. + + +General bug fix: + + src/lib/Libifl/tm.c + + Do not call DIS_tcp_setup(-1) if connection to the local mom failed + after 5 tries. + +Functionality additions: + + src/resmom/start_exec.c + + Added extension to get stdin/out/err files from the environment rather + than always using /dev/null during tm_spawn. + + src/cmds/pbs_demux.c + + Never expect a PBS_JOBCOOKIE to be delivered from a process. + diff -puN src/cmds/pbs_demux.c~mpiexec src/cmds/pbs_demux.c --- pbs-2.3.12/src/cmds/pbs_demux.c~mpiexec 2004-04-17 09:43:07.000000000 -0400 +++ pbs-2.3.12-pw/src/cmds/pbs_demux.c 2004-04-17 09:43:07.000000000 -0400 @@ -106,10 +106,8 @@ enum rwhere {invalid, new_out, new_err, struct routem { enum rwhere r_where; short r_nl; - short r_first; }; fd_set readset; -char *cookie = 0; void readit(int sock, struct routem *prm) @@ -127,18 +125,6 @@ void readit(int sock, struct routem *prm i = 0; if ((amt = read(sock, buf, 256)) > 0) { - if (prm->r_first == 1) { - - /* first data on connection must be the cookie to validate it */ - - i = strlen(cookie); - if (strncmp(buf, cookie, i) != 0) { - (void)close(sock); - prm->r_where = invalid; - FD_CLR(sock, &readset); - } - prm->r_first = 0; - } for (pc = buf+i; pc < buf+amt; ++pc) { #ifdef DEBUG if (prm->r_nl) { @@ -176,22 +162,12 @@ main(int argc, char *argv[]) parent = getppid(); - cookie = getenv("PBS_JOBCOOKIE"); - if (cookie == 0) { - fprintf(stderr, "%s: no PBS_JOBCOOKIE found in the env\n", - argv[0]); - exit(3); - } -#ifdef DEBUG - printf("Cookie found in environment: %s\n", cookie); -#endif maxfd = sysconf(_SC_OPEN_MAX); routem = (struct routem *)malloc(maxfd*sizeof(struct routem)); for (i=0; ir_where = invalid; (routem+i)->r_nl = 1; - (routem+i)->r_first = 0; } (routem+main_sock_out)->r_where = new_out; (routem+main_sock_err)->r_where = new_err; @@ -244,7 +220,6 @@ main(int argc, char *argv[]) newsock = accept(i, 0, 0); (routem+newsock)->r_where = (routem+i)->r_where== new_out ? old_out : old_err; FD_SET(newsock, &readset); - (routem+newsock)->r_first = 1; break; case old_out: case old_err: diff -puN src/lib/Libifl/tm.c~mpiexec src/lib/Libifl/tm.c --- pbs-2.3.12/src/lib/Libifl/tm.c~mpiexec 2004-04-17 09:43:07.000000000 -0400 +++ pbs-2.3.12-pw/src/lib/Libifl/tm.c 2004-04-17 09:43:07.000000000 -0400 @@ -451,7 +451,8 @@ localmom() } } - DIS_tcp_setup(local_conn); + if (local_conn >= 0) + DIS_tcp_setup(local_conn); return (local_conn); } diff -puN src/resmom/start_exec.c~mpiexec src/resmom/start_exec.c --- pbs-2.3.12/src/resmom/start_exec.c~mpiexec 2004-04-17 09:43:07.000000000 -0400 +++ pbs-2.3.12-pw/src/resmom/start_exec.c 2004-04-17 09:43:07.000000000 -0400 @@ -1389,6 +1389,42 @@ finish_exec(pjob) } /* + * Look for a certain environment variable which has a port# which should + * be opened on the MS to establish communication for one of the 3 stdio + * streams. >=0 return is that valid fd, -1 means no env var found, + * -2 means malformed env value or failure to connect. + */ +static int +search_env_and_open(const char *envname, u_long ipaddr) +{ + static char *id = "search_env_and_open"; + int i, len = strlen(envname); + + for (i=0; i 0) - (void)close(fd); + (void)dup2(fd0, 0); + if (fd0 > 0) + (void)close(fd0); } + /* look through env for a port# on MS we should use for stdout/err */ + if ((fd1 = search_env_and_open("MPIEXEC_STDOUT_PORT", ipaddr)) == -2) + starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); + if ((fd2 = search_env_and_open("MPIEXEC_STDERR_PORT", ipaddr)) == -2) + starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); + if (pjob->ji_numnodes > 1) { /* ** Open sockets to demux proc for stdout and stderr. */ - if ((fd = open_demux(ipaddr, pjob->ji_stdout)) == -1) + if (fd1 < 0 && (fd1 = open_demux(ipaddr,pjob->ji_stdout)) == -1) starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); - (void)dup2(fd, 1); - if (fd > 1) - (void)close(fd); - if ((fd = open_demux(ipaddr, pjob->ji_stderr)) == -1) + (void)dup2(fd1, 1); + if (fd1 > 1) + (void)close(fd1); + if (fd2 < 0 && (fd2 = open_demux(ipaddr,pjob->ji_stderr)) == -1) starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr); - (void)dup2(fd, 2); - if (fd > 2) - (void)close(fd); - - (void)write(1,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str, - strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str)); - (void)write(2,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str, - strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str)); + (void)dup2(fd2, 2); + if (fd2 > 2) + (void)close(fd2); } else if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags&ATR_VFLAG_SET) && (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0)) { /* interactive job, single node, write to pty */ - if ((pts = open_pty(pjob)) < 0) { + pts = -1; + if (fd1 < 0 || fd2 < 0) { + if ((pts = open_pty(pjob)) < 0) { log_err(errno, id,"cannot open slave"); starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr); - } - (void)dup2(pts, 1); - (void)dup2(pts, 2); - + } + if (fd1 < 0) + fd1 = pts; + if (fd2 < 0) + fd2 = pts; + } + (void)dup2(fd1, 1); + (void)dup2(fd2, 2); + if (fd1 != pts) + (void) close(fd1); + if (fd2 != pts) + (void) close(fd2); } else { /* normal batch job, single node, write straight to files */ - if (open_std_out_err(pjob) == -1) { + pts = -1; + if (fd1 < 0 || fd2 < 0) { + if (open_std_out_err(pjob) == -1) starter_return(kid_write, kid_read,JOB_EXEC_FAIL1,&sjr); } + if (fd1 >= 0) { + (void) close(1); + (void) dup2(fd1, 1); + if (fd1 > 1) + (void) close(fd1); + } + if (fd2 >= 0) { + (void) close(2); + (void) dup2(fd2, 2); + if (fd2 > 2) + (void) close(fd2); + } } log_close(0); _