Skip to content

Commit

Permalink
[FIXED JENKINS-22249] Detect if the wrapper shell script is dead, for…
Browse files Browse the repository at this point in the history
… example because the machine was rebooted.
  • Loading branch information
jglick committed Oct 8, 2014
1 parent d2e7c8b commit 41a11fb
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 4 deletions.
Expand Up @@ -67,12 +67,12 @@ public String getScript() {
shf.write(s, "UTF-8");
shf.chmod(0755);


String cmd = String.format("'%s' > '%s' 2>&1; echo $? > '%s'",
String cmd = String.format("echo $$ > '%s'; '%s' > '%s' 2>&1; echo $? > '%s'",
c.pidFile(ws),
shf,
c.getLogFile(ws),
c.getResultFile(ws)
);
)./* escape against EnvVars jobEnv in LocalLauncher.launch */replace("$", "$$");

Launcher.ProcStarter ps = launcher.launch().cmds("nohup", "sh", "-c", cmd).envs(envVars).pwd(ws);
try {
Expand All @@ -83,11 +83,15 @@ public String getScript() {
} catch (Exception x) { // ?
x.printStackTrace(listener.getLogger());
}
ps.stdout(listener); // for diagnosis in case wrapper script fails
ps.start();
return c;
}

/*package*/ static final class ShellController extends FileMonitoringController {

private int pid;

private ShellController(FilePath ws) throws IOException, InterruptedException {
super(ws);
}
Expand All @@ -96,6 +100,37 @@ public FilePath getScriptFile(FilePath ws) {
return controlDir(ws).child("script.sh");
}

FilePath pidFile(FilePath ws) {
return controlDir(ws).child("pid");
}

private synchronized int pid(FilePath ws) throws IOException, InterruptedException {
if (pid == 0) {
FilePath pidFile = pidFile(ws);
if (pidFile.exists()) {
try {
pid = Integer.parseInt(pidFile.readToString().trim());
ProcessLiveness.reset(ws.getChannel());
} catch (NumberFormatException x) {
throw new IOException("corrupted content in " + pidFile + ": " + x, x);
}
}
}
return pid;
}

@Override public Integer exitStatus(FilePath workspace) throws IOException, InterruptedException {
Integer status = super.exitStatus(workspace);
if (status != null) {
return status;
}
int _pid = pid(workspace);
if (_pid > 0 && !ProcessLiveness.isAlive(workspace.getChannel(), _pid)) {
return -1; // arbitrary code to distinguish from 0 (success) and 1+ (observed failure)
}
return null;
}

private static final long serialVersionUID = 1L;
}

Expand Down
Expand Up @@ -127,7 +127,7 @@ private static class WriteLog implements FilePath.FileCallable<Long> {
}

// TODO would be more efficient to allow API to consolidate writeLog with exitStatus (save an RPC call)
@Override public final Integer exitStatus(FilePath workspace) throws IOException, InterruptedException {
@Override public Integer exitStatus(FilePath workspace) throws IOException, InterruptedException {
FilePath status = getResultFile(workspace);
if (status.exists()) {
try {
Expand Down
103 changes: 103 additions & 0 deletions src/main/java/org/jenkinsci/plugins/durabletask/ProcessLiveness.java
@@ -0,0 +1,103 @@
/*
* The MIT License
*
* Copyright 2014 Jesse Glick.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package org.jenkinsci.plugins.durabletask;

import hudson.remoting.Callable;
import hudson.remoting.VirtualChannel;
import hudson.util.ProcessTree;
import java.io.IOException;
import java.util.Map;
import java.util.WeakHashMap;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
* Utility class to track whether a given process is still alive.
* Since loading a complete {@link ProcessTree} may be expensive, this is done only once per {@link #CACHE_EXPIRY}.
* Might be more efficient and reliable to use JNA to look up this information with a direct system call,
* but this would be a longer-term project with more platform-specific code.
*/
final class ProcessLiveness {

/** By default, one minute. */
private static final long CACHE_EXPIRY = TimeUnit.MINUTES.toMillis(1);

private static final Logger LOGGER = Logger.getLogger(ProcessLiveness.class.getName());

private static final class ProcessTreeCache {
ProcessTree tree;
long lastChecked;
ProcessTreeCache() {}
}

private static final Map<VirtualChannel,ProcessTreeCache> processTrees = new WeakHashMap<VirtualChannel,ProcessTreeCache>();

/**
* Determines whether a process is believed to still be alive.
* @param channel a connection to the machine on which it would be running
* @param pid a process ID
* @return true if it is probably still alive (but might have recently died); false if it is believed to not be running
*/
public static boolean isAlive(VirtualChannel channel, int pid) throws IOException, InterruptedException {
ProcessTreeCache cache;
synchronized (processTrees) {
cache = processTrees.get(channel);
if (cache == null) {
cache = new ProcessTreeCache();
processTrees.put(channel, cache);
}
}
long now = System.currentTimeMillis();
synchronized (cache) {
if (cache.tree == null || now - cache.lastChecked > CACHE_EXPIRY) {
LOGGER.log(Level.FINE, "(re)loading process tree on {0}", channel);
cache.tree = channel.call(new LoadProcessTree());
cache.lastChecked = now;
}
return cache.tree.get(pid) != null;
}
}

private static final class LoadProcessTree implements Callable<ProcessTree,RuntimeException> {
@Override public ProcessTree call() throws RuntimeException {
return ProcessTree.get();
}
}

/**
* Clears any cache for a given machine.
* Should be done when a new process has been started.
* @param channel a connection to a machine
*/
public static void reset(VirtualChannel channel) {
synchronized (processTrees) {
processTrees.remove(channel);
}
}

private ProcessLiveness() {}

}

0 comments on commit 41a11fb

Please sign in to comment.