Skip to content

Commit

Permalink
Need to hold a lock when disconnecting or you can trigger JENKINS-286…
Browse files Browse the repository at this point in the history
…90 style deadlocks
  • Loading branch information
stephenc committed Jul 9, 2015
1 parent a37a078 commit 3346565
Show file tree
Hide file tree
Showing 2 changed files with 132 additions and 25 deletions.
Expand Up @@ -45,12 +45,26 @@ public class MansionComputer extends AbstractCloudComputer<MansionSlave> {
private final MansionSlave slave;
private long creationTime = System.currentTimeMillis();
private long onlineTime = 0;
private boolean disconnectInProgress;

MansionComputer(MansionSlave slave) {
super(slave);
this.slave = slave;
}

public synchronized boolean isDisconnectInProgress() {
return disconnectInProgress;

This comment has been minimized.

Copy link
@KostyaSha

KostyaSha Jul 19, 2015

Member

Wouldn't such logic duplicate OnceRetentionStrategy?

This comment has been minimized.

Copy link
@stephenc

stephenc Jul 20, 2015

Author Member

Don't get me started on how messed up this is. The specific retention strategy is not configurable... and gets replaced in certain circumstances, so I cannot maintain state in the retention strategy...

This comment has been minimized.

Copy link
@KostyaSha

KostyaSha Jul 20, 2015

Member

I also had ideas add disconnect var in computer. Shouldn't be better add it in CloudAPI?

This comment has been minimized.

Copy link
@stephenc

stephenc Jul 20, 2015

Author Member

Well given that I want to replace the (searches for a suitable word...) impedance-mismatched-cloud-api with one that is better suited for implementers and users alike... hacks will have to live until I get the time to fix.

}

/*package*/ synchronized void setDisconnectInProgress(boolean disconnectInProgress) {
this.disconnectInProgress = disconnectInProgress;
}

@Override
public boolean isAcceptingTasks() {
return !isDisconnectInProgress() && super.isAcceptingTasks();
}

/**
* {@link MansionComputer} is not configurable.
*
Expand Down
Expand Up @@ -27,11 +27,15 @@
import hudson.model.Computer;
import hudson.model.Executor;
import hudson.model.Node;
import hudson.model.Queue;
import hudson.slaves.CloudSlaveRetentionStrategy;
import hudson.slaves.OfflineCause;
import hudson.util.TimeUnit2;
import jenkins.model.Jenkins;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.lang.reflect.Modifier;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand All @@ -40,6 +44,8 @@

public class MansionRetentionStrategy <T extends MansionComputer> extends CloudSlaveRetentionStrategy<T> {

private transient volatile boolean disconnectInProgress;

@Override
public long check(T c) {
long nextCheck = super.check(c);
Expand Down Expand Up @@ -72,32 +78,68 @@ private boolean shouldHaveConnectedByNow(T c) {
}

@Override
protected void kill(Node n) throws IOException {
MansionComputer computer = (MansionComputer) n.toComputer();
// attempt to heuristically detect a race condition
// where an executor accepted a task after we checked for
// idleness,
// but before we marked it as unavailable for tasks
// See JENKINS-23676
LOGGER.log(Level.FINE, "Taking node offline since it seems to be idle" + n.getNodeName());
protected void kill(final Node n) throws IOException {
final MansionComputer computer = (MansionComputer) n.toComputer();

final String nodeName = n.getNodeName();
LOGGER.log(Level.FINE, "Taking node {0} offline since it seems to be idle", n.getNodeName());
// we need to have a private block as other threads could try to turn on accepting tasks
computer.setDisconnectInProgress(true);
// set accepting tasks so that others can see this flag if expecting it
computer.setAcceptingTasks(false);
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
LOGGER.log(Level.FINE, "Interrupted while sleeping before removing node " + n.getNodeName());
return;
}
if (!computer.isIdle() && computer.isOnline()) {
LOGGER.log(FINE, computer.getName() + " is no longer idle, aborting termination.");
// we lost the race -- mark it as back online
computer.setAcceptingTasks(true);
return;
}
for (Executor e : computer.getExecutors()) {
e.interrupt();
synchronized (this) {
if (disconnectInProgress) {
// if we already have a background thread running, then we can return immediately
return;
}
disconnectInProgress = true;
}
LOGGER.log(Level.FINE, "Finally removing node " + n.getNodeName());
super.kill(n);
Computer.threadPoolForRemoting.submit(new Runnable() {
public void run() {
// TODO once Jenkins 1.607+ we can remove the sleep as the withLock will give atomic guarantee

// attempt to heuristically detect a race condition
// where an executor accepted a task after we checked for
// idleness,
// but before we marked it as unavailable for tasks
// See JENKINS-23676
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
LOGGER.log(Level.FINE, "Interrupted while sleeping before removing node {0}", nodeName);
return;
}

queueLock.withLock(new Runnable() {
public void run() {
if (!computer.isIdle() && computer.isOnline()) {
LOGGER.log(FINE, computer.getName() + " is no longer idle, aborting termination.");
// we lost the race -- mark it as back online
computer.setAcceptingTasks(true);
computer.setDisconnectInProgress(false);
synchronized (MansionRetentionStrategy.this) {
disconnectInProgress = false;
}
return;
}
// TODO figure out why this cannot just be computer.getNode().terminate()
try {
for (Executor e : computer.getExecutors()) {
e.interrupt();
}
LOGGER.log(Level.FINE, "Finally removing node " + n.getNodeName());
MansionRetentionStrategy.super.kill(n);
} catch (IOException e) {
computer.setAcceptingTasks(true);
computer.setDisconnectInProgress(false);
synchronized (MansionRetentionStrategy.this) {
disconnectInProgress = false;
}
}
}
});
}
});
}

/**
Expand All @@ -114,4 +156,55 @@ protected long getIdleMaxTime() {


private static Logger LOGGER = Logger.getLogger(MansionRetentionStrategy.class.getName());

private static final QueueLock queueLock = newQueueLock();

private static QueueLock newQueueLock() {
try {
return new ReflectionPost592QueueLock();
} catch (NoSuchMethodException e) {
return new Pre592QueueLock();
}
}

// TODO replace all this with Queue.withLock once Jenkins 1.592+
interface QueueLock {
void withLock(Runnable runnable);
}

private static class Pre592QueueLock implements QueueLock {
@SuppressWarnings("SynchronizationOnLocalVariableOrMethodParameter")
public void withLock(Runnable runnable) {
final Jenkins jenkins = Jenkins.getInstance();
final Object monitor = jenkins == null ? getClass() : jenkins.getQueue();
synchronized (monitor) {
runnable.run();
}
}
}

private static class ReflectionPost592QueueLock implements QueueLock {
private final Method withLock;

private ReflectionPost592QueueLock() throws NoSuchMethodException {
this.withLock = Queue.class.getMethod("withLock", Runnable.class);
if (!Modifier.isStatic(withLock.getModifiers())) {
throw new NoSuchMethodException("Expecting withLock(Runnable) to be static");
}
if (!Modifier.isPublic(withLock.getModifiers())) {
throw new NoSuchMethodException("Expecting withLock(Runnable) to be static");
}
}

public void withLock(Runnable runnable) {
try {
withLock.invoke(null, runnable);
} catch (IllegalAccessException e) {
// fall back, but should never get here as the constructor will blow up first
new Pre592QueueLock().withLock(runnable);
} catch (InvocationTargetException e) {
// ignore
}
}
}
}

0 comments on commit 3346565

Please sign in to comment.