Skip to content

Commit

Permalink
[JENKINS-46680] Disconnect computer on ping timeout (#3005)
Browse files Browse the repository at this point in the history
* [JENKINS-46680] Reproduce in unittest

* [FIX JENKINS-46680] Reset SlaveComputer channel before closing it on ping timeout

* [JENKINS-46680] Attach channel termination offline cause on ping timeouts
  • Loading branch information
olivergondza authored and oleg-nenashev committed Sep 15, 2017
1 parent 2ae3721 commit dbb5e44
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 24 deletions.
50 changes: 34 additions & 16 deletions core/src/main/java/hudson/slaves/ChannelPinger.java
Expand Up @@ -23,6 +23,7 @@
*/
package hudson.slaves;

import com.google.common.annotations.VisibleForTesting;
import hudson.Extension;
import hudson.FilePath;
import jenkins.util.SystemProperties;
Expand All @@ -34,6 +35,7 @@
import jenkins.security.MasterToSlaveCallable;
import jenkins.slaves.PingFailureAnalyzer;

import javax.annotation.CheckForNull;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
Expand Down Expand Up @@ -86,28 +88,38 @@ public ChannelPinger() {

@Override
public void preOnline(Computer c, Channel channel, FilePath root, TaskListener listener) {
install(channel);
SlaveComputer slaveComputer = null;
if (c instanceof SlaveComputer) {
slaveComputer = (SlaveComputer) c;
}
install(channel, slaveComputer);
}

public void install(Channel channel) {
install(channel, null);
}

@VisibleForTesting
/*package*/ void install(Channel channel, @CheckForNull SlaveComputer c) {
if (pingTimeoutSeconds < 1 || pingIntervalSeconds < 1) {
LOGGER.warning("Agent ping is disabled");
return;
}

// set up ping from both directions, so that in case of a router dropping a connection,
// both sides can notice it and take compensation actions.
try {
channel.call(new SetUpRemotePing(pingTimeoutSeconds, pingIntervalSeconds));
LOGGER.fine("Set up a remote ping for " + channel.getName());
} catch (Exception e) {
LOGGER.severe("Failed to set up a ping for " + channel.getName());
LOGGER.log(Level.SEVERE, "Failed to set up a ping for " + channel.getName(), e);
}

// set up ping from both directions, so that in case of a router dropping a connection,
// both sides can notice it and take compensation actions.
setUpPingForChannel(channel, pingTimeoutSeconds, pingIntervalSeconds, true);
setUpPingForChannel(channel, c, pingTimeoutSeconds, pingIntervalSeconds, true);
}

static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> {
@VisibleForTesting
/*package*/ static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> {
private static final long serialVersionUID = -2702219700841759872L;
@Deprecated
private transient int pingInterval;
Expand All @@ -121,7 +133,7 @@ static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> {

@Override
public Void call() throws IOException {
setUpPingForChannel(Channel.current(), pingTimeoutSeconds, pingIntervalSeconds, false);
setUpPingForChannel(Channel.current(), null, pingTimeoutSeconds, pingIntervalSeconds, false);
return null;
}

Expand Down Expand Up @@ -163,30 +175,36 @@ protected Object readResolve() {
}
}

static void setUpPingForChannel(final Channel channel, int timeoutSeconds, int intervalSeconds, final boolean analysis) {
@VisibleForTesting
/*package*/ static void setUpPingForChannel(final Channel channel, final SlaveComputer computer, int timeoutSeconds, int intervalSeconds, final boolean analysis) {
LOGGER.log(Level.FINE, "setting up ping on {0} with a {1} seconds interval and {2} seconds timeout", new Object[] {channel.getName(), intervalSeconds, timeoutSeconds});
final AtomicBoolean isInClosed = new AtomicBoolean(false);
final PingThread t = new PingThread(channel, timeoutSeconds * 1000L, intervalSeconds * 1000L) {
@Override
protected void onDead(Throwable cause) {
try {
if (analysis) {
analyze(cause);
}
if (isInClosed.get()) {
boolean inClosed = isInClosed.get();
// Disassociate computer channel before closing it
if (computer != null) {
Exception exception = cause instanceof Exception ? (Exception) cause: new IOException(cause);
computer.disconnect(new OfflineCause.ChannelTermination(exception));
}
if (inClosed) {
LOGGER.log(Level.FINE,"Ping failed after the channel "+channel.getName()+" is already partially closed.",cause);
} else {
LOGGER.log(Level.INFO,"Ping failed. Terminating the channel "+channel.getName()+".",cause);
channel.close(cause);
}
} catch (IOException e) {
LOGGER.log(Level.SEVERE,"Failed to terminate the channel "+channel.getName(),e);
}
}
/** Keep in a separate method so we do not even try to do class loading on {@link PingFailureAnalyzer} from an agent JVM. */
private void analyze(Throwable cause) throws IOException {
private void analyze(Throwable cause) {
for (PingFailureAnalyzer pfa : PingFailureAnalyzer.all()) {
pfa.onPingFailure(channel,cause);
try {
pfa.onPingFailure(channel, cause);
} catch (IOException ex) {
LOGGER.log(Level.WARNING, "Ping failure analyzer " + pfa.getClass().getName() + " failed for " + channel.getName(), ex);
}
}
}
@Deprecated
Expand Down
16 changes: 8 additions & 8 deletions core/src/test/java/hudson/slaves/ChannelPingerTest.java
Expand Up @@ -59,12 +59,12 @@ private void preserveSystemProperty(String propertyName) {
@Test
public void testDefaults() throws Exception {
ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel);
channelPinger.install(mockChannel, null);

verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT)));
verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT, true);
}

Expand All @@ -74,23 +74,23 @@ public void testFromSystemProperties() throws Exception {
System.setProperty("hudson.slaves.ChannelPinger.pingIntervalSeconds", "73");

ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel);
channelPinger.install(mockChannel, null);

verify(mockChannel).call(new ChannelPinger.SetUpRemotePing(42, 73));
verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, 42, 73, true);
ChannelPinger.setUpPingForChannel(mockChannel, null, 42, 73, true);
}

@Test
public void testFromOldSystemProperty() throws Exception {
System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7");

ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel);
channelPinger.install(mockChannel, null);

verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420)));
verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420, true);
ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420, true);
}

@Test
Expand All @@ -99,11 +99,11 @@ public void testNewSystemPropertyTrumpsOld() throws Exception {
System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7");

ChannelPinger channelPinger = new ChannelPinger();
channelPinger.install(mockChannel);
channelPinger.install(mockChannel, null);

verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73)));
verifyStatic();
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73, true);
ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73, true);
}

@Test
Expand Down
99 changes: 99 additions & 0 deletions test/src/test/java/hudson/slaves/PingThreadTest.java
@@ -0,0 +1,99 @@
/*
* The MIT License
*
* Copyright (c) Red Hat, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package hudson.slaves;

import hudson.Functions;
import hudson.model.Computer;
import hudson.remoting.Channel;
import hudson.remoting.ChannelClosedException;
import hudson.remoting.PingThread;
import jenkins.security.MasterToSlaveCallable;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;

import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.lang.reflect.Method;
import java.util.Date;
import java.util.concurrent.TimeoutException;

import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.fail;
import static org.junit.Assume.assumeFalse;

/**
* @author ogondza.
*/
public class PingThreadTest {

@Rule
public JenkinsRule j = new JenkinsRule();

@Test
public void failedPingThreadResetsComputerChannel() throws Exception {
assumeFalse("We simulate hung agent by sending the SIGTSTP signal", Functions.isWindows());

DumbSlave slave = j.createOnlineSlave();
Computer computer = slave.toComputer();
Channel channel = (Channel) slave.getChannel();
String pid = channel.call(new GetPid());

PingThread pingThread = null;
for (Thread it: Thread.getAllStackTraces().keySet()) {
if (it instanceof PingThread && it.getName().endsWith(channel.toString())) {
pingThread = (PingThread) it;
}
}
assertNotNull(pingThread);

// Simulate lost connection
assert new ProcessBuilder("kill", "-TSTP", pid).start().waitFor() == 0;
try {
// ... do not wait for Ping Thread to notice
Method onDead = PingThread.class.getDeclaredMethod("onDead", Throwable.class);
onDead.setAccessible(true);
onDead.invoke(pingThread, new TimeoutException("No ping"));

try {
channel.call(new GetPid());
fail();
} catch (ChannelClosedException ex) {
// Expected
}

assertNull(slave.getComputer().getChannel());
assertNull(computer.getChannel());
} finally {
assert new ProcessBuilder("kill", "-CONT", pid).start().waitFor() == 0;
}
}

private static final class GetPid extends MasterToSlaveCallable<String, IOException> {
@Override public String call() throws IOException {
return ManagementFactory.getRuntimeMXBean().getName().replaceAll("@.*", "");
}
}
}

0 comments on commit dbb5e44

Please sign in to comment.