Commit
Node monitoring should run in parallel to reduce the total round-trip time in large instances.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
package hudson.node_monitors; | ||
|
||
import hudson.model.Computer; | ||
import hudson.remoting.Callable; | ||
import hudson.remoting.VirtualChannel; | ||
import jenkins.model.Jenkins; | ||
|
||
import javax.annotation.CheckForNull; | ||
import java.io.IOException; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
import java.util.Map.Entry; | ||
import java.util.concurrent.ExecutionException; | ||
import java.util.concurrent.Future; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.TimeoutException; | ||
import java.util.logging.Logger; | ||
|
||
import static java.util.concurrent.TimeUnit.MILLISECONDS; | ||
import static java.util.logging.Level.WARNING; | ||
|
||
/** | ||
* Sophisticated version of {@link AbstractNodeMonitorDescriptor} that | ||
* performs monitoring on all slaves concurrently and asynchronously. | ||
* | ||
* @param <T> | ||
* represents the the result of the monitoring. | ||
* @author Kohsuke Kawaguchi | ||
*/ | ||
public abstract class AbstractAsyncNodeMonitorDescriptor<T> extends AbstractNodeMonitorDescriptor<T> { | ||
protected AbstractAsyncNodeMonitorDescriptor() { | ||
} | ||
|
||
protected AbstractAsyncNodeMonitorDescriptor(long interval) { | ||
super(interval); | ||
} | ||
|
||
protected AbstractAsyncNodeMonitorDescriptor(Class<? extends NodeMonitor> clazz) { | ||
super(clazz); | ||
} | ||
|
||
protected AbstractAsyncNodeMonitorDescriptor(Class<? extends NodeMonitor> clazz, long interval) { | ||
super(clazz, interval); | ||
} | ||
|
||
/** | ||
* Creates a {@link Callable} that performs the monitoring when executed. | ||
*/ | ||
protected abstract @CheckForNull Callable<T,IOException> createCallable(Computer c); | ||
|
||
@Override | ||
protected T monitor(Computer c) throws IOException, InterruptedException { | ||
VirtualChannel ch = c.getChannel(); | ||
if (ch != null) { | ||
Callable<T,IOException> cc = createCallable(c); | ||
if (cc!=null) | ||
return ch.call(cc); | ||
} | ||
return null; | ||
} | ||
|
||
/** | ||
* Performs all monitoring concurrently. | ||
*/ | ||
@Override | ||
protected Map<Computer, T> monitor() throws InterruptedException { | ||
Map<Computer,Future<T>> futures = new HashMap<Computer,Future<T>>(); | ||
|
||
for (Computer c : Jenkins.getInstance().getComputers()) { | ||
try { | ||
VirtualChannel ch = c.getChannel(); | ||
futures.put(c,null); // sentinel value | ||
if (ch!=null) { | ||
Callable<T, ?> cc = createCallable(c); | ||
if (cc!=null) | ||
futures.put(c,ch.callAsync(cc)); | ||
} | ||
} catch (RuntimeException e) { | ||
LOGGER.log(WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e); | ||
} catch (IOException e) { | ||
LOGGER.log(WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e); | ||
} | ||
} | ||
|
||
final long now = System.currentTimeMillis(); | ||
final long end = now + getMonitoringTimeOut(); | ||
|
||
final Map<Computer,T> data = new HashMap<Computer,T>(); | ||
|
||
for (Entry<Computer, Future<T>> e : futures.entrySet()) { | ||
Computer c = e.getKey(); | ||
Future<T> f = futures.get(c); | ||
data.put(c, null); // sentinel value | ||
|
||
if (f!=null) { | ||
try { | ||
data.put(c,f.get(Math.max(0,end-System.currentTimeMillis()), MILLISECONDS)); | ||
} catch (RuntimeException x) { | ||
LOGGER.log(WARNING, "Failed to monitor " + c.getDisplayName() + " for " + getDisplayName(), x); | ||
} catch (ExecutionException x) { | ||
LOGGER.log(WARNING, "Failed to monitor " + c.getDisplayName() + " for " + getDisplayName(), x); | ||
} catch (TimeoutException x) { | ||
LOGGER.log(WARNING, "Failed to monitor " + c.getDisplayName() + " for " + getDisplayName(), x); | ||
} | ||
} | ||
} | ||
|
||
return data; | ||
} | ||
|
||
/** | ||
* Controls the time out of monitoring. | ||
*/ | ||
protected long getMonitoringTimeOut() { | ||
return TimeUnit.SECONDS.toMillis(30); | ||
} | ||
|
||
private static final Logger LOGGER = Logger.getLogger(AbstractAsyncNodeMonitorDescriptor.class.getName()); | ||
} |
4 comments
on commit 7357138
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something is wrong within this commit.
On my setup, I have 2 slaves located remotely, 200ms ping and clock synchronized with master.
With this commit, slaves are reported with 27 days ahead, and between 2000 and 3000ms ping.
Additionally, I seem to get more disconnections than before.
Same screenshot, a few commits back in time.
To me it looks like the reported response time is some kind of average between all nodes...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not really understanding what is happening with theses Step1-3, but maybe it's related to System.nanoTime (which is only monotonic for a single VM) and that that is somehow compared between different nodes?
BTW: some documentation, what theses Steps actually do - i.e the workflow - would be nice!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW: is there a testcase for this? Doesn't look like there is
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The clock comparisons are definitely broken. See https://issues.jenkins-ci.org/browse/JENKINS-18671
Would agree with comments from @kutzi System.nanoTime is used to compute the time differences.
remoteTime is from a different JVM to startTime/endTime so they cannot be directly compared.
See http://docs.oracle.com/javase/1.5.0/docs/api/java/lang/System.html#nanoTime%28%29