[FIXED JENKINS-18438]

Node monitoring should run in parallel to reduce the total round-trip time in large instances.
jenkinsci · Jun 26, 2013 · 7357138 · oldelvet · Jul 11, 2013 · 7357138
1 parent 208cf9c
commit 7357138
Show file tree

Hide file tree

Showing 15 changed files with 376 additions and 84 deletions.
diff --git a/changelog.html b/changelog.html
@@ -55,7 +55,9 @@
 <!-- Record your changes in the trunk here. -->
 <div id="trunk" style="display:none"><!--=TRUNK-BEGIN=-->
 <ul class=image>
-  <li class=>
+  <li class=rfe>
+    Node monitoring now happens concurrently across all the slaves, so it'll be affected less by problematic slaves.
+    (<a href="https://issues.jenkins-ci.org/browse/JENKINS-18438">issue 18438</a>)
 </ul>
 </div><!--=TRUNK-END=-->
 

diff --git a/core/src/main/java/hudson/FilePath.java b/core/src/main/java/hudson/FilePath.java
@@ -70,6 +70,7 @@
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InterruptedIOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.io.OutputStream;
@@ -998,6 +999,27 @@ public <V,E extends Throwable> V act(Callable<V,E> callable) throws IOException,
         }
     }
 
+    /**
+     * Takes a {@link FilePath}+{@link FileCallable} pair and returns the equivalent {@link Callable}.
+     * When executing the resulting {@link Callable}, it executes {@link FileCallable#act(FileCallable)}
+     * on this {@link FilePath}.
+     *
+     * @since 1.522
+     */
+    public <V> Callable<V,IOException> asCallableWith(final FileCallable<V> task) {
+        return new Callable<V,IOException>() {
+            @Override
+            public V call() throws IOException {
+                try {
+                    return act(task);
+                } catch (InterruptedException e) {
+                    throw (IOException)new InterruptedIOException().initCause(e);
+                }
+            }
+            private static final long serialVersionUID = 1L;
+        };
+    }
+
     /**
      * Converts this file to the URI, relative to the machine
      * on which this file is available.

diff --git a/core/src/main/java/hudson/model/Node.java b/core/src/main/java/hudson/model/Node.java
@@ -35,6 +35,7 @@
 import hudson.model.labels.LabelAtom;
 import hudson.model.queue.CauseOfBlockage;
 import hudson.node_monitors.NodeMonitor;
+import hudson.remoting.Callable;
 import hudson.remoting.VirtualChannel;
 import hudson.security.ACL;
 import hudson.security.AccessControlled;
@@ -445,7 +446,22 @@ public Object onConvert(Type targetType, Class targetTypeErasure, Object jsonSou
      * @throws InterruptedException
      *      if the operation is aborted.
      */
-    public abstract ClockDifference getClockDifference() throws IOException, InterruptedException;
+    public ClockDifference getClockDifference() throws IOException, InterruptedException {
+        VirtualChannel channel = getChannel();
+        if(channel==null)
+            throw new IOException(getNodeName()+" is offline");
+
+        return channel.call(getClockDifferenceCallable());
+    }
+
+    /**
+     * Returns a {@link Callable} that when run on the channel, estimates the clock difference.
+     *
+     * @return
+     *      always non-null.
+     * @sine 1.522
+     */
+    public abstract Callable<ClockDifference,IOException> getClockDifferenceCallable();
 
     /**
      * Constants that control how Hudson allocates jobs to slaves.

diff --git a/core/src/main/java/hudson/model/Slave.java b/core/src/main/java/hudson/model/Slave.java
@@ -30,7 +30,6 @@
 import hudson.Launcher.RemoteLauncher;
 import hudson.model.Descriptor.FormException;
 import hudson.remoting.Callable;
-import hudson.remoting.VirtualChannel;
 import hudson.slaves.CommandLauncher;
 import hudson.slaves.ComputerLauncher;
 import hudson.slaves.DumbSlave;
@@ -57,6 +56,7 @@
 
 import javax.servlet.ServletException;
 
+import hudson.util.TimeUnit2;
 import jenkins.model.Jenkins;
 import jenkins.slaves.WorkspaceLocator;
 
@@ -252,16 +252,9 @@ public void setLabelString(String labelString) throws IOException {
         getAssignedLabels();
     }
 
-    public ClockDifference getClockDifference() throws IOException, InterruptedException {
-        VirtualChannel channel = getChannel();
-        if(channel==null)
-            throw new IOException(getNodeName()+" is offline");
-
-        long startTime = System.currentTimeMillis();
-        long slaveTime = channel.call(new GetSystemTime());
-        long endTime = System.currentTimeMillis();
-
-        return new ClockDifference((startTime+endTime)/2 - slaveTime);
+    @Override
+    public Callable<ClockDifference,IOException> getClockDifferenceCallable() {
+        return new GetClockDifference1();
     }
 
     public Computer createComputer() {
@@ -432,16 +425,59 @@ public FormValidation doCheckRemoteFS(@QueryParameter String value) throws IOExc
     private transient String agentCommand;
 
     /**
-     * Obtains the system clock.
+     * Obtains the clock difference between this side and that side of the channel.
+     *
+     * <p>
+     * This is a hack to wrap the whole thing into a simple {@link Callable}.
+     *
+     * <ol>
+     *     <li>When the callable is sent to remote, we capture the time (on this side) in {@link GetClockDifference2#startTime}
+     *     <li>When the other side receives the callable it is {@link GetClockDifference2}.
+     *     <li>We capture the time on the other side and {@link GetClockDifference3} gets sent from the other side
+     *     <li>When it's read on this side as a return value, it morphs itself into {@link ClockDifference}.
+     * </ol>
      */
-    private static final class GetSystemTime implements Callable<Long,RuntimeException> {
-        public Long call() {
-            return System.currentTimeMillis();
+    private static final class GetClockDifference1 implements Callable<ClockDifference,IOException> {
+        public ClockDifference call() {
+            // this method must be being invoked locally, which means the clock is in sync
+            return new ClockDifference(0);
+        }
+
+        private Object writeReplace() {
+            return new GetClockDifference2();
         }
 
         private static final long serialVersionUID = 1L;
     }
 
+    private static final class GetClockDifference2 implements Callable<GetClockDifference3,IOException> {
+        /**
+         * Capture the time on the master when this object is sent to remote, which is when
+         * {@link GetClockDifference1#writeReplace()} is run.
+         */
+        private final long startTime = System.nanoTime();
+
+        public GetClockDifference3 call() {
+            return new GetClockDifference3(startTime);
+        }
+
+        private static final long serialVersionUID = 1L;
+    }
+
+    private static final class GetClockDifference3 implements Serializable {
+        private final long remoteTime = System.nanoTime();
+        private final long startTime;
+
+        public GetClockDifference3(long startTime) {
+            this.startTime = startTime;
+        }
+
+        private Object readResolve() {
+            long endTime = System.nanoTime();
+            return new ClockDifference(TimeUnit2.NANOSECONDS.toMillis((startTime + endTime)/2-remoteTime));
+        }
+    }
+
     /**
      * Determines the workspace root file name for those who really really need the shortest possible path name.
      */

diff --git a/core/src/main/java/hudson/node_monitors/AbstractAsyncNodeMonitorDescriptor.java b/core/src/main/java/hudson/node_monitors/AbstractAsyncNodeMonitorDescriptor.java
@@ -0,0 +1,119 @@
+package hudson.node_monitors;
+
+import hudson.model.Computer;
+import hudson.remoting.Callable;
+import hudson.remoting.VirtualChannel;
+import jenkins.model.Jenkins;
+
+import javax.annotation.CheckForNull;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.logging.Logger;
+
+import static java.util.concurrent.TimeUnit.MILLISECONDS;
+import static java.util.logging.Level.WARNING;
+
+/**
+ * Sophisticated version of {@link AbstractNodeMonitorDescriptor} that
+ * performs monitoring on all slaves concurrently and asynchronously.
+ *
+ * @param <T>
+ *     represents the the result of the monitoring.
+ * @author Kohsuke Kawaguchi
+ */
+public abstract class AbstractAsyncNodeMonitorDescriptor<T> extends AbstractNodeMonitorDescriptor<T> {
+    protected AbstractAsyncNodeMonitorDescriptor() {
+    }
+
+    protected AbstractAsyncNodeMonitorDescriptor(long interval) {
+        super(interval);
+    }
+
+    protected AbstractAsyncNodeMonitorDescriptor(Class<? extends NodeMonitor> clazz) {
+        super(clazz);
+    }
+
+    protected AbstractAsyncNodeMonitorDescriptor(Class<? extends NodeMonitor> clazz, long interval) {
+        super(clazz, interval);
+    }
+
+    /**
+     * Creates a {@link Callable} that performs the monitoring when executed.
+     */
+    protected abstract @CheckForNull Callable<T,IOException> createCallable(Computer c);
+
+    @Override
+    protected T monitor(Computer c) throws IOException, InterruptedException {
+        VirtualChannel ch = c.getChannel();
+        if (ch != null) {
+            Callable<T,IOException> cc = createCallable(c);
+            if (cc!=null)
+                return ch.call(cc);
+        }
+        return null;
+    }
+
+    /**
+     * Performs all monitoring concurrently.
+     */
+    @Override
+    protected Map<Computer, T> monitor() throws InterruptedException {
+        Map<Computer,Future<T>> futures = new HashMap<Computer,Future<T>>();
+
+        for (Computer c : Jenkins.getInstance().getComputers()) {
+            try {
+                VirtualChannel ch = c.getChannel();
+                futures.put(c,null);    // sentinel value
+                if (ch!=null) {
+                    Callable<T, ?> cc = createCallable(c);
+                    if (cc!=null)
+                        futures.put(c,ch.callAsync(cc));
+                }
+            } catch (RuntimeException e) {
+                LOGGER.log(WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e);
+            } catch (IOException e) {
+                LOGGER.log(WARNING, "Failed to monitor "+c.getDisplayName()+" for "+getDisplayName(), e);
+            }
+        }
+
+        final long now = System.currentTimeMillis();
+        final long end = now + getMonitoringTimeOut();
+
+        final Map<Computer,T> data = new HashMap<Computer,T>();
+
+        for (Entry<Computer, Future<T>> e : futures.entrySet()) {
+            Computer c = e.getKey();
+            Future<T> f = futures.get(c);
+            data.put(c, null);  // sentinel value
+
+            if (f!=null) {
+                try {
+                    data.put(c,f.get(Math.max(0,end-System.currentTimeMillis()), MILLISECONDS));
+                } catch (RuntimeException x) {
+                    LOGGER.log(WARNING, "Failed to monitor " + c.getDisplayName() + " for " + getDisplayName(), x);
+                } catch (ExecutionException x) {
+                    LOGGER.log(WARNING, "Failed to monitor " + c.getDisplayName() + " for " + getDisplayName(), x);
+                } catch (TimeoutException x) {
+                    LOGGER.log(WARNING, "Failed to monitor " + c.getDisplayName() + " for " + getDisplayName(), x);
+                }
+            }
+        }
+
+        return data;
+    }
+
+    /**
+     * Controls the time out of monitoring.
+     */
+    protected long getMonitoringTimeOut() {
+        return TimeUnit.SECONDS.toMillis(30);
+    }
+
+    private static final Logger LOGGER = Logger.getLogger(AbstractAsyncNodeMonitorDescriptor.class.getName());
+}