Skip to content

Commit

Permalink
FIXED JENKINS-15331 by changing Util.deleteContentsRecursive, Util.de…
Browse files Browse the repository at this point in the history
…leteFile and Util.deleteRecursive so that they can retry failed deletions.

The number of deletion attempts and the time it waits between deletes are configurable via system properties (like hudson.Util.noSymlink etc).
Util.DELETION_MAX is set by -Dhudson.Util.deletionMax.  Default is 3 attempts.
Util.WAIT_BETWEEN_DELETION_RETRIES is set by -Dhudson.Util.deletionRetryWait.  Defaults is 100 milliseconds.
Util.GC_AFTER_FAILED_DELETE is set by -Dhudson.Util.performGCOnFailedDelete.  Default is false.

Added unit-tests for new functionality.
  • Loading branch information
pjdarton committed Feb 12, 2016
1 parent be2787a commit 310c674
Show file tree
Hide file tree
Showing 2 changed files with 353 additions and 14 deletions.
239 changes: 225 additions & 14 deletions core/src/main/java/hudson/Util.java
Expand Up @@ -43,6 +43,9 @@
import org.apache.tools.ant.taskdefs.Copy;
import org.apache.tools.ant.types.FileSet;

import org.kohsuke.accmod.Restricted;
import org.kohsuke.accmod.restrictions.NoExternalUse;

import jnr.posix.FileStat;
import jnr.posix.POSIX;

Expand Down Expand Up @@ -208,24 +211,54 @@ public static String loadFile(@Nonnull File logfile, @Nonnull Charset charset) t
/**
* Deletes the contents of the given directory (but not the directory itself)
* recursively.
* It does not take no for an answer - if necessary, it will have multiple
* attempts at deleting things.
*
* @throws IOException
* if the operation fails.
*/
public static void deleteContentsRecursive(@Nonnull File file) throws IOException {
File[] files = file.listFiles();
if(files==null)
return; // the directory didn't exist in the first place
for (File child : files)
deleteRecursive(child);
for( int numberOfAttempts=1 ; ; numberOfAttempts++ ) {
try {
tryOnceDeleteContentsRecursive(file);
break; // success
} catch (IOException ex) {
boolean threadWasInterrupted = pauseBetweenDeletes(numberOfAttempts);
if( numberOfAttempts>= DELETION_MAX || threadWasInterrupted)
throw new IOException(deleteFailExceptionMessage(file, numberOfAttempts, threadWasInterrupted), ex);
}
}
}

/**
* Deletes this file (and does not take no for an answer).
* If necessary, it will have multiple attempts at deleting things.
*
* @param f a file to delete
* @throws IOException if it exists but could not be successfully deleted
*/
public static void deleteFile(@Nonnull File f) throws IOException {
for( int numberOfAttempts=1 ; ; numberOfAttempts++ ) {
try {
tryOnceDeleteFile(f);
break; // success
} catch (IOException ex) {
boolean threadWasInterrupted = pauseBetweenDeletes(numberOfAttempts);
if( numberOfAttempts>= DELETION_MAX || threadWasInterrupted)
throw new IOException(deleteFailExceptionMessage(f, numberOfAttempts, threadWasInterrupted), ex);
}
}
}

/**
* Deletes this file, working around most problems which might make
* this difficult.
*
* @param f
* What to delete. If a directory, it'll need to be empty.
* @throws IOException if it exists but could not be successfully deleted
*/
private static void tryOnceDeleteFile(File f) throws IOException {
if (!f.delete()) {
if(!f.exists())
// we are trying to delete a file that no longer exists, so this is not an error
Expand Down Expand Up @@ -304,19 +337,144 @@ private static void makeWritable(@Nonnull File f) {

}

/**
* Deletes the given directory (including its contents) recursively.
* It does not take no for an answer - if necessary, it will have multiple
* attempts at deleting things.
*
* @throws IOException
* if the operation fails.
*/
public static void deleteRecursive(@Nonnull File dir) throws IOException {
for( int numberOfAttempts=1 ; ; numberOfAttempts++ ) {
try {
tryOnceDeleteRecursive(dir);
break; // success
} catch (IOException ex) {
boolean threadWasInterrupted = pauseBetweenDeletes(numberOfAttempts);
if( numberOfAttempts>= DELETION_MAX || threadWasInterrupted)
throw new IOException(deleteFailExceptionMessage(dir, numberOfAttempts, threadWasInterrupted), ex);
}
}
}

/**
* Deletes a file or folder, throwing the first exception encountered, but
* having a go at deleting everything. i.e. it does not <em>stop</em> on the
* first exception, but tries (to delete) everything once.
*
* @param dir
* What to delete. If a directory, the contents will be deleted
* too.
* @throws The first exception encountered.
*/
private static void tryOnceDeleteRecursive(File dir) throws IOException {
if(!isSymlink(dir))
deleteContentsRecursive(dir);
tryOnceDeleteContentsRecursive(dir);
tryOnceDeleteFile(dir);
}

/**
* Deletes a folder's contents, throwing the first exception encountered,
* but having a go at deleting everything. i.e. it does not <em>stop</em>
* on the first exception, but tries (to delete) everything once.
*
* @param directory
* The directory whose contents will be deleted.
* @throws The first exception encountered.
*/
private static void tryOnceDeleteContentsRecursive(File directory) throws IOException {
File[] directoryContents = directory.listFiles();
if(directoryContents==null)
return; // the directory didn't exist in the first place
IOException firstCaught = null;
for (File child : directoryContents) {
try {
tryOnceDeleteRecursive(child);
} catch (IOException justCaught) {
if( firstCaught==null) {
firstCaught = justCaught;
}
}
}
if( firstCaught!=null )
throw firstCaught;
}

/**
* Pauses between delete attempts, and says if it's ok to try again.
* This does not wait if the wait time is zero or if we have tried
* too many times already.
* <p>
* See {@link #WAIT_BETWEEN_DELETION_RETRIES} for details of
* the pause duration.<br/>
* See {@link #GC_AFTER_FAILED_DELETE} for when {@link System#gc()} is called.
*
* @return false if it is ok to continue trying to delete things, true if
* we were interrupted (and should stop now).
*/
private static boolean pauseBetweenDeletes(int numberOfAttemptsSoFar) {
long delayInMs;
if( numberOfAttemptsSoFar>=DELETION_MAX ) return false;
/* If the Jenkins process had the file open earlier, and it has not
* closed it then Windows won't let us delete it until the Java object
* with the open stream is Garbage Collected, which can result in builds
* failing due to "file in use" on Windows despite working perfectly
* well on other OSs. */
if (GC_AFTER_FAILED_DELETE) {
System.gc();
}
if (WAIT_BETWEEN_DELETION_RETRIES>=0) {
delayInMs = WAIT_BETWEEN_DELETION_RETRIES;
} else {
delayInMs = -numberOfAttemptsSoFar*WAIT_BETWEEN_DELETION_RETRIES;
}
if (delayInMs<=0)
return Thread.interrupted();
try {
deleteFile(dir);
} catch (IOException e) {
// if some of the child directories are big, it might take long enough to delete that
// it allows others to create new files, causing problemsl ike JENKINS-10113
// so give it one more attempt before we give up.
if(!isSymlink(dir))
deleteContentsRecursive(dir);
deleteFile(dir);
Thread.sleep(delayInMs);
return false;
} catch (InterruptedException e) {
return true;
}
}

/**
* Creates a "couldn't delete file" message that explains how hard we tried.
* See {@link #DELETION_MAX}, {@link #WAIT_BETWEEN_DELETION_RETRIES}
* and {@link #GC_AFTER_FAILED_DELETE} for more details.
*/
private static String deleteFailExceptionMessage(File whatWeWereTryingToRemove, int retryCount, boolean wasInterrupted) {
StringBuilder sb = new StringBuilder();
sb.append("Unable to delete '");
sb.append(whatWeWereTryingToRemove);
sb.append("'. Tried ");
sb.append(retryCount);
sb.append(" time");
if( retryCount!=1 ) sb.append('s');
if( DELETION_MAX>1 ) {
sb.append(" (of a maximum of ");
sb.append(DELETION_MAX);
sb.append(')');
if( GC_AFTER_FAILED_DELETE )
sb.append(" garbage-collecting");
if( WAIT_BETWEEN_DELETION_RETRIES!=0 && GC_AFTER_FAILED_DELETE )
sb.append(" and");
if( WAIT_BETWEEN_DELETION_RETRIES!=0 ) {
sb.append(" waiting ");
sb.append(getTimeSpanString(Math.abs(WAIT_BETWEEN_DELETION_RETRIES)));
if( WAIT_BETWEEN_DELETION_RETRIES<0 ) {
sb.append("-");
sb.append(getTimeSpanString(Math.abs(WAIT_BETWEEN_DELETION_RETRIES)*DELETION_MAX));
}
}
if( WAIT_BETWEEN_DELETION_RETRIES!=0 || GC_AFTER_FAILED_DELETE)
sb.append(" between attempts");
}
if( wasInterrupted )
sb.append(". The delete operation was interrupted before it completed successfully");
sb.append('.');
return sb.toString();
}

/*
Expand Down Expand Up @@ -1528,4 +1686,57 @@ public static Properties loadProperties(@Nonnull String properties) throws IOExc
public static boolean NO_SYMLINK = Boolean.getBoolean(Util.class.getName()+".noSymLink");

public static boolean SYMLINK_ESCAPEHATCH = Boolean.getBoolean(Util.class.getName()+".symlinkEscapeHatch");

/**
* The number of times we will attempt to delete files/directory trees
* before giving up and throwing an exception.<br/>
* Specifying a value less than 1 is invalid and will be treated as if
* a value of 1 (i.e. one attempt, no retries) was specified.
* <p>
* e.g. if some of the child directories are big, it might take long enough
* to delete that it allows others to create new files in the directory we
* are trying to empty, causing problems like JENKINS-10113.
* Or, if we're on Windows, then deletes can fail for transient reasons
* regardless of external activity; see JENKINS-15331.
* Whatever the reason, this allows us to do multiple attempts before we
* give up, thus improving build reliability.
*/
@Restricted(value = NoExternalUse.class)
static int DELETION_MAX = Math.max(1, Integer.getInteger(Util.class.getName() + ".deletionMax", 3).intValue());

/**
* The time (in milliseconds) that we will wait between attempts to
* delete files when retrying.<br>
* This has no effect unless {@link #DELETION_MAX} is non-zero.
* <p>
* If zero, we will not delay between attempts.<br>
* If negative, we will wait an (linearly) increasing multiple of this value
* between attempts.
*/
@Restricted(value = NoExternalUse.class)
static int WAIT_BETWEEN_DELETION_RETRIES = Integer.getInteger(Util.class.getName() + ".deletionRetryWait", 100).intValue();

/**
* If this flag is set to true then we will request a garbage collection
* after a deletion failure before we next retry the delete.<br>
* It defaults to <code>false</code> and is ignored unless
* {@link #DELETION_MAX} is greater than 1.
* <p>
* Setting this flag to true <i>may</i> resolve some problems on Windows,
* and also for directory trees residing on an NFS share, <b>but</b> it can
* have a negative impact on performance and may have no effect at all (GC
* behavior is JVM-specific).
* <p>
* Warning: This should only ever be used if you find that your builds are
* failing because Jenkins is unable to delete files, that this failure is
* because Jenkins itself has those files locked "open", and even then it
* should only be used on slaves with relatively few executors (because the
* garbage collection can impact the performance of all job executors on
* that slave).<br/>
* i.e. Setting this flag is a act of last resort - it is <em>not</em>
* recommended, and should not be used on the main Jenkins server
* unless you can tolerate the performance impact.
*/
@Restricted(value = NoExternalUse.class)
static boolean GC_AFTER_FAILED_DELETE = Boolean.getBoolean(Util.class.getName() + ".performGCOnFailedDelete");
}

0 comments on commit 310c674

Please sign in to comment.