Skip to content

Commit

Permalink
Commit a giant wad of changes that makes CpsFlowExecution more bullet…
Browse files Browse the repository at this point in the history
…proof against flownode storage failures and solves JENKINS-48824
  • Loading branch information
svanoort committed Jan 12, 2018
1 parent 795500f commit b24dbd0
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 33 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Expand Up @@ -66,7 +66,7 @@
<java.level>8</java.level>
<no-test-jar>false</no-test-jar>
<git-plugin.version>3.1.0</git-plugin.version>
<workflow-support-plugin.version>2.17-durability-beta-1</workflow-support-plugin.version>
<workflow-support-plugin.version>2.17-durability-beta-2</workflow-support-plugin.version>
<scm-api-plugin.version>2.0.8</scm-api-plugin.version>
<groovy-cps.version>1.21</groovy-cps.version>
<!-- Just until https://github.com/jenkinsci/jenkins-test-harness/pull/91 is released -->
Expand Down
Expand Up @@ -269,6 +269,9 @@ public class CpsFlowExecution extends FlowExecution implements BlockableResume {

boolean resumeBlocked = false;

/** Subdirectory string where we store {@link FlowNode}s */
private String storageDir = null;

/**
* Start nodes that have been created, whose {@link BlockEndNode} is not yet created.
*/
Expand Down Expand Up @@ -387,10 +390,10 @@ public CpsFlowExecution(@Nonnull String script, boolean sandbox, @Nonnull FlowE
this.script = script;
this.sandbox = sandbox;
this.durabilityHint = durabilityHint;
this.storage = createStorage();
this.storage.setAvoidAtomicWrite(!this.getDurabilityHint().isAtomicWrite());
Authentication auth = Jenkins.getAuthentication();
this.user = auth.equals(ACL.SYSTEM) ? null : auth.getName();
this.storage = createStorage();
this.storage.setAvoidAtomicWrite(!this.getDurabilityHint().isAtomicWrite());
}

public CpsFlowExecution(String script, boolean sandbox, FlowExecutionOwner owner) throws IOException {
Expand Down Expand Up @@ -505,7 +508,8 @@ private TimingFlowNodeStorage createStorage() throws IOException {
* Directory where workflow stores its state.
*/
public File getStorageDir() throws IOException {
return new File(this.owner.getRootDir(),"workflow");
return new File(this.owner.getRootDir(),
(this.storageDir != null) ? this.storageDir : "workflow");
}

@Override
Expand Down Expand Up @@ -613,44 +617,59 @@ private void rebuildEmptyGraph() {
LOGGER.log(Level.WARNING, "Failed to persist", e);
}
persistedClean = false;
startNodesSerial = null;
headsSerial = null;
}
}

protected void initializeStorage() throws IOException {
storage = createStorage();
synchronized (this) {
// heads could not be restored in unmarshal, so doing that now:
heads = new TreeMap<Integer,FlowHead>();
for (Map.Entry<Integer,String> entry : headsSerial.entrySet()) {
FlowHead h = new FlowHead(this, entry.getKey());

FlowNode n = storage.getNode(entry.getValue());
if (n != null) {
h.setForDeserialize(storage.getNode(entry.getValue()));
heads.put(h.getId(), h);
} else {
// FlowNodeStorage not in synch with the FlowExecution -- hard failure of master with storage not persisted cleanly
// Or files deleted (maybe build rotation).
// Need to find a way to mimick up the heads and fail cleanly, far enough to let the canResume do its thing
rebuildEmptyGraph();
return;
protected synchronized void initializeStorage() throws IOException {
boolean storageErrors = false; // Maybe storage didn't get to persist properly or files were deleted.
try {
storage = createStorage();

heads = new TreeMap<Integer,FlowHead>();
for (Map.Entry<Integer,String> entry : headsSerial.entrySet()) {
FlowHead h = new FlowHead(this, entry.getKey());

FlowNode n = storage.getNode(entry.getValue());
if (n != null) {
h.setForDeserialize(storage.getNode(entry.getValue()));
heads.put(h.getId(), h);
} else {
storageErrors = true;
break;
}
}
}

headsSerial = null;
// Same for startNodes:
startNodes = new Stack<BlockStartNode>();
for (String id : startNodesSerial) {
FlowNode node = storage.getNode(id);
if (node != null) {
startNodes.add((BlockStartNode) storage.getNode(id));
} else {
// TODO if possible, consider trying to close out unterminated blocks using heads, to keep existing graph history
rebuildEmptyGraph();
return;

if (!storageErrors) {
// Same for startNodes:
storageErrors = false;
startNodes = new Stack<BlockStartNode>();
for (String id : startNodesSerial) {
FlowNode node = storage.getNode(id);
if (node != null) {
startNodes.add((BlockStartNode) storage.getNode(id));
} else {
// TODO if possible, consider trying to close out unterminated blocks using heads, to keep existing graph history
storageErrors = true;
break;
}
}
}
startNodesSerial = null;

} catch (IOException ioe) {
LOGGER.log(Level.WARNING, "Error initializing storage and loading nodes", ioe);
storageErrors = true;
}

if (storageErrors) { //
this.storageDir = getStorageDir()+"-fallback"; // Avoid overwriting data
this.storage = createStorage(); // Empty storage
// Need to find a way to mimic up the heads and fail cleanly, far enough to let the canResume do its thing
rebuildEmptyGraph();
}
}

Expand Down Expand Up @@ -1505,6 +1524,10 @@ public void marshal(Object source, HierarchicalStreamWriter w, MarshallingContex
}
}
writeChild(w, context, "resumeBlocked", e.resumeBlocked, Boolean.class);

if (e.storageDir != null) {
writeChild(w, context, "storageDir", e.storageDir, String.class);
}
}

private <T> void writeChild(HierarchicalStreamWriter w, MarshallingContext context, String name, @Nonnull T v, Class<T> staticType) {
Expand Down Expand Up @@ -1575,6 +1598,9 @@ public Object unmarshal(HierarchicalStreamReader reader, final UnmarshallingCont
} else if (nodeName.equals("resumeBlocked")) {
Boolean val = readChild(reader, context, Boolean.class, result);
setField(result, "resumeBlocked", val.booleanValue());
} else if (nodeName.equals("storageDir")) {
String val = readChild(reader, context, String.class, result);
setField(result, "storageDir", val);
}

reader.moveUp();
Expand Down
Expand Up @@ -40,6 +40,7 @@
import org.junit.runner.Description;
import org.junit.runners.model.Statement;
import org.jvnet.hudson.test.BuildWatcher;
import org.jvnet.hudson.test.Issue;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.RestartableJenkinsRule;

Expand Down Expand Up @@ -523,6 +524,7 @@ public void evaluate() throws Throwable {
/** Verify that if the master dies messily and FlowNode storage is lost entirely we fail the build cleanly.
*/
@Test
@Issue("JENKINS-48824")
public void testDurableAgainstCleanRestartFailsWithBogusStorageFile() throws Exception {
final String[] logStart = new String[1];
story.addStepWithDirtyShutdown(new Statement() {
Expand Down

0 comments on commit b24dbd0

Please sign in to comment.