diff --git a/helper/restore_helper.go b/helper/restore_helper.go index 8a488fb8f..b78c301d3 100644 --- a/helper/restore_helper.go +++ b/helper/restore_helper.go @@ -327,13 +327,22 @@ func doRestoreAgent() error { log(fmt.Sprintf("Oid %d: Copied %d bytes into the pipe", oid, bytesRead)) log(fmt.Sprintf("Closing pipe for oid %d: %s", oid, currentPipe)) - err = flushAndCloseRestoreWriter(currentPipe, oid) - if err != nil { - log(fmt.Sprintf("Oid %d: Failed to flush and close pipe", oid)) - goto LoopEnd - } // Recreate pipe to prevent data holdover bug + // The mentioned bug is probably caused by unsynchronized reader + // and writer. When reader is slow enough, writer can append + // another batch of data to the same pipe in the next loop cycle. + // In the worst case, reader can process all data in one loop + // cycle for the first batch. + // Doing recreation before flush is essential. Otherwise, fast + // enough reader can open the pipe which is already flushed, but + // not yet deleted. This race causing writer to wait endlessly in + // next ENXIO cycle above and the reader to wait for data on the + // other side of pipe. + // TODO: The whole batches loop looks overweighted. It seems that + // we can simplify and speedup it by using a single writer with + // single pipe. Then the problem described will cease to be + // relevant. err = deletePipe(currentPipe) if err != nil { log(fmt.Sprintf("Error deleting pipe %s at end of batch loop: %v", currentPipe, err)) @@ -345,6 +354,12 @@ func doRestoreAgent() error { goto LoopEnd } + err = flushAndCloseRestoreWriter(currentPipe, oid) + if err != nil { + log(fmt.Sprintf("Oid %d: Failed to flush and close pipe", oid)) + goto LoopEnd + } + contentToRestore += *destSize } log(fmt.Sprintf("Oid %d: Successfully flushed and closed pipe", oid))