From e138dabecc89e1d614df515b395f38521e9d6c7d Mon Sep 17 00:00:00 2001 From: milen <94537774+taratorio@users.noreply.github.com> Date: Wed, 4 Sep 2024 21:05:25 +0100 Subject: [PATCH] astrid: fix stage integration cached fork choice logic (#11870) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Next chain tip error caught and fixed for astrid stage integration: ``` append with gap blockNum=11561329, but current height=11561327 ``` happens after unwind due to a fork change in the corresponding fork choice update. This is due to a bug in the logic of handling fork choice updates in the stage integration. The issue is that when processing the `cachedForkChoice` after we have done the unwind, `fixCanonicalChain` returns empty `newNodes` (correctly, since the chain was fixed before we cached the fork choice). The solution is to cache the new nodes as the `cachedForkChoice` so that when we process the cached fork choice in the next iteration we can correctly update the tx nums for the new nodes. Full logs: ``` INFO[09-04|16:14:31.018] [2/6 PolygonSync] update fork choice block=11561328 age=0 hash=0x41ebb5e01406c1f013f06ee4e53ab68b125f071717d50bcdcfa4597a0a052cfe INFO[09-04|16:14:31.019] [2/6 PolygonSync] new fork - unwinding and caching fork choice DBUG[09-04|16:14:31.021] UnwindTo block=11561327 block_hash=0xc8ba20e1e4dc312bda4aadc5108722205693783b1c2d6103cb70949bda58a460 err=nil stack="[sync.go:171 stage_polygon_sync.go:1391 stage_polygon_sync.go:1356 stage_polygon_sync.go:1478 stage_polygon_sync.go:501 stage_polygon_sync.go:175 default_stages.go:479 sync.go:531 sync.go:410 stageloop.go:249 stageloop.go:101 asm_arm64.s:1222]" DBUG[09-04|16:14:31.021] [2/6 PolygonSync] DONE in=5.45216175s DBUG[09-04|16:14:31.021] [1/6 OtterSync] DONE in=21.167µs INFO[09-04|16:14:31.021] [2/6 PolygonSync] forward progress=11561327 INFO[09-04|16:14:31.021] [2/6 PolygonSync] new fork - processing cached fork choice after unwind INFO[09-04|16:14:31.022] [2/6 PolygonSync] update fork choice block=11561328 age=0 hash=0x41ebb5e01406c1f013f06ee4e53ab68b125f071717d50bcdcfa4597a0a052cfe DBUG[09-04|16:14:31.022] [2/6 PolygonSync] DONE in=186.792µs DBUG[09-04|16:14:31.022] [3/6 Senders] DONE in=236.458µs INFO[09-04|16:14:31.024] [4/6 Execution] Done Commit every block blk=11561327 blks=1 blk/s=1125.7 txs=2 tx/s=2.25k gas/s=0 buf=0B/512.0MB stepsInDB=0.00 step=24.3 alloc=600.4MB sys=1.7GB DBUG[09-04|16:14:31.024] [4/6 Execution] DONE in=2.020375ms DBUG[09-04|16:14:31.024] [5/6 TxLookup] DONE in=74.292µs DBUG[09-04|16:14:31.024] [6/6 Finish] DONE in=2.958µs INFO[09-04|16:14:31.024] Timings (slower than 50ms) PolygonSync=5.452s alloc=600.5MB sys=1.7GB DBUG[09-04|16:14:31.025] [6/6 Finish] Prune done in=5.625µs DBUG[09-04|16:14:31.025] [5/6 TxLookup] Prune done in=237.084µs DBUG[09-04|16:14:31.025] [4/6 Execution] Prune done in=65.958µs DBUG[09-04|16:14:31.025] [3/6 Senders] Prune done in=2.75µs DBUG[09-04|16:14:31.025] [2/6 PolygonSync] Prune done in=2.25µs DBUG[09-04|16:14:31.025] [snapshots] Prune Blocks to=11559976 limit=10 DBUG[09-04|16:14:31.026] [snapshots] Prune Bor Blocks to=11559976 limit=10 DBUG[09-04|16:14:31.026] [1/6 OtterSync] Prune done in=1.334833ms DBUG[09-04|16:14:31.154] [1/6 OtterSync] DONE in=6.792µs INFO[09-04|16:14:31.154] [2/6 PolygonSync] forward progress=11561327 DBUG[09-04|16:14:33.030] [bridge] processing new blocks from=11561329 to=11561329 lastProcessedBlockNum=11561328 lastProcessedBlockTime=1725462871 lastProcessedEventID=2688 DBUG[09-04|16:14:33.030] [sync] inserted blocks len=1 duration=1.184125ms DBUG[09-04|16:14:33.030] [bor.heimdall] synchronizing spans... blockNum=11561329 DBUG[09-04|16:14:33.031] [bridge] synchronizing events... blockNum=11561329 lastProcessedBlockNum=11561328 INFO[09-04|16:14:33.031] [2/6 PolygonSync] update fork choice block=11561329 age=0 hash=0x298f72d6fbbfdc8d3df098828867dea7e8e7bba787c1eb17f6c6025afa9ac3d1 WARN[09-04|16:14:33.032] [bor.heimdall] an error while fetching path=bor/latest-span queryParams= attempt=1 err="Get \"https://heimdall-api-amoy.polygon.technology/bor/latest-span\": context canceled" DBUG[09-04|16:14:33.032] [bor.heimdall] request canceled reason="context canceled" path=bor/latest-span queryParams= attempt=1 EROR[09-04|16:14:36.032] [2/6 PolygonSync] stopping node err="append with gap blockNum=11561329, but current height=11561327, stack: [txnum.go:149 accessors_chain.go:703 stage_polygon_sync.go:1398 stage_polygon_sync.go:1356 stage_polygon_sync.go:1478 stage_polygon_sync.go:501 stage_polygon_sync.go:175 default_stages.go:479 sync.go:531 sync.go:410 stageloop.go:249 stageloop.go:101 asm_arm64.s:1222]" DBUG[09-04|16:14:36.032] Error while executing stage err="[2/6 PolygonSync] stopped: append with gap blockNum=11561329, but current height=11561327, stack: [txnum.go:149 accessors_chain.go:703 stage_polygon_sync.go:1398 stage_polygon_sync.go:1356 stage_polygon_sync.go:1478 stage_polygon_sync.go:501 stage_polygon_sync.go:175 default_stages.go:479 sync.go:531 sync.go:410 stageloop.go:249 stageloop.go:101 asm_arm64.s:1222]" DBUG[09-04|16:14:36.033] rpcdaemon: the subscription to pending blocks channel was closed ``` --- eth/stagedsync/stage_polygon_sync.go | 90 +++++++++++++++++++++++----- 1 file changed, 75 insertions(+), 15 deletions(-) diff --git a/eth/stagedsync/stage_polygon_sync.go b/eth/stagedsync/stage_polygon_sync.go index a6b1cf5ffb3..616ffd8764a 100644 --- a/eth/stagedsync/stage_polygon_sync.go +++ b/eth/stagedsync/stage_polygon_sync.go @@ -470,15 +470,8 @@ func (s *polygonSyncStageService) Run(ctx context.Context, tx kv.RwTx, stageStat s.runBgComponentsOnce(ctx) - if s.executionEngine.cachedForkChoice != nil { - s.logger.Info(s.appendLogPrefix("new fork - processing cached fork choice after unwind")) - err := s.executionEngine.updateForkChoice(tx, s.executionEngine.cachedForkChoice) - if err != nil { - return err - } - - s.executionEngine.cachedForkChoice = nil - return nil + if err := s.executionEngine.processCachedForkChoiceIfNeeded(tx); err != nil { + return err } for { @@ -1261,6 +1254,36 @@ func (s polygonSyncStageBridgeStore) Close() { // no-op } +func newPolygonSyncStageForkChoice(newNodes []chainNode) *polygonSyncStageForkChoice { + if len(newNodes) == 0 { + panic("unexpected newNodes to be 0") + } + + return &polygonSyncStageForkChoice{newNodes: newNodes} +} + +type polygonSyncStageForkChoice struct { + // note newNodes contains tip first and its new ancestors after it (oldest is last) + // we assume len(newNodes) is never 0, guarded by panic in newPolygonSyncStageForkChoice + newNodes []chainNode +} + +func (fc polygonSyncStageForkChoice) tipBlockNum() uint64 { + return fc.newNodes[0].number +} + +func (fc polygonSyncStageForkChoice) tipBlockHash() common.Hash { + return fc.newNodes[0].hash +} + +func (fc polygonSyncStageForkChoice) oldestNewAncestorBlockNum() uint64 { + return fc.newNodes[len(fc.newNodes)-1].number +} + +func (fc polygonSyncStageForkChoice) numNodes() int { + return len(fc.newNodes) +} + type polygonSyncStageExecutionEngine struct { blockReader services.FullBlockReader txActionStream chan<- polygonSyncStageTxAction @@ -1269,7 +1292,7 @@ type polygonSyncStageExecutionEngine struct { appendLogPrefix func(string) string stageState *StageState unwinder Unwinder - cachedForkChoice *types.Header + cachedForkChoice *polygonSyncStageForkChoice } func (e *polygonSyncStageExecutionEngine) GetHeader(ctx context.Context, blockNum uint64) (*types.Header, error) { @@ -1385,21 +1408,38 @@ func (e *polygonSyncStageExecutionEngine) updateForkChoice(tx kv.RwTx, tip *type } if len(badNodes) > 0 { - e.logger.Info(e.appendLogPrefix("new fork - unwinding and caching fork choice")) badNode := badNodes[len(badNodes)-1] - e.cachedForkChoice = tip - return e.unwinder.UnwindTo(badNode.number, ForkReset(badNode.hash), tx) + unwindNumber := badNode.number - 1 + badHash := badNode.hash + e.cachedForkChoice = newPolygonSyncStageForkChoice(newNodes) + + e.logger.Info( + e.appendLogPrefix("new fork - unwinding and caching fork choice"), + "unwindNumber", unwindNumber, + "badHash", badHash, + "cachedTipNumber", e.cachedForkChoice.tipBlockNum(), + "cachedTipHash", e.cachedForkChoice.tipBlockHash(), + "cachedNewNodes", e.cachedForkChoice.numNodes(), + ) + + return e.unwinder.UnwindTo(unwindNumber, ForkReset(badHash), tx) } if len(newNodes) == 0 { return nil } - if err := rawdb.AppendCanonicalTxNums(tx, newNodes[len(newNodes)-1].number); err != nil { + return e.updateForkChoiceForward(tx, newPolygonSyncStageForkChoice(newNodes)) +} + +func (e *polygonSyncStageExecutionEngine) updateForkChoiceForward(tx kv.RwTx, fc *polygonSyncStageForkChoice) error { + tipBlockNum := fc.tipBlockNum() + + if err := rawdb.AppendCanonicalTxNums(tx, fc.oldestNewAncestorBlockNum()); err != nil { return err } - if err := rawdb.WriteHeadHeaderHash(tx, tipHash); err != nil { + if err := rawdb.WriteHeadHeaderHash(tx, fc.tipBlockHash()); err != nil { return err } @@ -1422,6 +1462,26 @@ func (e *polygonSyncStageExecutionEngine) updateForkChoice(tx kv.RwTx, tip *type return nil } +func (e *polygonSyncStageExecutionEngine) processCachedForkChoiceIfNeeded(tx kv.RwTx) error { + if e.cachedForkChoice == nil { + return nil + } + + e.logger.Info( + e.appendLogPrefix("new fork - processing cached fork choice after unwind"), + "cachedTipNumber", e.cachedForkChoice.tipBlockNum(), + "cachedTipHash", e.cachedForkChoice.tipBlockHash(), + "cachedNewNodes", e.cachedForkChoice.numNodes(), + ) + + if err := e.updateForkChoiceForward(tx, e.cachedForkChoice); err != nil { + return err + } + + e.cachedForkChoice = nil + return nil +} + func (e *polygonSyncStageExecutionEngine) CurrentHeader(ctx context.Context) (*types.Header, error) { type response struct { result *types.Header