diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel index 4ff94cee62db..52f12da0e13a 100644 --- a/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel @@ -30,7 +30,7 @@ go_library( "//pkg/util/randutil", "//pkg/util/syncutil", "//pkg/util/timeutil", - "@com_github_pkg_errors//:errors", + "@com_github_cockroachdb_errors//:errors", "@org_golang_x_exp//maps", ], ) diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go index cfbabb7f0453..b1d9ce93ed1d 100644 --- a/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go @@ -23,6 +23,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/errors" ) // Helper is the struct passed to `stepFunc`s (user-provided or @@ -111,8 +112,8 @@ func (h *Helper) Background( return err } - desc := fmt.Sprintf("error in background function %s: %s", name, err) - return h.runner.testFailure(desc, bgLogger, nil) + err := errors.Wrapf(err, "error in background function %s", name) + return h.runner.testFailure(err, bgLogger, nil) } return nil diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go index ebaca0d096c0..6505fcdab057 100644 --- a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go @@ -89,7 +89,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils/release" "github.com/cockroachdb/cockroach/pkg/util/randutil" - "github.com/pkg/errors" + "github.com/cockroachdb/errors" ) const ( diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go index 558c988db41a..ad363b15bd70 100644 --- a/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/runner.go @@ -35,6 +35,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" ) type ( @@ -54,9 +55,7 @@ type ( stopFuncs []StopFunc } - testFailure struct { - summarized bool - description string + testFailureDetails struct { seed int64 testContext *Context binaryVersions []roachpb.Version @@ -171,7 +170,7 @@ func (tr *testRunner) run() (retErr error) { return fmt.Errorf("background step `%s` returned error: %w", event.Name, event.Err) case err := <-tr.monitor.Err(): - return tr.testFailure(err.Error(), tr.logger, nil) + return tr.testFailure(err, tr.logger, nil) } } } @@ -303,18 +302,20 @@ func (tr *testRunner) startBackgroundStep(ss *singleStep, l *logger.Logger, stop // cluster version before and after the step (in case the failure // happened *while* the cluster version was updating). func (tr *testRunner) stepError(err error, step *singleStep, l *logger.Logger) error { - desc := fmt.Sprintf("mixed-version test failure while running step %d (%s): %s", - step.ID, step.impl.Description(), err, + stepErr := errors.Wrapf( + err, + "mixed-version test failure while running step %d (%s)", + step.ID, step.impl.Description(), ) - return tr.testFailure(desc, l, &step.context) + return tr.testFailure(stepErr, l, &step.context) } -// testFailure generates a `testFailure` with the given -// description. It logs the error to the logger passed, and renames -// the underlying file to include the "FAILED" prefix to help in -// debugging. -func (tr *testRunner) testFailure(desc string, l *logger.Logger, testContext *Context) error { +// testFailure generates a `testFailure` for failures that happened +// due to the given error. It logs the error to the logger passed, +// and renames the underlying file to include the "FAILED" prefix to +// help in debugging. +func (tr *testRunner) testFailure(err error, l *logger.Logger, testContext *Context) error { clusterVersionsBefore := tr.clusterVersions var clusterVersionsAfter atomic.Value if tr.connCacheInitialized() { @@ -325,8 +326,7 @@ func (tr *testRunner) testFailure(desc string, l *logger.Logger, testContext *Co } } - tf := &testFailure{ - description: desc, + tf := &testFailureDetails{ seed: tr.seed, testContext: testContext, binaryVersions: loadAtomicVersions(tr.binaryVersions), @@ -334,15 +334,19 @@ func (tr *testRunner) testFailure(desc string, l *logger.Logger, testContext *Co clusterVersionsAfter: loadAtomicVersions(clusterVersionsAfter), } + // failureErr wraps the original error, adding mixed-version state + // information as error details. + failureErr := errors.WithDetailf(err, "%s", tf.Format()) + // Print the test failure on the step's logger for convenience, and // to reduce cross referencing of logs. - l.Printf("%v", tf) + l.Printf("%+v", failureErr) if err := renameFailedLogger(l); err != nil { tr.logger.Printf("could not rename failed step logger: %v", err) } - return tf + return failureErr } // teardown groups together all tasks that happen once a test finishes. @@ -635,30 +639,25 @@ func (br *backgroundRunner) CompletedEvents() <-chan backgroundEvent { return br.events } -func (tf *testFailure) Error() string { - if tf.summarized { - return tf.description - } - tf.summarized = true - +func (tfd *testFailureDetails) Format() string { lines := []string{ - tf.description, - fmt.Sprintf("test random seed: %d\n", tf.seed), + "test failed:", + fmt.Sprintf("test random seed: %d\n", tfd.seed), } - tw := newTableWriter(len(tf.binaryVersions)) - if tf.testContext != nil { - releasedVersions := make([]*clusterupgrade.Version, 0, len(tf.testContext.CockroachNodes)) - for _, node := range tf.testContext.CockroachNodes { - releasedVersions = append(releasedVersions, tf.testContext.NodeVersion(node)) + tw := newTableWriter(len(tfd.binaryVersions)) + if tfd.testContext != nil { + releasedVersions := make([]*clusterupgrade.Version, 0, len(tfd.testContext.CockroachNodes)) + for _, node := range tfd.testContext.CockroachNodes { + releasedVersions = append(releasedVersions, tfd.testContext.NodeVersion(node)) } tw.AddRow("released versions", toString(releasedVersions)...) } - tw.AddRow("logical binary versions", toString(tf.binaryVersions)...) - tw.AddRow("cluster versions before failure", toString(tf.clusterVersionsBefore)...) + tw.AddRow("logical binary versions", toString(tfd.binaryVersions)...) + tw.AddRow("cluster versions before failure", toString(tfd.clusterVersionsBefore)...) - if cv := tf.clusterVersionsAfter; cv != nil { + if cv := tfd.clusterVersionsAfter; cv != nil { tw.AddRow("cluster versions after failure", toString(cv)...) } diff --git a/pkg/kv/kvserver/replica_proposal_buf.go b/pkg/kv/kvserver/replica_proposal_buf.go index 0bff983e0fe9..207d713cfb48 100644 --- a/pkg/kv/kvserver/replica_proposal_buf.go +++ b/pkg/kv/kvserver/replica_proposal_buf.go @@ -125,19 +125,20 @@ type propBuf struct { } type rangeLeaderInfo struct { + // leader is the Raft group's leader. Equals 0 [roachpb.ReplicaID(raft.None)] + // if the leader is not known/set, in which case other fields are unset too. + leader roachpb.ReplicaID // iAmTheLeader is set if the local replica is the leader. iAmTheLeader bool - // leaderKnown is set if the local Raft machinery knows who the leader is. If - // not set, all other fields are empty. - leaderKnown bool - // leader represents the Raft group's leader. Not set if leaderKnown is not - // set. - leader roachpb.ReplicaID // leaderEligibleForLease is set if the leader is known and its type of // replica allows it to acquire a lease. leaderEligibleForLease bool } +func (r rangeLeaderInfo) leaderKnown() bool { + return r.leader != roachpb.ReplicaID(raft.None) +} + type admitEntHandle struct { handle *kvflowcontrolpb.RaftAdmissionMeta pCtx context.Context @@ -682,11 +683,16 @@ func (b *propBuf) maybeRejectUnsafeProposalLocked( // Thus, we do one of two things: // - if the leader is known, we reject this proposal and make sure the // request that needed the lease is redirected to the leaseholder; - // - if the leader is not known, we don't do anything special here to + // - if the leader is not known [^1], we don't do anything special here to // terminate the proposal, but we know that Raft will reject it with a // ErrProposalDropped. We'll eventually re-propose it once a leader is // known, at which point it will either go through or be rejected based on - // whether or not it is this replica that became the leader. + // whether it is this replica that became the leader. + // + // [^1]: however, if the leader is not known and RejectLeaseOnLeaderUnknown + // cluster setting is true, we reject the proposal. + // TODO(pav-kv): make this behaviour default. Right now, it is hidden behind + // the experimental cluster setting. See #120073 and #118435. // // A special case is when the leader is known, but is ineligible to get the // lease. In that case, we have no choice but to continue with the proposal. @@ -698,11 +704,25 @@ func (b *propBuf) maybeRejectUnsafeProposalLocked( if li.iAmTheLeader { return false } - leaderKnownAndEligible := li.leaderKnown && li.leaderEligibleForLease - ownsCurrentLease := b.p.ownsValidLease(ctx, b.clock.NowAsClockTimestamp()) - if leaderKnownAndEligible && !ownsCurrentLease && !b.testing.allowLeaseProposalWhenNotLeader { + if b.p.ownsValidLease(ctx, b.clock.NowAsClockTimestamp()) { + log.VEventf(ctx, 2, "proposing lease extension even though we're not the leader; we hold the current lease") + return false + } + + reject := false + if !li.leaderKnown() && RejectLeaseOnLeaderUnknown.Get(&b.settings.SV) { + log.VEventf(ctx, 2, "not proposing lease acquisition because we're not the leader; the leader is unknown") + reject = true + } + // TODO(pav-kv): the testing knob logic below doesn't exactly correspond to + // its name. Clean it up, potentially replace by the cluster setting above. + if li.leaderEligibleForLease && !b.testing.allowLeaseProposalWhenNotLeader { log.VEventf(ctx, 2, "not proposing lease acquisition because we're not the leader; replica %d is", li.leader) + reject = true + } + if reject { + // NB: li.leader can be None. b.p.rejectProposalWithRedirectLocked(ctx, p, li.leader) if b.p.shouldCampaignOnRedirect(raftGroup) { const format = "campaigning because Raft leader (id=%d) not live in node liveness map" @@ -715,12 +735,9 @@ func (b *propBuf) maybeRejectUnsafeProposalLocked( } return true } - // If the leader is not known, or if it is known but it's ineligible - // for the lease, continue with the proposal as explained above. We - // also send lease extensions for an existing leaseholder. - if ownsCurrentLease { - log.VEventf(ctx, 2, "proposing lease extension even though we're not the leader; we hold the current lease") - } else if !li.leaderKnown { + // If the leader is not known, or if it is known but is ineligible for the + // lease, continue with the proposal as explained above. + if !li.leaderKnown() { log.VEventf(ctx, 2, "proposing lease acquisition even though we're not the leader; the leader is unknown") } else { log.VEventf(ctx, 2, "proposing lease acquisition even though we're not the leader; the leader is ineligible") @@ -810,8 +827,7 @@ func (b *propBuf) maybeRejectUnsafeProposalLocked( func (b *propBuf) leaderStatusRLocked(ctx context.Context, raftGroup proposerRaft) rangeLeaderInfo { leaderInfo := b.p.leaderStatus(ctx, raftGroup) // Sanity check. - if leaderInfo.leaderKnown && leaderInfo.leader == b.p.getReplicaID() && - !leaderInfo.iAmTheLeader { + if leaderInfo.leader == b.p.getReplicaID() && !leaderInfo.iAmTheLeader { log.Fatalf(ctx, "inconsistent Raft state: state %s while the current replica is also the lead: %d", raftGroup.BasicStatus().RaftState, leaderInfo.leader) @@ -1402,7 +1418,6 @@ func (rp *replicaProposer) leaderStatus( } return rangeLeaderInfo{ iAmTheLeader: iAmTheLeader, - leaderKnown: leaderKnown, leader: roachpb.ReplicaID(leader), leaderEligibleForLease: leaderEligibleForLease, } @@ -1444,6 +1459,14 @@ func (rp *replicaProposer) rejectProposalWithRedirectLocked( rangeDesc := r.descRLocked() storeID := r.store.StoreID() r.store.metrics.LeaseRequestErrorCount.Inc(1) + if redirectTo == roachpb.ReplicaID(raft.None) { + // We don't know the leader, so pass Lease{} to give no hint. + rp.rejectProposalWithErrLocked(ctx, prop, kvpb.NewError( + kvpb.NewNotLeaseHolderError(roachpb.Lease{}, storeID, rangeDesc, + "refusing to acquire lease on follower"))) + return + } + redirectRep, _ /* ok */ := rangeDesc.GetReplicaDescriptorByID(redirectTo) log.VEventf(ctx, 2, "redirecting proposal to node %s; request: %s", redirectRep.NodeID, prop.Request) rp.rejectProposalWithErrLocked(ctx, prop, kvpb.NewError( diff --git a/pkg/kv/kvserver/replica_proposal_buf_test.go b/pkg/kv/kvserver/replica_proposal_buf_test.go index 282de7680a9d..9a761f6be039 100644 --- a/pkg/kv/kvserver/replica_proposal_buf_test.go +++ b/pkg/kv/kvserver/replica_proposal_buf_test.go @@ -235,7 +235,6 @@ func (t *testProposer) leaderStatus(ctx context.Context, raftGroup proposerRaft) } return rangeLeaderInfo{ iAmTheLeader: iAmTheLeader, - leaderKnown: leaderKnown, leader: leaderRep, leaderEligibleForLease: leaderEligibleForLease, } diff --git a/pkg/kv/kvserver/replica_range_lease.go b/pkg/kv/kvserver/replica_range_lease.go index e12057ae9c16..e4c3b3f3a958 100644 --- a/pkg/kv/kvserver/replica_range_lease.go +++ b/pkg/kv/kvserver/replica_range_lease.go @@ -144,6 +144,18 @@ var LeaseCheckPreferencesOnAcquisitionEnabled = settings.RegisterBoolSetting( true, ) +// RejectLeaseOnLeaderUnknown controls whether a replica that does not know the +// current raft leader rejects a lease request. +// +// TODO(pav-kv): flip the default to true, and remove this setting when this +// becomes the only behaviour. +var RejectLeaseOnLeaderUnknown = settings.RegisterBoolSetting( + settings.SystemOnly, + "kv.lease.reject_on_leader_unknown.enabled", + "reject lease requests on a replica that does not know the raft leader", + false, +) + var leaseStatusLogLimiter = func() *log.EveryN { e := log.Every(15 * time.Second) e.ShouldLog() // waste the first shot diff --git a/pkg/sql/opt/exec/execbuilder/testdata/select_index b/pkg/sql/opt/exec/execbuilder/testdata/select_index index 896cc4d7a153..5967568e4a90 100644 --- a/pkg/sql/opt/exec/execbuilder/testdata/select_index +++ b/pkg/sql/opt/exec/execbuilder/testdata/select_index @@ -1466,16 +1466,16 @@ EXPLAIN (VERBOSE) SELECT * FROM noncover ORDER BY c LIMIT 5 OFFSET 5 distribution: local vectorized: true · -• limit +• index join │ columns: (a, b, c, d) -│ offset: 5 +│ ordering: +c +│ estimated row count: 5 (missing stats) +│ table: noncover@noncover_pkey +│ key columns: a │ -└── • index join - │ columns: (a, b, c, d) - │ ordering: +c - │ estimated row count: 10 (missing stats) - │ table: noncover@noncover_pkey - │ key columns: a +└── • limit + │ columns: (a, c) + │ offset: 5 │ └── • scan columns: (a, c) diff --git a/pkg/sql/opt/xform/rules/limit.opt b/pkg/sql/opt/xform/rules/limit.opt index 80752a5b2c84..010567088419 100644 --- a/pkg/sql/opt/xform/rules/limit.opt +++ b/pkg/sql/opt/xform/rules/limit.opt @@ -37,8 +37,6 @@ # PushLimitIntoIndexJoin pushes a limit through an index join. Since index # lookup can be expensive, it's always better to discard rows beforehand. -# -# TODO(radu): we can similarly push Offset too. [PushLimitIntoIndexJoin, Explore] (Limit (IndexJoin $input:* $indexJoinPrivate:*) & @@ -56,6 +54,25 @@ $indexJoinPrivate ) +# PushOffsetIntoIndexJoin pushes an offset through an index join. Since an index +# lookup can be expensive, it's always better to discard rows beforehand. +[PushOffsetIntoIndexJoin, Explore] +(Offset + (IndexJoin $input:* $indexJoinPrivate:*) & + (IndexJoinPreservesRows $indexJoinPrivate) + $offsetExpr:(Const $offset:* & (IsPositiveInt $offset)) + $ordering:* & + (OrderingCanProjectCols + $ordering + $cols:(OutputCols $input) + ) +) +=> +(IndexJoin + (Offset $input $offsetExpr (PruneOrdering $ordering $cols)) + $indexJoinPrivate +) + # SplitLimitedScanIntoUnionScans splits a non-inverted scan under a limit into a # union-all of limited scans over disjoint intervals. Example: # diff --git a/pkg/sql/opt/xform/testdata/coster/limit b/pkg/sql/opt/xform/testdata/coster/limit index a292f882c795..6b739e59eb99 100644 --- a/pkg/sql/opt/xform/testdata/coster/limit +++ b/pkg/sql/opt/xform/testdata/coster/limit @@ -17,50 +17,50 @@ SELECT * FROM a WHERE y = 10 ORDER BY s, x DESC LIMIT 20 OFFSET 1000 ---- -offset +index-join a ├── columns: x:1!null y:2!null z:3 s:4!null - ├── internal-ordering: +4,-1 opt(2) ├── cardinality: [0 - 20] ├── stats: [rows=1] - ├── cost: 89.9929301 + ├── cost: 35.4529295 ├── key: (1) ├── fd: ()-->(2), (1)-->(3,4) ├── ordering: +4,-1 opt(2) [actual: +4,-1] - ├── index-join a - │ ├── columns: x:1!null y:2!null z:3 s:4!null - │ ├── cardinality: [0 - 1020] - │ ├── stats: [rows=10, distinct(4)=9.56179, null(4)=0] - │ ├── cost: 89.9729301 - │ ├── key: (1) - │ ├── fd: ()-->(2), (1)-->(3,4) - │ ├── ordering: +4,-1 opt(2) [actual: +4,-1] - │ └── limit - │ ├── columns: x:1!null y:2!null s:4!null - │ ├── internal-ordering: +4,-1 opt(2) - │ ├── cardinality: [0 - 1020] - │ ├── stats: [rows=10, distinct(4)=9.56179, null(4)=0] - │ ├── cost: 29.3629295 - │ ├── key: (1) - │ ├── fd: ()-->(2), (1)-->(4) - │ ├── ordering: +4,-1 opt(2) [actual: +4,-1] - │ ├── sort (segmented) - │ │ ├── columns: x:1!null y:2!null s:4!null - │ │ ├── stats: [rows=10, distinct(2)=1, null(2)=0, distinct(4)=9.56179, null(4)=0] - │ │ ├── cost: 29.2529295 - │ │ ├── key: (1) - │ │ ├── fd: ()-->(2), (1)-->(4) - │ │ ├── ordering: +4,-1 opt(2) [actual: +4,-1] - │ │ ├── limit hint: 1020.00 - │ │ └── scan a@a_y_s_idx - │ │ ├── columns: x:1!null y:2!null s:4!null - │ │ ├── constraint: /2/4/1: [/10 - /10] - │ │ ├── stats: [rows=10, distinct(2)=1, null(2)=0, distinct(4)=9.56179, null(4)=0] - │ │ ├── cost: 28.6200001 - │ │ ├── key: (1) - │ │ ├── fd: ()-->(2), (1)-->(4) - │ │ └── ordering: +4 opt(2) [actual: +4] - │ └── 1020 - └── 1000 + └── offset + ├── columns: x:1!null y:2!null s:4!null + ├── internal-ordering: +4,-1 opt(2) + ├── cardinality: [0 - 20] + ├── stats: [rows=1] + ├── cost: 29.3829295 + ├── key: (1) + ├── fd: ()-->(2), (1)-->(4) + ├── ordering: +4,-1 opt(2) [actual: +4,-1] + ├── limit + │ ├── columns: x:1!null y:2!null s:4!null + │ ├── internal-ordering: +4,-1 opt(2) + │ ├── cardinality: [0 - 1020] + │ ├── stats: [rows=10, distinct(4)=9.56179, null(4)=0] + │ ├── cost: 29.3629295 + │ ├── key: (1) + │ ├── fd: ()-->(2), (1)-->(4) + │ ├── ordering: +4,-1 opt(2) [actual: +4,-1] + │ ├── sort (segmented) + │ │ ├── columns: x:1!null y:2!null s:4!null + │ │ ├── stats: [rows=10, distinct(2)=1, null(2)=0, distinct(4)=9.56179, null(4)=0] + │ │ ├── cost: 29.2529295 + │ │ ├── key: (1) + │ │ ├── fd: ()-->(2), (1)-->(4) + │ │ ├── ordering: +4,-1 opt(2) [actual: +4,-1] + │ │ ├── limit hint: 1020.00 + │ │ └── scan a@a_y_s_idx + │ │ ├── columns: x:1!null y:2!null s:4!null + │ │ ├── constraint: /2/4/1: [/10 - /10] + │ │ ├── stats: [rows=10, distinct(2)=1, null(2)=0, distinct(4)=9.56179, null(4)=0] + │ │ ├── cost: 28.6200001 + │ │ ├── key: (1) + │ │ ├── fd: ()-->(2), (1)-->(4) + │ │ └── ordering: +4 opt(2) [actual: +4] + │ └── 1020 + └── 1000 exec-ddl ALTER TABLE a INJECT STATISTICS '[ diff --git a/pkg/sql/opt/xform/testdata/rules/limit b/pkg/sql/opt/xform/testdata/rules/limit index 2ea49a45ddee..cd027b15021f 100644 --- a/pkg/sql/opt/xform/testdata/rules/limit +++ b/pkg/sql/opt/xform/testdata/rules/limit @@ -815,6 +815,110 @@ Final best expression │ └── (a:1 >= 20) AND (a:1 <= 30) [outer=(1), constraints=(/1: [/20 - /30]; tight)] └── 5 +# -------------------------------------------------- +# PushOffsetIntoIndexJoin +# -------------------------------------------------- + +opt expect=PushOffsetIntoIndexJoin +SELECT * FROM kuv WHERE k = 1 OR k = 2 ORDER BY u OFFSET 5 +---- +index-join kuv + ├── columns: k:1!null u:2 v:3 + ├── ordering: +2 + └── offset + ├── columns: k:1!null u:2 rowid:4!null + ├── internal-ordering: +2 + ├── key: (4) + ├── fd: (4)-->(1,2) + ├── ordering: +2 + ├── sort + │ ├── columns: k:1!null u:2 rowid:4!null + │ ├── key: (4) + │ ├── fd: (4)-->(1,2) + │ ├── ordering: +2 + │ └── scan kuv@kuv_k_u_idx + │ ├── columns: k:1!null u:2 rowid:4!null + │ ├── constraint: /1/2/4: [/1 - /2] + │ ├── key: (4) + │ └── fd: (4)-->(1,2) + └── 5 + +# Both LIMIT and OFFSET can be pushed below a join. +opt expect=(PushLimitIntoIndexJoin,PushOffsetIntoIndexJoin) +SELECT * FROM kuv WHERE k = 1 OR k = 2 ORDER BY u LIMIT 10 OFFSET 5 +---- +index-join kuv + ├── columns: k:1!null u:2 v:3 + ├── cardinality: [0 - 10] + ├── ordering: +2 + └── offset + ├── columns: k:1!null u:2 rowid:4!null + ├── internal-ordering: +2 + ├── cardinality: [0 - 10] + ├── key: (4) + ├── fd: (4)-->(1,2) + ├── ordering: +2 + ├── top-k + │ ├── columns: k:1!null u:2 rowid:4!null + │ ├── internal-ordering: +2 + │ ├── k: 15 + │ ├── cardinality: [0 - 15] + │ ├── key: (4) + │ ├── fd: (4)-->(1,2) + │ ├── ordering: +2 + │ └── scan kuv@kuv_k_u_idx + │ ├── columns: k:1!null u:2 rowid:4!null + │ ├── constraint: /1/2/4: [/1 - /2] + │ ├── key: (4) + │ └── fd: (4)-->(1,2) + └── 5 + +# Ensure that the offset is not pushed down when the ordering requires columns +# produced by the IndexJoin. +opt expect-not=PushOffsetIntoIndexJoin +SELECT * FROM kuv WHERE u > 1 AND u < 10 ORDER BY u, v OFFSET 5 +---- +offset + ├── columns: k:1 u:2!null v:3 + ├── internal-ordering: +2,+3 + ├── ordering: +2,+3 + ├── sort + │ ├── columns: k:1 u:2!null v:3 + │ ├── ordering: +2,+3 + │ └── select + │ ├── columns: k:1 u:2!null v:3 + │ ├── scan kuv + │ │ └── columns: k:1 u:2 v:3 + │ └── filters + │ └── (u:2 > 1) AND (u:2 < 10) [outer=(2), constraints=(/2: [/2 - /9]; tight)] + └── 5 + +# Ensure that the offset is not pushed down when using SKIP LOCKED. +opt expect-not=PushOffsetIntoIndexJoin +SELECT * FROM kuv WHERE k = 1 ORDER BY u OFFSET 5 FOR UPDATE SKIP LOCKED +---- +offset + ├── columns: k:1!null u:2 v:3 + ├── internal-ordering: +2 opt(1) + ├── volatile + ├── fd: ()-->(1) + ├── ordering: +2 opt(1) [actual: +2] + ├── index-join kuv + │ ├── columns: k:1!null u:2 v:3 + │ ├── locking: for-update,skip-locked + │ ├── volatile + │ ├── fd: ()-->(1) + │ ├── ordering: +2 opt(1) [actual: +2] + │ └── scan kuv@kuv_k_u_idx + │ ├── columns: k:1!null u:2 rowid:4!null + │ ├── constraint: /1/2/4: [/1 - /1] + │ ├── locking: for-update,skip-locked + │ ├── volatile + │ ├── key: (4) + │ ├── fd: ()-->(1), (4)-->(2) + │ └── ordering: +2 opt(1) [actual: +2] + └── 5 + # -------------------------------------------------- # PushLimitIntoOffset + GenerateLimitedScans # -------------------------------------------------- @@ -825,51 +929,51 @@ Final best expression opt SELECT * from a ORDER BY s LIMIT 10 OFFSET 10 ---- -offset +index-join a ├── columns: k:1!null i:2 f:3 s:4 j:5 - ├── internal-ordering: +4 ├── cardinality: [0 - 10] ├── key: (1) ├── fd: (1)-->(2-5) ├── ordering: +4 - ├── index-join a - │ ├── columns: k:1!null i:2 f:3 s:4 j:5 - │ ├── cardinality: [0 - 20] - │ ├── key: (1) - │ ├── fd: (1)-->(2-5) - │ ├── ordering: +4 - │ └── scan a@s_idx - │ ├── columns: k:1!null i:2 f:3 s:4 - │ ├── limit: 20 - │ ├── key: (1) - │ ├── fd: (1)-->(2-4) - │ └── ordering: +4 - └── 10 + └── offset + ├── columns: k:1!null i:2 f:3 s:4 + ├── internal-ordering: +4 + ├── cardinality: [0 - 10] + ├── key: (1) + ├── fd: (1)-->(2-4) + ├── ordering: +4 + ├── scan a@s_idx + │ ├── columns: k:1!null i:2 f:3 s:4 + │ ├── limit: 20 + │ ├── key: (1) + │ ├── fd: (1)-->(2-4) + │ └── ordering: +4 + └── 10 # The right index is used for the limited scan based on the order. opt SELECT * from a ORDER BY s DESC LIMIT 10 OFFSET 10 ---- -offset +index-join a ├── columns: k:1!null i:2 f:3 s:4 j:5 - ├── internal-ordering: -4 ├── cardinality: [0 - 10] ├── key: (1) ├── fd: (1)-->(2-5) ├── ordering: -4 - ├── index-join a - │ ├── columns: k:1!null i:2 f:3 s:4 j:5 - │ ├── cardinality: [0 - 20] - │ ├── key: (1) - │ ├── fd: (1)-->(2-5) - │ ├── ordering: -4 - │ └── scan a@si_idx - │ ├── columns: k:1!null i:2 s:4 j:5 - │ ├── limit: 20 - │ ├── key: (1) - │ ├── fd: (1)-->(2,4,5) - │ └── ordering: -4 - └── 10 + └── offset + ├── columns: k:1!null i:2 s:4 j:5 + ├── internal-ordering: -4 + ├── cardinality: [0 - 10] + ├── key: (1) + ├── fd: (1)-->(2,4,5) + ├── ordering: -4 + ├── scan a@si_idx + │ ├── columns: k:1!null i:2 s:4 j:5 + │ ├── limit: 20 + │ ├── key: (1) + │ ├── fd: (1)-->(2,4,5) + │ └── ordering: -4 + └── 10 # PushLimitIntoIndexJoin propagates row-level locking information. opt