diff --git a/changelog/22.0/22.0.0/summary.md b/changelog/22.0/22.0.0/summary.md
index 958ad8ce5a3..2fb66ea8969 100644
--- a/changelog/22.0/22.0.0/summary.md
+++ b/changelog/22.0/22.0.0/summary.md
@@ -102,9 +102,9 @@ You can control idle connection retention for the query server’s query pool, s
This feature ensures that, during traffic spikes, idle connections are available for faster responses, while minimizing overhead in low-traffic periods by limiting the number of idle connections retained. It helps strike a balance between performance, efficiency, and cost.
### Stalled Disk Recovery in VTOrc
-VTOrc has been augmented to be able to identify and recover from stalled disk errors. This is done by polling that the disk is writable by the vttablets and they send this information in the full status output to VTOrc. If the disk is not writable on the primary tablet, VTOrc will attempt to recover the cluster by reparenting to a different primary. This is useful in scenarios where the disk is stalled and the primary vttablet is unable to accept writes because of it.
+VTOrc can now identify and recover from stalled disk errors. VTTablets test whether the disk is writable and they send this information in the full status output to VTOrc. If the disk is not writable on the primary tablet, VTOrc will attempt to recover the cluster by promoting a new primary. This is useful in scenarios where the disk is stalled and the primary vttablet is unable to accept writes because of it.
-To opt into this feature, `--enable-stalled-disk-primary-recovery` flag has to be specified on VTOrc, and `--stalled-disk-write-dir` flag has to be specified on the vttablets. `--stalled-disk-write-interval` and `--stalled-disk-write-timeout` flags can be used to configure the polling interval and timeout respectively.
+To opt into this feature, `--enable-primary-disk-stalled-recovery` flag has to be specified on VTOrc, and `--disk-write-dir` flag has to be specified on the vttablets. `--disk-write-interval` and `--disk-write-timeout` flags can be used to configure the polling interval and timeout respectively.
## Minor Changes
diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt
index a6140263ce6..76c8e894347 100644
--- a/go/flags/endtoend/vtcombo.txt
+++ b/go/flags/endtoend/vtcombo.txt
@@ -102,6 +102,9 @@ Flags:
--ddl_strategy string Set default strategy for DDL statements. Override with @@ddl_strategy session variable (default "direct")
--default_tablet_type topodatapb.TabletType The default tablet type to set for queries, when one is not explicitly selected. (default PRIMARY)
--degraded_threshold duration replication lag after which a replica is considered degraded (default 30s)
+ --disk-write-dir string if provided, tablet will attempt to write a file to this directory to check if the disk is stalled
+ --disk-write-interval duration how often to write to the disk to check whether it is stalled (default 5s)
+ --disk-write-timeout duration if writes exceed this duration, the disk is considered stalled (default 30s)
--emit_stats If set, emit stats to push-based monitoring and stats backends
--enable-consolidator Synonym to -enable_consolidator (default true)
--enable-consolidator-replicas Synonym to -enable_consolidator_replicas
@@ -328,9 +331,6 @@ Flags:
--srv_topo_cache_refresh duration how frequently to refresh the topology for cached entries (default 1s)
--srv_topo_cache_ttl duration how long to use cached entries for topology (default 1s)
--srv_topo_timeout duration topo server timeout (default 5s)
- --stalled-disk-write-dir string if provided, tablet will attempt to write a file to this directory to check if the disk is stalled
- --stalled-disk-write-interval duration how often to write to the disk to check whether it is stalled (default 5s)
- --stalled-disk-write-timeout duration if writes exceed this duration, the disk is considered stalled (default 30s)
--start_mysql Should vtcombo also start mysql
--stats_backend string The name of the registered push-based monitoring/stats backend to use
--stats_combine_dimensions string List of dimensions to be combined into a single "all" value in exported stats vars
diff --git a/go/flags/endtoend/vtorc.txt b/go/flags/endtoend/vtorc.txt
index b4ec79996f4..ca8083709e5 100644
--- a/go/flags/endtoend/vtorc.txt
+++ b/go/flags/endtoend/vtorc.txt
@@ -33,7 +33,7 @@ Flags:
--config-type string Config file type (omit to infer config type from file extension).
--consul_auth_static_file string JSON File to read the topos/tokens from.
--emit_stats If set, emit stats to push-based monitoring and stats backends
- --enable-stalled-disk-primary-recovery Whether VTOrc should be analyzing and recovering stalled disk primary failures
+ --enable-primary-disk-stalled-recovery Whether VTOrc should detect a stalled disk on the primary and failover
--grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024)
--grpc_auth_static_client_creds string When using grpc_static_auth in the server, this file provides the credentials to use to authenticate with server.
--grpc_compression string Which protocol to use for compressing gRPC. Default: nothing. Supported: snappy
diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt
index 44e242c5ae5..955823f7322 100644
--- a/go/flags/endtoend/vttablet.txt
+++ b/go/flags/endtoend/vttablet.txt
@@ -133,6 +133,9 @@ Flags:
--dba_idle_timeout duration Idle timeout for dba connections (default 1m0s)
--dba_pool_size int Size of the connection pool for dba connections (default 20)
--degraded_threshold duration replication lag after which a replica is considered degraded (default 30s)
+ --disk-write-dir string if provided, tablet will attempt to write a file to this directory to check if the disk is stalled
+ --disk-write-interval duration how often to write to the disk to check whether it is stalled (default 5s)
+ --disk-write-timeout duration if writes exceed this duration, the disk is considered stalled (default 30s)
--emit_stats If set, emit stats to push-based monitoring and stats backends
--enable-consolidator Synonym to -enable_consolidator (default true)
--enable-consolidator-replicas Synonym to -enable_consolidator_replicas
@@ -328,9 +331,6 @@ Flags:
--srv_topo_cache_refresh duration how frequently to refresh the topology for cached entries (default 1s)
--srv_topo_cache_ttl duration how long to use cached entries for topology (default 1s)
--srv_topo_timeout duration topo server timeout (default 5s)
- --stalled-disk-write-dir string if provided, tablet will attempt to write a file to this directory to check if the disk is stalled
- --stalled-disk-write-interval duration how often to write to the disk to check whether it is stalled (default 5s)
- --stalled-disk-write-timeout duration if writes exceed this duration, the disk is considered stalled (default 30s)
--stats_backend string The name of the registered push-based monitoring/stats backend to use
--stats_combine_dimensions string List of dimensions to be combined into a single "all" value in exported stats vars
--stats_common_tags strings Comma-separated list of common tags for the stats backend. It provides both label and values. Example: label1:value1,label2:value2
diff --git a/go/vt/vtorc/config/config.go b/go/vt/vtorc/config/config.go
index 6a4b40503e1..db367673aeb 100644
--- a/go/vt/vtorc/config/config.go
+++ b/go/vt/vtorc/config/config.go
@@ -175,10 +175,10 @@ var (
},
)
- enableStalledDiskPrimaryRecovery = viperutil.Configure(
- "enable-stalled-disk-primary-recovery",
+ enablePrimaryDiskStalledRecovery = viperutil.Configure(
+ "enable-primary-disk-stalled-recovery",
viperutil.Options[bool]{
- FlagName: "enable-stalled-disk-primary-recovery",
+ FlagName: "enable-primary-disk-stalled-recovery",
Default: false,
Dynamic: true,
},
@@ -206,7 +206,7 @@ func registerFlags(fs *pflag.FlagSet) {
fs.Duration("recovery-poll-duration", recoveryPollDuration.Default(), "Timer duration on which VTOrc polls its database to run a recovery")
fs.Bool("allow-emergency-reparent", ersEnabled.Default(), "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary")
fs.Bool("change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs.Default(), "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED")
- fs.Bool("enable-stalled-disk-primary-recovery", enableStalledDiskPrimaryRecovery.Default(), "Whether VTOrc should be analyzing and recovering stalled disk primary failures")
+ fs.Bool("enable-primary-disk-stalled-recovery", enablePrimaryDiskStalledRecovery.Default(), "Whether VTOrc should detect a stalled disk on the primary and failover")
viperutil.BindFlags(fs,
instancePollTime,
@@ -224,7 +224,7 @@ func registerFlags(fs *pflag.FlagSet) {
recoveryPollDuration,
ersEnabled,
convertTabletsWithErrantGTIDs,
- enableStalledDiskPrimaryRecovery,
+ enablePrimaryDiskStalledRecovery,
)
}
@@ -345,7 +345,7 @@ func SetConvertTabletWithErrantGTIDs(val bool) {
// GetStalledDiskPrimaryRecovery reports whether VTOrc is allowed to check for and recovery stalled disk problems.
func GetStalledDiskPrimaryRecovery() bool {
- return enableStalledDiskPrimaryRecovery.Get()
+ return enablePrimaryDiskStalledRecovery.Get()
}
// MarkConfigurationLoaded is called once configuration has first been loaded.
diff --git a/go/vt/vtorc/db/generate_base.go b/go/vt/vtorc/db/generate_base.go
index f4a62577ee8..8baa9a12476 100644
--- a/go/vt/vtorc/db/generate_base.go
+++ b/go/vt/vtorc/db/generate_base.go
@@ -105,7 +105,7 @@ CREATE TABLE database_instance (
semi_sync_primary_status TINYint NOT NULL DEFAULT 0,
semi_sync_replica_status TINYint NOT NULL DEFAULT 0,
semi_sync_primary_clients int NOT NULL DEFAULT 0,
- stalled_disk TINYint NOT NULL DEFAULT 0,
+ is_disk_stalled TINYint NOT NULL DEFAULT 0,
PRIMARY KEY (alias)
)`,
`
diff --git a/go/vt/vtorc/inst/analysis.go b/go/vt/vtorc/inst/analysis.go
index 06435b4a6d7..6a800e5ee0b 100644
--- a/go/vt/vtorc/inst/analysis.go
+++ b/go/vt/vtorc/inst/analysis.go
@@ -56,7 +56,7 @@ const (
LockedSemiSyncPrimaryHypothesis AnalysisCode = "LockedSemiSyncPrimaryHypothesis"
LockedSemiSyncPrimary AnalysisCode = "LockedSemiSyncPrimary"
ErrantGTIDDetected AnalysisCode = "ErrantGTIDDetected"
- StalledDiskPrimary AnalysisCode = "StalledDiskPrimary"
+ PrimaryDiskStalled AnalysisCode = "PrimaryDiskStalled"
)
type StructureAnalysisCode string
@@ -130,7 +130,7 @@ type ReplicationAnalysis struct {
MaxReplicaGTIDMode string
MaxReplicaGTIDErrant string
IsReadOnly bool
- IsStalledDisk bool
+ IsDiskStalled bool
}
func (replicationAnalysis *ReplicationAnalysis) MarshalJSON() ([]byte, error) {
diff --git a/go/vt/vtorc/inst/analysis_dao.go b/go/vt/vtorc/inst/analysis_dao.go
index a1a92baa20e..d41b5677a81 100644
--- a/go/vt/vtorc/inst/analysis_dao.go
+++ b/go/vt/vtorc/inst/analysis_dao.go
@@ -234,7 +234,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
DISTINCT case when replica_instance.log_bin
AND replica_instance.log_replica_updates then replica_instance.major_version else NULL end
) AS count_distinct_logging_major_versions,
- primary_instance.stalled_disk != 0 AS is_stalled_disk
+ primary_instance.is_disk_stalled != 0 AS is_disk_stalled
FROM
vitess_tablet
JOIN vitess_keyspace ON (
@@ -355,7 +355,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
a.HeartbeatInterval = m.GetFloat64("heartbeat_interval")
a.IsReadOnly = m.GetUint("read_only") == 1
- a.IsStalledDisk = m.GetBool("is_stalled_disk")
+ a.IsDiskStalled = m.GetBool("is_disk_stalled")
if !a.LastCheckValid {
analysisMessage := fmt.Sprintf("analysis: Alias: %+v, Keyspace: %+v, Shard: %+v, IsPrimary: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v",
@@ -403,11 +403,10 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
} else if isInvalid {
a.Analysis = InvalidReplica
a.Description = "VTOrc hasn't been able to reach the replica even once since restart/shutdown"
- } else if a.IsClusterPrimary && !a.LastCheckValid && a.IsStalledDisk {
- a.Analysis = StalledDiskPrimary
+ } else if a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled {
+ a.Analysis = PrimaryDiskStalled
a.Description = "Primary has a stalled disk"
ca.hasClusterwideAction = true
- //
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 {
a.Analysis = DeadPrimaryWithoutReplicas
a.Description = "Primary cannot be reached by vtorc and has no replica"
diff --git a/go/vt/vtorc/inst/analysis_dao_test.go b/go/vt/vtorc/inst/analysis_dao_test.go
index c04938e378c..baa1121b776 100644
--- a/go/vt/vtorc/inst/analysis_dao_test.go
+++ b/go/vt/vtorc/inst/analysis_dao_test.go
@@ -118,7 +118,7 @@ func TestGetReplicationAnalysisDecision(t *testing.T) {
}},
keyspaceWanted: "ks",
shardWanted: "0",
- codeWanted: StalledDiskPrimary,
+ codeWanted: PrimaryDiskStalled,
}, {
name: "DeadPrimary",
info: []*test.InfoForRecoveryAnalysis{{
diff --git a/go/vt/vtorc/inst/instance_dao.go b/go/vt/vtorc/inst/instance_dao.go
index 6fde60bcf36..2a24caf0a0b 100644
--- a/go/vt/vtorc/inst/instance_dao.go
+++ b/go/vt/vtorc/inst/instance_dao.go
@@ -879,7 +879,7 @@ func mkInsertForInstances(instances []*Instance, instanceWasActuallyFound bool,
"semi_sync_primary_clients",
"semi_sync_replica_status",
"last_discovery_latency",
- "stalled_disk",
+ "is_disk_stalled",
}
values := make([]string, len(columns))
@@ -1011,7 +1011,7 @@ func UpdateInstanceLastChecked(tabletAlias string, partialSuccess bool, stalledD
SET
last_checked = DATETIME('now'),
last_check_partial_success = ?,
- stalled_disk = ?
+ is_disk_stalled = ?
WHERE
alias = ?
`,
diff --git a/go/vt/vtorc/inst/instance_dao_test.go b/go/vt/vtorc/inst/instance_dao_test.go
index 235c9b2664e..c3b99455741 100644
--- a/go/vt/vtorc/inst/instance_dao_test.go
+++ b/go/vt/vtorc/inst/instance_dao_test.go
@@ -64,7 +64,7 @@ func TestMkInsertSingle(t *testing.T) {
version, major_version, version_comment, binlog_server, read_only, binlog_format,
binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_net_timeout, heartbeat_interval,
replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant,
- source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, stalled_disk, last_seen)
+ source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, is_disk_stalled, last_seen)
VALUES
(?, ?, ?, DATETIME('now'), DATETIME('now'), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, DATETIME('now'))
`
@@ -87,7 +87,7 @@ func TestMkInsertThree(t *testing.T) {
version, major_version, version_comment, binlog_server, read_only, binlog_format,
binlog_row_image, log_bin, log_replica_updates, binary_log_file, binary_log_pos, source_host, source_port, replica_net_timeout, heartbeat_interval,
replica_sql_running, replica_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, source_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant,
- source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, stalled_disk, last_seen)
+ source_log_file, read_source_log_pos, relay_source_log_file, exec_source_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, replication_lag_seconds, replica_lag_seconds, sql_delay, data_center, region, physical_environment, replication_depth, is_co_primary, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_primary_enabled, semi_sync_primary_timeout, semi_sync_primary_wait_for_replica_count, semi_sync_replica_enabled, semi_sync_primary_status, semi_sync_primary_clients, semi_sync_replica_status, last_discovery_latency, is_disk_stalled, last_seen)
VALUES
(?, ?, ?, DATETIME('now'), DATETIME('now'), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, DATETIME('now')),
(?, ?, ?, DATETIME('now'), DATETIME('now'), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, DATETIME('now')),
@@ -515,19 +515,19 @@ func TestUpdateInstanceLastChecked(t *testing.T) {
tabletAlias: "zone1-0000000100",
partialSuccess: false,
stalledDisk: false,
- conditionToCheck: "last_checked >= DATETIME('now', '-30 second') and last_check_partial_success = false and stalled_disk = false",
+ conditionToCheck: "last_checked >= DATETIME('now', '-30 second') and last_check_partial_success = false and is_disk_stalled = false",
}, {
name: "Verify partial success",
tabletAlias: "zone1-0000000100",
partialSuccess: true,
stalledDisk: false,
- conditionToCheck: "last_checked >= datetime('now', '-30 second') and last_check_partial_success = true and stalled_disk = false",
+ conditionToCheck: "last_checked >= datetime('now', '-30 second') and last_check_partial_success = true and is_disk_stalled = false",
}, {
name: "Verify stalled disk",
tabletAlias: "zone1-0000000100",
partialSuccess: false,
stalledDisk: true,
- conditionToCheck: "last_checked >= DATETIME('now', '-30 second') and last_check_partial_success = false and stalled_disk = true",
+ conditionToCheck: "last_checked >= DATETIME('now', '-30 second') and last_check_partial_success = false and is_disk_stalled = true",
}, {
name: "Verify no error on unknown tablet",
tabletAlias: "unknown tablet",
diff --git a/go/vt/vtorc/logic/topology_recovery.go b/go/vt/vtorc/logic/topology_recovery.go
index 1bf69098b65..ab41d1fa988 100644
--- a/go/vt/vtorc/logic/topology_recovery.go
+++ b/go/vt/vtorc/logic/topology_recovery.go
@@ -285,7 +285,7 @@ func checkAndRecoverGenericProblem(ctx context.Context, analysisEntry *inst.Repl
func getCheckAndRecoverFunctionCode(analysisCode inst.AnalysisCode, tabletAlias string) recoveryFunction {
switch analysisCode {
// primary
- case inst.DeadPrimary, inst.DeadPrimaryAndSomeReplicas, inst.StalledDiskPrimary:
+ case inst.DeadPrimary, inst.DeadPrimaryAndSomeReplicas, inst.PrimaryDiskStalled:
// If ERS is disabled, we have no way of repairing the cluster.
if !config.ERSEnabled() {
log.Infof("VTOrc not configured to run ERS, skipping recovering %v", analysisCode)
diff --git a/go/vt/vtorc/logic/topology_recovery_test.go b/go/vt/vtorc/logic/topology_recovery_test.go
index e539468dda4..ca164d78836 100644
--- a/go/vt/vtorc/logic/topology_recovery_test.go
+++ b/go/vt/vtorc/logic/topology_recovery_test.go
@@ -45,7 +45,7 @@ func TestAnalysisEntriesHaveSameRecovery(t *testing.T) {
}, {
// DeadPrimary and StalledDiskPrimary have the same recovery
prevAnalysisCode: inst.DeadPrimary,
- newAnalysisCode: inst.StalledDiskPrimary,
+ newAnalysisCode: inst.PrimaryDiskStalled,
shouldBeEqual: true,
}, {
// DeadPrimary and PrimaryTabletDeleted are different recoveries.
@@ -223,12 +223,12 @@ func TestGetCheckAndRecoverFunctionCode(t *testing.T) {
}, {
name: "StalledDiskPrimary with ERS enabled",
ersEnabled: true,
- analysisCode: inst.StalledDiskPrimary,
+ analysisCode: inst.PrimaryDiskStalled,
wantRecoveryFunction: recoverDeadPrimaryFunc,
}, {
name: "StalledDiskPrimary with ERS disabled",
ersEnabled: false,
- analysisCode: inst.StalledDiskPrimary,
+ analysisCode: inst.PrimaryDiskStalled,
wantRecoveryFunction: noRecoveryFunc,
}, {
name: "PrimaryTabletDeleted with ERS enabled",
diff --git a/go/vt/vtorc/test/recovery_analysis.go b/go/vt/vtorc/test/recovery_analysis.go
index 3a0bdb70b03..bb6e4132243 100644
--- a/go/vt/vtorc/test/recovery_analysis.go
+++ b/go/vt/vtorc/test/recovery_analysis.go
@@ -146,7 +146,7 @@ func (info *InfoForRecoveryAnalysis) ConvertToRowMap() sqlutils.RowMap {
rowMap["semi_sync_replica_enabled"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.SemiSyncReplicaEnabled), Valid: true}
res, _ := prototext.Marshal(info.TabletInfo)
rowMap["tablet_info"] = sqlutils.CellData{String: string(res), Valid: true}
- rowMap["is_stalled_disk"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsStalledDisk), Valid: true}
+ rowMap["is_disk_stalled"] = sqlutils.CellData{String: fmt.Sprintf("%v", info.IsStalledDisk), Valid: true}
return rowMap
}
diff --git a/go/vt/vttablet/tabletmanager/disk_health_monitor_test.go b/go/vt/vttablet/tabletmanager/disk_health_monitor_test.go
index fcb3b0ed258..68930f3061d 100644
--- a/go/vt/vttablet/tabletmanager/disk_health_monitor_test.go
+++ b/go/vt/vttablet/tabletmanager/disk_health_monitor_test.go
@@ -46,7 +46,7 @@ func TestDiskHealthMonitor_stallAndRecover(t *testing.T) {
}
}
-func TestDiskHealthMonitor_errorIsStall(t *testing.T) {
+func TestDiskHealthMonitor_stallDetected(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
mockFileWriter := &sequencedMockWriter{defaultWriteFunction: delayedWriteFunction(10*time.Millisecond, errors.New("test error"))}
diff --git a/go/vt/vttablet/tabletmanager/rpc_replication.go b/go/vt/vttablet/tabletmanager/rpc_replication.go
index bce172b52d4..b27b25d87c6 100644
--- a/go/vt/vttablet/tabletmanager/rpc_replication.go
+++ b/go/vt/vttablet/tabletmanager/rpc_replication.go
@@ -63,7 +63,7 @@ func (tm *TabletManager) FullStatus(ctx context.Context) (*replicationdatapb.Ful
}
// Return error if the disk is stalled or rejecting writes.
- // Noop by default, must be enabled with the flag "stalled-disk-write-dir".
+ // Noop by default, must be enabled with the flag "disk-write-dir".
if tm.dhMonitor.IsDiskStalled() {
return nil, errors.New("stalled disk")
}
diff --git a/go/vt/vttablet/tabletmanager/tm_init.go b/go/vt/vttablet/tabletmanager/tm_init.go
index 6da165ab330..c22ea0a6e51 100644
--- a/go/vt/vttablet/tabletmanager/tm_init.go
+++ b/go/vt/vttablet/tabletmanager/tm_init.go
@@ -112,9 +112,9 @@ func registerInitFlags(fs *pflag.FlagSet) {
fs.Var(&initTags, "init_tags", "(init parameter) comma separated list of key:value pairs used to tag the tablet")
fs.DurationVar(&initTimeout, "init_timeout", initTimeout, "(init parameter) timeout to use for the init phase.")
fs.DurationVar(&mysqlShutdownTimeout, "mysql-shutdown-timeout", mysqlShutdownTimeout, "timeout to use when MySQL is being shut down.")
- fs.StringVar(&stalledDiskWriteDir, "stalled-disk-write-dir", stalledDiskWriteDir, "if provided, tablet will attempt to write a file to this directory to check if the disk is stalled")
- fs.DurationVar(&stalledDiskWriteTimeout, "stalled-disk-write-timeout", stalledDiskWriteTimeout, "if writes exceed this duration, the disk is considered stalled")
- fs.DurationVar(&stalledDiskWriteInterval, "stalled-disk-write-interval", stalledDiskWriteInterval, "how often to write to the disk to check whether it is stalled")
+ fs.StringVar(&stalledDiskWriteDir, "disk-write-dir", stalledDiskWriteDir, "if provided, tablet will attempt to write a file to this directory to check if the disk is stalled")
+ fs.DurationVar(&stalledDiskWriteTimeout, "disk-write-timeout", stalledDiskWriteTimeout, "if writes exceed this duration, the disk is considered stalled")
+ fs.DurationVar(&stalledDiskWriteInterval, "disk-write-interval", stalledDiskWriteInterval, "how often to write to the disk to check whether it is stalled")
}
var (