From e48177366a6f1a0c6a410095d4dcace27a15002b Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 29 Apr 2026 13:53:53 +0800 Subject: [PATCH 1/3] pd: use physically destroyed when tiup using --force Signed-off-by: lhy1024 --- pkg/cluster/api/pdapi.go | 13 +++++++++++++ pkg/cluster/operation/scale_in.go | 29 ++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/pkg/cluster/api/pdapi.go b/pkg/cluster/api/pdapi.go index d61450451b..8e2e26b0a5 100644 --- a/pkg/cluster/api/pdapi.go +++ b/pkg/cluster/api/pdapi.go @@ -800,6 +800,16 @@ func (pc *PDClient) IsUp(host string) (bool, error) { // DelStore deletes stores from a (TiKV) host // The host parameter should be in format of IP:Port, that matches store's address func (pc *PDClient) DelStore(host string, retryOpt *utils.RetryOption) error { + return pc.delStore(host, retryOpt, false) +} + +// DelStorePhysicallyDestroyed deletes a store and tells PD that the store has +// been physically destroyed, so PD can bypass replica-count checks. +func (pc *PDClient) DelStorePhysicallyDestroyed(host string, retryOpt *utils.RetryOption) error { + return pc.delStore(host, retryOpt, true) +} + +func (pc *PDClient) delStore(host string, retryOpt *utils.RetryOption, physicallyDestroyed bool) error { // get info of current stores storeInfo, err := pc.GetCurrentStore(host) if err != nil { @@ -813,6 +823,9 @@ func (pc *PDClient) DelStore(host string, retryOpt *utils.RetryOption) error { storeID := storeInfo.Store.Id cmd := fmt.Sprintf("%s/%d", pdStoreURI, storeID) + if physicallyDestroyed { + cmd += "?force=true" + } endpoints := pc.getEndpoints(cmd) logger := pc.l() diff --git a/pkg/cluster/operation/scale_in.go b/pkg/cluster/operation/scale_in.go index 1c04b3f8c9..121751dcb2 100644 --- a/pkg/cluster/operation/scale_in.go +++ b/pkg/cluster/operation/scale_in.go @@ -192,7 +192,7 @@ func ScaleInCluster( compName := component.Name() if compName != spec.ComponentPump && compName != spec.ComponentDrainer { - if err := deleteMember(ctx, component, instance, pdClient, binlogClient, options.APITimeout); err != nil { + if err := deletePhysicallyDestroyedMember(ctx, component, instance, pdClient, binlogClient, options.APITimeout); err != nil { logger.Warnf("failed to delete %s: %v", compName, err) } } @@ -387,6 +387,29 @@ func deleteMember( pdClient *api.PDClient, binlogClient *api.BinlogClient, timeoutSecond uint64, +) error { + return deleteMemberWithStoreDelete(ctx, component, instance, pdClient, pdClient.DelStore, binlogClient, timeoutSecond) +} + +func deletePhysicallyDestroyedMember( + ctx context.Context, + component spec.Component, + instance spec.Instance, + pdClient *api.PDClient, + binlogClient *api.BinlogClient, + timeoutSecond uint64, +) error { + return deleteMemberWithStoreDelete(ctx, component, instance, pdClient, pdClient.DelStorePhysicallyDestroyed, binlogClient, timeoutSecond) +} + +func deleteMemberWithStoreDelete( + ctx context.Context, + component spec.Component, + instance spec.Instance, + pdClient *api.PDClient, + delStore func(string, *utils.RetryOption) error, + binlogClient *api.BinlogClient, + timeoutSecond uint64, ) error { timeoutOpt := &utils.RetryOption{ Timeout: time.Second * time.Duration(timeoutSecond), @@ -395,12 +418,12 @@ func deleteMember( switch component.Name() { case spec.ComponentTiKV: - if err := pdClient.DelStore(instance.ID(), timeoutOpt); err != nil { + if err := delStore(instance.ID(), timeoutOpt); err != nil { return err } case spec.ComponentTiFlash: addr := utils.JoinHostPort(instance.GetHost(), instance.(*spec.TiFlashInstance).GetServicePort()) - if err := pdClient.DelStore(addr, timeoutOpt); err != nil { + if err := delStore(addr, timeoutOpt); err != nil { return err } case spec.ComponentPD: From 3430a625b73a9d4c24c15be35b341e0959e37026 Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Wed, 29 Apr 2026 14:08:50 +0800 Subject: [PATCH 2/3] pd: fix delete store lint and add test Signed-off-by: lhy1024 --- pkg/cluster/api/pdapi.go | 6 +-- pkg/cluster/api/pdapi_test.go | 89 +++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 pkg/cluster/api/pdapi_test.go diff --git a/pkg/cluster/api/pdapi.go b/pkg/cluster/api/pdapi.go index 8e2e26b0a5..b06cad64f3 100644 --- a/pkg/cluster/api/pdapi.go +++ b/pkg/cluster/api/pdapi.go @@ -800,16 +800,16 @@ func (pc *PDClient) IsUp(host string) (bool, error) { // DelStore deletes stores from a (TiKV) host // The host parameter should be in format of IP:Port, that matches store's address func (pc *PDClient) DelStore(host string, retryOpt *utils.RetryOption) error { - return pc.delStore(host, retryOpt, false) + return pc.deleteStore(host, retryOpt, false) } // DelStorePhysicallyDestroyed deletes a store and tells PD that the store has // been physically destroyed, so PD can bypass replica-count checks. func (pc *PDClient) DelStorePhysicallyDestroyed(host string, retryOpt *utils.RetryOption) error { - return pc.delStore(host, retryOpt, true) + return pc.deleteStore(host, retryOpt, true) } -func (pc *PDClient) delStore(host string, retryOpt *utils.RetryOption, physicallyDestroyed bool) error { +func (pc *PDClient) deleteStore(host string, retryOpt *utils.RetryOption, physicallyDestroyed bool) error { // get info of current stores storeInfo, err := pc.GetCurrentStore(host) if err != nil { diff --git a/pkg/cluster/api/pdapi_test.go b/pkg/cluster/api/pdapi_test.go new file mode 100644 index 0000000000..867da1d739 --- /dev/null +++ b/pkg/cluster/api/pdapi_test.go @@ -0,0 +1,89 @@ +// Copyright 2026 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package api + +import ( + "context" + "fmt" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + "time" + + "github.com/pingcap/kvproto/pkg/metapb" + logprinter "github.com/pingcap/tiup/pkg/logger/printer" + "github.com/pingcap/tiup/pkg/utils" + "github.com/stretchr/testify/require" +) + +func TestDelStorePhysicallyDestroyedQuery(t *testing.T) { + tests := []struct { + name string + delStore func(*PDClient, string, *utils.RetryOption) error + expectedDeleteRawQuery string + }{ + { + name: "regular delete", + delStore: (*PDClient).DelStore, + expectedDeleteRawQuery: "", + }, + { + name: "physically destroyed delete", + delStore: (*PDClient).DelStorePhysicallyDestroyed, + expectedDeleteRawQuery: "force=true", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var deleted atomic.Bool + var deleteRawQuery string + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/pd/api/v1/stores": + state := metapb.StoreState_Up + if deleted.Load() { + state = metapb.StoreState_Offline + } + fmt.Fprintf(w, `{"count":1,"stores":[{"store":{"id":42,"address":"store-1:20160","state":%d}}]}`, state) + case r.Method == http.MethodDelete && r.URL.Path == "/pd/api/v1/store/42": + deleteRawQuery = r.URL.RawQuery + deleted.Store(true) + fmt.Fprint(w, `"The store is set as Offline."`) + default: + http.NotFound(w, r) + } + })) + defer server.Close() + + logger := logprinter.NewLogger("") + logger.SetStdout(io.Discard) + logger.SetStderr(io.Discard) + ctx := context.WithValue(context.Background(), logprinter.ContextKeyLogger, logger) + client := NewPDClient(ctx, []string{strings.TrimPrefix(server.URL, "http://")}, time.Second, nil) + + err := tt.delStore(client, "store-1:20160", &utils.RetryOption{ + Delay: time.Millisecond, + Timeout: time.Second, + }) + + require.NoError(t, err) + require.Equal(t, tt.expectedDeleteRawQuery, deleteRawQuery) + }) + } +} From 77a4237377f11ef9a7a8c6f447319bea8611f3fa Mon Sep 17 00:00:00 2001 From: lhy1024 Date: Thu, 30 Apr 2026 15:43:30 +0800 Subject: [PATCH 3/3] pd: abort force scale-in on delete failure Signed-off-by: lhy1024 --- pkg/cluster/api/pdapi.go | 13 ----- pkg/cluster/api/pdapi_test.go | 83 ++++++++++++------------------- pkg/cluster/operation/scale_in.go | 31 ++---------- 3 files changed, 35 insertions(+), 92 deletions(-) diff --git a/pkg/cluster/api/pdapi.go b/pkg/cluster/api/pdapi.go index b06cad64f3..d61450451b 100644 --- a/pkg/cluster/api/pdapi.go +++ b/pkg/cluster/api/pdapi.go @@ -800,16 +800,6 @@ func (pc *PDClient) IsUp(host string) (bool, error) { // DelStore deletes stores from a (TiKV) host // The host parameter should be in format of IP:Port, that matches store's address func (pc *PDClient) DelStore(host string, retryOpt *utils.RetryOption) error { - return pc.deleteStore(host, retryOpt, false) -} - -// DelStorePhysicallyDestroyed deletes a store and tells PD that the store has -// been physically destroyed, so PD can bypass replica-count checks. -func (pc *PDClient) DelStorePhysicallyDestroyed(host string, retryOpt *utils.RetryOption) error { - return pc.deleteStore(host, retryOpt, true) -} - -func (pc *PDClient) deleteStore(host string, retryOpt *utils.RetryOption, physicallyDestroyed bool) error { // get info of current stores storeInfo, err := pc.GetCurrentStore(host) if err != nil { @@ -823,9 +813,6 @@ func (pc *PDClient) deleteStore(host string, retryOpt *utils.RetryOption, physic storeID := storeInfo.Store.Id cmd := fmt.Sprintf("%s/%d", pdStoreURI, storeID) - if physicallyDestroyed { - cmd += "?force=true" - } endpoints := pc.getEndpoints(cmd) logger := pc.l() diff --git a/pkg/cluster/api/pdapi_test.go b/pkg/cluster/api/pdapi_test.go index 867da1d739..f44acb1901 100644 --- a/pkg/cluster/api/pdapi_test.go +++ b/pkg/cluster/api/pdapi_test.go @@ -30,60 +30,39 @@ import ( "github.com/stretchr/testify/require" ) -func TestDelStorePhysicallyDestroyedQuery(t *testing.T) { - tests := []struct { - name string - delStore func(*PDClient, string, *utils.RetryOption) error - expectedDeleteRawQuery string - }{ - { - name: "regular delete", - delStore: (*PDClient).DelStore, - expectedDeleteRawQuery: "", - }, - { - name: "physically destroyed delete", - delStore: (*PDClient).DelStorePhysicallyDestroyed, - expectedDeleteRawQuery: "force=true", - }, - } +func TestDelStoreDoesNotUseForceQuery(t *testing.T) { + var deleted atomic.Bool + var deleteRawQuery string - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - var deleted atomic.Bool - var deleteRawQuery string + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/pd/api/v1/stores": + state := metapb.StoreState_Up + if deleted.Load() { + state = metapb.StoreState_Offline + } + fmt.Fprintf(w, `{"count":1,"stores":[{"store":{"id":42,"address":"store-1:20160","state":%d}}]}`, state) + case r.Method == http.MethodDelete && r.URL.Path == "/pd/api/v1/store/42": + deleteRawQuery = r.URL.RawQuery + deleted.Store(true) + fmt.Fprint(w, `"The store is set as Offline."`) + default: + http.NotFound(w, r) + } + })) + defer server.Close() - server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - switch { - case r.Method == http.MethodGet && r.URL.Path == "/pd/api/v1/stores": - state := metapb.StoreState_Up - if deleted.Load() { - state = metapb.StoreState_Offline - } - fmt.Fprintf(w, `{"count":1,"stores":[{"store":{"id":42,"address":"store-1:20160","state":%d}}]}`, state) - case r.Method == http.MethodDelete && r.URL.Path == "/pd/api/v1/store/42": - deleteRawQuery = r.URL.RawQuery - deleted.Store(true) - fmt.Fprint(w, `"The store is set as Offline."`) - default: - http.NotFound(w, r) - } - })) - defer server.Close() + logger := logprinter.NewLogger("") + logger.SetStdout(io.Discard) + logger.SetStderr(io.Discard) + ctx := context.WithValue(context.Background(), logprinter.ContextKeyLogger, logger) + client := NewPDClient(ctx, []string{strings.TrimPrefix(server.URL, "http://")}, time.Second, nil) - logger := logprinter.NewLogger("") - logger.SetStdout(io.Discard) - logger.SetStderr(io.Discard) - ctx := context.WithValue(context.Background(), logprinter.ContextKeyLogger, logger) - client := NewPDClient(ctx, []string{strings.TrimPrefix(server.URL, "http://")}, time.Second, nil) + err := client.DelStore("store-1:20160", &utils.RetryOption{ + Delay: time.Millisecond, + Timeout: time.Second, + }) - err := tt.delStore(client, "store-1:20160", &utils.RetryOption{ - Delay: time.Millisecond, - Timeout: time.Second, - }) - - require.NoError(t, err) - require.Equal(t, tt.expectedDeleteRawQuery, deleteRawQuery) - }) - } + require.NoError(t, err) + require.Empty(t, deleteRawQuery) } diff --git a/pkg/cluster/operation/scale_in.go b/pkg/cluster/operation/scale_in.go index 121751dcb2..f7c66b09dd 100644 --- a/pkg/cluster/operation/scale_in.go +++ b/pkg/cluster/operation/scale_in.go @@ -192,8 +192,8 @@ func ScaleInCluster( compName := component.Name() if compName != spec.ComponentPump && compName != spec.ComponentDrainer { - if err := deletePhysicallyDestroyedMember(ctx, component, instance, pdClient, binlogClient, options.APITimeout); err != nil { - logger.Warnf("failed to delete %s: %v", compName, err) + if err := deleteMember(ctx, component, instance, pdClient, binlogClient, options.APITimeout); err != nil { + return errors.Trace(err) } } @@ -387,29 +387,6 @@ func deleteMember( pdClient *api.PDClient, binlogClient *api.BinlogClient, timeoutSecond uint64, -) error { - return deleteMemberWithStoreDelete(ctx, component, instance, pdClient, pdClient.DelStore, binlogClient, timeoutSecond) -} - -func deletePhysicallyDestroyedMember( - ctx context.Context, - component spec.Component, - instance spec.Instance, - pdClient *api.PDClient, - binlogClient *api.BinlogClient, - timeoutSecond uint64, -) error { - return deleteMemberWithStoreDelete(ctx, component, instance, pdClient, pdClient.DelStorePhysicallyDestroyed, binlogClient, timeoutSecond) -} - -func deleteMemberWithStoreDelete( - ctx context.Context, - component spec.Component, - instance spec.Instance, - pdClient *api.PDClient, - delStore func(string, *utils.RetryOption) error, - binlogClient *api.BinlogClient, - timeoutSecond uint64, ) error { timeoutOpt := &utils.RetryOption{ Timeout: time.Second * time.Duration(timeoutSecond), @@ -418,12 +395,12 @@ func deleteMemberWithStoreDelete( switch component.Name() { case spec.ComponentTiKV: - if err := delStore(instance.ID(), timeoutOpt); err != nil { + if err := pdClient.DelStore(instance.ID(), timeoutOpt); err != nil { return err } case spec.ComponentTiFlash: addr := utils.JoinHostPort(instance.GetHost(), instance.(*spec.TiFlashInstance).GetServicePort()) - if err := delStore(addr, timeoutOpt); err != nil { + if err := pdClient.DelStore(addr, timeoutOpt); err != nil { return err } case spec.ComponentPD: