From e1e035aa79a360a33c99430a99b2dda8c38e4af3 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Wed, 10 Jun 2026 12:25:32 +0100 Subject: [PATCH 1/7] DAOS-19008 control: Erase formatting after failed format --replace Signed-off-by: Tom Nabarro --- src/control/cmd/dmg/storage_test.go | 19 ++++++++ src/control/server/ctl_storage_rpc.go | 23 ++++----- src/control/server/ctl_storage_rpc_test.go | 2 +- src/control/server/instance.go | 20 ++++++-- src/control/server/instance_storage.go | 54 ++++++++++++++++++++++ src/control/system/membership.go | 3 -- src/control/system/membership_test.go | 15 +++++- 7 files changed, 116 insertions(+), 20 deletions(-) diff --git a/src/control/cmd/dmg/storage_test.go b/src/control/cmd/dmg/storage_test.go index 124b46e984a..2f0fdb50588 100644 --- a/src/control/cmd/dmg/storage_test.go +++ b/src/control/cmd/dmg/storage_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -154,6 +155,24 @@ func TestStorageCommands(t *testing.T) { printRequest(t, nvmeAddDeviceReq().WithStorageTierIndex(0)), nil, }, + { + "Format with replace; no hosts in hostlist", + "storage format --replace", + "", + errors.New("expects a single host"), + }, + { + "Format with replace; multiple hosts in hostlist", + "storage format --replace -l foo[1,2].com", + "", + errors.New("expects a single host"), + }, + { + "Format with replace and force", + "storage format --replace --force", + "", + errors.New("may not be mixed with --force"), + }, { "Nonexistent subcommand", "storage quack", diff --git a/src/control/server/ctl_storage_rpc.go b/src/control/server/ctl_storage_rpc.go index 4efefd6c32f..14eda4d4fd4 100644 --- a/src/control/server/ctl_storage_rpc.go +++ b/src/control/server/ctl_storage_rpc.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // (C) Copyright 2025 Google LLC // // SPDX-License-Identifier: BSD-2-Clause-Patent @@ -856,10 +856,10 @@ type formatScmReq struct { } func formatScm(ctx context.Context, req formatScmReq, resp *ctlpb.StorageFormatResp) (map[int]string, map[int]bool, error) { - needFormat := make(map[int]bool) + needScmFormat := make(map[int]bool) emptyTmpfs := make(map[int]bool) scmCfgs := make(map[int]*storage.TierConfig) - allNeedFormat := true + allNeedScmFormat := true for idx, ei := range req.instances { needs, err := ei.GetStorage().ScmNeedsFormat() @@ -867,9 +867,9 @@ func formatScm(ctx context.Context, req formatScmReq, resp *ctlpb.StorageFormatR return nil, nil, errors.Wrap(err, "detecting if SCM format is needed") } if needs { - needFormat[idx] = true + needScmFormat[idx] = true } else { - allNeedFormat = false + allNeedScmFormat = false } scmCfg, err := ei.GetStorage().GetScmConfig() @@ -882,19 +882,20 @@ func formatScm(ctx context.Context, req formatScmReq, resp *ctlpb.StorageFormatR if scmCfg.Class == storage.ClassRam && !needs { info, err := ei.GetStorage().GetScmUsage() if err != nil { - return nil, nil, errors.Wrapf(err, "failed to check SCM usage for instance %d", idx) + return nil, nil, errors.Wrapf(err, + "failed to check SCM usage for instance %d", idx) } emptyTmpfs[idx] = info.TotalBytes-info.AvailBytes == 0 } } - if req.replace && len(needFormat) == 0 { + if req.replace && len(needScmFormat) == 0 { // Only valid if at least one engine requires format. - return nil, nil, errors.New("format replace option only valid if at " + - "least one engine requires format but no engines need format") + return nil, nil, errors.New("format replace option only valid if at least one " + + "engine requires scm-format but currently no engines need scm-format") } - if allNeedFormat { + if allNeedScmFormat { // Check available RAM is sufficient before formatting SCM on engines. if err := checkTmpfsMem(req.log, scmCfgs, req.getSysMemInfo); err != nil { return nil, nil, err @@ -907,7 +908,7 @@ func formatScm(ctx context.Context, req formatScmReq, resp *ctlpb.StorageFormatR formatting := 0 for idx, ei := range req.instances { - if needFormat[idx] || req.reformat { + if needScmFormat[idx] || req.reformat { formatting++ go func(e Engine) { scmChan <- e.StorageFormatSCM(ctx, req.reformat) diff --git a/src/control/server/ctl_storage_rpc_test.go b/src/control/server/ctl_storage_rpc_test.go index cde9c4387f8..e9f6ca70432 100644 --- a/src/control/server/ctl_storage_rpc_test.go +++ b/src/control/server/ctl_storage_rpc_test.go @@ -2360,7 +2360,7 @@ func TestServer_CtlSvc_StorageFormat(t *testing.T) { }, }, }, - expErr: errors.New("only valid if at least one engine requires format"), + expErr: errors.New("only valid if at least one engine requires scm-format"), expResp: &ctlpb.StorageFormatResp{ Crets: []*ctlpb.NvmeControllerResult{ { diff --git a/src/control/server/instance.go b/src/control/server/instance.go index c763ae1b7a9..02be3b4151a 100644 --- a/src/control/server/instance.go +++ b/src/control/server/instance.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -216,9 +216,24 @@ func (ei *EngineInstance) determineRank(ctx context.Context, ready *srvpb.Notify Replace: ei.replaceRank.Load(), } + // Reset replaceRank state for instance after joinSystem() has been attempted. + defer ei.replaceRank.SetFalse() + resp, err := ei.joinSystem(ctx, joinReq) if err != nil { ei.log.Errorf("join failed: %s", err) + + // If this is a replace operation and join failed, clean up the formatted storage to + // prevent leaving the rank in a formatted state. This prevents the engine + // inadvertently being joined later with a new rank. + if ei.replaceRank.Load() { + ei.log.Infof("cleaning up after join failure during replace") + if cleanupErr := ei.cleanupFailedJoinReplace(ctx); cleanupErr != nil { + ei.log.Errorf("failed to cleanup after join failure: %v", cleanupErr) + // Don't override the original join error + } + } + return ranklist.NilRank, false, 0, err } switch resp.State { @@ -237,9 +252,6 @@ func (ei *EngineInstance) determineRank(ctx context.Context, ready *srvpb.Notify } r = ranklist.Rank(resp.Rank) - // Reset replaceRank state for instance after joinSystem() has returned. - ei.replaceRank.SetFalse() - if !superblock.ValidRank || ready.Uri != superblock.URI { ei.log.Noticef("updating rank %d URI to %s", resp.Rank, ready.Uri) superblock.Rank = new(ranklist.Rank) diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index ab8f8adcb98..ec258b868a9 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -12,6 +12,7 @@ import ( "context" "fmt" "os" + "syscall" "github.com/dustin/go-humanize" "github.com/pkg/errors" @@ -76,6 +77,59 @@ func (ei *EngineInstance) NotifyStorageReady(replaceRank bool) { }() } +// cleanupFailedJoinReplace cleans up storage after a join failure during replace operation. +// This is called when format succeeded but the join to the system failed, leaving +// the storage in a partially initialized state. +func (ei *EngineInstance) cleanupFailedJoinReplace(ctx context.Context) error { + idx := ei.Index() + ei.log.Infof("instance %d: cleaning up after join failure during replace", idx) + + storageProv := ei.GetStorage() + + // Get SCM config to access mount point and class + scmCfg, err := storageProv.GetScmConfig() + if err != nil { + return errors.Wrap(err, "failed to get SCM config") + } + + if scmCfg == nil { + ei.log.Debugf("instance %d: no SCM config, nothing to clean", idx) + return nil + } + + if ei.IsStarted() { + ei.log.Infof("instance %d: stopping engine before cleanup", idx) + if err := ei.Stop(syscall.SIGKILL); err != nil { + return errors.Wrap(err, "failed to stop engine") + } + + pollFn := func(e Engine) bool { return !e.IsStarted() } + if err := pollInstanceState(ctx, []Engine{ei}, pollFn); err != nil { + return errors.Wrap(err, "waiting for engine to stop") + } + ei.log.Debugf("instance %d: engine stopped successfully", idx) + } + + // For RAM-based SCM (tmpfs), unmount to reset state + if scmCfg.Class == storage.ClassRam { + ei.log.Debugf("instance %d: unmounting tmpfs at %s", idx, scmCfg.Scm.MountPoint) + if err := storageProv.UnmountTmpfs(); err != nil { + ei.log.Errorf("instance %d: unmount failed: %v", idx, err) + // Continue anyway - log the error but don't fail the cleanup + } else { + ei.log.Debugf("instance %d: tmpfs unmounted successfully", idx) + } + } + + // Removing superblock prevents subsequent join without reformat. + if err := ei.RemoveSuperblock(); err != nil { + return err + } + + ei.log.Infof("instance %d: cleanup after join failure complete", idx) + return nil +} + func (ei *EngineInstance) checkScmNeedFormat() (bool, error) { msgIdx := fmt.Sprintf("instance %d", ei.Index()) diff --git a/src/control/system/membership.go b/src/control/system/membership.go index 691243c0682..d7773870ea0 100644 --- a/src/control/system/membership.go +++ b/src/control/system/membership.go @@ -198,9 +198,6 @@ func (m *Membership) joinReplace(req *JoinRequest) (*JoinResponse, error) { return nil, err } - if cm.State == MemberStateAdminExcluded { - return nil, ErrJoinAdminExcluded(cm.UUID, cm.Rank) - } memberToReplace := &Member{} *memberToReplace = *cm diff --git a/src/control/system/membership_test.go b/src/control/system/membership_test.go index 7d3e2db8473..e6a599d4c82 100644 --- a/src/control/system/membership_test.go +++ b/src/control/system/membership_test.go @@ -1029,7 +1029,20 @@ func TestSystem_Membership_Join(t *testing.T) { FabricContexts: curMember.PrimaryFabricContexts, FaultDomain: curMember.FaultDomain, }, - expErr: ErrJoinAdminExcluded(adminExcludedMember.UUID, 0), + expResp: &JoinResponse{ + Created: false, + Member: func() *Member { + cm := *adminExcludedMember + cm.Rank = 0 + cm.UUID = curMember.UUID + cm.State = MemberStateJoined + cm.Info = "" + return &cm + }(), + PrevState: MemberStateAdminExcluded, + // Extra map increment because of remove and add operations. + MapVersion: 3, + }, }, "successful replace; different UUID but otherwise identical member": { req: &JoinRequest{ From f343e59cbc940674abffc3f580f997dd787e3b83 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Wed, 10 Jun 2026 13:34:00 +0100 Subject: [PATCH 2/7] consolidate test engine setup logic into helpers Signed-off-by: Tom Nabarro --- src/control/server/ctl_firmware_test.go | 15 +++----- src/control/server/ctl_ranks_rpc_test.go | 11 ------ src/control/server/ctl_svc_test.go | 14 +------ src/control/server/instance_drpc_test.go | 16 +++----- src/control/server/server_utils_test.go | 16 -------- src/control/server/util_test.go | 47 +++++++++++++++++++++++- 6 files changed, 58 insertions(+), 61 deletions(-) diff --git a/src/control/server/ctl_firmware_test.go b/src/control/server/ctl_firmware_test.go index cab959063ec..7cda1fcfe28 100644 --- a/src/control/server/ctl_firmware_test.go +++ b/src/control/server/ctl_firmware_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -15,10 +16,8 @@ import ( "github.com/daos-stack/daos/src/control/common/proto/convert" ctlpb "github.com/daos-stack/daos/src/control/common/proto/ctl" "github.com/daos-stack/daos/src/control/common/test" - "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/server/config" - "github.com/daos-stack/daos/src/control/server/engine" "github.com/daos-stack/daos/src/control/server/storage" "github.com/daos-stack/daos/src/control/server/storage/bdev" "github.com/daos-stack/daos/src/control/server/storage/scm" @@ -803,14 +802,10 @@ func TestCtlSvc_FirmwareUpdate(t *testing.T) { cfg := config.DefaultServer() cs := mockControlService(t, log, cfg, tc.bmbc, tc.smbc, nil) for i := 0; i < 2; i++ { - rCfg := new(engine.TestRunnerConfig) - rCfg.Running.Store(tc.enginesRunning) - runner := engine.NewTestRunner(rCfg, engine.MockConfig()) - instance := NewEngineInstance(log, nil, nil, runner, nil) - if !tc.noRankEngines { - instance._superblock = &Superblock{} - instance._superblock.ValidRank = true - instance._superblock.Rank = ranklist.NewRankPtr(uint32(i)) + instance := NewEngineInstance(log, nil, nil, nil, nil) + setupTestEngine(t, instance, uint32(i), !tc.enginesRunning) + if tc.noRankEngines { + instance._superblock = nil } if err := cs.harness.AddInstance(instance); err != nil { t.Fatal(err) diff --git a/src/control/server/ctl_ranks_rpc_test.go b/src/control/server/ctl_ranks_rpc_test.go index 8057325710f..8be48353fb7 100644 --- a/src/control/server/ctl_ranks_rpc_test.go +++ b/src/control/server/ctl_ranks_rpc_test.go @@ -76,17 +76,6 @@ func checkUnorderedRankResults(t *testing.T, expResults, gotResults []*sharedpb. } } -func setupTestEngine(t *testing.T, ei *EngineInstance, rank uint32, stopped ...bool) { - ei._superblock.Rank = ranklist.NewRankPtr(rank) - - trc := &engine.TestRunnerConfig{} - if len(stopped) == 0 || !stopped[0] { - trc.Running.SetTrue() - ei.ready.SetTrue() - } - ei.runner = engine.NewTestRunner(trc, engine.MockConfig()) -} - func TestServer_CtlSvc_PrepShutdownRanks(t *testing.T) { for name, tc := range map[string]struct { missingSB bool diff --git a/src/control/server/ctl_svc_test.go b/src/control/server/ctl_svc_test.go index 9c6bd03f043..09d7b96378b 100644 --- a/src/control/server/ctl_svc_test.go +++ b/src/control/server/ctl_svc_test.go @@ -12,7 +12,6 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/events" - "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/provider/system" "github.com/daos-stack/daos/src/control/server/config" @@ -72,19 +71,10 @@ func newMockControlServiceFromBackends(t *testing.T, log logging.Logger, cfg *co } for idx, ec := range cfg.Engines { - trc := new(engine.TestRunnerConfig) - if started[idx] { - trc.Running.SetTrue() - } - runner := engine.NewTestRunner(trc, ec) storProv := storage.MockProvider(log, 0, &ec.Storage, syp, sp, bp, nil) - - ei := NewEngineInstance(log, storProv, nil, runner, nil) - ei.setSuperblock(&Superblock{ - Rank: ranklist.NewRankPtr(uint32(idx)), - }) + ei := NewEngineInstance(log, storProv, nil, nil, nil) + setupTestEngineWithConfig(t, ei, uint32(idx), ec, !started[idx]) if started[idx] { - ei.ready.SetTrue() ei.setDrpcSocket("/dontcare") } if err := cs.harness.AddInstance(ei); err != nil { diff --git a/src/control/server/instance_drpc_test.go b/src/control/server/instance_drpc_test.go index ecfd395ed68..8fec113f0a6 100644 --- a/src/control/server/instance_drpc_test.go +++ b/src/control/server/instance_drpc_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -26,7 +26,6 @@ import ( "github.com/daos-stack/daos/src/control/lib/daos" . "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" - "github.com/daos-stack/daos/src/control/server/engine" . "github.com/daos-stack/daos/src/control/system" ) @@ -90,10 +89,8 @@ func TestEngineInstance_CallDrpc(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) defer test.ShowBufferOnFailure(t, buf) - trc := engine.TestRunnerConfig{} - trc.Running.Store(!tc.notStarted) - runner := engine.NewTestRunner(&trc, engine.MockConfig()) - instance := NewEngineInstance(log, nil, nil, runner, nil) + instance := NewEngineInstance(log, nil, nil, nil, nil) + setupTestEngine(t, instance, 0, tc.notStarted) instance.ready.Store(!tc.notReady) if !tc.noSocket { @@ -188,11 +185,8 @@ func TestEngineInstance_CallDrpc_Parallel(t *testing.T) { }(t) t.Log("setting up engine...") - trc := engine.TestRunnerConfig{} - trc.Running.Store(true) - runner := engine.NewTestRunner(&trc, engine.MockConfig()) - instance := NewEngineInstance(log, nil, nil, runner, nil) - instance.ready.Store(true) + instance := NewEngineInstance(log, nil, nil, nil, nil) + setupTestEngine(t, instance, 0) instance.getDrpcClientFn = func(s string) drpc.DomainSocketClient { t.Log("fetching drpc client") diff --git a/src/control/server/server_utils_test.go b/src/control/server/server_utils_test.go index e9de4583ba0..7954d9df862 100644 --- a/src/control/server/server_utils_test.go +++ b/src/control/server/server_utils_test.go @@ -2104,22 +2104,6 @@ const ( testProcessingDelay = 100 * time.Millisecond ) -func setupAddTestEngine(t *testing.T, log logging.Logger, h *EngineHarness, isRunning bool, ranks ...uint32) { - t.Helper() - - rank := uint32(1) - if len(ranks) != 0 { - rank = ranks[0] - } - - ei := newTestEngine(log, false, storage.MockProvider(log, 0, nil, nil, nil, nil, nil)) - setupTestEngine(t, ei, rank, !isRunning) - - if err := h.AddInstance(ei); err != nil { - t.Fatal(err) - } -} - func TestServer_handleEngineSelfTerminated(t *testing.T) { testRank := ranklist.Rank(1) testIncarnation := uint64(42) diff --git a/src/control/server/util_test.go b/src/control/server/util_test.go index 44f06c4375b..75dcdb8438b 100644 --- a/src/control/server/util_test.go +++ b/src/control/server/util_test.go @@ -1,6 +1,6 @@ // // (C) Copyright 2019-2024 Intel Corporation. -// (C) Copyright 2025 Hewlett Packard Enterprise Development LP +// (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -290,3 +290,48 @@ func newTestMgmtSvcNonReplica(t *testing.T, log logging.Logger) *mgmtSvc { svc.sysdb = raft.MockDatabaseWithAddr(t, log, nil) return svc } + +// setupTestEngineWithConfig configures an EngineInstance with a custom engine config. +// If cfg is nil, uses engine.MockConfig(). If rank is 0, no rank is assigned. +func setupTestEngineWithConfig(t *testing.T, ei *EngineInstance, rank uint32, cfg *engine.Config, stopped ...bool) { + if ei._superblock == nil { + ei._superblock = &Superblock{} + } + ei._superblock.Rank = ranklist.NewRankPtr(rank) + ei._superblock.ValidRank = true + + trc := &engine.TestRunnerConfig{} + if len(stopped) == 0 || !stopped[0] { + trc.Running.SetTrue() + ei.ready.SetTrue() + } + + if cfg == nil { + cfg = engine.MockConfig() + } + ei.runner = engine.NewTestRunner(trc, cfg) +} + +// setupTestEngine configures an EngineInstance for testing with the specified rank and state. +// If stopped is true (or provided), the engine is marked as stopped, otherwise as running. +// If rank is 0, no rank is assigned to the superblock. +func setupTestEngine(t *testing.T, ei *EngineInstance, rank uint32, stopped ...bool) { + setupTestEngineWithConfig(t, ei, rank, nil, stopped...) +} + +// setupAddTestEngine adds test engine to harness with specified ranks and running state. +func setupAddTestEngine(t *testing.T, log logging.Logger, h *EngineHarness, isRunning bool, ranks ...uint32) { + t.Helper() + + rank := uint32(1) + if len(ranks) != 0 { + rank = ranks[0] + } + + ei := newTestEngine(log, false, storage.MockProvider(log, 0, nil, nil, nil, nil, nil)) + setupTestEngine(t, ei, rank, !isRunning) + + if err := h.AddInstance(ei); err != nil { + t.Fatal(err) + } +} From 550d8530bdb09cfb51d05ada9440b5bbe78ea311 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Fri, 12 Jun 2026 21:05:51 +0100 Subject: [PATCH 3/7] add unit tests for cleanup function Signed-off-by: Tom Nabarro --- src/control/server/ctl_ranks_rpc.go | 4 +- src/control/server/instance_storage.go | 46 ++++--- src/control/server/instance_storage_test.go | 134 ++++++++++++++++++++ 3 files changed, 162 insertions(+), 22 deletions(-) diff --git a/src/control/server/ctl_ranks_rpc.go b/src/control/server/ctl_ranks_rpc.go index 5fcb092f3ea..3907effb794 100644 --- a/src/control/server/ctl_ranks_rpc.go +++ b/src/control/server/ctl_ranks_rpc.go @@ -29,12 +29,14 @@ const ( instanceUpdateDelay = 500 * time.Millisecond ) +type pollValidateFn func(Engine) bool + // pollInstanceState waits for either context to be cancelled/timeout or for the // provided validate function to return true for each of the provided instances. // // Returns true if all instances return true from the validate function, false // if context is cancelled before. -func pollInstanceState(ctx context.Context, instances []Engine, validate func(Engine) bool) error { +func pollInstanceState(ctx context.Context, instances []Engine, validate pollValidateFn) error { ready := make(chan struct{}) go func() { for { diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index ec258b868a9..206b0648392 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -77,10 +77,7 @@ func (ei *EngineInstance) NotifyStorageReady(replaceRank bool) { }() } -// cleanupFailedJoinReplace cleans up storage after a join failure during replace operation. -// This is called when format succeeded but the join to the system failed, leaving -// the storage in a partially initialized state. -func (ei *EngineInstance) cleanupFailedJoinReplace(ctx context.Context) error { +func (ei *EngineInstance) clearFormat(ctx context.Context, stopEngineFn func(context.Context, *EngineInstance) error) error { idx := ei.Index() ei.log.Infof("instance %d: cleaning up after join failure during replace", idx) @@ -99,27 +96,13 @@ func (ei *EngineInstance) cleanupFailedJoinReplace(ctx context.Context) error { if ei.IsStarted() { ei.log.Infof("instance %d: stopping engine before cleanup", idx) - if err := ei.Stop(syscall.SIGKILL); err != nil { - return errors.Wrap(err, "failed to stop engine") - } - - pollFn := func(e Engine) bool { return !e.IsStarted() } - if err := pollInstanceState(ctx, []Engine{ei}, pollFn); err != nil { - return errors.Wrap(err, "waiting for engine to stop") + if err := stopEngineFn(ctx, ei); err != nil { + return err } ei.log.Debugf("instance %d: engine stopped successfully", idx) } - // For RAM-based SCM (tmpfs), unmount to reset state - if scmCfg.Class == storage.ClassRam { - ei.log.Debugf("instance %d: unmounting tmpfs at %s", idx, scmCfg.Scm.MountPoint) - if err := storageProv.UnmountTmpfs(); err != nil { - ei.log.Errorf("instance %d: unmount failed: %v", idx, err) - // Continue anyway - log the error but don't fail the cleanup - } else { - ei.log.Debugf("instance %d: tmpfs unmounted successfully", idx) - } - } + // On RAM-based SCM (tmpfs) unmount here unnecessary as will be done on engine exit // Removing superblock prevents subsequent join without reformat. if err := ei.RemoveSuperblock(); err != nil { @@ -130,6 +113,27 @@ func (ei *EngineInstance) cleanupFailedJoinReplace(ctx context.Context) error { return nil } +// Production implementation of stopEngineFn. +func stopEngine(ctx context.Context, ei *EngineInstance) error { + if err := ei.Stop(syscall.SIGKILL); err != nil { + return errors.Wrap(err, "failed to stop engine") + } + + pollFn := func(e Engine) bool { return !e.IsStarted() } + if err := pollInstanceState(ctx, []Engine{ei}, pollFn); err != nil { + return errors.Wrap(err, "waiting for engine to stop") + } + + return nil +} + +// cleanupFailedJoinReplace cleans up storage after a join failure during replace operation. +// This is called when format succeeded but the join to the system failed, leaving +// the storage in a partially initialized state. +func (ei *EngineInstance) cleanupFailedJoinReplace(ctx context.Context) error { + return ei.clearFormat(ctx, stopEngine) +} + func (ei *EngineInstance) checkScmNeedFormat() (bool, error) { msgIdx := fmt.Sprintf("instance %d", ei.Index()) diff --git a/src/control/server/instance_storage_test.go b/src/control/server/instance_storage_test.go index 3b4ebf82ae0..99f4d611709 100644 --- a/src/control/server/instance_storage_test.go +++ b/src/control/server/instance_storage_test.go @@ -1,5 +1,6 @@ // // (C) Copyright 2020-2024 Intel Corporation. +// (C) Copyright 2026 Hewlett Packard Enterprise Development LP // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -11,6 +12,7 @@ import ( "fmt" "os" "path/filepath" + "strings" "sync" "testing" "time" @@ -23,6 +25,7 @@ import ( "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" "github.com/daos-stack/daos/src/control/provider/system" + sysprov "github.com/daos-stack/daos/src/control/provider/system" "github.com/daos-stack/daos/src/control/server/engine" "github.com/daos-stack/daos/src/control/server/storage" "github.com/daos-stack/daos/src/control/server/storage/scm" @@ -558,3 +561,134 @@ func TestIOEngineInstance_awaitStorageReady(t *testing.T) { }) } } + +func TestEngineInstance_clearFormat(t *testing.T) { + defStopEngine := func(_ context.Context, _ *EngineInstance) error { + return errors.New("stop failed") + } + + for name, tc := range map[string]struct { + scmClass storage.Class + getSCMErr error + unmountErr error + engineStarted bool + stopEngine func(context.Context, *EngineInstance) error + expErr error + expSBCreated bool + expSBRemoved bool + }{ + "no SCM config": { + scmClass: "", + expErr: errors.New("expected exactly 1 SCM tier"), + }, + "get SCM config fails": { + getSCMErr: errors.New("mock config error"), + expErr: errors.New("failed to get SCM config"), + }, + "RAM class not started": { + scmClass: storage.ClassRam, + expErr: nil, + expSBRemoved: true, + }, + "DCPM class not started": { + scmClass: storage.ClassDcpm, + expErr: nil, + }, + "RAM class engine started": { + scmClass: storage.ClassRam, + engineStarted: true, + stopEngine: func(_ context.Context, _ *EngineInstance) error { + return nil + }, + expErr: nil, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + testDir, cleanup := test.CreateTestDir(t) + defer cleanup() + + mnt := "/mnt/daos0" + testMountPoint := filepath.Join(testDir, mnt) + if err := os.MkdirAll(testMountPoint, 0777); err != nil { + t.Fatal(err) + } + + storageCfg := storage.Config{ + Tiers: storage.TierConfigs{}, + } + + if tc.scmClass != "" { + storageCfg.Tiers = append(storageCfg.Tiers, &storage.TierConfig{ + Class: tc.scmClass, + Scm: storage.ScmConfig{ + MountPoint: mnt, //testMountPoint, + }, + }) + if tc.scmClass == storage.ClassRam { + storageCfg.ControlMetadata = storage.ControlMetadata{ + Path: "control_meta", + } + } + } + + sysCfg := &sysprov.MockSysConfig{} + sysProv := sysprov.NewMockSysProvider(log, sysCfg) + scmProv := scm.DefaultMockProvider(log) + metaProv := &storage.MockMetadataProvider{ + UnmountErr: tc.unmountErr, + } + + ec := engine.MockConfig().WithStorage(storageCfg.Tiers...) + storProv := storage.MockProvider(log, 0, &storageCfg, sysProv, scmProv, nil, + metaProv) + + ei := NewEngineInstance(log, storProv, nil, nil, nil) + ei.fsRoot = testDir + setupTestEngineWithConfig(t, ei, uint32(0), ec, !tc.engineStarted) + + testSBPath := ei.superblockPath() + t.Logf("SB path: %s", testSBPath) + if err := os.MkdirAll(filepath.Dir(testSBPath), 0777); err != nil { + t.Fatal(err) + } + + if _, err := os.Stat(testSBPath); err == nil { + t.Fatalf("unexpected superblock exists (%s)", err) + } else if !os.IsNotExist(err) { + t.Fatalf("unexpected error %s", err) + } + if len(storageCfg.Tiers) != 0 { + if err := ei.WriteSuperblock(); err != nil { + //storage.FormatControlMetadata([]uint{0}); err != nil { + t.Fatal(err) + } + } + if _, err := os.Stat(testSBPath); tc.expSBCreated && err != nil { + t.Fatalf("superblock missing (%s)", err) + } + + if tc.stopEngine == nil { + tc.stopEngine = defStopEngine + } + + test.CmpErr(t, tc.expErr, ei.clearFormat(test.Context(t), tc.stopEngine)) + + // Check log output for unexpected unmount call + logOutput := buf.String() + unmountCalled := strings.Contains(logOutput, "unmounting tmpfs") + if unmountCalled { + t.Error("unexpected unmount call") + } + + if !tc.expSBCreated { + return + } + + _, err := os.Stat(testSBPath) + test.AssertEqual(t, tc.expSBRemoved, os.IsNotExist(err), "is superblock removed") + }) + } +} From 65ff26ea66511e3856f70142b9888f0c1ee43a1a Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Tue, 16 Jun 2026 14:07:20 +0100 Subject: [PATCH 4/7] fix unit tests and address review comments Features: control Signed-off-by: Tom Nabarro --- src/control/server/instance_storage.go | 13 -- src/control/server/instance_storage_test.go | 127 +++++++++----------- 2 files changed, 57 insertions(+), 83 deletions(-) diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index 206b0648392..35f37b9a972 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -81,19 +81,6 @@ func (ei *EngineInstance) clearFormat(ctx context.Context, stopEngineFn func(con idx := ei.Index() ei.log.Infof("instance %d: cleaning up after join failure during replace", idx) - storageProv := ei.GetStorage() - - // Get SCM config to access mount point and class - scmCfg, err := storageProv.GetScmConfig() - if err != nil { - return errors.Wrap(err, "failed to get SCM config") - } - - if scmCfg == nil { - ei.log.Debugf("instance %d: no SCM config, nothing to clean", idx) - return nil - } - if ei.IsStarted() { ei.log.Infof("instance %d: stopping engine before cleanup", idx) if err := stopEngineFn(ctx, ei); err != nil { diff --git a/src/control/server/instance_storage_test.go b/src/control/server/instance_storage_test.go index 99f4d611709..6039e3827b0 100644 --- a/src/control/server/instance_storage_test.go +++ b/src/control/server/instance_storage_test.go @@ -12,7 +12,6 @@ import ( "fmt" "os" "path/filepath" - "strings" "sync" "testing" "time" @@ -24,7 +23,6 @@ import ( "github.com/daos-stack/daos/src/control/events" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" - "github.com/daos-stack/daos/src/control/provider/system" sysprov "github.com/daos-stack/daos/src/control/provider/system" "github.com/daos-stack/daos/src/control/server/engine" "github.com/daos-stack/daos/src/control/server/storage" @@ -57,17 +55,17 @@ var mockRamCfg = storage.Config{ func TestIOEngineInstance_MountControlMetadata(t *testing.T) { for name, tc := range map[string]struct { meta *storage.MockMetadataProvider - sysCfg *system.MockSysConfig + sysCfg *sysprov.MockSysConfig expErr error }{ "check mounted fails": { - sysCfg: &system.MockSysConfig{ + sysCfg: &sysprov.MockSysConfig{ IsMountedErr: errors.New("mock IsMounted"), }, expErr: errors.New("mock IsMounted"), }, "already mounted": { - sysCfg: &system.MockSysConfig{ + sysCfg: &sysprov.MockSysConfig{ IsMountedBool: true, }, meta: &storage.MockMetadataProvider{ @@ -98,7 +96,7 @@ func TestIOEngineInstance_MountControlMetadata(t *testing.T) { WithStorageControlMetadataPath(mockRamCfg.ControlMetadata.Path). WithStorageControlMetadataDevice(mockRamCfg.ControlMetadata.DevicePath) runner := engine.NewRunner(log, ec) - sysProv := system.NewMockSysProvider(log, tc.sysCfg) + sysProv := sysprov.NewMockSysProvider(log, tc.sysCfg) provider := storage.MockProvider(log, 0, &mockRamCfg, sysProv, nil, nil, tc.meta) instance := NewEngineInstance(log, provider, nil, runner, nil) @@ -141,7 +139,7 @@ func TestIOEngineInstance_MountScmDevice(t *testing.T) { for name, tc := range map[string]struct { cfg *storage.Config - msCfg *system.MockSysConfig + msCfg *sysprov.MockSysConfig expErr error }{ "empty config": { @@ -149,14 +147,14 @@ func TestIOEngineInstance_MountScmDevice(t *testing.T) { }, "IsMounted fails": { cfg: ramCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedErr: errors.New("failed to check mount"), }, expErr: errors.New("failed to check mount"), }, "already mounted": { cfg: ramCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedBool: true, }, }, @@ -165,7 +163,7 @@ func TestIOEngineInstance_MountScmDevice(t *testing.T) { }, "mount ramdisk fails": { cfg: ramCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ MountErr: errors.New("mount failed"), }, expErr: errors.New("mount failed"), @@ -175,7 +173,7 @@ func TestIOEngineInstance_MountScmDevice(t *testing.T) { }, "mount dcpm fails": { cfg: dcpmCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ MountErr: errors.New("mount failed"), }, expErr: errors.New("mount failed"), @@ -204,7 +202,7 @@ func TestIOEngineInstance_MountScmDevice(t *testing.T) { ec := engine.MockConfig().WithStorage(tc.cfg.Tiers...) runner := engine.NewRunner(log, ec) - sys := system.NewMockSysProvider(log, tc.msCfg) + sys := sysprov.NewMockSysProvider(log, tc.msCfg) scm := scm.NewMockProvider(log, nil, tc.msCfg) provider := storage.MockProvider(log, 0, tc.cfg, sys, scm, nil, nil) instance := NewEngineInstance(log, provider, nil, runner, nil) @@ -238,7 +236,7 @@ func TestEngineInstance_NeedsScmFormat(t *testing.T) { for name, tc := range map[string]struct { engineCfg *engine.Config mbCfg *scm.MockBackendConfig - msCfg *system.MockSysConfig + msCfg *sysprov.MockSysConfig expNeedsFormat bool expErr error }{ @@ -247,14 +245,14 @@ func TestEngineInstance_NeedsScmFormat(t *testing.T) { }, "check ramdisk fails (IsMounted fails)": { engineCfg: ramCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedErr: errors.New("failed to check mount"), }, expErr: errors.New("failed to check mount"), }, "check ramdisk (mounted)": { engineCfg: ramCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedBool: true, }, expNeedsFormat: false, @@ -265,35 +263,35 @@ func TestEngineInstance_NeedsScmFormat(t *testing.T) { }, "check ramdisk (unmounted, mountpoint doesn't exist)": { engineCfg: ramCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedErr: os.ErrNotExist, }, expNeedsFormat: true, }, "check dcpm (mounted)": { engineCfg: dcpmCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedBool: true, }, expNeedsFormat: false, }, "check dcpm (unmounted, unformatted)": { engineCfg: dcpmCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ GetfsStr: "none", }, expNeedsFormat: true, }, "check dcpm (unmounted, formatted)": { engineCfg: dcpmCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ GetfsStr: "ext4", }, expNeedsFormat: false, }, "check dcpm (unmounted, formatted, mountpoint doesn't exist)": { engineCfg: dcpmCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedErr: os.ErrNotExist, GetfsStr: "ext4", }, @@ -302,7 +300,7 @@ func TestEngineInstance_NeedsScmFormat(t *testing.T) { }, "check dcpm fails (IsMounted fails)": { engineCfg: dcpmCfg, - msCfg: &system.MockSysConfig{ + msCfg: &sysprov.MockSysConfig{ IsMountedErr: errors.New("failed to check mount"), }, expErr: errors.New("failed to check mount"), @@ -325,7 +323,7 @@ func TestEngineInstance_NeedsScmFormat(t *testing.T) { runner := engine.NewRunner(log, tc.engineCfg) mp := storage.NewProvider(log, 0, &tc.engineCfg.Storage, - system.NewMockSysProvider(log, tc.msCfg), + sysprov.NewMockSysProvider(log, tc.msCfg), scm.NewMockProvider(log, tc.mbCfg, tc.msCfg), nil, nil) instance := NewEngineInstance(log, mp, nil, runner, nil) @@ -495,7 +493,7 @@ func TestIOEngineInstance_awaitStorageReady(t *testing.T) { tc.storageCfg = &dcpmCfg.Storage } - msc := system.MockSysConfig{ + msc := sysprov.MockSysConfig{ IsMountedBool: tc.isMounted, IsMountedErr: tc.isMountedErr, MountErr: tc.mountErr, @@ -520,7 +518,7 @@ func TestIOEngineInstance_awaitStorageReady(t *testing.T) { } mp := storage.NewProvider(log, 0, tc.storageCfg, - system.NewMockSysProvider(log, &msc), + sysprov.NewMockSysProvider(log, &msc), scm.NewMockProvider(log, &smbc, &msc), nil, mmp) engine := NewEngineInstance(log, mp, nil, runner, nil) @@ -568,39 +566,34 @@ func TestEngineInstance_clearFormat(t *testing.T) { } for name, tc := range map[string]struct { - scmClass storage.Class - getSCMErr error - unmountErr error - engineStarted bool - stopEngine func(context.Context, *EngineInstance) error - expErr error - expSBCreated bool - expSBRemoved bool + scmClass storage.Class + engineStarted bool + missingSuperblock bool + expErr error }{ - "no SCM config": { - scmClass: "", - expErr: errors.New("expected exactly 1 SCM tier"), + "RAM class missing superblock": { + scmClass: storage.ClassRam, + missingSuperblock: true, + expErr: errors.New("no such file or directory"), }, - "get SCM config fails": { - getSCMErr: errors.New("mock config error"), - expErr: errors.New("failed to get SCM config"), + "DCPM class missing superblock": { + scmClass: storage.ClassDcpm, + missingSuperblock: true, + expErr: errors.New("no such file or directory"), }, "RAM class not started": { - scmClass: storage.ClassRam, - expErr: nil, - expSBRemoved: true, + scmClass: storage.ClassRam, }, "DCPM class not started": { scmClass: storage.ClassDcpm, - expErr: nil, }, "RAM class engine started": { scmClass: storage.ClassRam, engineStarted: true, - stopEngine: func(_ context.Context, _ *EngineInstance) error { - return nil - }, - expErr: nil, + }, + "DCPM class engine started": { + scmClass: storage.ClassDcpm, + engineStarted: true, }, } { t.Run(name, func(t *testing.T) { @@ -624,7 +617,7 @@ func TestEngineInstance_clearFormat(t *testing.T) { storageCfg.Tiers = append(storageCfg.Tiers, &storage.TierConfig{ Class: tc.scmClass, Scm: storage.ScmConfig{ - MountPoint: mnt, //testMountPoint, + MountPoint: mnt, }, }) if tc.scmClass == storage.ClassRam { @@ -637,9 +630,7 @@ func TestEngineInstance_clearFormat(t *testing.T) { sysCfg := &sysprov.MockSysConfig{} sysProv := sysprov.NewMockSysProvider(log, sysCfg) scmProv := scm.DefaultMockProvider(log) - metaProv := &storage.MockMetadataProvider{ - UnmountErr: tc.unmountErr, - } + metaProv := &storage.MockMetadataProvider{} ec := engine.MockConfig().WithStorage(storageCfg.Tiers...) storProv := storage.MockProvider(log, 0, &storageCfg, sysProv, scmProv, nil, @@ -650,7 +641,6 @@ func TestEngineInstance_clearFormat(t *testing.T) { setupTestEngineWithConfig(t, ei, uint32(0), ec, !tc.engineStarted) testSBPath := ei.superblockPath() - t.Logf("SB path: %s", testSBPath) if err := os.MkdirAll(filepath.Dir(testSBPath), 0777); err != nil { t.Fatal(err) } @@ -660,35 +650,32 @@ func TestEngineInstance_clearFormat(t *testing.T) { } else if !os.IsNotExist(err) { t.Fatalf("unexpected error %s", err) } - if len(storageCfg.Tiers) != 0 { - if err := ei.WriteSuperblock(); err != nil { - //storage.FormatControlMetadata([]uint{0}); err != nil { - t.Fatal(err) + if !tc.missingSuperblock { + if len(storageCfg.Tiers) != 0 { + if err := ei.WriteSuperblock(); err != nil { + t.Fatal(err) + } + } + if _, err := os.Stat(testSBPath); err != nil { + t.Fatalf("superblock missing (%s)", err) } - } - if _, err := os.Stat(testSBPath); tc.expSBCreated && err != nil { - t.Fatalf("superblock missing (%s)", err) } - if tc.stopEngine == nil { - tc.stopEngine = defStopEngine + stopEngine := defStopEngine + if tc.engineStarted { + stopEngine = func(_ context.Context, _ *EngineInstance) error { + return nil + } } - test.CmpErr(t, tc.expErr, ei.clearFormat(test.Context(t), tc.stopEngine)) - - // Check log output for unexpected unmount call - logOutput := buf.String() - unmountCalled := strings.Contains(logOutput, "unmounting tmpfs") - if unmountCalled { - t.Error("unexpected unmount call") - } + test.CmpErr(t, tc.expErr, ei.clearFormat(test.Context(t), stopEngine)) - if !tc.expSBCreated { + if tc.expErr != nil { return } _, err := os.Stat(testSBPath) - test.AssertEqual(t, tc.expSBRemoved, os.IsNotExist(err), "is superblock removed") + test.AssertTrue(t, os.IsNotExist(err), "superblock not removed") }) } } From 710d497cb742672921a559ce63c643296a2e6105 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Tue, 16 Jun 2026 16:04:43 +0100 Subject: [PATCH 5/7] reinstate Adminexcluded check for joinReplace Features: control Signed-off-by: Tom Nabarro --- src/control/system/membership.go | 3 +++ src/control/system/membership_test.go | 15 +-------------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/src/control/system/membership.go b/src/control/system/membership.go index d7773870ea0..691243c0682 100644 --- a/src/control/system/membership.go +++ b/src/control/system/membership.go @@ -198,6 +198,9 @@ func (m *Membership) joinReplace(req *JoinRequest) (*JoinResponse, error) { return nil, err } + if cm.State == MemberStateAdminExcluded { + return nil, ErrJoinAdminExcluded(cm.UUID, cm.Rank) + } memberToReplace := &Member{} *memberToReplace = *cm diff --git a/src/control/system/membership_test.go b/src/control/system/membership_test.go index e6a599d4c82..7d3e2db8473 100644 --- a/src/control/system/membership_test.go +++ b/src/control/system/membership_test.go @@ -1029,20 +1029,7 @@ func TestSystem_Membership_Join(t *testing.T) { FabricContexts: curMember.PrimaryFabricContexts, FaultDomain: curMember.FaultDomain, }, - expResp: &JoinResponse{ - Created: false, - Member: func() *Member { - cm := *adminExcludedMember - cm.Rank = 0 - cm.UUID = curMember.UUID - cm.State = MemberStateJoined - cm.Info = "" - return &cm - }(), - PrevState: MemberStateAdminExcluded, - // Extra map increment because of remove and add operations. - MapVersion: 3, - }, + expErr: ErrJoinAdminExcluded(adminExcludedMember.UUID, 0), }, "successful replace; different UUID but otherwise identical member": { req: &JoinRequest{ From 4f87d31d08f1490eb399fdc51d318bbc3faa7b7d Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Thu, 18 Jun 2026 12:35:05 +0100 Subject: [PATCH 6/7] consistency improvement Signed-off-by: Tom Nabarro --- src/control/system/membership.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/control/system/membership.go b/src/control/system/membership.go index 691243c0682..3218c5249b9 100644 --- a/src/control/system/membership.go +++ b/src/control/system/membership.go @@ -102,7 +102,7 @@ func (m *Membership) FindRankFromJoinRequest(req *JoinRequest) (Rank, error) { return NilRank, errors.New("unexpected rank in replace-rank request") } - currentMembers, err := m.Members(nil, AllMemberFilter) + currentMembers, err := m.db.AllMembers() if err != nil { return NilRank, errors.Wrap(err, "failed to get all system members") } From 8f3ef7fce091ef247a1a1de96dd259eba63744d7 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Thu, 18 Jun 2026 13:54:23 +0100 Subject: [PATCH 7/7] address review comments from knard38 Signed-off-by: Tom Nabarro --- docs/admin/administration.md | 17 +++++++++++++---- src/control/server/instance_storage.go | 8 ++++++-- src/control/server/instance_storage_test.go | 9 ++++++++- src/control/server/util_test.go | 4 ++-- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/docs/admin/administration.md b/docs/admin/administration.md index 78c5d316b7f..1fe6fb0d559 100644 --- a/docs/admin/administration.md +++ b/docs/admin/administration.md @@ -1152,10 +1152,19 @@ An examples workflow would be: 8. Formatted engine will join using the existing (old) rank which is mapped to the engine's hardware. !!! note - `dmg storage format --replace` can be used to replace a rank in `AdminExcluded` state. The - subsequent state of the rank will then no longer be `AdminExcluded`. This special case reduces - a chance that a duplicate rank entry is introduced inadvertently because the rank to be replaced - is in the `AdminExcluded` state and so is recreated rather than replaced. + `dmg storage format --replace` can not be used to replace a rank in `AdminExcluded` state. An + administrator is required to run `dmg system clear-exclude` to remove the `AdminExcluded` state + before being able to assign an engine it's previous rank. + +!!! note + If `dmg storage format --replace` succeeds at the format level but the subsequent rank join + fails (e.g. due to a misconfigured fabric URI), the engine's superblock is automatically + removed to prevent a new rank from being unintentionally created on the next restart. On DCPM + systems the PMem device (`/dev/pmemX`) will remain mounted after the failure; this is + intentional so that the next `dmg storage format --replace` can write the new superblock onto + the PMEM correctly. Resolve the configuration issue and re-run `dmg storage format --replace` + or run `dmg storage format` without `--replace` to create a new rank with a different fabric + URI (for example). ### System Erase diff --git a/src/control/server/instance_storage.go b/src/control/server/instance_storage.go index 35f37b9a972..286f298560b 100644 --- a/src/control/server/instance_storage.go +++ b/src/control/server/instance_storage.go @@ -89,9 +89,13 @@ func (ei *EngineInstance) clearFormat(ctx context.Context, stopEngineFn func(con ei.log.Debugf("instance %d: engine stopped successfully", idx) } - // On RAM-based SCM (tmpfs) unmount here unnecessary as will be done on engine exit + // SCM is intentionally not unmounted: + // - RAM/tmpfs: auto-unmounted by the control-plane when the engine exits above; + // ramdisk is unconditionally recreated on next startup. + // - DCPM: must stay mounted so createSuperblock() writes onto the PMEM in a "Metadata + // format" rather than requiring a full "SCM format". - // Removing superblock prevents subsequent join without reformat. + // Removing superblock prevents subsequent join without an explicit format. if err := ei.RemoveSuperblock(); err != nil { return err } diff --git a/src/control/server/instance_storage_test.go b/src/control/server/instance_storage_test.go index 6039e3827b0..13a88eeb925 100644 --- a/src/control/server/instance_storage_test.go +++ b/src/control/server/instance_storage_test.go @@ -568,6 +568,7 @@ func TestEngineInstance_clearFormat(t *testing.T) { for name, tc := range map[string]struct { scmClass storage.Class engineStarted bool + stopEngineFails bool missingSuperblock bool expErr error }{ @@ -595,6 +596,12 @@ func TestEngineInstance_clearFormat(t *testing.T) { scmClass: storage.ClassDcpm, engineStarted: true, }, + "DCPM class engine started, stop fails": { + scmClass: storage.ClassDcpm, + engineStarted: true, + stopEngineFails: true, + expErr: errors.New("stop failed"), + }, } { t.Run(name, func(t *testing.T) { log, buf := logging.NewTestLogger(t.Name()) @@ -662,7 +669,7 @@ func TestEngineInstance_clearFormat(t *testing.T) { } stopEngine := defStopEngine - if tc.engineStarted { + if tc.engineStarted && !tc.stopEngineFails { stopEngine = func(_ context.Context, _ *EngineInstance) error { return nil } diff --git a/src/control/server/util_test.go b/src/control/server/util_test.go index 75dcdb8438b..a5bfed9ea49 100644 --- a/src/control/server/util_test.go +++ b/src/control/server/util_test.go @@ -292,7 +292,7 @@ func newTestMgmtSvcNonReplica(t *testing.T, log logging.Logger) *mgmtSvc { } // setupTestEngineWithConfig configures an EngineInstance with a custom engine config. -// If cfg is nil, uses engine.MockConfig(). If rank is 0, no rank is assigned. +// If cfg is nil, uses engine.MockConfig(). Note that rank 0 is a valid rank. func setupTestEngineWithConfig(t *testing.T, ei *EngineInstance, rank uint32, cfg *engine.Config, stopped ...bool) { if ei._superblock == nil { ei._superblock = &Superblock{} @@ -314,7 +314,7 @@ func setupTestEngineWithConfig(t *testing.T, ei *EngineInstance, rank uint32, cf // setupTestEngine configures an EngineInstance for testing with the specified rank and state. // If stopped is true (or provided), the engine is marked as stopped, otherwise as running. -// If rank is 0, no rank is assigned to the superblock. +// Note that rank 0 is a valid rank. func setupTestEngine(t *testing.T, ei *EngineInstance, rank uint32, stopped ...bool) { setupTestEngineWithConfig(t, ei, rank, nil, stopped...) }