Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -514,6 +514,17 @@ private boolean deRefOrDeleteResource(DevelopersApi api, String rscName, String
ApiCallRcList answers = api.resourceDefinitionDelete(rd.getName());
checkLinstorAnswersThrow(answers);
deleted = true;

// LINSTOR can return success here while the resource lingers in DELETING state
// on the controller (down peer, lost quorum, etc.). Confirm it's actually gone
// — if not, log a WARN so operators can clear it manually. Don't throw: the
// CloudStack-side accounting has already moved on.
if (!LinstorUtil.waitForResourceDefinitionDeleted(api, rd.getName(),
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS)) {
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list).",
rd.getName(), LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
}
}
}
return deleted;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,20 @@ private void deleteResourceDefinition(StoragePoolVO storagePoolVO, String rscDef
throw new CloudRuntimeException("Linstor: Unable to delete resource definition: " + rscDefName);
}
logger.info("Linstor: Deleted resource {}", rscDefName);

// LINSTOR can return success on the delete API call while the resource lingers in
// DELETING state (peer issues, lost quorum, satellite down). Verify the resource is
// actually gone — if not, log a WARN so operators see it. We deliberately do NOT
// throw here: the volume is already considered gone on the CloudStack side, and
// throwing would leave the CS DB and LINSTOR in different states.
if (!LinstorUtil.waitForResourceDefinitionDeleted(linstorApi, rscDefName,
LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS))
{
logger.warn("Linstor: resource {} still present {}ms after delete returned success — " +
"may be stuck in DELETING. Check the LINSTOR controller (linstor resource list) " +
"and clear manually if the resource has no live peers.",
rscDefName, LinstorUtil.DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS);
}
} catch (ApiException apiEx)
{
logger.error("Linstor: ApiEx - " + apiEx.getMessage());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,58 @@ public static List<ResourceDefinition> getRDListStartingWith(DevelopersApi api,
.collect(Collectors.toList());
}

/**
* Default per-call timeout for {@link #waitForResourceDefinitionDeleted}. Long enough for a
* healthy LINSTOR controller to finish a normal delete; short enough not to block the calling
* thread for too long if the delete is genuinely stuck. Used both from the management server
* (e.g. {@code LinstorPrimaryDataStoreDriverImpl}) and from KVM agent paths.
*/
public static final long DEFAULT_RD_DELETE_VERIFY_TIMEOUT_MILLIS = 30_000L;

/**
* Returns {@code true} if the named resource definition is no longer present on the LINSTOR
* controller. Used after a {@code resourceDefinitionDelete} to verify the delete actually
* completed (LINSTOR can return success on the API call while the resource lingers in
* DELETING state due to peer issues, lost quorum, or down satellites). Uses the
* controller-side name filter rather than scanning every RD on the cluster (cheap even
* when polled once per second from {@link #waitForResourceDefinitionDeleted}).
*/
public static boolean isResourceDefinitionGone(DevelopersApi api, String rscName) throws ApiException {
List<ResourceDefinition> matching =
api.resourceDefinitionList(Collections.singletonList(rscName), false, null, null, null);
return matching == null || matching.isEmpty();
}

/**
* Polls the controller until the named resource definition is gone or the timeout elapses.
* Returns {@code true} if the resource was confirmed gone, {@code false} if it was still
* present (or the controller kept erroring) at the deadline. Callers should NOT throw on a
* {@code false} return — the upstream API call already reported success and the operator
* may need to investigate manually. Log a WARN with the resource name instead.
*/
public static boolean waitForResourceDefinitionDeleted(DevelopersApi api, String rscName, long timeoutMillis) {
final long deadline = System.currentTimeMillis() + timeoutMillis;
while (true) {
try {
if (isResourceDefinitionGone(api, rscName)) {
return true;
}
} catch (ApiException e) {
LOGGER.debug("LINSTOR delete-verify poll failed for {}: {}", rscName, e.getMessage());
// Keep polling — controller may be transiently unavailable.
}
if (System.currentTimeMillis() >= deadline) {
return false;
}
try {
Thread.sleep(1_000L);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
return false;
}
}
Comment on lines +433 to +453
Copy link

Copilot AI Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New delete-verification behavior in waitForResourceDefinitionDeleted / isResourceDefinitionGone isn’t covered by the existing LinstorUtilTest suite. Please add unit tests for (a) confirmed-gone case, and (b) timeout/exception paths (ideally without introducing real sleeps) so regressions in the polling logic are caught in CI.

Copilot uses AI. Check for mistakes.
}

/**
* Returns a pair list of resource-definitions with ther 1:1 mapped resource-group objects that start with the
* resource name `startWith`
Expand Down
Loading