Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
import org.apache.ignite.lang.IgniteBiTuple;
import org.apache.ignite.lang.IgniteFuture;
import org.apache.ignite.lang.IgniteFutureCancelledException;
import org.apache.ignite.lang.IgniteFutureTimeoutException;
import org.apache.ignite.lang.IgnitePredicate;
import org.apache.ignite.spi.discovery.DiscoverySpiCustomMessage;
import org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi;
Expand Down Expand Up @@ -903,6 +904,18 @@ private void checkIncrementalSnapshotWalRecords(IgniteEx node, IncrementalSnapsh
}
}

/** Print thread dump if {@code IgniteFutureTimeoutException} is raised. */
protected void runWithLoggedThreadDump(Runnable action) {
try {
action.run();
}
catch (IgniteFutureTimeoutException ex) {
U.dumpThreads(log);

throw ex;
}
}

/**
* @param ignite Ignite instance to resolve discovery spi to.
* @return BlockingCustomMessageDiscoverySpi instance.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ public class IgniteClusterSnapshotHandlerTest extends IgniteClusterSnapshotResto
/** Custom snapshot handlers. */
private final List<SnapshotHandler<?>> handlers = new ArrayList<>();

/** Timeout in milliseconds to await for snapshot operation being completed. */
protected static final long TIMEOUT = 60_000;
Copy link
Copy Markdown
Contributor

@maksaska maksaska May 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you clarify the scope of this PR? The ticket description mentions covering only Disk 4 tests, but this PR modifies IgniteClusterSnapshotHandlerTest, which belongs to the Disk 5 suite.

  • If we are including Disk 5: Please update the ticket description to avoid confusion. Also, can we update the timeouts for IgniteSnapshotMXBeanTest while we are at it?
  • If Disk 5 is out of scope: I suggest reverting the changes to IgniteClusterSnapshotHandlerTest to keep this PR focused.


/** Extensions plugin provider. */
private final PluginProvider<PluginConfiguration> pluginProvider = new AbstractTestPluginProvider() {
@Override public String name() {
Expand All @@ -84,7 +87,7 @@ public class IgniteClusterSnapshotHandlerTest extends IgniteClusterSnapshotResto

/** {@inheritDoc} */
@Override protected Function<Integer, Object> valueBuilder() {
return Integer::new;
return Integer::valueOf;
}

/**
Expand Down Expand Up @@ -142,11 +145,13 @@ public void testClusterSnapshotHandlers() throws Exception {

IgniteFuture<Void> fut = ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null);

GridTestUtils.assertThrowsAnyCause(log, () -> fut.get(TIMEOUT), IgniteCheckedException.class, expMsg);
runWithLoggedThreadDump(() ->
GridTestUtils.assertThrowsAnyCause(log, () -> fut.get(TIMEOUT), IgniteCheckedException.class, expMsg));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one coming from AbstractSnapshotSelfTest which is 15sec. I see you had only 1 run for Disk Page Compressions 5 which might not be enough. The timeout is not changed so I anticipate further failures. The ticket is for Disk 4, and the test fails on Disk 5, we should either fix Disk 5 as well and add it to the ticket description or remove this change.


changeMetadataRequestIdOnDisk(reqIdRef.get());

ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT);
runWithLoggedThreadDump(() ->
ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same as above


assertCacheKeys(ignite.cache(DEFAULT_CACHE_NAME), CACHE_KEYS_RANGE);
}
Expand Down Expand Up @@ -212,7 +217,8 @@ public void testClusterSnapshotHandlerFailure() throws Exception {

IgniteFuture<Void> fut = snp(ignite).createSnapshot(SNAPSHOT_NAME, null, false, onlyPrimary);

GridTestUtils.assertThrowsAnyCause(log, () -> fut.get(TIMEOUT), IgniteCheckedException.class, expMsg);
runWithLoggedThreadDump(() ->
GridTestUtils.assertThrowsAnyCause(log, () -> fut.get(TIMEOUT), IgniteCheckedException.class, expMsg));

failCreateFlag.set(false);

Expand All @@ -224,11 +230,13 @@ public void testClusterSnapshotHandlerFailure() throws Exception {

IgniteFuture<Void> fut0 = ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null);

GridTestUtils.assertThrowsAnyCause(log, () -> fut0.get(TIMEOUT), IgniteCheckedException.class, expMsg);
runWithLoggedThreadDump(() ->
GridTestUtils.assertThrowsAnyCause(log, () -> fut0.get(TIMEOUT), IgniteCheckedException.class, expMsg));

failRestoreFlag.set(false);

ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT);
runWithLoggedThreadDump(() ->
ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT));

assertCacheKeys(ignite.cache(DEFAULT_CACHE_NAME), CACHE_KEYS_RANGE);
}
Expand Down Expand Up @@ -406,7 +414,8 @@ public void testHandlerSnapshotLocation() throws Exception {
ignite.destroyCache(DEFAULT_CACHE_NAME);
awaitPartitionMapExchange();

snpMgr.restoreSnapshot(snpName, snpDir.getAbsolutePath(), null).get(TIMEOUT);
runWithLoggedThreadDump(() ->
snpMgr.restoreSnapshot(snpName, snpDir.getAbsolutePath(), null).get(TIMEOUT));
}
finally {
U.delete(snpDir);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ public class IgniteClusterSnapshotRestoreSelfTest extends IgniteClusterSnapshotR
/** Reset consistent ID flag. */
private boolean resetConsistentId;

/** Timeout in milliseconds to await for snapshot operation being completed. */
protected static final long TIMEOUT = 60_000;

/** {@inheritDoc} */
@Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception {
IgniteConfiguration cfg = super.getConfiguration(igniteInstanceName);
Expand All @@ -117,7 +120,8 @@ public void testRestoreWithEmptyPartitions() throws Exception {
// Skip check because some partitions will be empty - keysCnt == parts/2.
Ignite ignite = startGridsWithSnapshot(1, keysCnt, false, true);

ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT);
runWithLoggedThreadDump(() ->
ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT));

assertCacheKeys(ignite.cache(DEFAULT_CACHE_NAME), keysCnt);
}
Expand Down Expand Up @@ -235,7 +239,8 @@ private void doRestoreAllGroups() throws Exception {
TestRecordingCommunicationSpi.spi(g).record(SnapshotFilesRequestMessage.class);

// Restore all cache groups.
grid(0).snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT);
runWithLoggedThreadDump(() ->
grid(0).snapshot().restoreSnapshot(SNAPSHOT_NAME, null).get(TIMEOUT));

awaitPartitionMapExchange(true, true, null, true);

Expand Down Expand Up @@ -277,8 +282,9 @@ private void checkStartClusterSnapshotRestoreMultithreaded(IntSupplier nodeIdxSu

IgniteInternalFuture<Long> fut = GridTestUtils.runMultiThreadedAsync(() -> {
try {
grid(nodeIdxSupplier.getAsInt()).snapshot().restoreSnapshot(
SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT);
runWithLoggedThreadDump(() ->
grid(nodeIdxSupplier.getAsInt()).snapshot().restoreSnapshot(
SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT));

successCnt.incrementAndGet();
}
Expand Down Expand Up @@ -444,7 +450,8 @@ public void testClusterSnapshotRestoreOnSmallerTopology() throws Exception {

resetBaselineTopology();

grid(0).snapshot().restoreSnapshot(SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT);
runWithLoggedThreadDump(() ->
grid(0).snapshot().restoreSnapshot(SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT));

assertCacheKeys(grid(0).cache(DEFAULT_CACHE_NAME), CACHE_KEYS_RANGE);
waitForEvents(EVT_CLUSTER_SNAPSHOT_RESTORE_STARTED, EVT_CLUSTER_SNAPSHOT_RESTORE_FINISHED);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ public class IgniteSnapshotMXBeanTest extends AbstractSnapshotSelfTest {
/** Snapshot group name. */
private static final String SNAPSHOT_GROUP = "Snapshot";

/** Timeout in milliseconds to await for snapshot operation being completed. */
protected static final long TIMEOUT = 60_000;

/** {@inheritDoc} */
@Override protected IgniteConfiguration getConfiguration(String igniteInstanceName) throws Exception {
return super.getConfiguration(igniteInstanceName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@
* Cluster-wide snapshot test check command with indexes.
*/
public class IgniteClusterSnapshotCheckWithIndexesTest extends AbstractSnapshotSelfTest {
/** Timeout in milliseconds to await for snapshot operation being completed. */
protected static final long TIMEOUT = 60_000;
Copy link
Copy Markdown
Contributor

@maksaska maksaska May 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question as for IgniteClusterSnapshotHandlerTest.

The ticket description mentions covering only Disk 4 tests, but this PR modifies IgniteClusterSnapshotCheckWithIndexingTest as well as IgniteClusterSnapshotRestoreWithIndexingTest, which belongs to the Disk 6 suite.

  • If we are including Disk 6: Please update the ticket description to avoid confusion. Also, can we update the timeouts for IgniteClusterSnapshotMetricsTest while we are at it? And I see IgniteClusterSnapshotRestoreWithIndexingTest timeouts unchanged. Was it on purpuse?
  • If Disk 6 is out of scope: I suggest reverting the changes to keep this PR focused.


/** @throws Exception If fails. */
@Test
public void testClusterSnapshotCheckEmptyCache() throws Exception {
Expand Down Expand Up @@ -87,7 +90,9 @@ public void testClusterSnapshotCheckWithNodeFilter() throws Exception {
cache2.put(i, new Account(i, i));
}

createAndCheckSnapshot(grid(0), SNAPSHOT_NAME, null, TIMEOUT);
runWithLoggedThreadDump(() ->
createAndCheckSnapshot(grid(0), SNAPSHOT_NAME, null, TIMEOUT)
);

IdleVerifyResult res = grid(0).context().cache().context().snapshotMgr()
.checkSnapshot(SNAPSHOT_NAME, null).get().idleVerifyResult();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ public class IgniteClusterSnapshotRestoreWithIndexingTest extends IgniteClusterS
/** Number of cache keys to pre-create at node start. */
private static final int CACHE_KEYS_RANGE = 10_000;

/** Timeout in milliseconds to await for snapshot operation being completed. */
protected static final long TIMEOUT = 60_000;

/** {@inheritDoc} */
@Override protected <K, V> CacheConfiguration<K, V> txCacheConfig(CacheConfiguration<K, V> ccfg) {
return super.txCacheConfig(ccfg).setSqlIndexMaxInlineSize(255).setSqlSchema("PUBLIC")
Expand All @@ -74,7 +77,8 @@ public void testBasicClusterSnapshotRestore() throws Exception {

IgniteEx client = startGridsWithSnapshot(2, CACHE_KEYS_RANGE, true);

grid(0).snapshot().restoreSnapshot(SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT);
runWithLoggedThreadDump(() ->
grid(0).snapshot().restoreSnapshot(SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one coming from AbstractSnapshotSelfTest which is 15sec. I see you had only 1 run for Disk Page Compressions 6 which might not be enough. The timeout is not changed so I anticipate further failures. The ticket is for Disk 4, and the test fails on Disk 6, we should either fix Disk 6 as well and add it to the ticket description or remove this change.

Copy link
Copy Markdown
Contributor

@maksaska maksaska May 18, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Timeouts unchanged. Test is failing on teamcity


// Only primary mode leads to index rebuild on restore.
// Must wait until index rebuild finish so subsequent checks will pass.
Expand All @@ -101,7 +105,8 @@ public void testBasicClusterSnapshotRestoreWithMetadata() throws Exception {

forceCheckpoint();

ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT);
runWithLoggedThreadDump(() ->
ignite.snapshot().restoreSnapshot(SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT));

// Only primary mode leads to index rebuild on restore.
// Must wait until index rebuild finish so subsequent checks will pass.
Expand All @@ -126,7 +131,8 @@ public void testClusterSnapshotRestoreOnBiggerTopology() throws Exception {

startGridsWithCache(nodesCnt - 2, CACHE_KEYS_RANGE, valueBuilder(), dfltCacheCfg);

grid(0).snapshot().createSnapshot(SNAPSHOT_NAME).get(TIMEOUT);
runWithLoggedThreadDump(() ->
grid(0).snapshot().createSnapshot(SNAPSHOT_NAME).get(TIMEOUT));

startGrid(nodesCnt - 2);

Expand All @@ -152,8 +158,8 @@ public void testClusterSnapshotRestoreOnBiggerTopology() throws Exception {
forceCheckpoint();

// Restore from an empty node.
ignite.snapshot().restoreSnapshot(
SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT);
runWithLoggedThreadDump(() -> ignite.snapshot().restoreSnapshot(
SNAPSHOT_NAME, Collections.singleton(DEFAULT_CACHE_NAME)).get(TIMEOUT));

awaitPartitionMapExchange();

Expand Down
Loading