From a4e22846dba1389a783b8187928427150d131cb9 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Thu, 4 Jun 2026 12:37:29 +0200 Subject: [PATCH] [Test] In test_dynamic_file_system_mounting, check the mounted/unmounted file systems on login nodes. --- .../tests/common/login_nodes_utils.py | 19 ++++++++++++ .../tests/storage/storage_common.py | 31 ++++++++++++++++++- .../tests/update/test_update.py | 7 ++++- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/common/login_nodes_utils.py b/tests/integration-tests/tests/common/login_nodes_utils.py index 1b32f9b532..ddf07d2adb 100644 --- a/tests/integration-tests/tests/common/login_nodes_utils.py +++ b/tests/integration-tests/tests/common/login_nodes_utils.py @@ -13,6 +13,7 @@ import boto3 from assertpy import assert_that +from remote_command_executor import RemoteCommandExecutor from retrying import retry from time_utils import minutes @@ -58,3 +59,21 @@ def wait_for_login_fleet_stop(cluster, wait_fixed=None, stop_max_delay=None): wait_fixed=wait_fixed if wait_fixed is not None else minutes(1), stop_max_delay=stop_max_delay if stop_max_delay is not None else minutes(10), )(assert_login_nodes_count)(cluster, 0) + + +def get_login_node_executors_by_pool(cluster, login_node_pools): + """ + Return a remote command executor for each login node belonging to the given pools. + :param cluster: the cluster. + :param login_node_pools: the list of login node pool names to target. + :return: a list of remote command executors, empty when no pools are requested. + """ + if not login_node_pools: + return [] + executors = [] + for login_node in cluster.describe_login_nodes(): + if login_node.get("poolName") not in login_node_pools: + continue + login_node_ip = login_node.get("publicIpAddress") or login_node.get("privateIpAddress") + executors.append(RemoteCommandExecutor(cluster, login_node_ip=login_node_ip)) + return executors diff --git a/tests/integration-tests/tests/storage/storage_common.py b/tests/integration-tests/tests/storage/storage_common.py index 79374972b6..a7791e0422 100644 --- a/tests/integration-tests/tests/storage/storage_common.py +++ b/tests/integration-tests/tests/storage/storage_common.py @@ -32,6 +32,7 @@ from troposphere.iam import InstanceProfile, Policy, Role from utils import generate_stack_name, random_alphanumeric, retrieve_cfn_outputs +from tests.common.login_nodes_utils import get_login_node_executors_by_pool from tests.common.schedulers_common import SlurmCommands from tests.common.utils import CLASSIC_AWS_DOMAIN, get_aws_domain, retrieve_latest_ami @@ -102,7 +103,13 @@ def test_directory_correctly_shared_between_ln_and_hn( def verify_directory_correctly_shared( - remote_command_executor, mount_dir, scheduler_commands, partitions=None, run_sudo=False + remote_command_executor, + mount_dir, + scheduler_commands, + partitions=None, + run_sudo=False, + cluster=None, + login_node_pools=None, ): """ Confirm nodes can read and write to the FileSystem @@ -114,6 +121,10 @@ def verify_directory_correctly_shared( While Reading: "A" reads files: ["A-", "B-"] "B" reads files: ["A-", "B-"] + + When `login_node_pools` is provided, the login nodes belonging to those pools are verified the same way + as the head node: each writes its own file (shared with the other nodes) and reads back all the files. + `cluster` is required to resolve the login nodes of the given pools. """ executor_node_file = random_alphanumeric() logging.info(f"{remote_command_executor.get_target_host_type()}: Writing File: {executor_node_file}") @@ -142,6 +153,19 @@ def verify_directory_correctly_shared( scheduler_commands.assert_job_succeeded(job_id) files_to_read.append(compute_file) + # For each login pool, a login node writes a new file to the shared folder + login_node_executors = get_login_node_executors_by_pool(cluster, login_node_pools) + for login_node_executor in login_node_executors: + login_node_file = random_alphanumeric() + logging.info(f"{login_node_executor.get_target_host_type()}: Writing File: {login_node_file}") + login_node_executor.run_remote_command( + ("sudo " if run_sudo else "") + + "touch {mount_dir}/{login_node_file} && cat {mount_dir}/{login_node_file}".format( + mount_dir=mount_dir, login_node_file=login_node_file + ) + ) + files_to_read.append(login_node_file) + read_all_files_command = "cat {files_to_read}".format( files_to_read=" ".join([f"{mount_dir}/{target_file}" for target_file in files_to_read]), ) @@ -157,6 +181,11 @@ def verify_directory_correctly_shared( scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) + # For each login pool, a login node reads all the shared files + for login_node_executor in login_node_executors: + logging.info(f"{login_node_executor.get_target_host_type()}: Reading Files: {files_to_read}") + login_node_executor.run_remote_command(read_all_files_command) + # for EBS diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 6eeab200e4..b40e4d2cbe 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -1205,7 +1205,12 @@ def test_dynamic_file_systems_update( ) for mount_dir in all_mount_dirs_update_1: verify_directory_correctly_shared( - remote_command_executor, mount_dir, scheduler_commands, partitions=["queue1", "queue2"] + remote_command_executor, + mount_dir, + scheduler_commands, + partitions=["queue1", "queue2"], + cluster=cluster, + login_node_pools=["login"], ) # # Update cluster to stop login nodes