Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions tests/integration-tests/tests/common/login_nodes_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import boto3
from assertpy import assert_that
from remote_command_executor import RemoteCommandExecutor
from retrying import retry
from time_utils import minutes

Expand Down Expand Up @@ -58,3 +59,21 @@ def wait_for_login_fleet_stop(cluster, wait_fixed=None, stop_max_delay=None):
wait_fixed=wait_fixed if wait_fixed is not None else minutes(1),
stop_max_delay=stop_max_delay if stop_max_delay is not None else minutes(10),
)(assert_login_nodes_count)(cluster, 0)


def get_login_node_executors_by_pool(cluster, login_node_pools):
"""
Return a remote command executor for each login node belonging to the given pools.
:param cluster: the cluster.
:param login_node_pools: the list of login node pool names to target.
:return: a list of remote command executors, empty when no pools are requested.
"""
if not login_node_pools:
return []
executors = []
for login_node in cluster.describe_login_nodes():
if login_node.get("poolName") not in login_node_pools:
continue
login_node_ip = login_node.get("publicIpAddress") or login_node.get("privateIpAddress")
executors.append(RemoteCommandExecutor(cluster, login_node_ip=login_node_ip))
return executors
31 changes: 30 additions & 1 deletion tests/integration-tests/tests/storage/storage_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from troposphere.iam import InstanceProfile, Policy, Role
from utils import generate_stack_name, random_alphanumeric, retrieve_cfn_outputs

from tests.common.login_nodes_utils import get_login_node_executors_by_pool
from tests.common.schedulers_common import SlurmCommands
from tests.common.utils import CLASSIC_AWS_DOMAIN, get_aws_domain, retrieve_latest_ami

Expand Down Expand Up @@ -102,7 +103,13 @@ def test_directory_correctly_shared_between_ln_and_hn(


def verify_directory_correctly_shared(
remote_command_executor, mount_dir, scheduler_commands, partitions=None, run_sudo=False
remote_command_executor,
mount_dir,
scheduler_commands,
partitions=None,
run_sudo=False,
cluster=None,
login_node_pools=None,
):
"""
Confirm nodes can read and write to the FileSystem
Expand All @@ -114,6 +121,10 @@ def verify_directory_correctly_shared(
While Reading:
"A" reads files: ["A-<random_alphanumeric_characters>", "B-<random_alphanumeric_characters>"]
"B" reads files: ["A-<random_alphanumeric_characters>", "B-<random_alphanumeric_characters>"]

When `login_node_pools` is provided, the login nodes belonging to those pools are verified the same way
as the head node: each writes its own file (shared with the other nodes) and reads back all the files.
`cluster` is required to resolve the login nodes of the given pools.
"""
executor_node_file = random_alphanumeric()
logging.info(f"{remote_command_executor.get_target_host_type()}: Writing File: {executor_node_file}")
Expand Down Expand Up @@ -142,6 +153,19 @@ def verify_directory_correctly_shared(
scheduler_commands.assert_job_succeeded(job_id)
files_to_read.append(compute_file)

# For each login pool, a login node writes a new file to the shared folder
login_node_executors = get_login_node_executors_by_pool(cluster, login_node_pools)
for login_node_executor in login_node_executors:
login_node_file = random_alphanumeric()
logging.info(f"{login_node_executor.get_target_host_type()}: Writing File: {login_node_file}")
login_node_executor.run_remote_command(
("sudo " if run_sudo else "")
+ "touch {mount_dir}/{login_node_file} && cat {mount_dir}/{login_node_file}".format(
mount_dir=mount_dir, login_node_file=login_node_file
)
)
files_to_read.append(login_node_file)

read_all_files_command = "cat {files_to_read}".format(
files_to_read=" ".join([f"{mount_dir}/{target_file}" for target_file in files_to_read]),
)
Expand All @@ -157,6 +181,11 @@ def verify_directory_correctly_shared(
scheduler_commands.wait_job_completed(job_id)
scheduler_commands.assert_job_succeeded(job_id)

# For each login pool, a login node reads all the shared files
for login_node_executor in login_node_executors:
logging.info(f"{login_node_executor.get_target_host_type()}: Reading Files: {files_to_read}")
login_node_executor.run_remote_command(read_all_files_command)


# for EBS

Expand Down
7 changes: 6 additions & 1 deletion tests/integration-tests/tests/update/test_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -1205,7 +1205,12 @@ def test_dynamic_file_systems_update(
)
for mount_dir in all_mount_dirs_update_1:
verify_directory_correctly_shared(
remote_command_executor, mount_dir, scheduler_commands, partitions=["queue1", "queue2"]
remote_command_executor,
mount_dir,
scheduler_commands,
partitions=["queue1", "queue2"],
cluster=cluster,
login_node_pools=["login"],
)

# # Update cluster to stop login nodes
Expand Down
Loading