Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ CHANGELOG
- Amazon Linux 2 is no longer supported.
- AWS Batch as a scheduler is no longer supported.

3.15.1
------

**CHANGES**
- Patch official ParallelCluster AMIs to address [CVE-2026-31431](https://nvd.nist.gov/vuln/detail/CVE-2026-31431).
- Disable `algif_aead` kernel module on Ubuntu to address [CVE-2026-31431](https://nvd.nist.gov/vuln/detail/CVE-2026-31431).
- Upgrade NVIDIA driver to version 580.126.20 (from 580.105.08) for all OSs except Amazon Linux 2 to address CVE-2025-33219.
- Upgrade NVIDIA Fabric manager to 580.126.20 (from 580.105.08) for all OSs except Amazon Linux 2.
- Upgrade NVIDIA IMEX to 580.126.20 (from 580.105.08) for all OSs except Amazon Linux 2.

3.15.0
------

Expand All @@ -37,7 +47,7 @@ CHANGELOG
- Replace cfn-hup in compute nodes with systemd timer to support in place updates in order to improve performance for tightly coupled worloads at scale.
This new mechanism relies on shared storage to sync updates between the head node and compute nodes.
- Disable `dnf-makecache.timer` to improve performance for tightly coupled worloads on RHEL/Rocky at scale.
- Support updates of `Tags` during cluster-updates.
- Support updates of `Tags` during cluster-updates except in AWS Top Secret and AWS Secret Regions.
- Add `LaunchTemplateOverrides` to cluster config to allow network interfaces to be customized by overriding the launch template of a compute resource.
- This overrides the parallelcluster default using a shallow merge.
- Add alarm on missing clustermgtd heartbeat.
Expand Down
26 changes: 26 additions & 0 deletions cli/src/pcluster/config/update_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import re
from enum import Enum

from pcluster.aws.common import get_region
from pcluster.config.cluster_config import QueueUpdateStrategy
from pcluster.config.update_policy_utils import SharedStorageChangeInfo
from pcluster.constants import (
Expand Down Expand Up @@ -147,6 +148,12 @@ def is_slurm_queues_change(change):
return any(path.startswith("SlurmQueues[") for path in change.path)


def _is_adc_region():
Comment thread
himani2411 marked this conversation as resolved.
"""Return True if the currently configured AWS region belongs to an ADC partition."""
region = get_region() or ""
return region.startswith("us-iso")


def extract_type_and_name_from_path(path):
# Example path = 'SlurmQueues[slurm-q-name]'
# This function returns the type and name extracted like this: 'SlurmQueues', 'slurm-q-name'
Expand Down Expand Up @@ -696,6 +703,25 @@ def fail_reason_extra_chef_attributes(change, _) -> str:
+ ". If you need this change, please consider creating a new cluster instead of updating the existing one.",
)

# Update supported everywhere except ADC regions (us-iso*, us-isob*).
#
# In ADC regions there is a CloudFormation behavior where an UpdateStack call that includes
# both a Tags change and a resource whose change is only in Metadata does not update that
# resource. This breaks the head node update flow because cfn-hup on the head node polls
# DescribeStackResource for Metadata changes on HeadNodeLaunchTemplate and never sees them,
# leaving the HeadNodeWaitCondition to time out.
#
# Until the CloudFormation behavior is fixed in ADC, block tag updates in those regions.
UpdatePolicy.SUPPORTED_UNLESS_ADC = UpdatePolicy(
name="SUPPORTED_UNLESS_ADC",
level=1000,
fail_reason=lambda change, patch: (
f"Updating '{change.key}' during a cluster update is not supported in ADC regions."
),
action_needed=lambda change, patch: f"Restore '{change.key}' to its previous value.",
condition_checker=lambda change, patch: not _is_adc_region(),
)

# Block update if cluster has a managed Fsx for Lustre FileSystem, otherwise fallback to QueueUpdateStrategy
UpdatePolicy.MANAGED_FSX = UpdatePolicy(
name="MANAGED_FSX",
Expand Down
6 changes: 5 additions & 1 deletion cli/src/pcluster/schemas/cluster_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1866,7 +1866,11 @@ class ClusterSchema(BaseSchema):

monitoring = fields.Nested(MonitoringSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
additional_packages = fields.Nested(AdditionalPackagesSchema, metadata={"update_policy": UpdatePolicy.UNSUPPORTED})
tags = fields.Nested(TagSchema, many=True, metadata={"update_policy": UpdatePolicy.SUPPORTED, "update_key": "Key"})
tags = fields.Nested(
TagSchema,
many=True,
metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC, "update_key": "Key"},
) # TODO: Support tags update once ADC region supports it
iam = fields.Nested(ClusterIamSchema, metadata={"update_policy": UpdatePolicy.IGNORED})
directory_service = fields.Nested(
DirectoryServiceSchema, metadata={"update_policy": UpdatePolicy.COMPUTE_AND_LOGIN_NODES_STOP}
Expand Down
2 changes: 1 addition & 1 deletion cli/src/pcluster/schemas/common_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class TagSchema(BaseSchema):
value = fields.Str(
required=True,
validate=validate.Length(max=256),
metadata={"update_policy": UpdatePolicy.SUPPORTED},
metadata={"update_policy": UpdatePolicy.SUPPORTED_UNLESS_ADC},
)

@post_load
Expand Down
16 changes: 8 additions & 8 deletions cli/tests/pcluster/config/test_config_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1154,11 +1154,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
None,
{"Key": "test3", "Value": "val3"},
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_addition",
),
pytest.param(
Expand All @@ -1170,11 +1170,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
{"Key": "test2", "Value": "val2"},
None,
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_removal",
),
pytest.param(
Expand All @@ -1186,11 +1186,11 @@ def test_patch_check_cluster_resource_bucket(
"Value",
"old_value",
"new_value",
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=False,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="tag_value_modification",
),
pytest.param(
Expand All @@ -1202,11 +1202,11 @@ def test_patch_check_cluster_resource_bucket(
"Tags",
None,
{"Key": "test3", "Value": "val3"},
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
is_list=True,
)
],
UpdatePolicy.SUPPORTED,
UpdatePolicy.SUPPORTED_UNLESS_ADC,
id="order_change_plus_addition",
),
pytest.param(
Expand Down
41 changes: 41 additions & 0 deletions cli/tests/pcluster/config/test_update_policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3044,3 +3044,44 @@ def test_head_node_local_storage_update_blocked(
if expected_action_needed:
assert_that(action_needed_messages).is_not_empty()
assert_that(action_needed_messages[0]).is_equal_to(expected_action_needed)


@pytest.mark.parametrize(
"region, expected_result",
[
pytest.param("us-east-1", True, id="commercial region allows tag update"),
pytest.param("eu-west-1", True, id="commercial EU region allows tag update"),
pytest.param("us-gov-west-1", True, id="GovCloud region allows tag update"),
pytest.param("cn-north-1", True, id="China region allows tag update"),
pytest.param("us-iso-east-1", False, id="us-iso region blocks tag update"),
pytest.param("us-iso-west-1", False, id="us-iso-west region blocks tag update"),
pytest.param("us-isob-east-1", False, id="us-isob region blocks tag update"),
pytest.param("", True, id="empty region falls back to allowing update"),
],
)
def test_supported_unless_adc_condition_checker(mocker, region, expected_result):
"""SUPPORTED_UNLESS_ADC blocks updates only in ADC partitions (us-iso*, us-isob*)."""
mocker.patch("pcluster.config.update_policy.get_region", return_value=region)

change_mock = mocker.MagicMock()
patch_mock = mocker.MagicMock()

assert_that(UpdatePolicy.SUPPORTED_UNLESS_ADC.condition_checker(change_mock, patch_mock)).is_equal_to(
expected_result
)


def test_supported_unless_adc_fail_reason_mentions_adc(mocker):
"""The fail reason for SUPPORTED_UNLESS_ADC explains the ADC limitation."""
mocker.patch("pcluster.config.update_policy.get_region", return_value="us-iso-east-1")

change_mock = mocker.MagicMock()
change_mock.key = "Tags"
patch_mock = mocker.MagicMock()

result, fail_reason, action_needed, _ = UpdatePolicy.SUPPORTED_UNLESS_ADC.check(change_mock, patch_mock)

assert_that(result).is_equal_to(UpdatePolicy.CheckResult.ACTION_NEEDED)
assert_that(fail_reason).contains("ADC")
assert_that(fail_reason).contains("Tags")
assert_that(action_needed).contains("Tags")
16 changes: 9 additions & 7 deletions tests/integration-tests/configs/isolated_regions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -599,13 +599,15 @@ test-suites:
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
proxy:
test_proxy.py::test_proxy:
dimensions:
- regions: {{ REGIONS }}
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
# This test has never passed in isolated regions because our proxy instance setup logic in the test needs to be adjusted
# If proxy instance is set up correctly, pcluster should be able to use proxy in isolated regions
# proxy:
# test_proxy.py::test_proxy:
# dimensions:
# - regions: {{ REGIONS }}
# instances: {{ INSTANCES }}
# oss: {{ OSS }}
# schedulers: {{ SCHEDULERS }}
pyxis:
test_pyxis.py::test_pyxis:
dimensions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,23 @@
)


def _get_global_build_number(config=None, args=None):
"""
Gets the global build number from args or pytest config.
Returns the build number as an int, or 0 if not provided.
"""
global_build_number = 0
if args:
args_dict = vars(args)
global_build_number = args_dict.get("global_build_number", 0)
elif config:
global_build_number = config.getoption("--global-build-number", default=0)
try:
return int(global_build_number)
except (TypeError, ValueError):
return 0


def _get_os_parameters(config=None, args=None):
"""
Gets OS jinja parameters.
Expand All @@ -42,9 +59,17 @@ def _get_os_parameters(config=None, args=None):
available_amis_oss_x86 = _get_available_amis_oss("x86", config=config, args=args)
available_amis_oss_arm = _get_available_amis_oss("arm", config=config, args=args)
result = {"AVAILABLE_AMIS_OSS_X86": available_amis_oss_x86, "AVAILABLE_AMIS_OSS_ARM": available_amis_oss_arm}
today_number = (date.today() - date(2020, 1, 1)).days

_propagate_os_jinja_variables("", result, today_number, SUPPORTED_OSES)
# Use global-build-number as the rotation seed if available and non-zero.
# This allows the OS rotation to advance on every build, enabling full coverage
# when running tests multiple times per day. Falls back to day-based rotation otherwise.
global_build_number = _get_global_build_number(config=config, args=args)
if global_build_number:
rotation_seed = global_build_number
else:
rotation_seed = (date.today() - date(2020, 1, 1)).days

_propagate_os_jinja_variables("", result, rotation_seed, SUPPORTED_OSES)

# DCV doesn't support AL2023. Therefore, the following logic makes sure the DCV jinja parameter is not AL2023
dcv_supported_oses = [
Expand All @@ -55,23 +80,23 @@ def _get_os_parameters(config=None, args=None):
for os in SUPPORTED_OSES
if os not in UNSUPPORTED_OSES_FOR_DCV + UNSUPPORTED_ARM_OSES_FOR_DCV + UNSUPPORTED_OSES_FOR_NON_GPU_DCV
]
_propagate_os_jinja_variables("DCV_", result, today_number, dcv_supported_oses, dcv_supported_arm_oses)
_propagate_os_jinja_variables("DCV_", result, rotation_seed, dcv_supported_oses, dcv_supported_arm_oses)

lustre_supported_oses = [os for os in SUPPORTED_OSES if os not in UNSUPPORTED_OSES_FOR_LUSTRE]
_propagate_os_jinja_variables("LUSTRE_", result, today_number, lustre_supported_oses)
_propagate_os_jinja_variables("LUSTRE_", result, rotation_seed, lustre_supported_oses)

no_rhel_oss = [os for os in SUPPORTED_OSES if "rhel" not in os]
_propagate_os_jinja_variables("NO_RHEL_", result, today_number, no_rhel_oss)
_propagate_os_jinja_variables("NO_RHEL_", result, rotation_seed, no_rhel_oss)

no_rocky_oss = [os for os in SUPPORTED_OSES if "rocky" not in os]
_propagate_os_jinja_variables("NO_ROCKY_", result, today_number, no_rocky_oss)
_propagate_os_jinja_variables("NO_ROCKY_", result, rotation_seed, no_rocky_oss)

rhel_oss = [os for os in SUPPORTED_OSES if "rhel" in os]
_propagate_os_jinja_variables("RHEL_", result, today_number, rhel_oss)
_propagate_os_jinja_variables("RHEL_", result, rotation_seed, rhel_oss)
return result


def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_oses, supported_arm_oses=None):
def _propagate_os_jinja_variables(prefix, result, rotation_seed, supported_x86_oses, supported_arm_oses=None):
available_amis_oss_x86 = result["AVAILABLE_AMIS_OSS_X86"]
available_amis_oss_arm = result["AVAILABLE_AMIS_OSS_ARM"]
if supported_arm_oses is None:
Expand All @@ -83,8 +108,12 @@ def _propagate_os_jinja_variables(prefix, result, today_number, supported_x86_os
result[f"{prefix}OS_X86"] = available_amis_oss_x86
result[f"{prefix}OS_ARM"] = available_amis_oss_arm
for index in range(len(supported_x86_oses)):
result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[(today_number + index) % len(available_amis_oss_x86)]
result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[(today_number + index) % len(available_amis_oss_arm)]
result[f"{prefix}OS_X86_{index}"] = available_amis_oss_x86[
(rotation_seed + index) % len(available_amis_oss_x86)
]
result[f"{prefix}OS_ARM_{index}"] = available_amis_oss_arm[
(rotation_seed + index) % len(available_amis_oss_arm)
]


def _get_instance_type_parameters(): # noqa: C901
Expand Down
30 changes: 24 additions & 6 deletions tests/integration-tests/tests/common/assertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,8 @@ def assert_default_user_has_desired_sudo_access(
node_type,
region,
disable_sudo_access_default_user,
retries=6,
retry_delay=15,
):
remote_command_executors = []
logging.info(
Expand All @@ -354,12 +356,28 @@ def assert_default_user_has_desired_sudo_access(

command = "sudo -n cat /etc/sudoers.d/90-cloud-init-users"
for node_index, remote_command_executor in enumerate(remote_command_executors):
result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300)
logging.info(f"Default user in {node_type} number {node_index} and result.failed={result.failed}")
logging.info(f"Default user in {node_type} number {node_index} and result.stdout={result.stdout}")
if disable_sudo_access_default_user:
assert_that(result.stdout).contains("a password is required")
assert_that(result.failed).is_equal_to(disable_sudo_access_default_user)
for attempt in range(retries):
result = remote_command_executor.run_remote_command(command, raise_on_error=False, timeout=300)
logging.info(
f"Default user in {node_type} number {node_index} attempt {attempt} "
f"result.failed={result.failed} result.stdout={result.stdout}"
)
if disable_sudo_access_default_user:
if "a password is required" in result.stdout and result.failed:
logging.info(f"Sudo access correctly disabled on {node_type} number {node_index}")
break
if attempt < retries - 1:
logging.info(
f"Sudo not yet disabled on {node_type} number {node_index}, "
f"retrying in {retry_delay}s (attempt {attempt + 1}/{retries})..."
)
time.sleep(retry_delay)
else:
assert_that(result.stdout).contains("a password is required")
assert_that(result.failed).is_equal_to(True)
else:
assert_that(result.failed).is_equal_to(False)
break


def assert_lambda_vpc_settings_are_correct(stack_name, region, security_group_ids, subnet_ids):
Expand Down
2 changes: 1 addition & 1 deletion tests/integration-tests/tests/common/nccl_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from assertpy import assert_that
from utils import get_instance_info

from tests.performance_tests.common import push_result_to_dynamodb
from tests.common.utils import push_result_to_dynamodb

NCCL_COMMON_DATADIR = pathlib.Path(__file__).parent / "data/nccl/"

Expand Down
Loading
Loading