-
Notifications
You must be signed in to change notification settings - Fork 316
[UpdateWorkflow] Make login node use head-node-driven orchestration for cluster updates #7409
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,6 +48,16 @@ datasource_list: [ Ec2, None ] | |
| output: | ||
| all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/ttyS0" | ||
| write_files: | ||
| - path: /tmp/dna.json | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note for the reviewer: the insertion in this file are all coming from what cfn-=init used to do. Now that we removed cfn-init, we need the user data directive/body to do that. |
||
| permissions: '0644' | ||
| owner: root:root | ||
| content: | | ||
| ${DnaJson} | ||
| - path: /tmp/extra.json | ||
| permissions: '0644' | ||
| owner: root:root | ||
| content: | | ||
| ${ExtraJson} | ||
| - path: /tmp/bootstrap.sh | ||
| permissions: '0744' | ||
| owner: root:root | ||
|
|
@@ -78,8 +88,6 @@ write_files: | |
|
|
||
| [ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh | ||
|
|
||
| $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c deployFiles -r ${LaunchTemplateResourceId} --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole} || error_exit 'Failed to bootstrap the login node. Please check /var/log/cfn-init.log in the login node or in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.' | ||
|
|
||
| # Configure AWS CLI using the expected overrides, if any. | ||
| [ -f /etc/profile.d/aws-cli-default-config.sh ] && . /etc/profile.d/aws-cli-default-config.sh | ||
|
|
||
|
|
@@ -129,7 +137,10 @@ write_files: | |
| vendor_cookbook | ||
| fi | ||
| cd /tmp | ||
| mkdir -p /etc/chef/ohai/hints | ||
| touch /etc/chef/ohai/hints/ec2.json | ||
|
|
||
| jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json ) | ||
| { | ||
| CINC_CMD="cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json" | ||
| pushd /etc/chef && | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1592,6 +1592,22 @@ def _get_launch_templates_config(self): | |
| } | ||
| } | ||
|
|
||
| # LoginPools record Name (for DescribeLaunchTemplateVersions lookup) and LogicalId | ||
| # (filename suffix for the shared dna.json, must match the login node's own | ||
| # launch_template_id). Id/Version are omitted to avoid a CloudFormation circular dependency: | ||
| # head node LT metadata -> LoginNodes nested stack -> head node. | ||
| if self.config.login_nodes and self.config.login_nodes.pools: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note for the reviewer: we need this basic info about login nodes because the script
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Non-blocking: Can you update the comment to mention
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand that we do not want to add the deadlock; can you add the code snippet of where this deadlock happens? Why is the deadlock happening for LoginNode Stack and not for ComputeFleet? If I am not wrong its because of https://github.com/aws/aws-parallelcluster/blob/develop/cli/src/pcluster/templates/cluster_stack.py#L520-L521. Do we know why this is needed for LN and not ComputeFleet; from my understanding we need this dependency because of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both compute and login nodes, when started, require the head node to be running.
|
||
| lt_config["LoginPools"] = { | ||
| pool.name: { | ||
| "LaunchTemplate": { | ||
| "Name": f"{self._stack_name}-{pool.name}", | ||
| "Version": "$Latest", | ||
| "LogicalId": f"LoginNodeLaunchTemplate{create_hash_suffix(pool.name)}", | ||
| } | ||
| } | ||
| for pool in self.config.login_nodes.pools | ||
| } | ||
|
|
||
| return lt_config | ||
|
|
||
| # -- Conditions -------------------------------------------------------------------------------------------------- # | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,8 +36,6 @@ | |
| from pcluster.utils import ( | ||
| get_attr, | ||
| get_http_tokens_setting, | ||
| get_resource_name_from_resource_arn, | ||
| get_service_endpoint, | ||
| is_feature_supported, | ||
| ) | ||
|
|
||
|
|
@@ -129,79 +127,7 @@ def _add_login_nodes_pool_launch_template(self): | |
| ds_config = self._config.directory_service | ||
| ds_generate_keys = str(ds_config.generate_ssh_keys_for_users).lower() if ds_config else "false" | ||
|
|
||
| if self._pool.instance_profile: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note for the reviewer: we are removing this profile because it was declared just to be consumed by cfn-init. Since we removed cfn-init, there is no reason to keep it |
||
| instance_profile_name = get_resource_name_from_resource_arn(self._pool.instance_profile) | ||
| instance_role_name = ( | ||
| AWSApi.instance() | ||
| .iam.get_instance_profile(instance_profile_name) | ||
| .get("InstanceProfile") | ||
| .get("Roles")[0] | ||
| .get("RoleName") | ||
| ) | ||
| elif self._pool.instance_role: | ||
| instance_role_name = get_resource_name_from_resource_arn(self._pool.instance_role) | ||
| else: | ||
| instance_role_name = self._instance_role.ref | ||
|
|
||
| launch_template_id = f"LoginNodeLaunchTemplate{create_hash_suffix(self._pool.name)}" | ||
| launch_template = ec2.CfnLaunchTemplate( | ||
| Stack.of(self), | ||
| launch_template_id, | ||
| launch_template_name=f"{self.stack_name}-{self._pool.name}", | ||
| launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty( | ||
| block_device_mappings=self._launch_template_builder.get_block_device_mappings( | ||
| self._pool.local_storage.root_volume, | ||
| AWSApi.instance().ec2.describe_image(self._config.login_nodes_ami[self._pool.name]).device_name, | ||
| ), | ||
| image_id=self._config.login_nodes_ami[self._pool.name], | ||
| instance_type=self._pool.instance_type, | ||
| metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty( | ||
| http_tokens=get_http_tokens_setting(self._config.imds.imds_support) | ||
| ), | ||
| iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty(name=self._instance_profile), | ||
| user_data=Fn.base64( | ||
| Fn.sub( | ||
| get_user_data_content("../resources/login_node/user_data.sh"), | ||
| { | ||
| **{ | ||
| "Timeout": str( | ||
| get_attr( | ||
| self._config, | ||
| "dev_settings.timeouts.compute_node_bootstrap_timeout", | ||
| NODE_BOOTSTRAP_TIMEOUT, | ||
| ) | ||
| ), | ||
| "AutoScalingGroupName": f"{self._login_nodes_stack_id}-AutoScalingGroup", | ||
| "LaunchingLifecycleHookName": ( | ||
| f"{self._login_nodes_stack_id}-LoginNodesLaunchingLifecycleHook" | ||
| ), | ||
| "LaunchTemplateResourceId": launch_template_id, | ||
| "CloudFormationUrl": get_service_endpoint("cloudformation", self._config.region), | ||
| "CfnInitRole": instance_role_name, | ||
| }, | ||
| **get_common_user_data_env(self._pool, self._config), | ||
| }, | ||
| ) | ||
| ), | ||
| network_interfaces=login_nodes_pool_lt_nw_interface, | ||
| tag_specifications=[ | ||
| ec2.CfnLaunchTemplate.TagSpecificationProperty( | ||
| resource_type="instance", | ||
| tags=get_default_instance_tags( | ||
| self.stack_name, self._config, self._pool, "LoginNode", self._shared_storage_infos | ||
| ) | ||
| + [CfnTag(key=PCLUSTER_LOGIN_NODES_POOL_NAME_TAG, value=self._pool.name)] | ||
| + get_custom_tags(self._config), | ||
| ), | ||
| ec2.CfnLaunchTemplate.TagSpecificationProperty( | ||
| resource_type="volume", | ||
| tags=get_default_volume_tags(self.stack_name, "LoginNode") | ||
| + [CfnTag(key=PCLUSTER_LOGIN_NODES_POOL_NAME_TAG, value=self._pool.name)] | ||
| + get_custom_tags(self._config), | ||
| ), | ||
| ], | ||
| ), | ||
| ) | ||
|
|
||
| dna_json = json.dumps( | ||
| { | ||
|
|
@@ -299,65 +225,66 @@ def _add_login_nodes_pool_launch_template(self): | |
| "launch_template_id": launch_template_id, | ||
| } | ||
| }, | ||
| indent=4, | ||
| indent=None, # Keep indent as None for compact sizing and proper parsing in user_data.sh | ||
| ) | ||
|
|
||
| cfn_init = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note for the reviewer: we can entirely remove this because we removed dependency to cfn-init. |
||
| "configSets": { | ||
| "deployFiles": ["deployConfigFiles"], | ||
| "update": ["deployConfigFiles", "chefUpdate"], | ||
| }, | ||
| "deployConfigFiles": { | ||
| "files": { | ||
| # A nosec comment is appended to the following line in order to disable the B108 check. | ||
| # The file is needed by the product | ||
| # [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory. | ||
| "/tmp/dna.json": { # nosec B108 | ||
| "content": dna_json, | ||
| "mode": "000644", | ||
| "owner": "root", | ||
| "group": "root", | ||
| "encoding": "plain", | ||
| }, | ||
| # A nosec comment is appended to the following line in order to disable the B108 check. | ||
| # The file is needed by the product | ||
| # [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory. | ||
| "/tmp/extra.json": { # nosec B108 | ||
| "mode": "000644", | ||
| "owner": "root", | ||
| "group": "root", | ||
| "content": self._config.extra_chef_attributes, | ||
| }, | ||
| }, | ||
| "commands": { | ||
| "mkdir": {"command": "mkdir -p /etc/chef/ohai/hints"}, | ||
| "touch": {"command": "touch /etc/chef/ohai/hints/ec2.json"}, | ||
| "jq": { | ||
| "command": ( | ||
| 'jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json ' | ||
| '|| ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )' | ||
| launch_template = ec2.CfnLaunchTemplate( | ||
| Stack.of(self), | ||
| launch_template_id, | ||
| launch_template_name=f"{self.stack_name}-{self._pool.name}", | ||
| launch_template_data=ec2.CfnLaunchTemplate.LaunchTemplateDataProperty( | ||
| block_device_mappings=self._launch_template_builder.get_block_device_mappings( | ||
| self._pool.local_storage.root_volume, | ||
| AWSApi.instance().ec2.describe_image(self._config.login_nodes_ami[self._pool.name]).device_name, | ||
| ), | ||
| image_id=self._config.login_nodes_ami[self._pool.name], | ||
| instance_type=self._pool.instance_type, | ||
| metadata_options=ec2.CfnLaunchTemplate.MetadataOptionsProperty( | ||
| http_tokens=get_http_tokens_setting(self._config.imds.imds_support) | ||
| ), | ||
| iam_instance_profile=ec2.CfnLaunchTemplate.IamInstanceProfileProperty(name=self._instance_profile), | ||
| user_data=Fn.base64( | ||
| Fn.sub( | ||
| get_user_data_content("../resources/login_node/user_data.sh"), | ||
| { | ||
| **{ | ||
| "Timeout": str( | ||
| get_attr( | ||
| self._config, | ||
| "dev_settings.timeouts.compute_node_bootstrap_timeout", | ||
| NODE_BOOTSTRAP_TIMEOUT, | ||
| ) | ||
| ), | ||
| "AutoScalingGroupName": f"{self._login_nodes_stack_id}-AutoScalingGroup", | ||
| "LaunchingLifecycleHookName": ( | ||
| f"{self._login_nodes_stack_id}-LoginNodesLaunchingLifecycleHook" | ||
| ), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note To Self: we removed below from LT because those were needed for Cfn-init command that we were running from |
||
| "DnaJson": dna_json, | ||
| "ExtraJson": self._config.extra_chef_attributes, | ||
| }, | ||
| **get_common_user_data_env(self._pool, self._config), | ||
| }, | ||
| ) | ||
| ), | ||
| network_interfaces=login_nodes_pool_lt_nw_interface, | ||
| tag_specifications=[ | ||
| ec2.CfnLaunchTemplate.TagSpecificationProperty( | ||
| resource_type="instance", | ||
| tags=get_default_instance_tags( | ||
| self.stack_name, self._config, self._pool, "LoginNode", self._shared_storage_infos | ||
| ) | ||
| }, | ||
| }, | ||
| }, | ||
| "chefUpdate": { | ||
| "commands": { | ||
| "chef": { | ||
| "command": ( | ||
| ". /etc/parallelcluster/pcluster_cookbook_environment.sh; " | ||
| "cinc-client --local-mode --config /etc/chef/client.rb --log_level info" | ||
| " --logfile /var/log/chef-client.log --force-formatter --no-color" | ||
| " --chef-zero-port 8889 --json-attributes /etc/chef/dna.json" | ||
| " --override-runlist aws-parallelcluster-entrypoints::update &&" | ||
| " /opt/parallelcluster/scripts/fetch_and_run -postupdate" | ||
| ), | ||
| "cwd": "/etc/chef", | ||
| } | ||
| } | ||
| }, | ||
| } | ||
|
|
||
| launch_template.add_metadata("AWS::CloudFormation::Init", cfn_init) | ||
| + [CfnTag(key=PCLUSTER_LOGIN_NODES_POOL_NAME_TAG, value=self._pool.name)] | ||
| + get_custom_tags(self._config), | ||
| ), | ||
| ec2.CfnLaunchTemplate.TagSpecificationProperty( | ||
| resource_type="volume", | ||
| tags=get_default_volume_tags(self.stack_name, "LoginNode") | ||
| + [CfnTag(key=PCLUSTER_LOGIN_NODES_POOL_NAME_TAG, value=self._pool.name)] | ||
| + get_custom_tags(self._config), | ||
| ), | ||
| ], | ||
| ), | ||
| ) | ||
|
|
||
| return launch_template | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note for the review: this fix is required by the current change. The fix is shipped in aws/aws-parallelcluster-cookbook#3188