diff --git a/changelog.d/5-bug-fixes/pipeline-fixes b/changelog.d/5-bug-fixes/pipeline-fixes new file mode 100644 index 000000000..381d78f0d --- /dev/null +++ b/changelog.d/5-bug-fixes/pipeline-fixes @@ -0,0 +1 @@ +Fixed: Update the docker pull logic to retry if initial pull fails and wait when connecting to fresh VMs in hetzner for cd diff --git a/nix/scripts/create-container-dump.sh b/nix/scripts/create-container-dump.sh index 84bee63ce..cecf792eb 100644 --- a/nix/scripts/create-container-dump.sh +++ b/nix/scripts/create-container-dump.sh @@ -13,34 +13,72 @@ export REGISTRY_TIMEOUT=600 # Registry specific timeout output_dir=$1 mkdir -p $1 + # Download all the docker images into $1, and append its name to an index.txt # If this errors out for you, copy default-policy.json from the skopeo repo to # /etc/containers/policy.json while IFS= read -r image; do - # sanitize the image file name, replace slashes with underscores, suffix with .tar - image_filename=$(sed -r "s/[:\/]/_/g" <<< $image) - image_path=$(realpath $1)/${image_filename}.tar - if [[ -e $image_path ]];then - echo "Skipping $image_filename…" + +# sanitize the image file name, replace slashes with underscores, suffix with .tar + image_filename=$(sed -r "s/[:\/]/_/g" <<< "$image") + image_path="$(realpath "$1")/${image_filename}.tar" + + if [[ -s "$image_path" ]]; then + echo "Skipping $image_filename…" + continue + fi + + echo "Fetching $image_filename…" + + # All of these images should be publicly fetchable, especially given we + # ship public tarballs containing these images. + # ci.sh already honors DOCKER_LOGIN, so do the same here, otherwise + # fallback to unauthorized fetching. + + # If an image has both a tag and digest, remove the tag. Return the original if there is no match. + image_trimmed=$(echo "$image" | sed -E 's/(.+)(:.+(@.+))/\1\3/') + + tmp_path="${image_path}.tmp" + rm -f "$tmp_path" + + success=false + + for attempt in {1..5}; do + echo "Attempt $attempt/5 for $image_trimmed" + + if [[ -n "${DOCKER_LOGIN:-}" && "$image" =~ quay.io/wire ]]; then + skopeo copy --insecure-policy \ + --src-creds "$DOCKER_LOGIN" \ + --retry-times 10 \ + "docker://$image_trimmed" \ + "docker-archive:${tmp_path}" \ + --additional-tag "$image" || rc=$? else - echo "Fetching $image_filename…" - - # All of these images should be publicly fetchable, especially given we - # ship public tarballs containing these images. - # ci.sh already honors DOCKER_LOGIN, so do the same here, otherwise - # fallback to unauthorized fetching. - - # If an image has both a tag and digest, remove the tag. Return the original if there is no match. - image_trimmed=$(echo "$image" | sed -E 's/(.+)(:.+(@.+))/\1\3/') - if [[ -n "${DOCKER_LOGIN:-}" && "$image" =~ quay.io/wire ]];then - skopeo copy --insecure-policy --src-creds "$DOCKER_LOGIN" --retry-times 10 \ - docker://$image_trimmed docker-archive:${image_path} --additional-tag $image - else - skopeo copy --insecure-policy --retry-times 10 \ - docker://$image_trimmed docker-archive:${image_path} --additional-tag $image - fi - echo "${image_filename}.tar" >> $(realpath "$1")/index.txt - # passing image and $output_dir - create-build-entry $image $output_dir + skopeo copy --insecure-policy \ + --retry-times 10 \ + "docker://$image_trimmed" \ + "docker-archive:${tmp_path}" \ + --additional-tag "$image" || rc=$? + fi + + rc=$? + + if [[ $rc -eq 0 && -s "$tmp_path" ]]; then + mv "$tmp_path" "$image_path" + success=true + break fi + + echo "Fetch failed for $image_trimmed with rc=$rc; retrying…" + rm -f "$tmp_path" + sleep $((attempt * 20)) + done + + if [[ "$success" != true ]]; then + echo "ERROR: failed to fetch $image after retries" >&2 + exit 1 + fi + + echo "${image_filename}.tar" >> "$(realpath "$1")/index.txt" + create-build-entry "$image" "$output_dir" done diff --git a/terraform/examples/wiab-staging-hetzner/outputs.tf b/terraform/examples/wiab-staging-hetzner/outputs.tf index 8fc10ec4e..c3fa5037b 100644 --- a/terraform/examples/wiab-staging-hetzner/outputs.tf +++ b/terraform/examples/wiab-staging-hetzner/outputs.tf @@ -55,7 +55,7 @@ output "static-inventory" { } } vars = { - ansible_ssh_common_args = "-o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null -o ControlMaster=auto -o ControlPersist=60s -o BatchMode=yes -o ConnectionAttempts=10 -o ServerAliveInterval=60 -o ServerAliveCountMax=3" + ansible_ssh_common_args = "-o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null -o ControlMaster=auto -o ControlPersist=60s -o BatchMode=yes -o ConnectionAttempts=10 -o ServerAliveInterval=60 -o ServerAliveCountMax=3 -o ConnectTimeout=10" } } private = { @@ -66,7 +66,7 @@ output "static-inventory" { adminhost_local = {} } vars = { - ansible_ssh_common_args = "-o ProxyCommand=\"ssh -i ssh_private_key -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null -W %h:%p -q root@${hcloud_server.adminhost.ipv4_address}\" -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null -o ControlMaster=auto -o ControlPersist=60s -o BatchMode=yes -o ConnectionAttempts=10 -o ServerAliveInterval=60 -o ServerAliveCountMax=3" + ansible_ssh_common_args = "-o ProxyCommand=\"ssh -i ssh_private_key -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null -W %h:%p -q root@${hcloud_server.adminhost.ipv4_address}\" -o StrictHostKeyChecking=accept-new -o UserKnownHostsFile=/dev/null -o ControlMaster=auto -o ControlPersist=60s -o BatchMode=yes -o ConnectionAttempts=10 -o ServerAliveInterval=60 -o ServerAliveCountMax=3 -o ConnectTimeout=10" } } adminhost_local = { diff --git a/terraform/examples/wiab-staging-hetzner/setup_nodes.yml b/terraform/examples/wiab-staging-hetzner/setup_nodes.yml index 0bb041ea9..b77b9ec3d 100644 --- a/terraform/examples/wiab-staging-hetzner/setup_nodes.yml +++ b/terraform/examples/wiab-staging-hetzner/setup_nodes.yml @@ -1,4 +1,20 @@ --- +- name: Wait for adminhost private SSH + hosts: adminhost + gather_facts: no + tasks: + - name: Wait for SSH on public adminhost + wait_for_connection: + timeout: 300 + delay: 5 + + - name: Wait until adminhost private IP is reachable from public adminhost + wait_for: + host: "{{ hostvars['adminhost_local'].ansible_host }}" + port: 22 + timeout: 300 + delay: 5 + - name: Setup adminhost with dnsmasq and Docker hosts: adminhost_local become: yes