diff --git a/.gitignore b/.gitignore index 479088e28..39be61286 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,7 @@ CMakeLists.txt.user* # Mac System File .DS_Store + +# lychee link-checker local artifacts +.lycheecache +.lychee-report.md diff --git a/utilities/check-links.sh b/utilities/check-links.sh new file mode 100755 index 000000000..4d219a7ab --- /dev/null +++ b/utilities/check-links.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +#========================================================================== +# +# Copyright NumFOCUS +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +#==========================================================================*/ + + +# Run the lychee link checker against ITK's documentation, examples, +# release notes, and other text artifacts to surface broken links. +# +# This script is intended for periodic manual / cron use, NOT for the +# per-PR CI pipeline. Lychee against the full ITK tree triggers +# rate-limit (HTTP 429) responses on hosts such as github.com, +# doi.org, and journal mirrors that the CI runner cannot work around +# (see lycheeverse/lychee#1574). Running locally with caching across +# invocations is reliable; running on every PR is not. +# +# Usage: +# Utilities/Maintenance/check-links.sh +# Scans the entire repository. Reads +# Utilities/Maintenance/lychee.toml for include / exclude +# configuration. Cache persists between runs at .lycheecache. +# Exit code 0 means no broken links; non-zero indicates findings +# in the report. +# +# Utilities/Maintenance/check-links.sh path1 [path2 ...] +# Scans only the listed files / directories (relative to the +# repository root). +# +# Output: +# Plain-text report on stdout plus a Markdown summary at +# .lychee-report.md (gitignored — local artifact). +# +# Dependencies: +# - lychee (https://lychee.cli.rs/) >= 0.15 +# Install via: cargo install lychee +# or: brew install lychee +# or: pre-built binary from +# https://github.com/lycheeverse/lychee/releases +# - bash >= 4 + +set -uo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +LYCHEE_CONFIG="${REPO_ROOT}/utilities/lychee.toml" +LYCHEE_REPORT="${REPO_ROOT}/.lychee-report.md" +LYCHEE_CACHE="${REPO_ROOT}/.lycheecache" + +if ! command -v lychee >/dev/null 2>&1; then + echo "error: 'lychee' not found on PATH" >&2 + echo " see the install hints in $(basename "$0")" >&2 + exit 127 +fi + +if [[ ! -f "${LYCHEE_CONFIG}" ]]; then + echo "error: ${LYCHEE_CONFIG} is missing" >&2 + exit 1 +fi + +cd "${REPO_ROOT}" + +# Default scan paths if none given. +if [[ $# -eq 0 ]]; then + set -- . +fi + +echo "Running lychee against: $*" +echo "Config: ${LYCHEE_CONFIG}" +echo "Cache: ${LYCHEE_CACHE}" +echo "Report: ${LYCHEE_REPORT}" +echo + +lychee \ + --config "${LYCHEE_CONFIG}" \ + --cache --cache-exclude-status 429 --max-cache-age 1d \ + --format markdown --output "${LYCHEE_REPORT}" \ + "$@" +status=$? + +echo +echo "Report written to ${LYCHEE_REPORT}" +exit $status diff --git a/utilities/lychee.toml b/utilities/lychee.toml new file mode 100644 index 000000000..46dee9358 --- /dev/null +++ b/utilities/lychee.toml @@ -0,0 +1,60 @@ +# Configuration for Utilities/Maintenance/check-links.sh. +# +# See https://lychee.cli.rs/usage/config/ for the full schema. + +# Network behaviour --------------------------------------------------- +max_concurrency = 8 +max_retries = 3 +retry_wait_time = 30 # seconds; matches lychee's default Retry-After honouring +timeout = 30 +user_agent = "ITK-link-checker (https://github.com/InsightSoftwareConsortium/ITK)" + +# Treat the following as non-failures. None of these are "broken +# link" signals; they mean the host or gateway misbehaved on this +# run, not that the target is gone. +# 429 — rate limit. +# 504 — gateway timeout (e.g. eth.limo gateway in front of +# content-link-upload.itk.eth.limo, Cloudflare in front of +# opencollective.org). +# 522 — Cloudflare "origin unreachable" (transient). +# 999 — LinkedIn / GitHub bot block. +accept = [200, 203, 206, 301, 302, 303, 307, 308, 403, 429, 504, 522, 999] + +# Connection-reset / DNS / firewall failures that occur from the +# specific network the maintainer happens to be on (e.g. ipfs.io, +# monai.io, dicom.nema.org all reset the TCP connection from some +# residential ISPs). These are reachability artifacts, not broken +# links, so accept them rather than surface as errors every run. +accept_timeouts = true + +# Path filters -------------------------------------------------------- +# Limit lychee's recursive scan to documentation-style file extensions; +# skipping the vendored ThirdParty trees keeps run time tractable and +# avoids vendored README links that are not maintained by ITK. +extensions = ["md", "rst", "txt", "html"] + +exclude_path = [ + "Modules/ThirdParty", + ".git", + ".lycheecache", + "build", + "build-debug", + "build-release", + "build-python", +] + +# URL patterns to skip. GitHub commit URLs return 429 reliably and +# need no checking; localhost / example placeholders are not real +# targets; mailto and DOI redirects are checked elsewhere. +exclude = [ + "https?://github\\.com/[^/]+/[^/]+/commit/[0-9a-f]+", + "https?://localhost(?::\\d+)?", + "https?://127\\.0\\.0\\.1(?::\\d+)?", + "https?://example\\.com", + "^mailto:", + "^file:", +] + +exclude_link_local = true +exclude_loopback = true +include_mail = false