diff --git a/AIPromptAddingNewWorkflow.md b/AIPromptAddingNewWorkflow.md new file mode 100644 index 000000000..a9fab3998 --- /dev/null +++ b/AIPromptAddingNewWorkflow.md @@ -0,0 +1,86 @@ +# AI Assistant Context: Adding a New Session Workflow + +This file provides detailed context and requirements for AI code assistants when creating new interactive session workflows. Reference this file along with DeveloperGuide.md when prompted to create a new workflow. + +## Workflow Structure and Requirements + +When creating a new interactive session workflow, the YAML will be named `[deployment]_v4.yaml` (e.g., `general_v4.yaml`, `emed_v4.yaml`) + +## General Requirements + +1. DO NOT add k8s support - create only the standard deployment workflow +2. Use the session_runner subworkflow (github/parallelworks/interactive_session@mai with $yaml: workflow/session_runner/v1.4/.yaml) for deployment +3. Follow the existing v4 session pattern with preprocessing + session_runner jobs +4. Ask the user which deployment target to use (general/emed/hsp/noaa) if not specified + +## Files to Create + +### 1. `[service-name]/controller-v3.sh` + - Bash script that runs on the controller node (has internet access) + - Install/download dependencies needed for the service + - Make it idempotent (check if already installed before installing) + - Use `service_parent_install_dir` variable (default: ${HOME}/pw/software) + - Set appropriate executable permissions where needed + +### 2. `[service-name]/start-template-v3.sh` + - Bash script that starts the web service + - MUST use `service_port` variable provided by session_runner + - Create a `cancel.sh` script with commands to kill the service + - Example structure: + ```bash + #!/bin/bash + # Start service on service_port + echo '#!/bin/bash' > cancel.sh + chmod +x cancel.sh + + # Start your service + /path/to/service --port=${service_port} & + pid=$! + echo "kill ${pid}" >> cancel.sh + + sleep inf + ``` + +### 3. `workflow/yamls/[service-name]/[deployment]_v4.yaml` + - Complete workflow YAML with: + - `permissions: ['*']` section + - `sessions.session` with `useTLS: false` and `redirect: true` + - `preprocessing` job that: + - Checks out this repo with sparse_checkout for your service directory + - Creates `inputs.sh` with PW environment variables + form inputs + - Uses `remoteHost: ${{ inputs.cluster.resource.ip }}` + - `session_runner` job that: + - Depends on preprocessing (`needs: [preprocessing]`) + - Uses `github/parallelworks/interactive_session@main` with `$yaml: workflow/session_runner/v1.4/.yaml` + - Passes session, resource, cluster (slurm/pbs settings), and service configuration + - Service config must include: + - `start_service_script: ${PW_PARENT_JOB_DIR}/[service-name]/start-template-v3.sh` + - `controller_script: ${PW_PARENT_JOB_DIR}/[service-name]/controller-v3.sh` + - `inputs_sh: ${PW_PARENT_JOB_DIR}/inputs.sh` + - `slug: ""` (or appropriate URL path like "lab", "vnc.html") + - `rundir: ${PW_PARENT_JOB_DIR}` + - Input form under `'on'.execute.inputs` with: + - Standard `cluster` group (resource, scheduler, slurm, pbs settings) + - Service-specific `service` group for your configuration options + +### 4. `workflow/yamls/[service-name]/README.md` + - User-facing documentation for the workflow. Structure: + - **Title + one-line description** of what the service provides + - **Features**: bullet list of key capabilities (runtime options, GPU support, scheduler support, etc.) + - **Use Cases**: bullet list of typical scenarios users would launch this for + - **Configuration**: subsection per major input group (e.g., OS, startup options, compute resources) — describe what each does and any valid values + - **Requirements**: any software that must be present on the target node (e.g., module, binary, container runtime) + - **Getting Started**: short numbered steps (select resource → configure → launch → access) + - Keep it factual and concise — no implementation details, no internal paths + +## Reference Implementations +- Look at `webshell/controller-v3.sh` and `webshell/start-template-v3.sh` for the simplest example +- Look at `workflow/yamls/jupyterlab-host/general_v4.yaml` for workflow structure (but don't copy JupyterLab-specific settings) +- Compare deployment variants like `general_v4.yaml` vs `emed_v4.yaml` to understand deployment-specific differences + +## Key Constraints +- Service MUST listen on `service_port` (allocated by session_runner) +- Scripts MUST be idempotent (safe to run multiple times) +- DO NOT create k8s variants (no general_k8s_v4.yaml) +- Follow the exact directory structure: `[service-name]/` for scripts, `workflow/yamls/[service-name]/` for YAML +- Ensure all paths in the YAML use `${PW_PARENT_JOB_DIR}` prefix for scripts and inputs.sh diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..266531e41 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,99 @@ +# CLAUDE.md — Interactive Sessions Repository + +## Project Overview + +This repository contains the **Interactive Sessions** framework for the [Activate platform](https://parallelworks.com). It enables browser-based interactive computing sessions (JupyterLab, VS Code, VNC desktops, web terminals) on remote compute clusters. + +Each session type consists of: +- A **controller script** (`controller-v3.sh`) — runs on the login node, handles installation/setup with internet access +- A **start script** (`start-template-v3.sh`) — runs on the compute node, launches the service +- **Workflow YAML files** (`workflow/yamls/[service-name]/`) — define the UI form and orchestrate execution via the Activate platform + +## Repository Structure + +``` +[service-name]/ +├── controller-v3.sh # Setup/installation (runs on login node) +└── start-template-v3.sh # Service startup (runs on compute/login node) + +workflow/ +├── yamls/[service-name]/ # Per-deployment workflow YAML files +│ ├── general_v4.yaml # Standard SLURM/PBS clusters +│ ├── emed_v4.yaml +│ ├── hsp_v4.yaml +│ └── noaa_v4.yaml +├── readmes/[service-name]/ # User-facing documentation +├── thumbnails/ # UI thumbnails for the Activate platform +├── session_runner/ # Subworkflow for job orchestration +│ ├── general.yaml +│ ├── emed.yaml +│ ├── hsp.yaml +│ └── noaa.yaml +└── k8s/ # Kubernetes-specific configs (separate) + +downloads/ # Binary dependencies (Git LFS) +examples/ # Example configurations +``` + +**Available services**: `jupyterlab-host`, `jupyter-host`, `webshell`, `openvscode`, `vncserver`, `kasmvnc-docker`, `kasmvnc-singularity`, `open-notebook` + +**Deployment variants**: `general`, `emed`, `hsp`, `noaa` (+ kubernetes for some) + +## No Build System + +There is **no build system, test suite, or linter**. This is a configuration/workflow repository. Deployment happens by the Activate platform cloning this repo and executing scripts directly. Validation is manual, on target clusters. + +## Adding a New Session — Required Steps + +See `AIPromptAddingNewWorkflow.md` and `DevelopersGuide.md` for full details. Summary: + +1. Create `[service-name]/controller-v3.sh` (login node, has internet) +2. Create `[service-name]/start-template-v3.sh` (compute node, may not have internet) +3. Create `workflow/yamls/[service-name]/general_v4.yaml` (and emed, hsp, noaa variants) +4. Create `workflow/readmes/[service-name]/README.md` +5. Add thumbnails to `workflow/thumbnails/` + +Use these as reference implementations: +- **Simplest**: `webshell/` (minimal, just ttyd terminal) +- **Typical**: `jupyterlab-host/` (conda, nginx proxy, JupyterLab) +- **Complex**: `vncserver/` (containers, multiple desktop environments) + +## Workflow YAML Structure + +Each workflow YAML has three jobs: +1. **permissions** — defines which users can run the workflow +2. **preprocessing** — generates `inputs.sh` from form inputs + platform environment variables +3. **session_runner** — the `marketplace/session_runner` subworkflow that orchestrates job submission and SSH tunneling + +Always use `marketplace/session_runner` (current version: `v1.3` or `v1.4`). Do NOT implement job submission logic directly. + +## Critical Rules and Conventions + +### Scripts +- Scripts MUST be **idempotent** — safe to re-run without side effects (check before installing) +- Service MUST listen on `${service_port}` — this port is dynamically allocated by `session_runner` +- Scripts MUST create `cancel.sh` for graceful shutdown +- The start script (compute node) MUST end with `sleep inf` (or equivalent) to keep the job alive +- All variables arrive via sourced `inputs.sh` (do not hardcode paths or values) +- Use `${PW_PARENT_JOB_DIR}` for all job directory references +- Use `service_parent_install_dir` for software install path (default: `${HOME}/pw/software`) + +### Workflow YAMLs +- All paths in YAML MUST use `${PW_PARENT_JOB_DIR}` prefix +- DO NOT add Kubernetes support in standard SLURM/PBS workflows — create those +- Form inputs are organized into groups: `cluster` (resource selection, scheduler) and `service` (service-specific options) +- Form values are accessible as `inputs.service.*` and `inputs.cluster.*` in YAML + +### Scheduler +- `scheduler: true` → job submitted via sbatch/qsub to a compute node +- `scheduler: false` → job runs on the login/controller node +- Support both SLURM and PBS + +## Git and Deployment + +- Push directly to `main` branch (`git@github.com:parallelworks/interactive_session.git`) +- No PRs or branches documented in the developer guide +- Git LFS is configured for large binaries in `downloads/` (juice binary, VNC containers) +- Do not store large binaries outside `downloads/` without Git LFS + + diff --git a/DeveloperGuide.md b/DeveloperGuide.md new file mode 100644 index 000000000..fa6af1b1b --- /dev/null +++ b/DeveloperGuide.md @@ -0,0 +1,275 @@ +# Developer Guide: Creating a New Interactive Session + +A session requires three files: + +| File | Runs on | Purpose | +|------|---------|---------| +| `controller-v3.sh` | Controller (login) node | Install software, download dependencies | +| `start-template-v3.sh` | Controller or compute node | Start the web service | +| Workflow YAML | Platform | Define the UI form, generate `inputs.sh`, call `session_runner` | + +The controller node always has internet access. The compute node may not. + + +## AI Assistant Prompt: Adding a New Session Workflow + +Use this prompt template when instructing an AI code assistant to add a new session: + +``` +Using the DeveloperGuide.md and the AIPromptAddingNewWorkflow.md files, create a new interactive session workflow for [deployment] to [Describe your workflow here]. +``` + +Replace `[deployment]` with your target deployment (general/emed/hsp/noaa) and `[Describe your workflow here]` with your specific service requirements, for example "a hello world nginx service". The AI assistant will use both reference files to understand the architecture and requirements for creating the new workflow. + + +## 1. Create the Controller Script + +File: `my-session/controller-v3.sh` + +This script runs **before** the service starts. Use it to install dependencies that require internet access. All variables from `inputs.sh` are available. + +```bash +#!/usr/bin/env bash +set -o pipefail + +if [ -z ${service_parent_install_dir} ]; then + service_parent_install_dir=${HOME}/pw/software +fi + +# Install your software if not already present +if ! [ -f "${service_parent_install_dir}/my-server" ]; then + echo "Installing my-server..." + mkdir -p ${service_parent_install_dir} + wget https://example.com/my-server.tar.gz -O /tmp/my-server.tar.gz + tar -xzf /tmp/my-server.tar.gz -C ${service_parent_install_dir} +fi +``` + +Keep it idempotent -- check if software exists before installing. + +## 2. Create the Start Service Script + +File: `my-session/start-template-v3.sh` + +This script starts the web service. The `session_runner` subworkflow provides the `service_port` variable -- your service **must** listen on this port. All variables from `inputs.sh` are available. + +```bash +#!/bin/bash +# service_port is provided by the session_runner subworkflow + +if [ -z ${service_parent_install_dir} ]; then + service_parent_install_dir=${HOME}/pw/software +fi + +# Create a cancel.sh script so the platform can stop the service +echo '#!/bin/bash' > cancel.sh +chmod +x cancel.sh + +# Start your service +${service_parent_install_dir}/my-server --port=${service_port} & +pid=$! +echo "kill ${pid}" >> cancel.sh + +sleep inf +``` + +Key requirements: +- **Use `service_port`** as the listening port. It is allocated automatically. +- **Write a `cancel.sh`** script that kills your service process. +- **End with `sleep inf`** to keep the job alive (or run the service in the foreground). + +## 3. Create the Workflow YAML + +File: `workflow/yamls/my-session/general_v4.yaml` + +The YAML has three responsibilities: +1. Define the user input form (under `on.execute.inputs`) +2. Generate `inputs.sh` from the form values (in the `preprocessing` job) +3. Call the `session_runner` subworkflow with the paths to your scripts + +### Minimal Example + +```yaml +permissions: + - '*' + +sessions: + session: + useTLS: false + redirect: true + +jobs: + preprocessing: + ssh: + remoteHost: ${{ inputs.cluster.resource.ip }} + steps: + - name: Checkout + uses: parallelworks/checkout + with: + repo: https://github.com/parallelworks/interactive_session.git + branch: main + sparse_checkout: + - my-session + - name: Create Inputs + run: | + set -x + # Capture PW environment variables + env | grep '^PW_' | grep -v 'PW_API_KEY' > inputs.sh + sed -i 's/=\(.*\)/="\1"/' inputs.sh + + # Add your form inputs + cat <<'EOF' >> inputs.sh + basepath=/me/session/${PW_USER}/${{ sessions.session }} + PATH=$HOME/pw:$PATH + service_parent_install_dir="${{ inputs.service.parent_install_dir }}" + EOF + + # Clean up and export + sed -i '/=\s*$\|=undefined\s*$/d' inputs.sh + sed -i '/=""/d' inputs.sh + sed -i 's/^/export /' inputs.sh + + session_runner: + needs: + - preprocessing + ssh: + remoteHost: ${{ inputs.cluster.resource.ip }} + steps: + - uses: github/parallelworks/interactive_session@main + early-cancel: any-job-failed + with: + $yaml: workflow/session_runner/v1.4/general.yaml + session: ${{ sessions.session }} + resource: ${{ inputs.cluster.resource }} + cluster: + scheduler: ${{ inputs.cluster.scheduler }} + slurm: + is_disabled: ${{ inputs.cluster.resource.schedulerType != 'slurm' || inputs.cluster.scheduler == false }} + partition: ${{ inputs.cluster.slurm.partition }} + scheduler_directives: ${{ inputs.cluster.slurm.scheduler_directives }} + time: ${{ inputs.cluster.slurm.time }} + pbs: + is_disabled: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.cluster.scheduler == false }} + scheduler_directives: ${{ inputs.cluster.pbs.scheduler_directives }} + service: + start_service_script: ${PW_PARENT_JOB_DIR}/my-session/start-template-v3.sh + controller_script: ${PW_PARENT_JOB_DIR}/my-session/controller-v3.sh + inputs_sh: ${PW_PARENT_JOB_DIR}/inputs.sh + slug: "" + rundir: ${PW_PARENT_JOB_DIR} + +'on': + execute: + inputs: + cluster: + type: group + label: Compute Cluster Settings + items: + resource: + type: compute-clusters + label: Service host + include-workspace: false + scheduler: + type: boolean + default: false + label: Schedule Job? + hidden: ${{ inputs.cluster.resource.schedulerType == '' }} + ignore: ${{ .hidden }} + slurm: + type: group + label: SLURM Directives + hidden: ${{ inputs.cluster.resource.schedulerType != 'slurm' || inputs.cluster.scheduler == false }} + items: + is_disabled: + type: boolean + hidden: true + default: ${{ inputs.cluster.resource.schedulerType != 'slurm' || inputs.cluster.scheduler == false }} + partition: + type: slurm-partitions + label: SLURM partition + optional: true + resource: ${{ inputs.cluster.resource }} + time: + label: Walltime + type: string + default: '01:00:00' + scheduler_directives: + type: editor + optional: true + pbs: + type: group + label: PBS Directives + hidden: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.cluster.scheduler == false }} + items: + is_disabled: + type: boolean + hidden: true + default: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.cluster.scheduler == false }} + scheduler_directives: + type: editor + service: + type: group + label: My Session Settings + items: + parent_install_dir: + label: Install Directory + type: string + default: ${HOME}/pw/software +``` + +### Understanding the `session_runner` Interface + +The `session_runner` subworkflow accepts these inputs: + +| Parameter | Description | +|-----------|-------------| +| `session` | Reference to the session object defined in `sessions:` | +| `resource` | The compute cluster resource | +| `cluster.scheduler` | `true` to submit to SLURM/PBS, `false` to run on controller | +| `cluster.slurm` | SLURM settings: `is_disabled`, `partition`, `time`, `scheduler_directives` | +| `cluster.pbs` | PBS settings: `is_disabled`, `scheduler_directives` | +| `service.start_service_script` | Path to your start script | +| `service.controller_script` | Path to your controller script | +| `service.inputs_sh` | Path to the generated `inputs.sh` | +| `service.slug` | URL path appended to the session URL (e.g., `lab`, `vnc.html`) | +| `service.rundir` | Working directory for the service | + +### What the `session_runner` Does + +1. **Preprocessing** -- Combines `inputs.sh` + `controller-v3.sh` and runs it on the controller node. Then combines `inputs.sh` + `start-template-v3.sh` into the final start script, injecting port allocation and cleanup traps. +2. **Job submission** -- If `scheduler: true`, submits the start script via `sbatch`/`qsub` to a compute node. If `false`, runs it directly on the controller node. +3. **Wait for start** -- Polls for the `job.started` marker file. +4. **Create session** -- Waits for the service to respond on its port, then registers the session URL with the platform. +5. **Cleanup** -- On workflow cancellation, runs `cancel.sh` to stop the service. + +### Variables Available in Your Scripts + +The `session_runner` injects these into the start script before your code runs: + +| Variable | Description | +|----------|-------------| +| `service_port` | The allocated port. Your service **must** listen on this port. | + +All variables exported in `inputs.sh` are also available in both scripts. + +## 4. Deployment-Specific Variants + +To support multiple deployments, create a separate YAML per platform deployment: + +``` +workflow/yamls/my-session/ +├── general_v4.yaml # Standard SLURM/PBS clusters +├── emed_v4.yaml # EMED clusters +├── noaa_v4.yaml # NOAA clusters +└── hsp_v4.yaml # HSP clusters +``` + +Each YAML uses the corresponding `session_runner` variant (e.g., `github/parallelworks/interactive_session@main` with `$yaml: workflow/session_runner/v1.4/.yaml` resolves to the appropriate deployment). The differences are typically in scheduler directives, partition names, and cluster-specific environment setup in the `inputs.sh` generation. + +## Existing Sessions as Reference + +Look at these for working examples: + +- **Simplest**: `webshell/` -- Starts a single `ttyd` process, minimal controller setup. +- **Typical**: `jupyterlab-host/` -- Conda installation in controller, nginx proxy + JupyterLab in start script. +- **Complex**: `vncserver/` -- Multiple desktop environment options, Singularity/Docker containers. diff --git a/DevelopersGuide.md b/DevelopersGuide.md deleted file mode 100644 index 7ca26db9d..000000000 --- a/DevelopersGuide.md +++ /dev/null @@ -1,58 +0,0 @@ -# Developers Guide -Welcome to the developer's guide for the interactive sessions repository. While this repository offers one method for creating interactive workflows in Parallel Works, please note that it's not the sole approach available. Ensure you review the information provided in these links on [interactive workflows](https://parallelworks.com/docs/workflows/interactive-sessions) and [developing workflows](https://parallelworks.com/docs/workflows/creating-workflows) before proceeding with this guide. - -This repository enables you to establish interactive session workflows for running various services, such as Jupyter or remote desktops, across different types of executors. The supported executors encompass your Parallel Works user workspace as well as remote on-premises or cloud clusters. When running on a cluster, we offer support for execution on the controller node, a SLURM partition, or a PBS queue. The session.sh script is launched on the selected executor to initiate the service.. - -The following files are executed in the specified order when a job is launched: -1. `main.sh`: Executes in the user workspace and contains the top-level logic for loading and preparing inputs, as well as the service.json file. This script also calls the `/session_wrapper.sh` script. -2. `/session_wrapper.sh`: Executes in the user workspace, generating the `session.sh` by wrapping the `/start_template.sh` script. Subsequently, it launches the `session.sh` script on the designated executor. The `session.sh` script encapsulates the service code and oversees SSH tunnel management. - -When a job is canceled, the following files are executed in this order: -1. `kill.sh`: Executes in the user workspace and is generated by the `/session_wrapper.sh` script to terminate the session if the job is canceled. -2. `/kill-template.sh`: Executes in the selected executor and is used created by the by the `/session_wrapper.sh` script to create the `kill.sh`. This scripts contains the commands that are specific to terminate the service. - -## Repository Structure -The repository is structured as follows: - -### Execution Mode Directories -The Execution mode directories contain the wrapper script to execute the service in different types of executors: -- `local`: User workspace -- `controller`: Controller node of the cluster -- `partition`: SLURM partition or PBS queue - -### Workflow Directory -The workflow directory contains workflow-specific information and is further categorized into the following subdirectories: -- `readmes`: Every file contains the description of a given workflow. The files are organized by service and each service has its own subdirectory. -- `xmls`: Contains the workflow XML files for each workflow. Remember this file is used by Parallel Works to render the input form and execute the workflow. The files are organized by service and each service has its own subdirectory. See workflow XML section for more information. -- `thumbnails`: Contains the thumbnail icon for each workflow. Typically, all workflows that run the same service use the same thumbnail. - -### Service Directories -The rest of the directories are service directories and contain the specific code to start the service. - -## Workflow XML -The workflow XML encompasses both input parameters and execution details for each workflow. For a comprehensive understanding of workflow XML files, please consult the [user guide](https://parallelworks.com/docs/workflows/creating-workflows#form-configuration). In this instance, it adheres to the subsequent structure: - -- Command and cancellation items define the execution and cancellation process for the workflow. -- The `service_name` parameter is set as a fixed value corresponding to the service directory's name. -- The resource section adheres to the resource wrapper tool's format. For further details, refer to [this link](https://github.com/parallelworks/interactive_session/blob/main/utils/input_form_resource_wrapper.py). -- Additional parameters and sections are also included. - -## Adding a New Service -Follow this steps to add a new service using the interactive sessions repository. - -### 1. Create the Service Directory - You can either create a new service directory from scratch or duplicate an existing one that closely matches your requirements. Within the service directory, you'll be working with several scripts. These scripts have access to environment variables with the input parameters defined in the input form as well as any variables introduced by the workflow. Here are the key scripts: - -- start-template.sh: Use this script to write the necessary code to start the service. -- url.sh: This script is responsible for writing the SLUG parameter within the file `service.json`. Visit [this link](https://parallelworks.com/docs/workflows/creating-workflows#servicejson) for more information about the service JSON file. In most cases, this script sets the SLUG parameter to "\"". - -- kill-template.sh: This script serves the purpose of terminating the service initiated by start-template.sh when the workflow is canceled. For instance, if you employ `docker run -d` within start-template.sh, this script becomes essential for stopping the Docker container. It is automatically executed when a job is canceled within the PW platform. - -### 2. Create the README and XML Directories -These components serve as the front end of the workflow, encompassing the description and XML files for each workflow that initiates this service. It's important to note that a single service can have various workflows tailored to strike the right balance between customization and generality. For instance, some workflows might require the exposure or hardcoding of specific SLURM directives, such as the account or the number of tasks per node for a particular cluster. Another common scenario involves desktop sessions that launch different software applications like MATLAB or RSTUDIO. This approach alleviates the need for users to complete extensive forms when submitting jobs to a designated cluster. - -When crafting the XML file, we recommend starting with an existing one and making necessary adjustments to the service section to align with your requirements. It's crucial to ensure that the `service_name` parameter in the XML matches the name of the service directory. - -### 3. Add a Thumbnail for the Service -Place a 120x120 PNG image into the thumbnails directory to serve as the workflow's thumbnail. - diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..490fb6ddf --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2026 Parallel Works + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README-v3.md b/README-v3.md deleted file mode 100644 index 948abd2fc..000000000 --- a/README-v3.md +++ /dev/null @@ -1,20 +0,0 @@ -## Interactive Session -Interactive session workflows initiate a server, such as a remote desktop or Jupyter Notebook server, on your chosen resource and establish a connection through an SSH tunnel to link it to the Parallel Works platform. - -You can launch interactive sessions on the controller or login node of a cluster, on a compute node of a SLURM partition or PBS queue, or in your user workspace (user container). - -Here's how to use an interactive session job: - -1. Choose the resource where you want to start the server. -2. Enter or review the input parameters in the provided form. You can find detailed descriptions of each parameter by hovering over the question mark icon. -3. Click the "execute" button to launch the job. -4. You will be automatically redirected to the session - -![Run Session](workflow/readmes/screenshots/readme-is-v3-1.png) - -To return to a running session, go to the 'Sessions' page and click the session's link. - -![Connected Desktop](workflow/readmes/screenshots/readme-is-v3-2.png) - - -For more information see [this link](https://parallelworks.com/docs/run/sessions). diff --git a/README.md b/README.md index f58061621..df2769e08 100644 --- a/README.md +++ b/README.md @@ -1,18 +1,63 @@ -## Interactive Session -Interactive session workflows initiate a server, such as a remote desktop or Jupyter Notebook server, on your chosen resource and establish a connection through an SSH tunnel to link it to the Parallel Works platform. +# Interactive Sessions -You can launch interactive sessions on the controller or login node of a cluster, on a compute node of a SLURM partition or PBS queue, or in your user workspace (user container). +This repository contains interactive session workflows for the Activate platform. Each workflow starts a web server on a compute cluster and connects it to the platform UI, giving users browser-based access to tools like JupyterLab, VS Code, VNC desktops, and web shells. -Here's how to use an interactive session job: +## Available Sessions -1. Choose the resource where you want to start the server. -2. Enter or review the input parameters in the provided form. You can find detailed descriptions of each parameter by hovering over the question mark icon. -3. Click the "execute" button to launch the job. -4. Access the server by clicking the "eye" icon in the workflow monitor. Note that the connection is established only after the server is running. This might take some time if the job is in a queue, if compute nodes are starting, or if the job is installing required software. For more information on the job's status, check the logs. -5. When you're done, click the red "no" symbol to cancel or stop the job. +| Session | Description | +|---------|-------------| +| `jupyterlab-host` | JupyterLab notebook environment | +| `jupyter-host` | Legacy Jupyter Notebook | +| `openvscode` | VS Code in the browser | +| `vncserver` | Remote desktop (VNC) with various GUI applications | +| `webshell` | Browser-based terminal | -![Input Form](workflow/readmes/screenshots/input-form.png) +## How It Works -![Workflow Monitor](workflow/readmes/screenshots/workflow-monitor.png) +Each session is defined by **two bash scripts** and **one workflow YAML**: -![Connected Desktop](workflow/readmes/screenshots/connected-desktop.png) +1. **`controller-v3.sh`** -- Runs on the controller (login) node. Installs software and downloads dependencies. This node has internet access. +2. **`start-template-v3.sh`** -- Runs on the controller or compute node (depending on user selection). Starts the web service. +3. **Workflow YAML** (`workflow/yamls//_v4.yaml`) -- Defines the platform UI form, generates the `inputs.sh` environment file, and calls the `session_runner` subworkflow. + +All sessions use the **`session_runner`** subworkflow (`workflow/session_runner/`) which handles job scheduling, port allocation, SSH tunneling, and session registration with the platform. + +## Deployments + +The `session_runner` subworkflow has deployment-specific variants for different Activate platform installations: + +| Deployment | File | Description | +|------------|------|-------------| +| `general` | `general.yaml` | Standard SLURM/PBS clusters | +| `emed` | `emed.yaml` | Einstein Medical clusters | +| `noaa` | `noaa.yaml` | NOAA clusters | +| `hsp` | `hsp.yaml` | HSP clusters | + +Each session also has per-deployment workflow YAMLs (e.g., `general_v4.yaml`, `emed_v4.yaml`) that configure the UI form and scheduler settings for that deployment. + +> **Note:** Some sessions also support Kubernetes deployments (e.g., `general_k8s_v4.yaml`), but the `session_runner` subworkflow is designed for compute (PBS/SLURM) clusters. Kubernetes sessions use a different orchestration approach with `kubectl` directly. + +## Repository Structure + +``` +. +├── jupyterlab-host/ # JupyterLab scripts +│ ├── controller-v3.sh # Controller node setup +│ └── start-template-v3.sh # Service start script +├── openvscode/ # VS Code scripts +├── vncserver/ # VNC desktop scripts +├── webshell/ # Web shell scripts +├── jupyter-host/ # Legacy Jupyter scripts +├── workflow/ +│ ├── session_runner/ # Session runner subworkflow (per deployment) +│ ├── script_submitter/ # Script submitter subworkflow +│ ├── yamls/ # Workflow YAMLs (per session, per deployment) +│ ├── readmes/ # Per-session documentation shown in the UI +│ └── thumbnails/ # UI thumbnails +├── downloads/ # Binary dependencies (Git LFS) +└── examples/ # Example notebooks +``` + +## Developing a New Session + +See [DeveloperGuide.md](DeveloperGuide.md) for step-by-step instructions on creating your own interactive session. diff --git a/airflow-host/controller-v3.sh b/airflow-host/controller-v3.sh deleted file mode 100644 index 35e488efc..000000000 --- a/airflow-host/controller-v3.sh +++ /dev/null @@ -1,89 +0,0 @@ -cd ${resource_jobdir} - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -export AIRFLOW_HOME=${service_airflow_home} -service_conda_install_dir=${service_parent_install_dir}/miniconda3-$(basename ${service_airflow_home}) - - - -displayErrorMessage() { - echo $(date): $1 - exit 1 -} - - - - -if [[ "${service_conda_install}" == "true" ]]; then - - if [[ "${service_install_instructions}" == "install_command" ]]; then - echo "Running install command ${service_install_command}" - eval ${service_install_command} - elif [[ "${service_install_instructions}" == "yaml" ]]; then - echo "Installing custom conda environment" - printf "%b" "${service_yaml}" > conda.yaml - cat conda.yaml - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} conda.yaml - elif [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing latest" - { - source ${service_conda_sh} - } || { - conda_dir=$(echo ${service_conda_sh} | sed "s|etc/profile.d/conda.sh||g" ) - f_install_miniconda ${conda_dir} - source ${service_conda_sh} - } - { - eval "conda activate ${service_conda_env}" - } || { - conda create -n ${service_conda_env} jupyter -y - eval "conda activate ${service_conda_env}" - } - if [ -z $(which jupyter-lab 2> /dev/null) ]; then - conda install -c conda-forge jupyterlab -y - conda install nb_conda_kernels -y - conda install -c anaconda jinja2 -y - pip install ipywidgets - # Check if SLURM is installed - if command -v sinfo &> /dev/null; then - # SLURM extension for Jupyter Lab https://github.com/NERSC/jupyterlab-slurm - pip install jupyterlab_slurm - fi - fi - else - echo "Installing conda environment ${service_install_instructions}.yaml" - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} ${service_install_instructions}.yaml - fi - if [ -z ${service_load_env} ]; then - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" - fi -fi -eval "${service_load_env}" - - -if [ -d "${AIRFLOW_HOME}" ]; then - echo "Airflow home directory ${AIRFLOW_HOME} already exists." - echo "No additional installation is required." - echo "To reinstall Airflow, delete the directory and rerun the job." - exit 0 -fi - -echo; echo "Installing Miniconda under ${service_conda_install_dir}" -mkdir -p ${service_conda_install_dir} -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ${service_conda_install_dir}/miniconda.sh -bash ${service_conda_install_dir}/miniconda.sh -b -u -p ${service_conda_install_dir} -rm ${service_conda_install_dir}/miniconda.sh -source ${service_conda_install_dir}/bin/activate - - -echo; echo "Installing Airflow version ${service_airflow_version}" -AIRFLOW_VERSION=${service_airflow_version} -# Extract the version of Python you have installed. If you're currently using a Python version that is not supported by Airflow, you may want to set this manually. -# See above for supported versions. -PYTHON_VERSION="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" -CONSTRAINT_URL="https://raw.githubusercontent.com/apache/airflow/constraints-${AIRFLOW_VERSION}/constraints-${PYTHON_VERSION}.txt" -# For example this would install 2.10.5 with python 3.8: https://raw.githubusercontent.com/apache/airflow/constraints-2.10.5/constraints-3.8.txt -pip install "apache-airflow==${AIRFLOW_VERSION}" --constraint "${CONSTRAINT_URL}" \ No newline at end of file diff --git a/airflow-host/dags/plot_wrf_dag.py b/airflow-host/dags/plot_wrf_dag.py deleted file mode 100644 index 8042aa448..000000000 --- a/airflow-host/dags/plot_wrf_dag.py +++ /dev/null @@ -1,151 +0,0 @@ -from airflow import DAG -from airflow.operators.python import PythonOperator -from airflow.operators.bash import BashOperator -from airflow.sensors.filesystem import FileSensor -from airflow.utils.trigger_rule import TriggerRule - -from datetime import datetime -import time -import os -import subprocess - -AIRFLOW_HOME = os.environ.get('AIRFLOW_HOME') -# Retrieves the installation path of the Conda environment where Airflow is currently running -# Only works if airflow is installed in the base environment! -CONDA_PREFIX = os.environ.get('CONDA_PREFIX') -WRF_CONDA_ENVIRONMENT="wrf" -DATA_PARENT_DIRECTORY = os.path.join( - os.path.expanduser("~"), - 'wrf_data' -) - -def get_slurm_status(job_id): - result = subprocess.run(["squeue", "-j", job_id, "-h", "-o", "%T"], capture_output=True, text=True) - status = result.stdout.strip() - - if status == "": - result = subprocess.run(["sacct", "-j", job_id, "--format=state", "-n"], capture_output=True, text=True) - status = result.stdout.split('\n')[0].strip() - - return status - -def print_file_contents(file_path): - if os.path.exists(file_path): - with open(file_path, "r") as file: - print(f"\nContents of {file_path}:") - print(file.read()) - else: - print(f"File {file_path} not found.") - -def monitor_slurm_job(data_dir, **kwargs): - job_id = kwargs['ti'].xcom_pull(task_ids='submit_job', key='return_value') - if not job_id: - raise ValueError("SLURM Job ID not found") - - print(f"Monitoring SLURM Job ID: {job_id}") - slurm_log_file = f"{data_dir}/plot_wrf.{job_id}.out" - print(f"SLURM job log file: {slurm_log_file}") - while True: - status = get_slurm_status(job_id) - - if status in ["", "COMPLETED"]: - print(f"Job {job_id} completed successfully or no longer in queue.") - break - elif status in ["FAILED", "CANCELLED", "TIMEOUT", "NODE_FAIL", "OUT_OF_MEMORY"]: - print_file_contents(slurm_log_file) - raise RuntimeError(f"SLURM job {job_id} failed with status: {status}") - else: - print(f"Job {job_id} is in status: {status}. Checking again in 30 seconds...") - time.sleep(30) - - print_file_contents(slurm_log_file) - - -def write_slurm_script(data_dir, **kwargs): - script_path = os.path.join(AIRFLOW_HOME, "dags", "plot_wrf_data.py") - slurm_script_path = os.path.join(data_dir, "slurm_script.sh") - - if not os.path.exists(script_path): - raise FileNotFoundError(f"Script {script_path} not found") - - slurm_script_content = f"""#!/bin/bash -#SBATCH --job-name=plot_wrf.%j -#SBATCH --output={data_dir}/plot_wrf.%j.out -#SBATCH --time=00:30:00 -#SBATCH --ntasks=1 -#SBATCH --chdir={data_dir} -source {CONDA_PREFIX}/etc/profile.d/conda.sh -conda activate {WRF_CONDA_ENVIRONMENT} -python {AIRFLOW_HOME}/dags/plot_wrf_data.py {data_dir}/wrf-output -""" - - with open(slurm_script_path, 'w') as slurm_script: - slurm_script.write(slurm_script_content) - - kwargs['ti'].xcom_push(key='slurm_script_path', value=slurm_script_path) - -with DAG( - dag_id='plot-wrf', - default_args={ - 'owner': 'airflow', - 'start_date': datetime.now(), - 'retries': 1, - }, - schedule_interval='@daily', - catchup=False, - tags=['slurm', 'sbatch', 'python-script', 'wrf', 'plot'] -) as dag: - - # Get current date - today = datetime.now() - - # Format and print the date - formatted_date = today.strftime('%Y-%m-%d') - - data_directory = os.path.join(DATA_PARENT_DIRECTORY, formatted_date) - - verify_or_create_conda_env = BashOperator( - task_id='verify_or_create_conda_env', - bash_command=f""" - source {CONDA_PREFIX}/bin/activate - - if conda env list | grep -q "{WRF_CONDA_ENVIRONMENT}"; then - echo "Conda environment {WRF_CONDA_ENVIRONMENT} exists." - else - echo "Creating Conda environment..." - conda create -y -n {WRF_CONDA_ENVIRONMENT} - conda activate {WRF_CONDA_ENVIRONMENT} - conda install -y -c conda-forge xarray matplotlib ffmpeg netCDF4 - fi - """, - ) - - wait_for_directory = FileSensor( - task_id='wait_for_directory', - filepath=data_directory, - poke_interval=30, # Check every 30 seconds - timeout=3600, # Timeout after 1 hour - mode='poke', - ) - - write_script = PythonOperator( - task_id='write_script', - python_callable=write_slurm_script, - op_kwargs={'data_dir': data_directory}, - provide_context=True, - ) - - submit_job = BashOperator( - task_id='submit_job', - bash_command="sbatch --parsable {{ ti.xcom_pull(task_ids='write_script', key='slurm_script_path') }}", - do_xcom_push=True, - ) - - monitor_job = PythonOperator( - task_id='monitor_job', - python_callable=monitor_slurm_job, - op_kwargs={'data_dir': data_directory}, - provide_context=True, - ) - - verify_or_create_conda_env >> wait_for_directory >> write_script >> submit_job >> monitor_job diff --git a/airflow-host/dags/plot_wrf_data.py b/airflow-host/dags/plot_wrf_data.py deleted file mode 100644 index 19648066f..000000000 --- a/airflow-host/dags/plot_wrf_data.py +++ /dev/null @@ -1,86 +0,0 @@ -import xarray as xr -import matplotlib.pyplot as plt -import matplotlib.animation as animation -import glob -import os - -import sys -# User input: path to the wrf-output directory -wrf_output_dir = sys.argv[1] -save_plot_dir = os.path.dirname(wrf_output_dir) - -# Get the first .nc file in the wrf_output_dir -nc_file = glob.glob(os.path.join(wrf_output_dir, "*.nc"))[0] - -ds = xr.open_dataset(nc_file) -# Print dataset information -print(ds) - -# Check for available temperature variables -print([var for var in ds.variables if 'T' in var]) - -# Extract variables for 2-meter temperature plot -T2 = ds["T2"].isel(Time=0) # Select the first time step -lats = ds["XLAT"].isel(Time=0) -lons = ds["XLONG"].isel(Time=0) - -# Plot 2-Meter Temperature and save it as a PNG file -plt.figure(figsize=(10, 6)) -plt.contourf(lons, lats, T2, cmap="coolwarm") -plt.colorbar(label="Temperature (K)") -plt.xlabel("Longitude") -plt.ylabel("Latitude") -plt.title("2m Temperature") -plt.savefig(os.path.join(save_plot_dir, "2m_temperature.png")) # Save plot to file -plt.close() - -# List of WRF output files for animation -files = sorted(glob.glob(os.path.join(wrf_output_dir, 'wrfout_d01_*.nc'))) - -# Prepare figure for animation -fig, ax = plt.subplots(figsize=(10, 6)) - -# Open the first WRF file to create the initial colorbar -ds = xr.open_dataset(files[0]) -T2 = ds["T2"].isel(Time=0) # Select first time step -lats = ds["XLAT"].isel(Time=0) -lons = ds["XLONG"].isel(Time=0) - -# Create initial contour plot to set color range -contour = ax.contourf(lons, lats, T2, cmap="coolwarm") - -# Add colorbar once (this prevents the multiple colorbar issue) -cbar = fig.colorbar(contour, ax=ax, label="Temperature (K)") - -def animate(i): - # Open the current WRF file - ds = xr.open_dataset(files[i]) - - # Extract data for temperature, lat, lon, and time - T2 = ds["T2"].isel(Time=0) # Select first time step - lats = ds["XLAT"].isel(Time=0) - lons = ds["XLONG"].isel(Time=0) - - # Clear previous plot (except colorbar) - ax.clear() - - # Recreate the contour plot for the new temperature data - contour = ax.contourf(lons, lats, T2, cmap="coolwarm") - ax.set_xlabel("Longitude") - ax.set_ylabel("Latitude") - ax.set_title(f"2m Temperature - Time: {ds.XTIME.values[0]}") - - # Update the colorbar with the latest data (if needed) - cbar.update_ticks() - - # Return the contour object to update the plot - return contour #.collections - -# Create the animation -ani = animation.FuncAnimation(fig, animate, frames=len(files), interval=1000, blit=False) - -# Save the animation to a file (e.g., an mp4 file) -ani.save(os.path.join(save_plot_dir, "temperature_animation.mp4"), writer="ffmpeg", dpi=300) - -# Close the figure after the animation -plt.close() diff --git a/airflow-host/kill-template.sh b/airflow-host/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/airflow-host/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/airflow-host/start-template-v3.sh b/airflow-host/start-template-v3.sh deleted file mode 100755 index 8c94801fd..000000000 --- a/airflow-host/start-template-v3.sh +++ /dev/null @@ -1,54 +0,0 @@ -# Runs via ssh + sbatch -set -x - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -echo "Activating Airflow environment" -export AIRFLOW_HOME=${service_airflow_home} -service_conda_install_dir=${service_parent_install_dir}/miniconda3-$(basename ${service_airflow_home}) -source ${service_conda_install_dir}/bin/activate - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh - -################# -# START AIRFLOW # -################# -#base_url="https://alvaro-airfloww.activate.pw/" -#sed -i "s|^base_url .*|base_url = ${base_url}|" ${AIRFLOW_HOME}/airflow.cfg -sed -i "s|^enable_proxy_fix .*|enable_proxy_fix = True|" ${AIRFLOW_HOME}/airflow.cfg - -# Do now use "airflow standalone"! It does not allow adding new users -airflow db init -# Run "airflow db reset" in cancel.sh? - -airflow users create \ - --username ${service_username} \ - --firstname ${service_firstname} \ - --lastname ${service_lastname} \ - --role ${service_role} \ - --email ${service_email} \ - --password ${service_password} - - -airflow scheduler 2>&1 | tee scheduler.log & -airflow_scheduler_pid=$! -echo "kill ${airflow_scheduler_pid} # airflow scheduler" >> cancel.sh - -airflow webserver --port ${service_port} 2>&1 | tee webserver.log & -airflow_webserver_pid=$! -echo "kill ${airflow_webserver_pid} # airflow webserver" >> cancel.sh - -# Transfer dags to dags folder -if [ -d "dags" ]; then - dags_folder=$(cat ${AIRFLOW_HOME}/airflow.cfg | grep dags_folder | cut -d'=' -f2) - if ! [ -z "${dags_folder}" ]; then - mkdir -p ${dags_folder} - cp -r dags/* ${dags_folder} - fi -fi - -sleep inf diff --git a/airflow-host/transfer_files.sh b/airflow-host/transfer_files.sh deleted file mode 100644 index 977390aa6..000000000 --- a/airflow-host/transfer_files.sh +++ /dev/null @@ -1,3 +0,0 @@ - -rsync -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" -avzq ${pw_job_dir}/${service_name}/dags ${resource_publicIp}:${resource_jobdir} - diff --git a/cesium/start-template-v3.sh b/cesium/start-template-v3.sh deleted file mode 100644 index 135ab4b95..000000000 --- a/cesium/start-template-v3.sh +++ /dev/null @@ -1,27 +0,0 @@ - -check_sudo_access() { - if ! sudo -n true 2>/dev/null; then - echo "$(date) ERROR: Cannot $1 without root access" - exit 1 - fi -} - -# if running on rocky9 update the download url -source /etc/os-release -if [[ "$VERSION_ID" == *"9"* ]]; then - check_sudo_access "nodejs" - sudo dnf install nodejs -y - mkdir cesium-app - cd cesium-app - npm init -y - npm install cesium http-server - echo -e "${service_html}" > index.html - npx http-server -p ${service_port} -elif [[ "$VERSION_ID" == *"8"* ]]; then - echo "$(date) ERROR: This workflow is only supported on Rocky Linux 9" - exit 1 -else - echo "$(date) ERROR: This workflow is only supported on Rocky Linux 9" - exit 1 -fi - diff --git a/docker-service/kill-template.sh b/docker-service/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/docker-service/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/docker-service/url.sh b/docker-service/url.sh deleted file mode 100644 index 29a9f7bae..000000000 --- a/docker-service/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="\"" \ No newline at end of file diff --git a/downloads/juice/juice-gpu-linux.tar.gz b/downloads/juice/juice-gpu-linux.tar.gz deleted file mode 100644 index 4474cf32c..000000000 --- a/downloads/juice/juice-gpu-linux.tar.gz +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d57c6b1458511926237c01c30792b2359a228361868d80525c369ad1c1197b5b -size 366513670 diff --git a/downloads/jupyter/nginx-unprivileged.sif b/downloads/jupyter/nginx-unprivileged.sif deleted file mode 100644 index a4fbf46c1..000000000 Binary files a/downloads/jupyter/nginx-unprivileged.sif and /dev/null differ diff --git a/downloads/vnc/noVNC-1.3.0.tgz b/downloads/vnc/noVNC-1.3.0.tgz deleted file mode 100644 index c528d93db..000000000 Binary files a/downloads/vnc/noVNC-1.3.0.tgz and /dev/null differ diff --git a/downloads/vnc/noVNC-1.5.0.tgz b/downloads/vnc/noVNC-1.5.0.tgz deleted file mode 100644 index 7113feb51..000000000 Binary files a/downloads/vnc/noVNC-1.5.0.tgz and /dev/null differ diff --git a/downloads/vnc/vncserver.sif b/downloads/vnc/vncserver.sif deleted file mode 100644 index 396b64f0e..000000000 --- a/downloads/vnc/vncserver.sif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:791c539a5399a2763608ad34f635cebadf26a91bffb7160491ad05a61653133a -size 1276698624 diff --git a/examples/jupyter/GraphCast.ipynb b/examples/jupyter/GraphCast.ipynb deleted file mode 100644 index d2e393a77..000000000 --- a/examples/jupyter/GraphCast.ipynb +++ /dev/null @@ -1,878 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "-jAYlxeKxvAJ" - }, - "source": [ - "# GraphCast\n", - "\n", - "This colab lets you run several versions of GraphCast.\n", - "\n", - "The model weights, normalization statistics, and example inputs are available on [Google Cloud Bucket](https://console.cloud.google.com/storage/browser/dm_graphcast).\n", - "\n", - "A Colab runtime with TPU/GPU acceleration will substantially speed up generating predictions and computing the loss/gradients. If you're using a CPU-only runtime, you can switch using the menu \"Runtime > Change runtime type\"." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IIWlNRupdI2i" - }, - "source": [ - ">

Copyright 2023 DeepMind Technologies Limited.

\n", - ">

Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.

\n", - ">

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yMbbXFl4msJw" - }, - "source": [ - "# Installation and Initialization\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "-W4K9skv9vh-" - }, - "outputs": [], - "source": [ - "# @title Pip install graphcast and dependencies\n", - "\n", - "%pip install --upgrade https://github.com/deepmind/graphcast/archive/master.zip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "MA5087Vb29z2" - }, - "outputs": [], - "source": [ - "# @title Workaround for cartopy crashes\n", - "\n", - "# Workaround for cartopy crashes due to the shapely installed by default in\n", - "# google colab kernel (https://github.com/anitagraser/movingpandas/issues/81):\n", - "#!pip uninstall -y shapely\n", - "#!pip install shapely --no-binary shapely\n", - "!pip install google-cloud-storage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "Z_j8ej4Pyg1L" - }, - "outputs": [], - "source": [ - "# @title Imports\n", - "\n", - "import dataclasses\n", - "import datetime\n", - "import functools\n", - "import math\n", - "import re\n", - "from typing import Optional\n", - "\n", - "import cartopy.crs as ccrs\n", - "from google.cloud import storage\n", - "from graphcast import autoregressive\n", - "from graphcast import casting\n", - "from graphcast import checkpoint\n", - "from graphcast import data_utils\n", - "from graphcast import graphcast\n", - "from graphcast import normalization\n", - "from graphcast import rollout\n", - "from graphcast import xarray_jax\n", - "from graphcast import xarray_tree\n", - "from IPython.display import HTML\n", - "import ipywidgets as widgets\n", - "import haiku as hk\n", - "import jax\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "from matplotlib import animation\n", - "import numpy as np\n", - "import xarray\n", - "\n", - "\n", - "def parse_file_parts(file_name):\n", - " return dict(part.split(\"-\", 1) for part in file_name.split(\"_\"))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "4wagX1TL_f15" - }, - "outputs": [], - "source": [ - "# @title Authenticate with Google Cloud Storage\n", - "\n", - "gcs_client = storage.Client.create_anonymous_client()\n", - "gcs_bucket = gcs_client.get_bucket(\"dm_graphcast\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "5JUymx84dI2m" - }, - "outputs": [], - "source": [ - "# @title Plotting functions\n", - "\n", - "def select(\n", - " data: xarray.Dataset,\n", - " variable: str,\n", - " level: Optional[int] = None,\n", - " max_steps: Optional[int] = None\n", - " ) -> xarray.Dataset:\n", - " data = data[variable]\n", - " if \"batch\" in data.dims:\n", - " data = data.isel(batch=0)\n", - " if max_steps is not None and \"time\" in data.sizes and max_steps < data.sizes[\"time\"]:\n", - " data = data.isel(time=range(0, max_steps))\n", - " if level is not None and \"level\" in data.coords:\n", - " data = data.sel(level=level)\n", - " return data\n", - "\n", - "def scale(\n", - " data: xarray.Dataset,\n", - " center: Optional[float] = None,\n", - " robust: bool = False,\n", - " ) -> tuple[xarray.Dataset, matplotlib.colors.Normalize, str]:\n", - " vmin = np.nanpercentile(data, (2 if robust else 0))\n", - " vmax = np.nanpercentile(data, (98 if robust else 100))\n", - " if center is not None:\n", - " diff = max(vmax - center, center - vmin)\n", - " vmin = center - diff\n", - " vmax = center + diff\n", - " return (data, matplotlib.colors.Normalize(vmin, vmax),\n", - " (\"RdBu_r\" if center is not None else \"viridis\"))\n", - "\n", - "def plot_data(\n", - " data: dict[str, xarray.Dataset],\n", - " fig_title: str,\n", - " plot_size: float = 5,\n", - " robust: bool = False,\n", - " cols: int = 4\n", - " ) -> tuple[xarray.Dataset, matplotlib.colors.Normalize, str]:\n", - "\n", - " first_data = next(iter(data.values()))[0]\n", - " max_steps = first_data.sizes.get(\"time\", 1)\n", - " assert all(max_steps == d.sizes.get(\"time\", 1) for d, _, _ in data.values())\n", - "\n", - " cols = min(cols, len(data))\n", - " rows = math.ceil(len(data) / cols)\n", - " figure = plt.figure(figsize=(plot_size * 2 * cols,\n", - " plot_size * rows))\n", - " figure.suptitle(fig_title, fontsize=16)\n", - " figure.subplots_adjust(wspace=0, hspace=0)\n", - " figure.tight_layout()\n", - "\n", - " images = []\n", - " for i, (title, (plot_data, norm, cmap)) in enumerate(data.items()):\n", - " ax = figure.add_subplot(rows, cols, i+1)\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])\n", - " ax.set_title(title)\n", - " im = ax.imshow(\n", - " plot_data.isel(time=0, missing_dims=\"ignore\"), norm=norm,\n", - " origin=\"lower\", cmap=cmap)\n", - " plt.colorbar(\n", - " mappable=im,\n", - " ax=ax,\n", - " orientation=\"vertical\",\n", - " pad=0.02,\n", - " aspect=16,\n", - " shrink=0.75,\n", - " cmap=cmap,\n", - " extend=(\"both\" if robust else \"neither\"))\n", - " images.append(im)\n", - "\n", - " def update(frame):\n", - " if \"time\" in first_data.dims:\n", - " td = datetime.timedelta(microseconds=first_data[\"time\"][frame].item() / 1000)\n", - " figure.suptitle(f\"{fig_title}, {td}\", fontsize=16)\n", - " else:\n", - " figure.suptitle(fig_title, fontsize=16)\n", - " for im, (plot_data, norm, cmap) in zip(images, data.values()):\n", - " im.set_data(plot_data.isel(time=frame, missing_dims=\"ignore\"))\n", - "\n", - " ani = animation.FuncAnimation(\n", - " fig=figure, func=update, frames=max_steps, interval=250)\n", - " plt.close(figure.number)\n", - " return HTML(ani.to_jshtml())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WEtSV8HEkHtf" - }, - "source": [ - "# Load the Data and initialize the model" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "G50ORsY_dI2n" - }, - "source": [ - "## Load the model params\n", - "\n", - "Choose one of the two ways of getting model params:\n", - "- **random**: You'll get random predictions, but you can change the model architecture, which may run faster or fit on your device.\n", - "- **checkpoint**: You'll get sensible predictions, but are limited to the model architecture that it was trained with, which may not fit on your device. In particular generating gradients uses a lot of memory, so you'll need at least 25GB of ram (TPUv4 or A100).\n", - "\n", - "Checkpoints vary across a few axes:\n", - "- The mesh size specifies the internal graph representation of the earth. Smaller meshes will run faster but will have worse outputs. The mesh size does not affect the number of parameters of the model.\n", - "- The resolution and number of pressure levels must match the data. Lower resolution and fewer levels will run a bit faster. Data resolution only affects the encoder/decoder.\n", - "- All our models predict precipitation. However, ERA5 includes precipitation, while HRES does not. Our models marked as \"ERA5\" take precipitation as input and expect ERA5 data as input, while model marked \"ERA5-HRES\" do not take precipitation as input and are specifically trained to take HRES-fc0 as input (see the data section below).\n", - "\n", - "We provide three pre-trained models.\n", - "1. `GraphCast`, the high-resolution model used in the GraphCast paper (0.25 degree resolution, 37 pressure levels), trained on ERA5 data from 1979 to 2017,\n", - "\n", - "2. `GraphCast_small`, a smaller, low-resolution version of GraphCast (1 degree resolution, 13 pressure levels, and a smaller mesh), trained on ERA5 data from 1979 to 2015, useful to run a model with lower memory and compute constraints,\n", - "\n", - "3. `GraphCast_operational`, a high-resolution model (0.25 degree resolution, 13 pressure levels) pre-trained on ERA5 data from 1979 to 2017 and fine-tuned on HRES data from 2016 to 2021. This model can be initialized from HRES data (does not require precipitation inputs).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "KGaJ6V9MdI2n" - }, - "outputs": [], - "source": [ - "# @title Choose the model\n", - "\n", - "params_file_options = [\n", - " name for blob in gcs_bucket.list_blobs(prefix=\"params/\")\n", - " if (name := blob.name.removeprefix(\"params/\"))] # Drop empty string.\n", - "\n", - "random_mesh_size = widgets.IntSlider(\n", - " value=4, min=4, max=6, description=\"Mesh size:\")\n", - "random_gnn_msg_steps = widgets.IntSlider(\n", - " value=4, min=1, max=32, description=\"GNN message steps:\")\n", - "random_latent_size = widgets.Dropdown(\n", - " options=[int(2**i) for i in range(4, 10)], value=32,description=\"Latent size:\")\n", - "random_levels = widgets.Dropdown(\n", - " options=[13, 37], value=13, description=\"Pressure levels:\")\n", - "\n", - "\n", - "params_file = widgets.Dropdown(\n", - " options=params_file_options,\n", - " description=\"Params file:\",\n", - " layout={\"width\": \"max-content\"})\n", - "\n", - "source_tab = widgets.Tab([\n", - " widgets.VBox([\n", - " random_mesh_size,\n", - " random_gnn_msg_steps,\n", - " random_latent_size,\n", - " random_levels,\n", - " ]),\n", - " params_file,\n", - "])\n", - "source_tab.set_title(0, \"Random\")\n", - "source_tab.set_title(1, \"Checkpoint\")\n", - "widgets.VBox([\n", - " source_tab,\n", - " widgets.Label(value=\"Run the next cell to load the model. Rerunning this cell clears your selection.\")\n", - "])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "lYQgrPgPdI2n" - }, - "outputs": [], - "source": [ - "# @title Load the model\n", - "\n", - "source = source_tab.get_title(source_tab.selected_index)\n", - "\n", - "if source == \"Random\":\n", - " params = None # Filled in below\n", - " state = {}\n", - " model_config = graphcast.ModelConfig(\n", - " resolution=0,\n", - " mesh_size=random_mesh_size.value,\n", - " latent_size=random_latent_size.value,\n", - " gnn_msg_steps=random_gnn_msg_steps.value,\n", - " hidden_layers=1,\n", - " radius_query_fraction_edge_length=0.6)\n", - " task_config = graphcast.TaskConfig(\n", - " input_variables=graphcast.TASK.input_variables,\n", - " target_variables=graphcast.TASK.target_variables,\n", - " forcing_variables=graphcast.TASK.forcing_variables,\n", - " pressure_levels=graphcast.PRESSURE_LEVELS[random_levels.value],\n", - " input_duration=graphcast.TASK.input_duration,\n", - " )\n", - "else:\n", - " assert source == \"Checkpoint\"\n", - " with gcs_bucket.blob(f\"params/{params_file.value}\").open(\"rb\") as f:\n", - " ckpt = checkpoint.load(f, graphcast.CheckPoint)\n", - " params = ckpt.params\n", - " state = {}\n", - "\n", - " model_config = ckpt.model_config\n", - " task_config = ckpt.task_config\n", - " print(\"Model description:\\n\", ckpt.description, \"\\n\")\n", - " print(\"Model license:\\n\", ckpt.license, \"\\n\")\n", - "\n", - "model_config" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rQWk0RRuCjDN" - }, - "source": [ - "## Load the example data\n", - "\n", - "Several example datasets are available, varying across a few axes:\n", - "- **Source**: fake, era5, hres\n", - "- **Resolution**: 0.25deg, 1deg, 6deg\n", - "- **Levels**: 13, 37\n", - "- **Steps**: How many timesteps are included\n", - "\n", - "Not all combinations are available.\n", - "- Higher resolution is only available for fewer steps due to the memory requirements of loading them.\n", - "- HRES is only available in 0.25 deg, with 13 pressure levels.\n", - "\n", - "The data resolution must match the model that is loaded.\n", - "\n", - "Some transformations were done from the base datasets:\n", - "- We accumulated precipitation over 6 hours instead of the default 1 hour.\n", - "- For HRES data, each time step corresponds to the HRES forecast at leadtime 0, essentially providing an \"initialisation\" from HRES. See HRES-fc0 in the GraphCast paper for further description. Note that a 6h accumulation of precipitation is not available from HRES, so our model taking HRES inputs does not depend on precipitation. However, because our models predict precipitation, we include the ERA5 precipitation in the example data so it can serve as an illustrative example of ground truth.\n", - "- We include ERA5 `toa_incident_solar_radiation` in the data. Our model uses the radiation at -6h, 0h and +6h as a forcing term for each 1-step prediction. If the radiation is missing from the data (e.g. in an operational setting), it will be computed using a custom implementation that produces values similar to those in ERA5." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "-DJzie5me2-H" - }, - "outputs": [], - "source": [ - "# @title Get and filter the list of available example datasets\n", - "\n", - "dataset_file_options = [\n", - " name for blob in gcs_bucket.list_blobs(prefix=\"dataset/\")\n", - " if (name := blob.name.removeprefix(\"dataset/\"))] # Drop empty string.\n", - "\n", - "def data_valid_for_model(\n", - " file_name: str, model_config: graphcast.ModelConfig, task_config: graphcast.TaskConfig):\n", - " file_parts = parse_file_parts(file_name.removesuffix(\".nc\"))\n", - " return (\n", - " model_config.resolution in (0, float(file_parts[\"res\"])) and\n", - " len(task_config.pressure_levels) == int(file_parts[\"levels\"]) and\n", - " (\n", - " (\"total_precipitation_6hr\" in task_config.input_variables and\n", - " file_parts[\"source\"] in (\"era5\", \"fake\")) or\n", - " (\"total_precipitation_6hr\" not in task_config.input_variables and\n", - " file_parts[\"source\"] in (\"hres\", \"fake\"))\n", - " )\n", - " )\n", - "\n", - "\n", - "dataset_file = widgets.Dropdown(\n", - " options=[\n", - " (\", \".join([f\"{k}: {v}\" for k, v in parse_file_parts(option.removesuffix(\".nc\")).items()]), option)\n", - " for option in dataset_file_options\n", - " if data_valid_for_model(option, model_config, task_config)\n", - " ],\n", - " description=\"Dataset file:\",\n", - " layout={\"width\": \"max-content\"})\n", - "widgets.VBox([\n", - " dataset_file,\n", - " widgets.Label(value=\"Run the next cell to load the dataset. Rerunning this cell clears your selection and refilters the datasets that match your model.\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "Yz-ekISoJxeZ" - }, - "outputs": [], - "source": [ - "# @title Load weather data\n", - "\n", - "if not data_valid_for_model(dataset_file.value, model_config, task_config):\n", - " raise ValueError(\n", - " \"Invalid dataset file, rerun the cell above and choose a valid dataset file.\")\n", - "\n", - "with gcs_bucket.blob(f\"dataset/{dataset_file.value}\").open(\"rb\") as f:\n", - " example_batch = xarray.load_dataset(f).compute()\n", - "\n", - "assert example_batch.dims[\"time\"] >= 3 # 2 for input, >=1 for targets\n", - "\n", - "print(\", \".join([f\"{k}: {v}\" for k, v in parse_file_parts(dataset_file.value.removesuffix(\".nc\")).items()]))\n", - "\n", - "example_batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "lXjFvdE6qStr" - }, - "outputs": [], - "source": [ - "# @title Choose data to plot\n", - "\n", - "plot_example_variable = widgets.Dropdown(\n", - " options=example_batch.data_vars.keys(),\n", - " value=\"2m_temperature\",\n", - " description=\"Variable\")\n", - "plot_example_level = widgets.Dropdown(\n", - " options=example_batch.coords[\"level\"].values,\n", - " value=500,\n", - " description=\"Level\")\n", - "plot_example_robust = widgets.Checkbox(value=True, description=\"Robust\")\n", - "plot_example_max_steps = widgets.IntSlider(\n", - " min=1, max=example_batch.dims[\"time\"], value=example_batch.dims[\"time\"],\n", - " description=\"Max steps\")\n", - "\n", - "widgets.VBox([\n", - " plot_example_variable,\n", - " plot_example_level,\n", - " plot_example_robust,\n", - " plot_example_max_steps,\n", - " widgets.Label(value=\"Run the next cell to plot the data. Rerunning this cell clears your selection.\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "kIK-EgMdkHtk" - }, - "outputs": [], - "source": [ - "# @title Plot example data\n", - "\n", - "plot_size = 7\n", - "\n", - "data = {\n", - " \" \": scale(select(example_batch, plot_example_variable.value, plot_example_level.value, plot_example_max_steps.value),\n", - " robust=plot_example_robust.value),\n", - "}\n", - "fig_title = plot_example_variable.value\n", - "if \"level\" in example_batch[plot_example_variable.value].coords:\n", - " fig_title += f\" at {plot_example_level.value} hPa\"\n", - "\n", - "plot_data(data, fig_title, plot_size, plot_example_robust.value)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "tPVy1GHokHtk" - }, - "outputs": [], - "source": [ - "# @title Choose training and eval data to extract\n", - "train_steps = widgets.IntSlider(\n", - " value=1, min=1, max=example_batch.sizes[\"time\"]-2, description=\"Train steps\")\n", - "eval_steps = widgets.IntSlider(\n", - " value=example_batch.sizes[\"time\"]-2, min=1, max=example_batch.sizes[\"time\"]-2, description=\"Eval steps\")\n", - "\n", - "widgets.VBox([\n", - " train_steps,\n", - " eval_steps,\n", - " widgets.Label(value=\"Run the next cell to extract the data. Rerunning this cell clears your selection.\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "Ogp4vTBvsgSt" - }, - "outputs": [], - "source": [ - "# @title Extract training and eval data\n", - "\n", - "train_inputs, train_targets, train_forcings = data_utils.extract_inputs_targets_forcings(\n", - " example_batch, target_lead_times=slice(\"6h\", f\"{train_steps.value*6}h\"),\n", - " **dataclasses.asdict(task_config))\n", - "\n", - "eval_inputs, eval_targets, eval_forcings = data_utils.extract_inputs_targets_forcings(\n", - " example_batch, target_lead_times=slice(\"6h\", f\"{eval_steps.value*6}h\"),\n", - " **dataclasses.asdict(task_config))\n", - "\n", - "print(\"All Examples: \", example_batch.dims.mapping)\n", - "print(\"Train Inputs: \", train_inputs.dims.mapping)\n", - "print(\"Train Targets: \", train_targets.dims.mapping)\n", - "print(\"Train Forcings:\", train_forcings.dims.mapping)\n", - "print(\"Eval Inputs: \", eval_inputs.dims.mapping)\n", - "print(\"Eval Targets: \", eval_targets.dims.mapping)\n", - "print(\"Eval Forcings: \", eval_forcings.dims.mapping)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "Q--ZRhpTdI2o" - }, - "outputs": [], - "source": [ - "# @title Load normalization data\n", - "\n", - "with gcs_bucket.blob(\"stats/diffs_stddev_by_level.nc\").open(\"rb\") as f:\n", - " diffs_stddev_by_level = xarray.load_dataset(f).compute()\n", - "with gcs_bucket.blob(\"stats/mean_by_level.nc\").open(\"rb\") as f:\n", - " mean_by_level = xarray.load_dataset(f).compute()\n", - "with gcs_bucket.blob(\"stats/stddev_by_level.nc\").open(\"rb\") as f:\n", - " stddev_by_level = xarray.load_dataset(f).compute()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ke2zQyuT_sMA" - }, - "outputs": [], - "source": [ - "# @title Build jitted functions, and possibly initialize random weights\n", - "\n", - "def construct_wrapped_graphcast(\n", - " model_config: graphcast.ModelConfig,\n", - " task_config: graphcast.TaskConfig):\n", - " \"\"\"Constructs and wraps the GraphCast Predictor.\"\"\"\n", - " # Deeper one-step predictor.\n", - " predictor = graphcast.GraphCast(model_config, task_config)\n", - "\n", - " # Modify inputs/outputs to `graphcast.GraphCast` to handle conversion to\n", - " # from/to float32 to/from BFloat16.\n", - " predictor = casting.Bfloat16Cast(predictor)\n", - "\n", - " # Modify inputs/outputs to `casting.Bfloat16Cast` so the casting to/from\n", - " # BFloat16 happens after applying normalization to the inputs/targets.\n", - " predictor = normalization.InputsAndResiduals(\n", - " predictor,\n", - " diffs_stddev_by_level=diffs_stddev_by_level,\n", - " mean_by_level=mean_by_level,\n", - " stddev_by_level=stddev_by_level)\n", - "\n", - " # Wraps everything so the one-step model can produce trajectories.\n", - " predictor = autoregressive.Predictor(predictor, gradient_checkpointing=True)\n", - " return predictor\n", - "\n", - "\n", - "@hk.transform_with_state\n", - "def run_forward(model_config, task_config, inputs, targets_template, forcings):\n", - " predictor = construct_wrapped_graphcast(model_config, task_config)\n", - " return predictor(inputs, targets_template=targets_template, forcings=forcings)\n", - "\n", - "\n", - "@hk.transform_with_state\n", - "def loss_fn(model_config, task_config, inputs, targets, forcings):\n", - " predictor = construct_wrapped_graphcast(model_config, task_config)\n", - " loss, diagnostics = predictor.loss(inputs, targets, forcings)\n", - " return xarray_tree.map_structure(\n", - " lambda x: xarray_jax.unwrap_data(x.mean(), require_jax=True),\n", - " (loss, diagnostics))\n", - "\n", - "def grads_fn(params, state, model_config, task_config, inputs, targets, forcings):\n", - " def _aux(params, state, i, t, f):\n", - " (loss, diagnostics), next_state = loss_fn.apply(\n", - " params, state, jax.random.PRNGKey(0), model_config, task_config,\n", - " i, t, f)\n", - " return loss, (diagnostics, next_state)\n", - " (loss, (diagnostics, next_state)), grads = jax.value_and_grad(\n", - " _aux, has_aux=True)(params, state, inputs, targets, forcings)\n", - " return loss, diagnostics, next_state, grads\n", - "\n", - "# Jax doesn't seem to like passing configs as args through the jit. Passing it\n", - "# in via partial (instead of capture by closure) forces jax to invalidate the\n", - "# jit cache if you change configs.\n", - "def with_configs(fn):\n", - " return functools.partial(\n", - " fn, model_config=model_config, task_config=task_config)\n", - "\n", - "# Always pass params and state, so the usage below are simpler\n", - "def with_params(fn):\n", - " return functools.partial(fn, params=params, state=state)\n", - "\n", - "# Our models aren't stateful, so the state is always empty, so just return the\n", - "# predictions. This is requiredy by our rollout code, and generally simpler.\n", - "def drop_state(fn):\n", - " return lambda **kw: fn(**kw)[0]\n", - "\n", - "init_jitted = jax.jit(with_configs(run_forward.init))\n", - "\n", - "if params is None:\n", - " params, state = init_jitted(\n", - " rng=jax.random.PRNGKey(0),\n", - " inputs=train_inputs,\n", - " targets_template=train_targets,\n", - " forcings=train_forcings)\n", - "\n", - "loss_fn_jitted = drop_state(with_params(jax.jit(with_configs(loss_fn.apply))))\n", - "grads_fn_jitted = with_params(jax.jit(with_configs(grads_fn)))\n", - "run_forward_jitted = drop_state(with_params(jax.jit(with_configs(\n", - " run_forward.apply))))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VBNutliiCyqA" - }, - "source": [ - "# Run the model\n", - "\n", - "Note that the cell below may take a while (possibly minutes) to run the first time you execute them, because this will include the time it takes for the code to compile. The second time running will be significantly faster.\n", - "\n", - "This use the python loop to iterate over prediction steps, where the 1-step prediction is jitted. This has lower memory requirements than the training steps below, and should enable making prediction with the small GraphCast model on 1 deg resolution data for 4 steps." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "7obeY9i9oTtD" - }, - "outputs": [], - "source": [ - "# @title Autoregressive rollout (loop in python)\n", - "\n", - "assert model_config.resolution in (0, 360. / eval_inputs.sizes[\"lon\"]), (\n", - " \"Model resolution doesn't match the data resolution. You likely want to \"\n", - " \"re-filter the dataset list, and download the correct data.\")\n", - "\n", - "print(\"Inputs: \", eval_inputs.dims.mapping)\n", - "print(\"Targets: \", eval_targets.dims.mapping)\n", - "print(\"Forcings:\", eval_forcings.dims.mapping)\n", - "\n", - "predictions = rollout.chunked_prediction(\n", - " run_forward_jitted,\n", - " rng=jax.random.PRNGKey(0),\n", - " inputs=eval_inputs,\n", - " targets_template=eval_targets * np.nan,\n", - " forcings=eval_forcings)\n", - "predictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ft298eZskHtn" - }, - "outputs": [], - "source": [ - "# @title Choose predictions to plot\n", - "\n", - "plot_pred_variable = widgets.Dropdown(\n", - " options=predictions.data_vars.keys(),\n", - " value=\"2m_temperature\",\n", - " description=\"Variable\")\n", - "plot_pred_level = widgets.Dropdown(\n", - " options=predictions.coords[\"level\"].values,\n", - " value=500,\n", - " description=\"Level\")\n", - "plot_pred_robust = widgets.Checkbox(value=True, description=\"Robust\")\n", - "plot_pred_max_steps = widgets.IntSlider(\n", - " min=1,\n", - " max=predictions.dims[\"time\"],\n", - " value=predictions.dims[\"time\"],\n", - " description=\"Max steps\")\n", - "\n", - "widgets.VBox([\n", - " plot_pred_variable,\n", - " plot_pred_level,\n", - " plot_pred_robust,\n", - " plot_pred_max_steps,\n", - " widgets.Label(value=\"Run the next cell to plot the predictions. Rerunning this cell clears your selection.\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "_tTdx6fmmj1I" - }, - "outputs": [], - "source": [ - "# @title Plot predictions\n", - "\n", - "plot_size = 5\n", - "plot_max_steps = min(predictions.dims[\"time\"], plot_pred_max_steps.value)\n", - "\n", - "data = {\n", - " \"Targets\": scale(select(eval_targets, plot_pred_variable.value, plot_pred_level.value, plot_max_steps), robust=plot_pred_robust.value),\n", - " \"Predictions\": scale(select(predictions, plot_pred_variable.value, plot_pred_level.value, plot_max_steps), robust=plot_pred_robust.value),\n", - " \"Diff\": scale((select(eval_targets, plot_pred_variable.value, plot_pred_level.value, plot_max_steps) -\n", - " select(predictions, plot_pred_variable.value, plot_pred_level.value, plot_max_steps)),\n", - " robust=plot_pred_robust.value, center=0),\n", - "}\n", - "fig_title = plot_pred_variable.value\n", - "if \"level\" in predictions[plot_pred_variable.value].coords:\n", - " fig_title += f\" at {plot_pred_level.value} hPa\"\n", - "\n", - "plot_data(data, fig_title, plot_size, plot_pred_robust.value)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Pa78b64bLYe1" - }, - "source": [ - "# Train the model\n", - "\n", - "The following operations require a large amount of memory and, depending on the accelerator being used, will only fit the very small \"random\" model on low resolution data. It uses the number of training steps selected above.\n", - "\n", - "The first time executing the cell takes more time, as it include the time to jit the function." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "Nv-u3dAP7IRZ" - }, - "outputs": [], - "source": [ - "# @title Loss computation (autoregressive loss over multiple steps)\n", - "loss, diagnostics = loss_fn_jitted(\n", - " rng=jax.random.PRNGKey(0),\n", - " inputs=train_inputs,\n", - " targets=train_targets,\n", - " forcings=train_forcings)\n", - "print(\"Loss:\", float(loss))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "mBNFq1IGZNLz" - }, - "outputs": [], - "source": [ - "# @title Gradient computation (backprop through time)\n", - "loss, diagnostics, next_state, grads = grads_fn_jitted(\n", - " inputs=train_inputs,\n", - " targets=train_targets,\n", - " forcings=train_forcings)\n", - "mean_grad = np.mean(jax.tree_util.tree_flatten(jax.tree_util.tree_map(lambda x: np.abs(x).mean(), grads))[0])\n", - "print(f\"Loss: {loss:.4f}, Mean |grad|: {mean_grad:.6f}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "J4FJFKWD8Loz" - }, - "outputs": [], - "source": [ - "# @title Autoregressive rollout (keep the loop in JAX)\n", - "print(\"Inputs: \", train_inputs.dims.mapping)\n", - "print(\"Targets: \", train_targets.dims.mapping)\n", - "print(\"Forcings:\", train_forcings.dims.mapping)\n", - "\n", - "predictions = run_forward_jitted(\n", - " rng=jax.random.PRNGKey(0),\n", - " inputs=train_inputs,\n", - " targets_template=train_targets * np.nan,\n", - " forcings=train_forcings)\n", - "predictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "colab": { - "name": "GraphCast", - "private_outputs": true, - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.2" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/examples/jupyter/TorchGeo.ipynb b/examples/jupyter/TorchGeo.ipynb deleted file mode 100644 index 5f3225172..000000000 --- a/examples/jupyter/TorchGeo.ipynb +++ /dev/null @@ -1,443 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "6f1f8ef7-33c9-4530-a964-1dabbe709377", - "metadata": { - "tags": [] - }, - "source": [ - "# TorchGeo: An Introduction to Object Detection Example\n", - "[https://medium.com/@byeonghyeokyu/torchgeo-an-introduction-to-object-detection-example-b0fd43e89649](https://medium.com/@byeonghyeokyu/torchgeo-an-introduction-to-object-detection-example-b0fd43e89649)\n", - "- https://doi.org/10.1016/j.isprsjprs.2014.10.002\n", - "- https://doi.org/10.1109/IGARSS.2019.8898573\n", - "- https://doi.org/10.3390/rs12060989" - ] - }, - { - "cell_type": "markdown", - "id": "7643618c-e4fb-4ff0-bd70-ec5f7d212019", - "metadata": {}, - "source": [ - "## Installing Dependencies" - ] - }, - { - "cell_type": "markdown", - "id": "8b518779-456f-4943-b1f1-843d78442a4a", - "metadata": {}, - "source": [ - "Install rar and unrar" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64abc62c-b107-4109-af2f-aef58e80e7f8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!wget https://www.rarlab.com/rar/rarlinux-x64-5.5.0.tar.gz\n", - "!tar xzvf rarlinux-x64-5.5.0.tar.gz \n", - "!sudo cp rar/rar rar/unrar /usr/local/bin/" - ] - }, - { - "cell_type": "markdown", - "id": "01e3f468-971b-40bc-a8b4-885031cc3e08", - "metadata": {}, - "source": [ - "Install Python Dependencies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d34243d7-4666-484f-9235-e4c15bd36472", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install torchgeo[all]\n", - "!pip install gdown\n", - "!pip install -q -U pytorch-lightning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79a0303b-6e16-4206-8255-c9b52c61d7d6", - "metadata": {}, - "outputs": [], - "source": [ - "import torchgeo\n", - "from torchgeo.datasets import VHR10\n", - "from torchgeo.trainers import ObjectDetectionTask\n", - "\n", - "import torch\n", - "from torch.utils.data import DataLoader\n", - "import lightning.pytorch as pl\n", - "\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d93f8c94-1207-4893-94af-e783eb36f8e5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "torchgeo.__version__\n", - "# Need version 0.5.2" - ] - }, - { - "cell_type": "markdown", - "id": "a73a10d1-3bdd-4a77-bc27-3468afa38000", - "metadata": {}, - "source": [ - "## Downloading the VHR-10 Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee97aa21-f800-4d34-a6fc-40efa4f7869a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import os, gdown\n", - "\n", - "os.makedirs('data/VHR10/', exist_ok=True)\n", - "\n", - "url = 'https://drive.google.com/uc?id=1--foZ3dV5OCsqXQXT84UeKtrAqc5CkAE'\n", - "output_path = 'data/VHR10/NWPU VHR-10 dataset.rar'\n", - "gdown.download(url, output_path, quiet=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed147886-523b-448e-b568-95544bc7c154", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def preprocess(sample):\n", - " sample[\"image\"] = sample[\"image\"].float() / 255.0\n", - " return sample\n", - "\n", - "ds = VHR10(\n", - " root=\"data/VHR10/\",\n", - " split=\"positive\",\n", - " transforms=preprocess,\n", - " download=True,\n", - " checksum=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "bbfd65bc-697c-4818-977b-74499af1a62e", - "metadata": {}, - "source": [ - "## Exploring the VHR-10 Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7f934897-27e0-41cd-b2cb-829d0a7a8419", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(f\"VHR-10 dataset: {len(ds)}\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9f5b1844-5c96-49ac-a04c-f84946b2dc50", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ds[0][\"image\"].shape\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54c226b8-3788-4ffb-a3a5-f0f5ab55dbe1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "torch.Size([3, 808, 958])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ca18037-925b-49f8-b8a9-34fa23ab8e2b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "image = ds[5][\"image\"].permute(1, 2, 0)\n", - "plt.imshow(image)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aca0c0b9-efe6-4add-9d40-a252584b619c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ds.plot(ds[5])\n", - "plt.savefig('ground_truth.png', bbox_inches='tight')\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "55612e0b-2c64-4e1f-9c1c-f77fff7b8d5c", - "metadata": {}, - "source": [ - "## Model Training" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fab7d1e-dce5-41b7-a75b-d7cbe177ec53", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def collate_fn(batch):\n", - " new_batch = {\n", - " \"image\": [item[\"image\"] for item in batch], # Images\n", - " \"boxes\": [item[\"boxes\"] for item in batch], # Bounding boxes\n", - " \"labels\": [item[\"labels\"] for item in batch], # Labels\n", - " \"masks\": [item[\"masks\"] for item in batch], # Masks\n", - " }\n", - " return new_batch # Return the new batch\n", - "\n", - "# Data Loader\n", - "\n", - "dl = DataLoader(\n", - " ds, # Dataset\n", - " batch_size=32, # Number of data to load at one time\n", - " num_workers=2, # Number of processes to use for data loading\n", - " shuffle=True, # Whether to shuffle the dataset before loading\n", - " collate_fn=collate_fn, # collate_fn function for batch processing\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "535d50ab-dfc1-4742-8b20-eb1445d5ac94", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "class VariableSizeInputObjectDetectionTask(ObjectDetectionTask):\n", - " # Define the training step\n", - " def training_step(self, batch, batch_idx, dataloader_idx=0):\n", - " x = batch[\"image\"] # Image\n", - " batch_size = len(x) # Set batch size (number of images)\n", - " y = [\n", - " {\"boxes\": batch[\"boxes\"][i], \"labels\": batch[\"labels\"][i]}\n", - " for i in range(batch_size)\n", - " ] # Extract bounding box and label information for each image\n", - " loss_dict = self(x, y) # Loss\n", - " train_loss: Tensor = sum(loss_dict.values()) # Training loss (sum of loss values)\n", - " self.log_dict(loss_dict) # Record loss values\n", - " return train_loss # Return training loss\n", - "\n", - "task = VariableSizeInputObjectDetectionTask(\n", - " model=\"faster-rcnn\", # Faster R-CNN model\n", - " backbone=\"resnet18\", # ResNet18 neural network architecture\n", - " weights=True, # Use pretrained weights\n", - " in_channels=3, # Number of channels in the input image (RGB images)\n", - " num_classes=11, # Number of classes to classify (10 + background)\n", - " trainable_layers=3, # Number of trainable layers\n", - " lr=1e-3, # Learning rate\n", - " patience=10, # Set the number of patience iterations for early stopping\n", - " freeze_backbone=False, # Whether to train with the backbone network weights unfrozen\n", - ")\n", - "task.monitor = \"loss_classifier\" # Set the metric to monitor (here, the classifier's loss)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce136d68-0257-4337-8486-9bf52e11e818", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(\n", - " default_root_dir=\"logs/\", # Set the default directory\n", - " accelerator=\"gpu\", # Set the type of hardware accelerator for training (using GPU)\n", - " devices=[0], # List of device IDs to use ([0] means the first GPU)\n", - " min_epochs=6, # Set the minimum number of training epochs\n", - " max_epochs=100, # Set the maximum number of training epochs\n", - " log_every_n_steps=20, # Set how often to log after a number of steps\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b0e0a2e-af82-4ad7-95f3-e83c52b3e027", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%time\n", - "# Model training\n", - "trainer.fit(task, train_dataloaders=dl)" - ] - }, - { - "cell_type": "markdown", - "id": "43bc214b-8e67-49ef-bb8b-20d20d941db1", - "metadata": {}, - "source": [ - "## Model Inference Example" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "737df6b6-7926-4b7a-b769-26b4beb8792b", - "metadata": {}, - "outputs": [], - "source": [ - "batch = next(iter(dl))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74914145-55a5-43e0-8bb5-ace7ae407987", - "metadata": {}, - "outputs": [], - "source": [ - "model = task.model\n", - "model.eval()\n", - "\n", - "with torch.no_grad():\n", - " out = model(batch[\"image\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "202b6df5-9c08-4744-bbcd-e0491914a710", - "metadata": {}, - "outputs": [], - "source": [ - "def create_sample(batch, out, batch_idx):\n", - " return {\n", - " \"image\": batch[\"image\"][batch_idx], # Image\n", - " \"boxes\": batch[\"boxes\"][batch_idx], # Actual bounding boxes\n", - " \"labels\": batch[\"labels\"][batch_idx], # Actual labels\n", - " \"masks\": batch[\"masks\"][batch_idx], # Actual masks\n", - " \"prediction_labels\": out[batch_idx][\"labels\"], # Labels predicted by the model\n", - " \"prediction_boxes\": out[batch_idx][\"boxes\"], # Bounding boxes predicted by the model\n", - " \"prediction_scores\": out[batch_idx][\"scores\"], # Confidence scores for each prediction\n", - " }\n", - "\n", - "batch_idx = 0\n", - "sample = create_sample(batch, out, batch_idx)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31842d2d-325b-49ac-8a72-968e9226d023", - "metadata": {}, - "outputs": [], - "source": [ - "ds.plot(sample)\n", - "plt.savefig('inference.png', bbox_inches='tight')\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b2f01cc-dede-42db-8635-7d08d0611c72", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualizing Sample for Batch Index 3\n", - "batch_idx = 3\n", - "sample = create_sample(batch, out, batch_idx)\n", - "\n", - "ds.plot(sample)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66573204-b27f-40d0-86b2-d9f5c94295b5", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualizing Sample for Batch Index 5\n", - "batch_idx = 5\n", - "sample = create_sample(batch, out, batch_idx)\n", - "\n", - "ds.plot(sample)\n", - "plt.show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/jupyter/dask/dask.ipynb b/examples/jupyter/dask/dask.ipynb deleted file mode 100644 index 34875533f..000000000 --- a/examples/jupyter/dask/dask.ipynb +++ /dev/null @@ -1,121 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 6, - "id": "631dafcc", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/contrib/Alvaro.Vidal/miniconda3/envs/dask/lib/python3.11/site-packages/dask_jobqueue/core.py:293: FutureWarning: header_skip has been renamed to job_directives_skip. You are still using it (even if only set to []; please also check config files). If you did not set job_directives_skip yet, header_skip will be respected for now, but it will be removed in a future release. If you already set job_directives_skip, header_skip is ignored and you can remove it.\n", - " warnings.warn(warn, FutureWarning)\n", - "/contrib/Alvaro.Vidal/miniconda3/envs/dask/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n", - "Perhaps you already have a cluster running?\n", - "Hosting the HTTP server on port 37905 instead\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "import dask.array as da # Import Dask\n", - "from dask.distributed import Client\n", - "from dask_jobqueue import SLURMCluster\n", - " \n", - "# Define SLURM cluster configuration\n", - "cluster = SLURMCluster(\n", - " queue=\"compute\",\n", - " cores=2, # Number of CPU cores per worker\n", - " memory=\"8GB\", # Memory per worker\n", - " header_skip=['--mem'], # Adding this argument allows Dask to ignore the memory parameter\n", - "\n", - ")\n", - "\n", - "# Scale the cluster to a desired number of workers\n", - "cluster.adapt(minimum = 0, maximum = 4) # Scale to 4 workers\n", - "\n", - "# Connect a Dask client to the cluster\n", - "client = Client(cluster)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d3b15330", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/contrib/Alvaro.Vidal/miniconda3/envs/dask/lib/python3.11/site-packages/dask_jobqueue/core.py:293: FutureWarning: header_skip has been renamed to job_directives_skip. You are still using it (even if only set to []; please also check config files). If you did not set job_directives_skip yet, header_skip will be respected for now, but it will be removed in a future release. If you already set job_directives_skip, header_skip is ignored and you can remove it.\n", - " warnings.warn(warn, FutureWarning)\n", - "/contrib/Alvaro.Vidal/miniconda3/envs/dask/lib/python3.11/site-packages/dask_jobqueue/core.py:293: FutureWarning: header_skip has been renamed to job_directives_skip. You are still using it (even if only set to []; please also check config files). If you did not set job_directives_skip yet, header_skip will be respected for now, but it will be removed in a future release. If you already set job_directives_skip, header_skip is ignored and you can remove it.\n", - " warnings.warn(warn, FutureWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Result: 2000000.0\n" - ] - } - ], - "source": [ - "\n", - "# Define a simple Dask computation (e.g., parallelized addition)\n", - "def add(a, b):\n", - " return a + b\n", - "\n", - "# Create Dask arrays for your computation\n", - "x = da.ones(1000000, chunks=10000)\n", - "y = da.ones(1000000, chunks=10000)\n", - "\n", - "# Perform the computation using Dask\n", - "result = add(x, y).sum()\n", - "\n", - "# Compute the result and retrieve the value\n", - "result_value = result.compute()\n", - "\n", - "# Print the result\n", - "print(\"Result:\", result_value)\n", - "\n", - "# Close the Dask client and cluster when done\n", - "client.close()\n", - "cluster.close()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a34c5dba", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:dask] *", - "language": "python", - "name": "conda-env-dask-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/jupyter/dask/dask.yaml b/examples/jupyter/dask/dask.yaml deleted file mode 100644 index 69bee063c..000000000 --- a/examples/jupyter/dask/dask.yaml +++ /dev/null @@ -1,275 +0,0 @@ -name: dask -channels: - - conda-forge - - anaconda - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - abseil-cpp=20211102.0=hd4dd3e8_0 - - argon2-cffi=21.3.0=pyhd3eb1b0_0 - - arrow-cpp=11.0.0=h374c478_2 - - asttokens=2.0.5=pyhd3eb1b0_0 - - aws-c-common=0.6.8=h5eee18b_1 - - aws-c-event-stream=0.1.6=h6a678d5_6 - - aws-checksums=0.1.11=h5eee18b_2 - - aws-sdk-cpp=1.8.185=h721c034_1 - - backcall=0.2.0=pyhd3eb1b0_0 - - blas=1.0=mkl - - bleach=4.1.0=pyhd3eb1b0_0 - - boost-cpp=1.73.0=h7f8727e_12 - - brotli-python=1.0.9=py311h6a678d5_7 - - bzip2=1.0.8=h7b6447c_0 - - c-ares=1.19.1=h5eee18b_0 - - ca-certificates=2023.7.22=hbcca054_0 - - certifi=2023.7.22=pyhd8ed1ab_0 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - cyrus-sasl=2.1.28=h52b45da_1 - - dask-core=2023.6.0=py311h06a4308_0 - - dask-glm=0.2.0=py_1 - - dask-jobqueue=0.8.2=pyhd8ed1ab_0 - - dask-ml=2023.3.24=pyhd8ed1ab_1 - - dbus=1.13.18=hb2f20db_0 - - decorator=5.1.1=pyhd3eb1b0_0 - - defusedxml=0.7.1=pyhd3eb1b0_0 - - executing=0.8.3=pyhd3eb1b0_0 - - expat=2.5.0=h6a678d5_0 - - fontconfig=2.14.1=h4c34cd2_2 - - freetype=2.12.1=h4a9f257_0 - - gflags=2.2.2=he6710b0_0 - - giflib=5.2.1=h5eee18b_3 - - glib=2.69.1=he621ea3_2 - - glog=0.5.0=h2531618_0 - - grpc-cpp=1.48.2=he1ff14a_1 - - gst-plugins-base=1.14.1=h6a678d5_1 - - gstreamer=1.14.1=h5eee18b_1 - - heapdict=1.0.1=pyhd3eb1b0_0 - - icu=58.2=he6710b0_3 - - intel-openmp=2023.1.0=hdb19cb5_46305 - - ipython_genutils=0.2.0=pyhd3eb1b0_1 - - joblib=1.3.2=pyhd8ed1ab_0 - - jpeg=9e=h5eee18b_1 - - json5=0.9.6=pyhd3eb1b0_0 - - jupyter_client=8.1.0=py311h06a4308_0 - - jupyter_console=6.6.3=py311h06a4308_0 - - jupyter_core=5.3.0=py311h06a4308_0 - - jupyter_events=0.6.3=py311h06a4308_0 - - jupyter_server=2.5.0=py311h06a4308_0 - - jupyter_server_fileid=0.9.0=py311h06a4308_0 - - jupyter_server_terminals=0.4.4=py311h06a4308_1 - - jupyter_server_ydoc=0.8.0=py311h06a4308_1 - - jupyter_ydoc=0.2.4=py311h06a4308_0 - - jupyterlab_pygments=0.1.2=py_0 - - jupyterlab_server=2.22.0=py311h06a4308_0 - - jupyterlab_widgets=3.0.5=py311h06a4308_0 - - krb5=1.20.1=h143b758_1 - - lcms2=2.12=h3be6417_0 - - ld_impl_linux-64=2.38=h1181459_1 - - lerc=3.0=h295c915_0 - - libboost=1.73.0=h28710b8_12 - - libbrotlicommon=1.0.9=h5eee18b_7 - - libbrotlidec=1.0.9=h5eee18b_7 - - libbrotlienc=1.0.9=h5eee18b_7 - - libclang=14.0.6=default_hc6dbbc7_1 - - libclang13=14.0.6=default_he11475f_1 - - libcups=2.4.2=h2d74bed_1 - - libcurl=8.4.0=h251f7ec_0 - - libdeflate=1.17=h5eee18b_1 - - libedit=3.1.20221030=h5eee18b_0 - - libev=4.33=h7f8727e_1 - - libevent=2.1.12=hdbd6064_1 - - libffi=3.4.4=h6a678d5_0 - - libgcc-ng=11.2.0=h1234567_1 - - libgfortran-ng=13.2.0=h69a702a_0 - - libgfortran5=13.2.0=ha4646dd_0 - - libgomp=11.2.0=h1234567_1 - - libllvm14=14.0.6=hdb19cb5_3 - - libnghttp2=1.57.0=h2d74bed_0 - - libpng=1.6.39=h5eee18b_0 - - libpq=12.15=hdbd6064_1 - - libprotobuf=3.20.3=he621ea3_0 - - libsodium=1.0.18=h7b6447c_0 - - libssh2=1.10.0=hdbd6064_2 - - libstdcxx-ng=11.2.0=h1234567_1 - - libthrift=0.15.0=h1795dd8_2 - - libtiff=4.5.1=h6a678d5_0 - - libuuid=1.41.5=h5eee18b_0 - - libwebp=1.3.2=h11a3e52_0 - - libwebp-base=1.3.2=h5eee18b_0 - - libxcb=1.15=h7f8727e_0 - - libxkbcommon=1.0.1=h5eee18b_1 - - libxml2=2.10.4=hcbfbd50_0 - - libxslt=1.1.37=h2085143_0 - - lz4-c=1.9.4=h6a678d5_0 - - mkl=2023.1.0=h213fc3f_46343 - - mkl_fft=1.3.8=py311h5eee18b_0 - - mkl_random=1.2.4=py311hdb19cb5_0 - - msgpack-python=1.0.3=py311hdb19cb5_0 - - multipledispatch=0.6.0=py_0 - - mysql=5.7.24=h721c034_2 - - nb_conda_kernels=2.3.1=py311h06a4308_0 - - ncurses=6.4=h6a678d5_0 - - nspr=4.35=h6a678d5_0 - - nss=3.89.1=h6a678d5_0 - - numpy-base=1.24.3=py311hf175353_1 - - openjpeg=2.4.0=h3ad879b_0 - - openssl=3.0.11=h7f8727e_2 - - orc=1.7.4=hb3bc3d3_1 - - pandocfilters=1.5.0=pyhd3eb1b0_0 - - parso=0.8.3=pyhd3eb1b0_0 - - pcre=8.45=h295c915_0 - - pexpect=4.8.0=pyhd3eb1b0_3 - - pickleshare=0.7.5=pyhd3eb1b0_1003 - - prometheus_client=0.14.1=py311h06a4308_0 - - prompt_toolkit=3.0.36=hd3eb1b0_0 - - ptyprocess=0.7.0=pyhd3eb1b0_2 - - pure_eval=0.2.2=pyhd3eb1b0_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pyqt=5.15.7=py311h6a678d5_0 - - python=3.11.5=h955ad1f_0 - - python-dateutil=2.8.2=pyhd3eb1b0_0 - - python-fastjsonschema=2.16.2=py311h06a4308_0 - - python-lmdb=1.4.1=py311h6a678d5_0 - - python-tzdata=2023.3=pyhd3eb1b0_0 - - qt-main=5.15.2=h7358343_9 - - qt-webengine=5.15.9=h9ab4d14_7 - - qtwebkit=5.212=h3fafdc1_5 - - re2=2022.04.01=h295c915_0 - - readline=8.2=h5eee18b_0 - - send2trash=1.8.0=pyhd3eb1b0_1 - - six=1.16.0=pyhd3eb1b0_1 - - snappy=1.1.9=h295c915_0 - - sortedcontainers=2.4.0=pyhd3eb1b0_0 - - sqlite=3.41.2=h5eee18b_0 - - stack_data=0.2.0=pyhd3eb1b0_0 - - tbb=2021.8.0=hdb19cb5_0 - - tblib=1.7.0=pyhd3eb1b0_0 - - threadpoolctl=3.2.0=pyha21a80b_0 - - tk=8.6.12=h1ccaba5_0 - - toml=0.10.2=pyhd3eb1b0_0 - - typing_extensions=4.7.1=py311h06a4308_0 - - tzdata=2023c=h04d1e81_0 - - utf8proc=2.6.1=h27cfd23_0 - - wcwidth=0.2.5=pyhd3eb1b0_0 - - xz=5.4.2=h5eee18b_0 - - yaml=0.2.5=h7b6447c_0 - - zeromq=4.3.4=h2531618_0 - - zlib=1.2.13=h5eee18b_0 - - zstd=1.5.5=hc292b87_0 - - pip: - - aiofiles==22.1.0 - - aiosqlite==0.18.0 - - anyio==3.5.0 - - argon2-cffi-bindings==21.2.0 - - attrs==23.1.0 - - babel==2.11.0 - - beautifulsoup4==4.12.2 - - bokeh==3.3.0 - - bottleneck==1.3.5 - - brotli==1.0.9 - - brotlipy==0.7.0 - - cffi==1.15.1 - - click==8.1.7 - - cloudpickle==2.2.1 - - comm==0.1.2 - - contourpy==1.0.5 - - cryptography==41.0.3 - - cytoolz==0.12.0 - - dask==2023.6.0 - - debugpy==1.6.7 - - distributed==2023.6.0 - - entrypoints==0.4 - - fastjsonschema==2.16.2 - - fsspec==2023.9.2 - - idna==3.4 - - importlib-metadata==6.0.0 - - ipykernel==6.25.0 - - ipython==8.15.0 - - ipywidgets==8.0.4 - - jedi==0.18.1 - - jinja2==3.1.2 - - jsonschema==4.17.3 - - jupyter==1.0.0 - - jupyter-client==8.1.0 - - jupyter-console==6.6.3 - - jupyter-core==5.3.0 - - jupyter-events==0.6.3 - - jupyter-server==2.5.0 - - jupyter-server-fileid==0.9.0 - - jupyter-server-terminals==0.4.4 - - jupyter-server-ydoc==0.8.0 - - jupyter-ydoc==0.2.4 - - jupyterlab==3.6.3 - - jupyterlab-server==2.22.0 - - jupyterlab-widgets==3.0.5 - - llvmlite==0.41.0 - - lmdb==1.4.1 - - locket==1.0.0 - - lxml==4.9.3 - - lz4==4.3.2 - - markupsafe==2.1.1 - - matplotlib-inline==0.1.6 - - mistune==0.8.4 - - mkl-fft==1.3.8 - - mkl-random==1.2.4 - - mkl-service==2.4.0 - - msgpack==1.0.3 - - nb-conda-kernels==2.3.1 - - nbclassic==0.5.5 - - nbclient==0.5.13 - - nbconvert==6.5.4 - - nbformat==5.9.2 - - nest-asyncio==1.5.6 - - notebook==6.5.4 - - notebook-shim==0.2.2 - - numba==0.58.0 - - numexpr==2.8.7 - - numpy==1.24.3 - - packaging==23.1 - - pandas==2.1.1 - - partd==1.4.0 - - pillow==10.0.1 - - pip==23.3 - - platformdirs==3.10.0 - - ply==3.11 - - prometheus-client==0.14.1 - - prompt-toolkit==3.0.36 - - psutil==5.9.0 - - pyarrow==11.0.0 - - pygments==2.15.1 - - pyopenssl==23.2.0 - - pyqt5-sip==12.11.0 - - pyrsistent==0.18.0 - - pysocks==1.7.1 - - python-json-logger==2.0.7 - - pytz==2023.3.post1 - - pyyaml==6.0.1 - - pyzmq==25.1.0 - - qtconsole==5.4.2 - - qtpy==2.2.0 - - requests==2.31.0 - - rfc3339-validator==0.1.4 - - rfc3986-validator==0.1.1 - - scikit-learn==1.3.0 - - scipy==1.11.3 - - setuptools==68.0.0 - - sip==6.6.2 - - sniffio==1.2.0 - - soupsieve==2.5 - - terminado==0.17.1 - - tinycss2==1.2.1 - - toolz==0.12.0 - - tornado==6.3.3 - - traitlets==5.7.1 - - typing-extensions==4.7.1 - - urllib3==1.26.16 - - webencodings==0.5.1 - - websocket-client==0.58.0 - - wheel==0.41.2 - - widgetsnbextension==4.0.5 - - xyzservices==2022.9.0 - - y-py==0.5.9 - - ypy-websocket==0.8.2 - - zict==3.0.0 - - zipp==3.11.0 -prefix: /contrib/Alvaro.Vidal/miniconda3/envs/dask diff --git a/examples/jupyter/hello-tensorflow-gpu.ipynb b/examples/jupyter/hello-tensorflow-gpu.ipynb deleted file mode 100644 index 74d79be88..000000000 --- a/examples/jupyter/hello-tensorflow-gpu.ipynb +++ /dev/null @@ -1,70 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gpu-101.cluster.local\n" - ] - } - ], - "source": [ - "import socket\n", - "print(socket.gethostname())" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Num GPUs Available: 4\n", - "[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]\n" - ] - } - ], - "source": [ - "import tensorflow as tf\n", - "print(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))\n", - "print(tf.config.list_physical_devices('GPU'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/h2o-3/controller-v3.sh b/h2o-3/controller-v3.sh deleted file mode 100755 index 746f91579..000000000 --- a/h2o-3/controller-v3.sh +++ /dev/null @@ -1,31 +0,0 @@ -cd ${resource_jobdir} - -displayErrorMessage() { - echo $(date): $1 - exit 1 -} - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - - -# https://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/6/h2o-3.46.0.6.zip -service_rel_install_dir="$(basename ${service_download_url} .zip)" -service_bin="${service_parent_install_dir}/${service_rel_install_dir}/h2o.jar" - -if [ -f "${service_bin}" ]; then - echo "Service already installed under ${service_rel_install_dir}/h2o.jar" - exit 0 -fi - -mkdir -p ${service_parent_install_dir} -cd ${service_parent_install_dir} -rm -rf ${service_rel_install_dir} -wget ${service_download_url} -unzip $(basename ${service_download_url}) - -if ! [ -f "${service_bin}" ]; then - displayErrorMessage "Failed to install ${service_download_url}" -fi \ No newline at end of file diff --git a/h2o-3/kill-template.sh b/h2o-3/kill-template.sh deleted file mode 100755 index 616a00b9e..000000000 --- a/h2o-3/kill-template.sh +++ /dev/null @@ -1,4 +0,0 @@ - -# Runs in the controller node: -bash ${resource_jobdir}/service-kill-${job_number}.sh - diff --git a/h2o-3/start-template-v3.sh b/h2o-3/start-template-v3.sh deleted file mode 100755 index 02b52d3c8..000000000 --- a/h2o-3/start-template-v3.sh +++ /dev/null @@ -1,12 +0,0 @@ -# Runs via ssh + sbatch -set -x - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -# https://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/6/h2o-3.46.0.6.zip -service_rel_install_dir="$(basename ${service_download_url} .zip)" -service_bin="${service_parent_install_dir}/${service_rel_install_dir}/h2o.jar" - -java -jar ${service_bin} -port ${service_port} \ No newline at end of file diff --git a/hammerspace/start-template-v3.sh b/hammerspace/start-template-v3.sh deleted file mode 100755 index f7d925364..000000000 --- a/hammerspace/start-template-v3.sh +++ /dev/null @@ -1,3 +0,0 @@ -# WRITE CODE TO START HAMMESPACE SERVICE HERE - -sleep inf diff --git a/hammerspace/url.sh b/hammerspace/url.sh deleted file mode 100755 index 29ce812a7..000000000 --- a/hammerspace/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="#/auth/login\"" \ No newline at end of file diff --git a/juice-server/controller-v3.sh b/juice-server/controller-v3.sh deleted file mode 100644 index f8848b024..000000000 --- a/juice-server/controller-v3.sh +++ /dev/null @@ -1,15 +0,0 @@ -cd ${resource_jobdir} - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -service_install_dir=${service_parent_install_dir}/JuiceServer - -if ! [ -f ${service_install_dir}/agent ]; then - rm -rf ${service_install_dir} - wget ${service_download_url} - mkdir -p ${service_install_dir} - tar -xf JuiceServer-linux.tar.gz -C ${service_install_dir} - ${service_install_dir}/agent --help -fi diff --git a/juice-server/pytorch-hello-gpu.py b/juice-server/pytorch-hello-gpu.py deleted file mode 100644 index 35d27a11e..000000000 --- a/juice-server/pytorch-hello-gpu.py +++ /dev/null @@ -1,23 +0,0 @@ -import torch -import socket - -# Get the hostname -hostname = socket.gethostname() -print(f"Running on host: {hostname}") - - -# Check if CUDA is available -if torch.cuda.is_available(): - device = torch.device("cuda") # Use the first GPU - print(f"Running on GPU: {torch.cuda.get_device_name(0)}") - - # Create a tensor on GPU - tensor = torch.tensor([1, 2, 3, 4, 5], device=device) - - # Perform a simple operation - result = tensor * 2 - - print(f"Tensor on GPU: {tensor}") - print(f"Result after computation: {result}") -else: - print("CUDA is not available. Please check your PyTorch installation.") \ No newline at end of file diff --git a/juice-server/start-template-v3.sh b/juice-server/start-template-v3.sh deleted file mode 100755 index 376b6af8d..000000000 --- a/juice-server/start-template-v3.sh +++ /dev/null @@ -1,54 +0,0 @@ -# Runs via ssh + sbatch -set -x - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -${service_parent_install_dir}/JuiceServer/agent -address 0.0.0.0:${service_port} - - -""" -$ ./agent --help -Usage of ./agent: - -access-token string - The access token to use when connecting to the controller - -address string - The IP address and port to use for listening for client connections (default "0.0.0.0:43210") - -auth-audience string - The audience used for validating jwt tokens - -auth-domain string - The domain used for validating jwt tokens - -cert-file string - - -controller string - The IP address and port of the controller - -disable-gpu-metrics - - -enable-token-validation - Enable token validation - -expose string - The IP address and port to expose through the controller for clients to see. The value is not checked for correctness. - -generate-cert - Generates a certificate for https - -gpu-metrics-interval-ms uint - (default 1000) - -juice-path string - - -key-file string - - -labels string - Comma separated list of key=value pairs - -log-file string - - -log-level string - Sets the maximum level of output [Fatal, Error, Warning, Info (Default), Debug, Trace] (default "info") - -quiet - Disables all logging output - -taints string - Comma separated list of key=value pairs - -version - Prints the version and exits - -""" - diff --git a/jupyter-docker/kill-template.sh b/jupyter-docker/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/jupyter-docker/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/jupyter-docker/start-template-v3.sh b/jupyter-docker/start-template-v3.sh deleted file mode 100755 index 79c85a27d..000000000 --- a/jupyter-docker/start-template-v3.sh +++ /dev/null @@ -1,224 +0,0 @@ -# Runs via ssh + sbatch -set -x - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh - -jupyter_container_name="jupyter-${service_port}" -echo "sudo docker stop ${jupyter_container_name}" >> cancel.sh -echo "sudo docker rm ${jupyter_container_name}" >> cancel.sh - -# Set the launch directory for JupyterHub -# If notebook_dir is not set or set to a templated value, -# use the default value of "/". -if [ -z ${service_notebook_dir} ]; then - service_notebook_dir="/" -fi - -if [[ ${service_use_gpus} == "true" ]]; then - gpu_flag="--gpus all" - # FIXME: This should go to the image creation - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo - sudo yum-config-manager --enable nvidia-container-toolkit-experimental - sudo yum install -y nvidia-container-toolkit - sudo nvidia-ctk runtime configure --runtime=docker - sudo systemctl restart docker -else - gpu_flag="" -fi - -sudo service docker start -sudo docker pull ${service_docker_repo} - -# Obtain Jupyter version without breaking ssh connection -sudo docker run -i --rm ${service_docker_repo} jupyter-notebook --version > jupyter.version & -sleep 5 -while [ ! -f "jupyter.version" ]; do - sleep 2 -done -jupyter_major_version=$(cat jupyter.version | tail -n1 | cut -d'.' -f1) - -echo "Jupyter version is ${jupyter_major_version}" - - -####################### -# OLD JUPYTER VERSION # -####################### -if [ "${jupyter_major_version}" -eq 6 ]; then - -# Custom PW plugin: -mkdir -p pw_jupyter_proxy -cat >> pw_jupyter_proxy/__init__.py <> config.conf <> nginx.conf <> cancel.sh -echo "sudo docker rm ${container_name}" >> cancel.sh -touch empty -touch nginx.logs -# change ownership to nginx user -sudo chown 101:101 nginx.logs # change ownership to nginx user -sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v ${PWD}/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.logs:/var/log/nginx/access.log \ - -v $PWD/nginx.logs:/var/log/nginx/error.log \ - --network=host nginxinc/nginx-unprivileged:1.25.3 -# Print logs -sudo docker logs ${container_name} - -######################### -# START JUPYTER WRAPPER # -######################### - -sudo -n docker run ${gpu_flag} -i --rm --name ${jupyter_container_name} \ - ${service_mount_directories} \ - -v ${HOME}:${HOME} \ - -p ${jupyterserver_port}:${jupyterserver_port} \ - ${service_docker_repo} \ - jupyter-notebook \ - --port=${jupyterserver_port} \ - --ip=0.0.0.0 \ - --no-browser \ - --allow-root \ - --ServerApp.trust_xheaders=True \ - --ServerApp.allow_origin='*' \ - --ServerApp.allow_remote_access=True \ - --ServerApp.token="" \ - --ServerApp.base_url=${basepath}/ \ - --ServerApp.root_dir=${service_notebook_dir} - -else - displayErrorMessage "ERROR: Jupyter Notebook version ${jupyter_major_version} is not supported. Use version 6 or 7" -fi - -sleep inf diff --git a/jupyter-host/controller-v3.sh b/jupyter-host/controller-v3.sh deleted file mode 100644 index a706bf34d..000000000 --- a/jupyter-host/controller-v3.sh +++ /dev/null @@ -1,176 +0,0 @@ -set -x - -cd ${resource_jobdir} - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - - -displayErrorMessage() { - echo $(date): $1 - exit 1 -} - -f_install_miniconda() { - install_dir=$1 - if [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing Miniconda3-latest-Linux-x86_64.sh" - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" - else - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh" - fi - ID=$(date +%s)-${RANDOM} # This script may run at the same time! - nohup wget --no-check-certificate ${conda_repo} -O /tmp/miniconda-${ID}.sh 2>&1 > /tmp/miniconda_wget-${ID}.out - rm -rf ${install_dir} - mkdir -p $(dirname ${install_dir}) - nohup bash /tmp/miniconda-${ID}.sh -b -p ${install_dir} 2>&1 > /tmp/miniconda_sh-${ID}.out -} - -f_set_up_conda_from_yaml() { - CONDA_DIR=$1 - CONDA_ENV=$2 - CONDA_YAML=$3 - CONDA_SH="${CONDA_DIR}/etc/profile.d/conda.sh" - # conda env export - # Remove line starting with name, prefix and remove empty lines - sed -i -e '/^name:/d' -e '/^prefix:/d' -e '/^$/d' ${CONDA_YAML} - - if [ ! -d "${CONDA_DIR}" ]; then - echo "Conda directory <${CONDA_DIR}> not found. Installing conda..." - f_install_miniconda ${CONDA_DIR} - fi - - echo "Sourcing Conda SH <${CONDA_SH}>" - source ${CONDA_SH} - - # Check if Conda environment exists - if ! conda env list | grep -q "${CONDA_ENV}"; then - echo "Creating Conda Environment <${CONDA_ENV}>" - conda create --name ${CONDA_ENV} - fi - - echo "Activating Conda Environment <${CONDA_ENV}>" - conda activate ${CONDA_ENV} - - echo "Installing condda environment from YAML" - conda env update -n ${CONDA_ENV} -f ${CONDA_YAML} -} - - -download_singularity_container() { - # 1. Clone the repository with --no-checkout - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - # Needed for emed - git config --global --unset http.sslbackend - git clone --no-checkout https://github.com/parallelworks/interactive_session.git - - # 2. Navigate into the repository directory - cd interactive_session - #git checkout download-dependencies - - # 3. Initialize sparse-checkout - git sparse-checkout init - - # 4. Configure sparse-checkout to include only the desired file - echo downloads/jupyter/nginx-unprivileged.sif > .git/info/sparse-checkout - - # 5. Perform the checkout - git checkout - - # 6. Extract tgz - cp downloads/jupyter/nginx-unprivileged.sif ${service_nginx_sif} - - # 7. Clean - cd ../ - rm -rf interactive_session - -} - - - -if [[ "${service_conda_install}" == "true" ]]; then - - ${sshusercontainer} "${pw_job_dir}/utils/notify.sh Installing" - - if [[ "${service_install_instructions}" == "install_command" ]]; then - echo "Running install command ${service_install_command}" - eval ${service_install_command} - elif [[ "${service_install_instructions}" == "yaml" ]]; then - echo "Installing custom conda environment" - printf "%b" "${service_yaml}" > conda.yaml - cat conda.yaml - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} conda.yaml - elif [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing latest" - { - source ${service_conda_sh} - } || { - conda_dir=$(echo ${service_conda_sh} | sed "s|etc/profile.d/conda.sh||g" ) - f_install_miniconda ${conda_dir} - source ${service_conda_sh} - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r - } - { - eval "conda activate ${service_conda_env}" - } || { - conda create -n ${service_conda_env} jupyter -y - eval "conda activate ${service_conda_env}" - } - if [ -z $(which jupyter-notebook 2> /dev/null) ]; then - conda install conda-forge::jupyter-book -y - conda install conda-forge::nb_conda_kernels -y - conda install conda-forge::jinja2 -y - fi - else - echo "Installing conda environment ${service_install_instructions}.yaml" - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} ${service_install_instructions}.yaml - fi - if [ -z ${service_load_env} ]; then - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" - fi -fi -eval "${service_load_env}" - -if [ -z $(which jupyter-notebook 2> /dev/null) ]; then - displayErrorMessage "jupyter-notebook command not found" -fi - -# Download singularity container if required -jupyter_major_version=$(jupyter notebook --version | cut -d'.' -f1) -echo "Jupyter version is" -jupyter notebook --version - -if [ "${jupyter_major_version}" -ge 7 ]; then - if ! [ -f "${service_nginx_sif}" ]; then - echo; echo "Downloading nginx singularity from Github" - download_singularity_container - fi -fi - - -if [[ "${service_conda_install}" != "true" ]]; then - exit 0 -fi - - -if [[ $service_install_kernels == *"julia-kernel"* ]]; then - if [ -z $(which julia 2> /dev/null) ]; then - curl -fsSL https://install.julialang.org | sh -s -- -y - source ~/.bashrc - source ~/.bash_profile - julia -e 'using Pkg; Pkg.add("IJulia")' - fi -fi - -if [[ $service_install_kernels == *"R-kernel"* ]]; then - conda install r-recommended r-irkernel -y - R -e 'IRkernel::installspec()' -fi diff --git a/jupyter-host/kill-template.sh b/jupyter-host/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/jupyter-host/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/jupyter-host/nginx-unprivileged.def b/jupyter-host/nginx-unprivileged.def deleted file mode 100644 index 08203b21c..000000000 --- a/jupyter-host/nginx-unprivileged.def +++ /dev/null @@ -1,5 +0,0 @@ -BootStrap: docker -From: nginxinc/nginx-unprivileged:1.25.3 - -%help - This Singularity container of the nginxinc/nginx-unprivileged Docker repository diff --git a/jupyter-host/notebook6.5.4-python3.9.18.yaml b/jupyter-host/notebook6.5.4-python3.9.18.yaml deleted file mode 100644 index d7dacf348..000000000 --- a/jupyter-host/notebook6.5.4-python3.9.18.yaml +++ /dev/null @@ -1,208 +0,0 @@ -name: base -channels: - - anaconda - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - aiofiles=22.1.0=py39h06a4308_0 - - aiosqlite=0.18.0=py39h06a4308_0 - - anyio=3.5.0=py39h06a4308_0 - - archspec=0.2.3=pyhd3eb1b0_0 - - argon2-cffi=21.3.0=pyhd3eb1b0_0 - - argon2-cffi-bindings=21.2.0=py39h7f8727e_0 - - asttokens=2.0.5=pyhd3eb1b0_0 - - attrs=23.1.0=py39h06a4308_0 - - babel=2.11.0=py39h06a4308_0 - - backcall=0.2.0=pyhd3eb1b0_0 - - beautifulsoup4=4.12.2=py39h06a4308_0 - - bleach=4.1.0=pyhd3eb1b0_0 - - boltons=23.0.0=py39h06a4308_0 - - brotli-python=1.0.9=py39h6a678d5_7 - - bzip2=1.0.8=h7b6447c_0 - - c-ares=1.19.1=h5eee18b_0 - - ca-certificates=2024.3.11=h06a4308_0 - - certifi=2024.2.2=pyhd8ed1ab_0 - - cffi=1.16.0=py39h5eee18b_0 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - comm=0.1.2=py39h06a4308_0 - - conda=24.3.0=py39hf3d152e_0 - - conda-content-trust=0.2.0=py39h06a4308_0 - - conda-libmamba-solver=23.12.0=pyhd3eb1b0_1 - - conda-package-handling=2.2.0=py39h06a4308_0 - - conda-package-streaming=0.9.0=py39h06a4308_0 - - cryptography=41.0.7=py39hdda0065_0 - - cyrus-sasl=2.1.28=h52b45da_1 - - dbus=1.13.18=hb2f20db_0 - - debugpy=1.6.7=py39h6a678d5_0 - - decorator=5.1.1=pyhd3eb1b0_0 - - defusedxml=0.7.1=pyhd3eb1b0_0 - - distro=1.8.0=py39h06a4308_0 - - entrypoints=0.4=py39h06a4308_0 - - exceptiongroup=1.0.4=py39h06a4308_0 - - executing=0.8.3=pyhd3eb1b0_0 - - expat=2.5.0=h6a678d5_0 - - fmt=9.1.0=hdb19cb5_0 - - fontconfig=2.14.1=h4c34cd2_2 - - freetype=2.12.1=h4a9f257_0 - - glib=2.69.1=he621ea3_2 - - gst-plugins-base=1.14.1=h6a678d5_1 - - gstreamer=1.14.1=h5eee18b_1 - - icu=73.1=h6a678d5_0 - - idna=3.4=py39h06a4308_0 - - importlib-metadata=6.0.0=py39h06a4308_0 - - importlib_metadata=6.0.0=hd3eb1b0_0 - - ipykernel=6.25.0=py39h2f386ee_0 - - ipython=8.15.0=py39h06a4308_0 - - ipython_genutils=0.2.0=pyhd3eb1b0_1 - - ipywidgets=8.0.4=py39h06a4308_0 - - jedi=0.18.1=py39h06a4308_1 - - jinja2=3.1.2=py39h06a4308_0 - - jpeg=9e=h5eee18b_1 - - json5=0.9.6=pyhd3eb1b0_0 - - jsonpatch=1.32=pyhd3eb1b0_0 - - jsonpointer=2.1=pyhd3eb1b0_0 - - jsonschema=4.19.2=py39h06a4308_0 - - jsonschema-specifications=2023.7.1=py39h06a4308_0 - - jupyter=1.0.0=py39h06a4308_8 - - jupyter-resource-usage=1.0.2=pyhd8ed1ab_0 - - jupyter_client=7.4.9=py39h06a4308_0 - - jupyter_console=6.6.3=py39h06a4308_0 - - jupyter_core=5.5.0=py39h06a4308_0 - - jupyter_events=0.8.0=py39h06a4308_0 - - jupyter_server=2.10.0=pyhd8ed1ab_0 - - jupyter_server_fileid=0.9.0=py39h06a4308_0 - - jupyter_server_terminals=0.5.3=pyhd8ed1ab_0 - - jupyter_server_ydoc=0.8.0=py39h06a4308_1 - - jupyter_ydoc=0.2.4=py39h06a4308_0 - - jupyterlab=3.6.3=py39h06a4308_0 - - jupyterlab_pygments=0.2.2=py39h06a4308_0 - - jupyterlab_server=2.25.1=py39h06a4308_0 - - jupyterlab_widgets=3.0.9=py39h06a4308_0 - - krb5=1.20.1=h143b758_1 - - ld_impl_linux-64=2.38=h1181459_1 - - libarchive=3.6.2=h6ac8c49_2 - - libclang=14.0.6=default_hc6dbbc7_1 - - libclang13=14.0.6=default_he11475f_1 - - libcups=2.4.2=h2d74bed_1 - - libcurl=8.4.0=h251f7ec_1 - - libedit=3.1.20230828=h5eee18b_0 - - libev=4.33=h7f8727e_1 - - libffi=3.4.4=h6a678d5_0 - - libgcc-ng=13.2.0=h807b86a_5 - - libgomp=13.2.0=h807b86a_5 - - libllvm14=14.0.6=hdb19cb5_3 - - libmamba=1.5.3=haf1ee3a_0 - - libmambapy=1.5.3=py39h2dafd23_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libpng=1.6.39=h5eee18b_0 - - libpq=12.15=hdbd6064_1 - - libsodium=1.0.18=h7b6447c_0 - - libsolv=0.7.24=he621ea3_0 - - libssh2=1.10.0=hdbd6064_2 - - libstdcxx-ng=11.2.0=h1234567_1 - - libuuid=1.41.5=h5eee18b_0 - - libxcb=1.15=h7f8727e_0 - - libxkbcommon=1.0.1=h5eee18b_1 - - libxml2=2.10.4=hf1b16e4_1 - - lz4-c=1.9.4=h6a678d5_0 - - markupsafe=2.1.1=py39h7f8727e_0 - - matplotlib-inline=0.1.6=py39h06a4308_0 - - menuinst=2.0.1=py39h06a4308_1 - - mistune=2.0.4=py39h06a4308_0 - - mysql=5.7.24=h721c034_2 - - nb_conda_kernels=2.3.1=py39h06a4308_0 - - nbclassic=1.0.0=py39h06a4308_0 - - nbclient=0.8.0=py39h06a4308_0 - - nbconvert=7.10.0=py39h06a4308_0 - - nbconvert-core=7.10.0=pyhd8ed1ab_0 - - nbformat=5.9.2=py39h06a4308_0 - - ncurses=6.4=h6a678d5_0 - - nest-asyncio=1.5.6=py39h06a4308_0 - - notebook=6.5.4=pyha770c72_0 - - notebook-shim=0.2.3=py39h06a4308_0 - - openssl=3.2.1=hd590300_1 - - overrides=7.7.0=pyhd8ed1ab_0 - - packaging=23.1=py39h06a4308_0 - - pandocfilters=1.5.0=pyhd3eb1b0_0 - - parso=0.8.3=pyhd3eb1b0_0 - - pcre=8.45=h295c915_0 - - pcre2=10.42=hebb0a14_0 - - pexpect=4.8.0=pyhd3eb1b0_3 - - pickleshare=0.7.5=pyhd3eb1b0_1003 - - pip=23.3.1=py39h06a4308_0 - - platformdirs=3.10.0=py39h06a4308_0 - - pluggy=1.0.0=py39h06a4308_1 - - ply=3.11=py39h06a4308_0 - - prometheus_client=0.14.1=py39h06a4308_0 - - prompt-toolkit=3.0.36=py39h06a4308_0 - - prompt_toolkit=3.0.36=hd3eb1b0_0 - - psutil=5.9.0=py39h5eee18b_0 - - ptyprocess=0.7.0=pyhd3eb1b0_2 - - pure_eval=0.2.2=pyhd3eb1b0_0 - - pybind11-abi=4=hd3eb1b0_1 - - pycosat=0.6.6=py39h5eee18b_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pygments=2.15.1=py39h06a4308_1 - - pyopenssl=23.2.0=py39h06a4308_0 - - pyqt=5.15.10=py39h6a678d5_0 - - pyqt5-sip=12.13.0=py39h5eee18b_0 - - pysocks=1.7.1=py39h06a4308_0 - - python=3.9.18=h955ad1f_0 - - python-dateutil=2.8.2=pyhd3eb1b0_0 - - python-fastjsonschema=2.16.2=py39h06a4308_0 - - python-json-logger=2.0.7=py39h06a4308_0 - - python_abi=3.9=2_cp39 - - pytz=2023.3.post1=py39h06a4308_0 - - pyyaml=6.0.1=py39h5eee18b_0 - - pyzmq=25.1.2=py39h6a678d5_0 - - qt-main=5.15.2=h53bd1ea_10 - - qtconsole=5.5.0=py39h06a4308_0 - - qtpy=2.4.1=py39h06a4308_0 - - readline=8.2=h5eee18b_0 - - referencing=0.30.2=py39h06a4308_0 - - reproc=14.2.4=h295c915_1 - - reproc-cpp=14.2.4=h295c915_1 - - requests=2.31.0=py39h06a4308_0 - - rfc3339-validator=0.1.4=py39h06a4308_0 - - rfc3986-validator=0.1.1=py39h06a4308_0 - - rpds-py=0.10.6=py39hb02cf49_0 - - ruamel.yaml=0.17.21=py39h5eee18b_0 - - ruamel.yaml.clib=0.2.6=py39h5eee18b_1 - - send2trash=1.8.2=py39h06a4308_0 - - setuptools=68.2.2=py39h06a4308_0 - - sip=6.7.12=py39h6a678d5_0 - - six=1.16.0=pyhd3eb1b0_1 - - sniffio=1.2.0=py39h06a4308_1 - - soupsieve=2.5=py39h06a4308_0 - - sqlite=3.41.2=h5eee18b_0 - - stack_data=0.2.0=pyhd3eb1b0_0 - - terminado=0.17.1=py39h06a4308_0 - - tinycss2=1.2.1=py39h06a4308_0 - - tk=8.6.12=h1ccaba5_0 - - tomli=2.0.1=py39h06a4308_0 - - tornado=6.3.3=py39h5eee18b_0 - - tqdm=4.65.0=py39hb070fc8_0 - - traitlets=5.7.1=py39h06a4308_0 - - typing-extensions=4.7.1=py39h06a4308_0 - - typing_extensions=4.7.1=py39h06a4308_0 - - typing_utils=0.1.0=pyhd8ed1ab_0 - - tzdata=2023c=h04d1e81_0 - - urllib3=1.26.18=py39h06a4308_0 - - wcwidth=0.2.5=pyhd3eb1b0_0 - - webencodings=0.5.1=py39h06a4308_1 - - websocket-client=0.58.0=py39h06a4308_4 - - wheel=0.41.2=py39h06a4308_0 - - widgetsnbextension=4.0.5=py39h06a4308_0 - - xz=5.4.5=h5eee18b_0 - - y-py=0.5.9=py39h52d8a92_0 - - yaml=0.2.5=h7b6447c_0 - - yaml-cpp=0.8.0=h6a678d5_0 - - ypy-websocket=0.8.2=py39h06a4308_0 - - zeromq=4.3.5=h6a678d5_0 - - zipp=3.11.0=py39h06a4308_0 - - zlib=1.2.13=h5eee18b_0 - - zstandard=0.19.0=py39h5eee18b_0 - - zstd=1.5.5=hc292b87_0 -prefix: /home/alvaro/miniconda3 diff --git a/jupyter-host/notebook7.2.2-python3.12.2.yaml b/jupyter-host/notebook7.2.2-python3.12.2.yaml deleted file mode 100644 index 020378488..000000000 --- a/jupyter-host/notebook7.2.2-python3.12.2.yaml +++ /dev/null @@ -1,187 +0,0 @@ -channels: - - anaconda - - defaults -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - anyio=4.2.0=py312h06a4308_0 - - archspec=0.2.3=pyhd3eb1b0_0 - - argon2-cffi=21.3.0=pyhd3eb1b0_0 - - argon2-cffi-bindings=21.2.0=py312h5eee18b_0 - - asttokens=2.0.5=pyhd3eb1b0_0 - - async-lru=2.0.4=py312h06a4308_0 - - attrs=23.1.0=py312h06a4308_0 - - babel=2.11.0=py312h06a4308_0 - - backcall=0.2.0=pyhd3eb1b0_0 - - beautifulsoup4=4.12.2=py312h06a4308_0 - - bleach=4.1.0=pyhd3eb1b0_0 - - boltons=23.0.0=py312h06a4308_0 - - brotli-python=1.0.9=py312h6a678d5_7 - - bzip2=1.0.8=h7b6447c_0 - - c-ares=1.19.1=h5eee18b_0 - - ca-certificates=2023.12.12=h06a4308_0 - - certifi=2024.2.2=py312h06a4308_0 - - cffi=1.16.0=py312h5eee18b_0 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - comm=0.2.1=py312h06a4308_0 - - conda=24.3.0=py312h06a4308_0 - - conda-content-trust=0.2.0=py312h06a4308_0 - - conda-libmamba-solver=23.12.0=pyhd3eb1b0_1 - - conda-package-handling=2.2.0=py312h06a4308_0 - - conda-package-streaming=0.9.0=py312h06a4308_0 - - cryptography=41.0.7=py312hdda0065_0 - - cyrus-sasl=2.1.28=h52b45da_1 - - dbus=1.13.18=hb2f20db_0 - - debugpy=1.6.7=py312h6a678d5_0 - - decorator=5.1.1=pyhd3eb1b0_0 - - defusedxml=0.7.1=pyhd3eb1b0_0 - - distro=1.8.0=py312h06a4308_0 - - executing=0.8.3=pyhd3eb1b0_0 - - expat=2.5.0=h6a678d5_0 - - fmt=9.1.0=hdb19cb5_0 - - fontconfig=2.14.1=h4c34cd2_2 - - freetype=2.12.1=h4a9f257_0 - - glib=2.69.1=he621ea3_2 - - gst-plugins-base=1.14.1=h6a678d5_1 - - gstreamer=1.14.1=h5eee18b_1 - - icu=73.1=h6a678d5_0 - - idna=3.4=py312h06a4308_0 - - ipykernel=6.28.0=py312h06a4308_0 - - ipython=8.15.0=py312h06a4308_0 - - ipython_genutils=0.2.0=pyhd3eb1b0_1 - - ipywidgets=8.1.2=py312h06a4308_0 - - jedi=0.18.1=py312h06a4308_1 - - jinja2=3.1.2=py312h06a4308_0 - - jpeg=9e=h5eee18b_1 - - json5=0.9.6=pyhd3eb1b0_0 - - jsonpatch=1.32=pyhd3eb1b0_0 - - jsonpointer=2.1=pyhd3eb1b0_0 - - jsonschema=4.19.2=py312h06a4308_0 - - jsonschema-specifications=2023.7.1=py312h06a4308_0 - - jupyter=1.0.0=py312h06a4308_9 - - jupyter-lsp=2.2.0=py312h06a4308_0 - - jupyter_client=8.6.0=py312h06a4308_0 - - jupyter_console=6.4.3=pyhd3eb1b0_0 - - jupyter_core=5.5.0=py312h06a4308_0 - - jupyter_events=0.8.0=py312h06a4308_0 - - jupyter_server=2.10.0=py312h06a4308_0 - - jupyter_server_terminals=0.4.4=py312h06a4308_1 - - jupyterlab=4.0.11=py312h06a4308_0 - - jupyterlab_pygments=0.1.2=py_0 - - jupyterlab_server=2.25.1=py312h06a4308_0 - - jupyterlab_widgets=3.0.10=py312h06a4308_0 - - krb5=1.20.1=h143b758_1 - - ld_impl_linux-64=2.38=h1181459_1 - - libarchive=3.6.2=h6ac8c49_2 - - libclang=14.0.6=default_hc6dbbc7_1 - - libclang13=14.0.6=default_he11475f_1 - - libcups=2.4.2=h2d74bed_1 - - libcurl=8.5.0=h251f7ec_0 - - libedit=3.1.20230828=h5eee18b_0 - - libev=4.33=h7f8727e_1 - - libffi=3.4.4=h6a678d5_0 - - libgcc-ng=11.2.0=h1234567_1 - - libgomp=11.2.0=h1234567_1 - - libllvm14=14.0.6=hdb19cb5_3 - - libmamba=1.5.3=haf1ee3a_0 - - libmambapy=1.5.3=py312h2dafd23_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libpng=1.6.39=h5eee18b_0 - - libpq=12.15=hdbd6064_1 - - libsodium=1.0.18=h7b6447c_0 - - libsolv=0.7.24=he621ea3_0 - - libssh2=1.10.0=hdbd6064_2 - - libstdcxx-ng=11.2.0=h1234567_1 - - libuuid=1.41.5=h5eee18b_0 - - libxcb=1.15=h7f8727e_0 - - libxkbcommon=1.0.1=h5eee18b_1 - - libxml2=2.10.4=hf1b16e4_1 - - lz4-c=1.9.4=h6a678d5_0 - - markupsafe=2.1.1=py312h5eee18b_0 - - matplotlib-inline=0.1.6=py312h06a4308_0 - - menuinst=2.0.2=py312h06a4308_0 - - mistune=2.0.4=py312h06a4308_0 - - mysql=5.7.24=h721c034_2 - - nbclient=0.5.11=pyhd3eb1b0_0 - - nbconvert=7.10.0=py312h06a4308_0 - - nbformat=5.9.2=py312h06a4308_0 - - ncurses=6.4=h6a678d5_0 - - nest-asyncio=1.5.6=py312h06a4308_0 - - notebook=7.0.8=py312h06a4308_0 - - notebook-shim=0.2.3=py312h06a4308_0 - - openssl=3.0.13=h7f8727e_0 - - overrides=7.4.0=py312h06a4308_0 - - packaging=23.1=py312h06a4308_0 - - pandocfilters=1.5.0=pyhd3eb1b0_0 - - parso=0.8.3=pyhd3eb1b0_0 - - pcre=8.45=h295c915_0 - - pcre2=10.42=hebb0a14_0 - - pexpect=4.8.0=pyhd3eb1b0_3 - - pickleshare=0.7.5=pyhd3eb1b0_1003 - - pip=23.3.1=py312h06a4308_0 - - platformdirs=3.10.0=py312h06a4308_0 - - pluggy=1.0.0=py312h06a4308_1 - - ply=3.11=py312h06a4308_1 - - prometheus_client=0.14.1=py312h06a4308_0 - - prompt-toolkit=3.0.36=py312h06a4308_0 - - prompt_toolkit=3.0.36=hd3eb1b0_0 - - psutil=5.9.0=py312h5eee18b_0 - - ptyprocess=0.7.0=pyhd3eb1b0_2 - - pure_eval=0.2.2=pyhd3eb1b0_0 - - pybind11-abi=4=hd3eb1b0_1 - - pycosat=0.6.6=py312h5eee18b_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pygments=2.15.1=py312h06a4308_1 - - pyqt=5.15.10=py312h6a678d5_0 - - pyqt5-sip=12.13.0=py312h5eee18b_0 - - pysocks=1.7.1=py312h06a4308_0 - - python=3.12.1=h996f2a0_0 - - python-dateutil=2.8.2=pyhd3eb1b0_0 - - python-fastjsonschema=2.16.2=py312h06a4308_0 - - python-json-logger=2.0.7=py312h06a4308_0 - - pytz=2023.3.post1=py312h06a4308_0 - - pyyaml=6.0.1=py312h5eee18b_0 - - pyzmq=25.1.0=py312h6a678d5_0 - - qt-main=5.15.2=h53bd1ea_10 - - qtconsole=5.3.0=pyhd3eb1b0_0 - - qtpy=2.2.0=py312h06a4308_0 - - readline=8.2=h5eee18b_0 - - referencing=0.30.2=py312h06a4308_0 - - reproc=14.2.4=h295c915_1 - - reproc-cpp=14.2.4=h295c915_1 - - requests=2.31.0=py312h06a4308_1 - - rfc3339-validator=0.1.4=py312h06a4308_0 - - rfc3986-validator=0.1.1=py312h06a4308_0 - - rpds-py=0.10.6=py312hb02cf49_0 - - ruamel.yaml=0.17.21=py312h5eee18b_0 - - send2trash=1.8.2=py312h06a4308_0 - - setuptools=68.2.2=py312h06a4308_0 - - sip=6.7.12=py312h6a678d5_0 - - six=1.16.0=pyhd3eb1b0_1 - - sniffio=1.2.0=py312h06a4308_1 - - soupsieve=2.5=py312h06a4308_0 - - sqlite=3.41.2=h5eee18b_0 - - stack_data=0.2.0=pyhd3eb1b0_0 - - terminado=0.17.1=py312h06a4308_0 - - tinycss2=1.2.1=py312h06a4308_0 - - tk=8.6.12=h1ccaba5_0 - - tornado=6.3.3=py312h5eee18b_0 - - tqdm=4.65.0=py312he106c6f_0 - - traitlets=5.7.1=py312h06a4308_0 - - truststore=0.8.0=py312h06a4308_0 - - typing-extensions=4.7.1=py312h06a4308_0 - - typing_extensions=4.7.1=py312h06a4308_0 - - tzdata=2023d=h04d1e81_0 - - urllib3=2.1.0=py312h06a4308_1 - - wcwidth=0.2.5=pyhd3eb1b0_0 - - webencodings=0.5.1=py312h06a4308_2 - - websocket-client=0.58.0=py312h06a4308_4 - - wheel=0.41.2=py312h06a4308_0 - - widgetsnbextension=4.0.10=py312h06a4308_0 - - xz=5.4.5=h5eee18b_0 - - yaml=0.2.5=h7b6447c_0 - - yaml-cpp=0.8.0=h6a678d5_0 - - zeromq=4.3.4=h2531618_0 - - zlib=1.2.13=h5eee18b_0 - - zstandard=0.19.0=py312h5eee18b_0 - - zstd=1.5.5=hc292b87_0 diff --git a/jupyter-host/start-template-v3.sh b/jupyter-host/start-template-v3.sh deleted file mode 100755 index f966d3626..000000000 --- a/jupyter-host/start-template-v3.sh +++ /dev/null @@ -1,289 +0,0 @@ -# Runs via ssh + sbatch - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - -if [ -z "${service_load_env}" ]; then - service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" -fi - -eval "${service_load_env}" - -if [ -z $(which jupyter-notebook 2> /dev/null) ]; then - displayErrorMessage "jupyter-notebook command not found" -fi - -echo "starting notebook on $service_port..." - -export XDG_RUNTIME_DIR="" - -# Generate sha: -if [ -z "${service_password}" ]; then - echo "No password was specified" - sha="" -else - echo "Generating sha" - sha=$(python3 -c "from notebook.auth.security import passwd; print(passwd('${service_password}', algorithm = 'sha1'))") -fi -# Set the launch directory for JupyterHub -# If notebook_dir is not set or set to a templated value, -# use the default value of "/". -if [ -z ${service_notebook_dir} ]; then - service_notebook_dir="/" -fi - -jupyter_major_version=$(jupyter notebook --version | cut -d'.' -f1) - -echo "Jupyter version is" -jupyter notebook --version - -if [ "${jupyter_major_version}" -lt 7 ]; then - -# Custom PW plugin: -mkdir -p pw_jupyter_proxy -cat >> pw_jupyter_proxy/__init__.py < cancel.sh -chmod +x cancel.sh -jupyterserver_port=$(findAvailablePort) - -####################### -# START NGINX WRAPPER # -####################### - -echo "Starting nginx wrapper on service port ${service_port}" - -# Write config file -cat >> config.conf <> nginx.conf </dev/null && which docker >/dev/null 2>&1; then - container_name="nginx-${service_port}" - # Remove container when job is canceled - echo "sudo docker stop ${container_name}" >> cancel.sh - echo "sudo docker rm ${container_name}" >> cancel.sh - # Start container - sudo service docker start - touch empty - touch nginx.logs - # change ownership to nginx user - sudo chown 101:101 nginx.conf config.conf empty nginx.logs - sudo chmod 644 *.conf - sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.logs:/var/log/nginx/access.log \ - -v $PWD/nginx.logs:/var/log/nginx/error.log \ - --network=host nginxinc/nginx-unprivileged:1.25.3 - # Print logs - sudo docker logs ${container_name} -elif which singularity >/dev/null 2>&1; then - echo "Running singularity container ${service_nginx_sif}" - # We need to mount $PWD/tmp:/tmp because otherwise nginx writes the file /tmp/nginx.pid - # and other users cannot use the node. Was not able to change this in the config.conf. - mkdir -p ./tmp - # Need to overwrite default configuration! - touch empty - singularity run -B $PWD/tmp:/tmp -B $PWD/config.conf:/etc/nginx/conf.d/config.conf -B $PWD/nginx.conf:/etc/nginx/nginx.conf -B empty:/etc/nginx/conf.d/default.conf ${service_nginx_sif} >> nginx.logs 2>&1 & - pid=$! - echo "kill ${pid}" >> cancel.sh -else - displayErrorMessage "Need Docker or Singularity to start NGINX proxy" -fi - - - -export JUPYTER_CONFIG_DIR=${PWD} -jupyter notebook --generate-config - -########################## -# Do not change anything # -########################## -# Default: '' -#sed -i "s|^.*c\.ExtensionApp\.default_url.*|c.ExtensionApp.default_url = '${basepath}'|" jupyter_notebook_config.py - -# Default: '/lab' -#sed -i "s|^.*c\.JupyterNotebookApp\.app_url.*|c.JupyterNotebookApp.app_url = '${basepath}/tree'|" jupyter_notebook_config.py - -# Default: '/tree' -#sed -i "s|^.*c\.JupyterNotebookApp\.default_url.*|c.JupyterNotebookApp.default_url = '${basepath}/tree'|" jupyter_notebook_config.py - -## Url where the static assets for the extension are served. -# See also: ExtensionApp.static_url_prefix -#sed -i "s|^.*c\.JupyterNotebookApp\.static_url_prefix.*|c.JupyterNotebookApp.static_url_prefix = '${basepath}/static'|" jupyter_notebook_config.py - -## The default URL to redirect to from \`/\` -# Default: '/' -#sed -i "s|^.*c\.ServerApp\.default_url.*|c.ServerApp.default_url = '${basepath}/'|" jupyter_notebook_config.py - -## Supply overrides for the tornado.web.Application that the Jupyter server uses. -# Default: {} -#c.ServerApp.tornado_settings = {c_ServerApp_tornado_settings} -#sed -i "s|^.*c\.ServerApp\.tornado_settings .*|c.ServerApp.tornado_settings = {\"static_url_prefix\":\"${basepath}/static/\"}|" jupyter_notebook_config.py - -## Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded- -# For headerssent by the upstream reverse proxy. Necessary if the proxy handles -# SSL -# Default: False -sed -i "s|^.*c\.ServerApp\.trust_xheaders.*|c.ServerApp.trust_xheaders = True|" jupyter_notebook_config.py - -## Set the Access-Control-Allow-Origin header -# -# Use '*' to allow any origin to access your server. -# -# Takes precedence over allow_origin_pat. -# Default: '' -sed -i "s|^.*c\.ServerApp\.allow_origin\ =.*|c.ServerApp.allow_origin = '\*'|" jupyter_notebook_config.py - -## Allow requests where the Host header doesn't point to a local server -# -# By default, requests get a 403 forbidden response if the 'Host' header -# shows that the browser thinks it's on a non-local domain. -# Setting this option to True disables this check. -# -# This protects against 'DNS rebinding' attacks, where a remote web server -# serves you a page and then changes its DNS to send later requests to a -# local IP, bypassing same-origin checks. -# -# Local IP addresses (such as 127.0.0.1 and ::1) are allowed as local, -# along with hostnames configured in local_hostnames. -# Default: False -sed -i "s|^.*c\.ServerApp\.allow_remote_access.*|c.ServerApp.allow_remote_access = True|" jupyter_notebook_config.py - -############################ -############################ - -sed -i "s|^.*c\.ServerApp\.token.*|c.ServerApp.token = ''|" jupyter_notebook_config.py - -sed -i "s|^.*c\.ServerApp\.root_dir.*|c.ServerApp.root_dir = '${service_notebook_dir}'|" jupyter_notebook_config.py - -## The base URL for the Jupyter server. -# -# Leading and trailing slashes can be omitted, -# and will automatically be added. -# Default: '/' -# Breaks in combination with the commented ones above -# This one is the only one that sets the base_url when you tunnel to laptop -sed -i "s|^.*c\.ServerApp\.base_url.*|c.ServerApp.base_url = '${basepath}/'|" jupyter_notebook_config.py -#sed -i "s|^.*c\.ServerApp\.base_url.*|c.ServerApp.base_url = '${basepath}/'|" jupyter_notebook_config.py -date -jupyter-notebook --port=${jupyterserver_port} --no-browser --config=${PWD}/jupyter_notebook_config.py - -fi - - -sleep 999999999 diff --git a/jupyter-host/transfer_files.sh b/jupyter-host/transfer_files.sh deleted file mode 100644 index d9468c2dd..000000000 --- a/jupyter-host/transfer_files.sh +++ /dev/null @@ -1,7 +0,0 @@ - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${resource_workdir}/pw/software -fi - -rsync -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" -avzq --rsync-path="mkdir -p ${service_parent_install_dir} && rsync" ${pw_job_dir}/${service_name}/*.yaml ${resource_publicIp}:${resource_jobdir} - diff --git a/jupyter-host/url.sh b/jupyter-host/url.sh deleted file mode 100755 index 39545b11f..000000000 --- a/jupyter-host/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="tree?dt=\"+(new Date()).getTime()" \ No newline at end of file diff --git a/jupyter-singularity/tensorflow_latest-gpu-jupyter-extra.def b/jupyter-singularity/tensorflow_latest-gpu-jupyter-extra.def deleted file mode 100644 index b10979c1f..000000000 --- a/jupyter-singularity/tensorflow_latest-gpu-jupyter-extra.def +++ /dev/null @@ -1,11 +0,0 @@ -BootStrap: docker -From: tensorflow/tensorflow:latest-gpu-jupyter -%help - This Singularity definition contains a TensorFlow-gpu installation -%post - pip install scipy six numpy pandas matplotlib scikit-learn netCDF4 zarr jupyterlab -%environment - export LC_ALL=C -%runscript -%labels - Author Alvaro.Vidal \ No newline at end of file diff --git a/jupyter-singularity/url.sh b/jupyter-singularity/url.sh deleted file mode 100644 index 39545b11f..000000000 --- a/jupyter-singularity/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="tree?dt=\"+(new Date()).getTime()" \ No newline at end of file diff --git a/jupyterhub-host/controller-v3.sh b/jupyterhub-host/controller-v3.sh deleted file mode 100644 index 3c0ede095..000000000 --- a/jupyterhub-host/controller-v3.sh +++ /dev/null @@ -1,164 +0,0 @@ -cd ${resource_jobdir} - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - - -displayErrorMessage() { - echo $(date): $1 - exit 1 -} - -f_install_miniconda() { - install_dir=$1 - if [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing Miniconda3-latest-Linux-x86_64.sh" - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" - else - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh" - fi - ID=$(date +%s)-${RANDOM} # This script may run at the same time! - nohup wget --no-check-certificate ${conda_repo} -O /tmp/miniconda-${ID}.sh 2>&1 > /tmp/miniconda_wget-${ID}.out - rm -rf ${install_dir} - mkdir -p $(dirname ${install_dir}) - nohup bash /tmp/miniconda-${ID}.sh -b -p ${install_dir} 2>&1 > /tmp/miniconda_sh-${ID}.out -} - -f_set_up_conda_from_yaml() { - CONDA_DIR=$1 - CONDA_ENV=$2 - CONDA_YAML=$3 - CONDA_SH="${CONDA_DIR}/etc/profile.d/conda.sh" - # conda env export - # Remove line starting with name, prefix and remove empty lines - sed -i -e '/^name:/d' -e '/^prefix:/d' -e '/^$/d' ${CONDA_YAML} - - if [ ! -d "${CONDA_DIR}" ]; then - echo "Conda directory <${CONDA_DIR}> not found. Installing conda..." - f_install_miniconda ${CONDA_DIR} - fi - - echo "Sourcing Conda SH <${CONDA_SH}>" - source ${CONDA_SH} - - # Check if Conda environment exists - if ! conda env list | grep -q "${CONDA_ENV}"; then - echo "Creating Conda Environment <${CONDA_ENV}>" - conda create --name ${CONDA_ENV} - fi - - echo "Activating Conda Environment <${CONDA_ENV}>" - conda activate ${CONDA_ENV} - - echo "Installing condda environment from YAML" - conda env update -n ${CONDA_ENV} -f ${CONDA_YAML} -} - - -download_singularity_container() { - # 1. Clone the repository with --no-checkout - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - # Needed for emed - git config --global --unset http.sslbackend - git clone --no-checkout https://github.com/parallelworks/interactive_session.git - - # 2. Navigate into the repository directory - cd interactive_session - #git checkout download-dependencies - - # 3. Initialize sparse-checkout - git sparse-checkout init - - # 4. Configure sparse-checkout to include only the desired file - echo downloads/jupyter/nginx-unprivileged.sif > .git/info/sparse-checkout - - # 5. Perform the checkout - git checkout - - # 6. Extract tgz - cp downloads/jupyter/nginx-unprivileged.sif ${service_nginx_sif} - - # 7. Clean - cd ../ - rm -rf interactive_session - -} - - - -if [[ "${service_conda_install}" == "true" ]]; then - - if [[ "${service_install_instructions}" == "install_command" ]]; then - echo "Running install command ${service_install_command}" - eval ${service_install_command} - elif [[ "${service_install_instructions}" == "yaml" ]]; then - echo "Installing custom conda environment" - printf "%b" "${service_yaml}" > conda.yaml - cat conda.yaml - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} conda.yaml - elif [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing latest" - { - source ${service_conda_sh} - } || { - conda_dir=$(echo ${service_conda_sh} | sed "s|etc/profile.d/conda.sh||g" ) - f_install_miniconda ${conda_dir} - source ${service_conda_sh} - } - { - eval "conda activate ${service_conda_env}" - } || { - conda create -n ${service_conda_env} jupyter -y - eval "conda activate ${service_conda_env}" - } - if [ -z $(which jupyterhub 2> /dev/null) ]; then - conda install -c conda-forge jupyterhub notebook -y - conda install -c conda-forge jupyterhub-systemd-spawner -y - pip install jupyterhub-nativeauthenticator - fi - else - echo "Installing conda environment ${service_install_instructions}.yaml" - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} ${service_install_instructions}.yaml - fi - if [ -z ${service_load_env} ]; then - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" - fi -fi -eval "${service_load_env}" - -if [ -z $(which jupyterhub 2> /dev/null) ]; then - displayErrorMessage "jupyterhub command not found" -fi - -if [[ "${service_conda_install}" != "true" ]]; then - exit 0 -fi - - -if [[ $service_install_kernels == *"julia-kernel"* ]]; then - if [ -z $(which julia 2> /dev/null) ]; then - curl -fsSL https://install.julialang.org | sh -s -- -y - source ~/.bashrc - source ~/.bash_profile - julia -e 'using Pkg; Pkg.add("IJulia")' - fi -fi - -if [[ $service_install_kernels == *"R-kernel"* ]]; then - conda install r-recommended r-irkernel -y - R -e 'IRkernel::installspec()' -fi - - -# Download singularity container if required -if ! [ -f "${service_nginx_sif}" ]; then - echo; echo "Downloading nginx singularity from Github" - download_singularity_container -fi diff --git a/jupyterhub-host/jupyterhub5.2.1-python3.13.1.yaml b/jupyterhub-host/jupyterhub5.2.1-python3.13.1.yaml deleted file mode 100644 index 6e5ac198c..000000000 --- a/jupyterhub-host/jupyterhub5.2.1-python3.13.1.yaml +++ /dev/null @@ -1,215 +0,0 @@ -name: base -channels: - - defaults - - conda-forge - - https://repo.anaconda.com/pkgs/main - - https://repo.anaconda.com/pkgs/r -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - alembic=1.14.1=pyhd8ed1ab_0 - - anaconda-anon-usage=0.5.0=py312hfc0e8ea_100 - - annotated-types=0.7.0=pyhd8ed1ab_1 - - anyio=4.8.0=pyhd8ed1ab_0 - - archspec=0.2.3=pyhd3eb1b0_0 - - argon2-cffi=23.1.0=pyhd8ed1ab_1 - - argon2-cffi-bindings=21.2.0=py312h66e93f0_5 - - arrow=1.3.0=pyhd8ed1ab_1 - - asttokens=3.0.0=pyhd8ed1ab_1 - - async-lru=2.0.4=pyhd8ed1ab_1 - - async_generator=1.10=pyhd8ed1ab_2 - - attrs=25.1.0=pyh71513ae_0 - - babel=2.17.0=pyhd8ed1ab_0 - - beautifulsoup4=4.13.3=pyha770c72_0 - - bleach=6.2.0=pyh29332c3_4 - - bleach-with-css=6.2.0=h82add2a_4 - - blinker=1.9.0=pyhff2d567_0 - - boltons=23.0.0=py312h06a4308_0 - - brotli-python=1.0.9=py312h6a678d5_8 - - bzip2=1.0.8=h5eee18b_6 - - c-ares=1.19.1=h5eee18b_0 - - ca-certificates=2025.1.31=hbcca054_0 - - cached-property=1.5.2=hd8ed1ab_1 - - cached_property=1.5.2=pyha770c72_1 - - certifi=2024.12.14=pyhd8ed1ab_0 - - certipy=0.2.1=pyhd8ed1ab_1 - - cffi=1.17.1=py312h1fdaa30_0 - - charset-normalizer=3.3.2=pyhd3eb1b0_0 - - comm=0.2.2=pyhd8ed1ab_1 - - conda=25.1.1=py312h7900ff3_0 - - conda-anaconda-telemetry=0.1.1=py312h06a4308_0 - - conda-content-trust=0.2.0=py312h06a4308_1 - - conda-libmamba-solver=24.9.0=pyhd3eb1b0_0 - - conda-package-handling=2.4.0=py312h06a4308_0 - - conda-package-streaming=0.11.0=py312h06a4308_0 - - configurable-http-proxy=4.6.2=h92b4e83_1 - - cryptography=43.0.3=py312h7825ff9_1 - - debugpy=1.8.12=py312h2ec8cdc_0 - - decorator=5.1.1=pyhd8ed1ab_1 - - defusedxml=0.7.1=pyhd8ed1ab_0 - - distro=1.9.0=py312h06a4308_0 - - exceptiongroup=1.2.2=pyhd8ed1ab_1 - - executing=2.1.0=pyhd8ed1ab_1 - - expat=2.6.4=h6a678d5_0 - - fmt=9.1.0=hdb19cb5_1 - - fqdn=1.5.1=pyhd8ed1ab_1 - - frozendict=2.4.2=py312h06a4308_0 - - greenlet=3.1.1=py312h2ec8cdc_1 - - h11=0.14.0=pyhd8ed1ab_1 - - h2=4.2.0=pyhd8ed1ab_0 - - hpack=4.1.0=pyhd8ed1ab_0 - - httpcore=1.0.7=pyh29332c3_1 - - httpx=0.28.1=pyhd8ed1ab_0 - - hyperframe=6.1.0=pyhd8ed1ab_0 - - icu=73.1=h6a678d5_0 - - idna=3.7=py312h06a4308_0 - - importlib-metadata=8.6.1=pyha770c72_0 - - importlib_resources=6.5.2=pyhd8ed1ab_0 - - ipykernel=6.29.5=pyh3099207_0 - - ipython=8.32.0=pyh907856f_0 - - isoduration=20.11.0=pyhd8ed1ab_1 - - jedi=0.19.2=pyhd8ed1ab_1 - - jinja2=3.1.5=pyhd8ed1ab_0 - - json5=0.10.0=pyhd8ed1ab_1 - - jsonpatch=1.33=py312h06a4308_1 - - jsonpointer=2.1=pyhd3eb1b0_0 - - jsonschema=4.23.0=pyhd8ed1ab_1 - - jsonschema-specifications=2024.10.1=pyhd8ed1ab_1 - - jsonschema-with-format-nongpl=4.23.0=hd8ed1ab_1 - - jupyter-lsp=2.2.5=pyhd8ed1ab_1 - - jupyter_client=8.6.3=pyhd8ed1ab_1 - - jupyter_core=5.7.2=pyh31011fe_1 - - jupyter_events=0.12.0=pyh29332c3_0 - - jupyter_server=2.15.0=pyhd8ed1ab_0 - - jupyter_server_terminals=0.5.3=pyhd8ed1ab_1 - - jupyterhub=5.2.1=pyh31011fe_0 - - jupyterhub-base=5.2.1=pyh31011fe_0 - - jupyterlab=4.3.5=pyhd8ed1ab_0 - - jupyterlab_pygments=0.3.0=pyhd8ed1ab_2 - - jupyterlab_server=2.27.3=pyhd8ed1ab_1 - - krb5=1.20.1=h143b758_1 - - ld_impl_linux-64=2.40=h12ee557_0 - - libarchive=3.7.4=hfab0078_0 - - libcurl=8.9.1=h251f7ec_0 - - libedit=3.1.20230828=h5eee18b_0 - - libev=4.33=h7f8727e_1 - - libexpat=2.6.4=h5888daf_0 - - libffi=3.4.4=h6a678d5_1 - - libgcc=14.2.0=h77fa898_1 - - libgcc-ng=14.2.0=h69a702a_1 - - libgomp=14.2.0=h77fa898_1 - - libmamba=1.5.11=hfe524e5_0 - - libmambapy=1.5.11=py312haf1ee3a_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libnsl=2.0.1=hd590300_0 - - libsodium=1.0.18=h36c2ea0_1 - - libsolv=0.7.24=he621ea3_1 - - libsqlite=3.46.0=hde9e2c9_0 - - libssh2=1.11.1=h251f7ec_0 - - libstdcxx=14.2.0=hc0a3c3a_1 - - libstdcxx-ng=11.2.0=h1234567_1 - - libuuid=2.38.1=h0b41bf4_0 - - libuv=1.48.0=hd590300_0 - - libxcrypt=4.4.36=hd590300_1 - - libxml2=2.13.5=hfdd30dd_0 - - libzlib=1.2.13=h4ab18f5_6 - - lz4-c=1.9.4=h6a678d5_1 - - mako=1.3.9=pyhd8ed1ab_0 - - markupsafe=3.0.2=py312h178313f_1 - - matplotlib-inline=0.1.7=pyhd8ed1ab_1 - - menuinst=2.2.0=py312h06a4308_0 - - mistune=3.1.1=pyhd8ed1ab_0 - - nbclient=0.10.2=pyhd8ed1ab_0 - - nbconvert-core=7.16.6=pyh29332c3_0 - - nbformat=5.10.4=pyhd8ed1ab_1 - - ncurses=6.4=h6a678d5_0 - - nest-asyncio=1.6.0=pyhd8ed1ab_1 - - nodejs=20.17.0=hb8e3597_0 - - notebook=7.3.2=pyhd8ed1ab_0 - - notebook-shim=0.2.4=pyhd8ed1ab_1 - - oauthlib=3.2.2=pyhd8ed1ab_1 - - openssl=3.4.0=h7b32b05_1 - - overrides=7.7.0=pyhd8ed1ab_1 - - packaging=24.1=py312h06a4308_0 - - pamela=1.2.0=pyhd8ed1ab_1 - - pandocfilters=1.5.0=pyhd8ed1ab_0 - - parso=0.8.4=pyhd8ed1ab_1 - - pcre2=10.42=hebb0a14_1 - - pexpect=4.9.0=pyhd8ed1ab_1 - - pickleshare=0.7.5=pyhd8ed1ab_1004 - - pip=24.2=py312h06a4308_0 - - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_2 - - platformdirs=3.10.0=py312h06a4308_0 - - pluggy=1.5.0=py312h06a4308_0 - - prometheus_client=0.21.1=pyhd8ed1ab_0 - - prompt-toolkit=3.0.50=pyha770c72_0 - - psutil=6.1.1=py312h66e93f0_0 - - ptyprocess=0.7.0=pyhd8ed1ab_1 - - pure_eval=0.2.3=pyhd8ed1ab_1 - - pybind11-abi=5=hd3eb1b0_0 - - pycosat=0.6.6=py312h5eee18b_1 - - pycparser=2.21=pyhd3eb1b0_0 - - pycurl=7.45.3=py312h6ac1089_2 - - pydantic=2.10.6=pyh3cfb1c2_0 - - pydantic-core=2.27.2=py312h12e396e_0 - - pygments=2.19.1=pyhd8ed1ab_0 - - pyjwt=2.10.1=pyhd8ed1ab_0 - - pysocks=1.7.1=py312h06a4308_0 - - python=3.12.2=hab00c5b_0_cpython - - python-dateutil=2.9.0.post0=pyhff2d567_1 - - python-fastjsonschema=2.21.1=pyhd8ed1ab_0 - - python-json-logger=2.0.7=pyhd8ed1ab_0 - - python_abi=3.12=5_cp312 - - pytz=2025.1=pyhd8ed1ab_0 - - pyyaml=6.0.2=py312h178313f_2 - - pyzmq=26.2.0=py312h6a678d5_0 - - readline=8.2=h5eee18b_0 - - referencing=0.36.2=pyh29332c3_0 - - reproc=14.2.4=h6a678d5_2 - - reproc-cpp=14.2.4=h6a678d5_2 - - requests=2.32.3=py312h06a4308_1 - - rfc3339-validator=0.1.4=pyhd8ed1ab_1 - - rfc3986-validator=0.1.1=pyh9f0ad1d_0 - - rpds-py=0.22.3=py312h12e396e_0 - - ruamel.yaml=0.18.6=py312h5eee18b_0 - - ruamel.yaml.clib=0.2.8=py312h5eee18b_0 - - send2trash=1.8.3=pyh0d859eb_1 - - setuptools=75.1.0=py312h06a4308_0 - - six=1.17.0=pyhd8ed1ab_0 - - sniffio=1.3.1=pyhd8ed1ab_1 - - soupsieve=2.5=pyhd8ed1ab_1 - - sqlalchemy=2.0.37=py312h66e93f0_0 - - sqlite=3.45.3=h5eee18b_0 - - stack_data=0.6.3=pyhd8ed1ab_1 - - terminado=0.18.1=pyh0d859eb_0 - - tinycss2=1.4.0=pyhd8ed1ab_0 - - tk=8.6.14=h39e8969_0 - - tomli=2.2.1=pyhd8ed1ab_1 - - tornado=6.4.2=py312h66e93f0_0 - - tqdm=4.66.5=py312he106c6f_0 - - traitlets=5.14.3=pyhd8ed1ab_1 - - truststore=0.8.0=py312h06a4308_0 - - types-python-dateutil=2.9.0.20241206=pyhd8ed1ab_0 - - typing-extensions=4.12.2=hd8ed1ab_1 - - typing_extensions=4.12.2=pyha770c72_1 - - typing_utils=0.1.0=pyhd8ed1ab_1 - - tzdata=2024b=h04d1e81_0 - - uri-template=1.3.0=pyhd8ed1ab_1 - - urllib3=2.2.3=py312h06a4308_0 - - wcwidth=0.2.13=pyhd8ed1ab_1 - - webcolors=24.11.1=pyhd8ed1ab_0 - - webencodings=0.5.1=pyhd8ed1ab_3 - - websocket-client=1.8.0=pyhd8ed1ab_1 - - wheel=0.44.0=py312h06a4308_0 - - xz=5.4.6=h5eee18b_1 - - yaml=0.2.5=h7f98852_2 - - yaml-cpp=0.8.0=h6a678d5_1 - - zeromq=4.3.5=h6a678d5_0 - - zipp=3.21.0=pyhd8ed1ab_1 - - zlib=1.2.13=h4ab18f5_6 - - zstandard=0.23.0=py312h2c38b39_1 - - zstd=1.5.6=hc292b87_0 - - pip: - - bcrypt==4.2.1 - - jupyterhub-nativeauthenticator==1.3.0 - - onetimepass==1.0.1 diff --git a/jupyterhub-host/kill-template.sh b/jupyterhub-host/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/jupyterhub-host/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/jupyterhub-host/start-template-v3.sh b/jupyterhub-host/start-template-v3.sh deleted file mode 100755 index 777f876f4..000000000 --- a/jupyterhub-host/start-template-v3.sh +++ /dev/null @@ -1,162 +0,0 @@ -# Runs via ssh + sbatch -set -x - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - -if [ -z "${service_load_env}" ]; then - service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" -fi - -eval "${service_load_env}" - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh -jupyterhub_port=$(findAvailablePort) - -if [[ "${service_conda_install}" == "true" ]]; then - source ${service_conda_sh} - eval "conda activate ${service_conda_env}" -else - eval "${service_load_env}" -fi - -if [ -z $(which jupyterhub 2> /dev/null) ]; then - displayErrorMessage "jupyterhub command not found" -fi - -####################### -# START NGINX WRAPPER # -####################### - -echo "Starting nginx wrapper on service port ${service_port}" - -# Write config file -cat >> config.conf <> nginx.conf </dev/null && which docker >/dev/null 2>&1; then - container_name="nginx-${service_port}" - # Remove container when job is canceled - echo "sudo docker stop ${container_name}" >> cancel.sh - echo "sudo docker rm ${container_name}" >> cancel.sh - # Start container - sudo service docker start - touch empty - touch nginx.logs - # change ownership to nginx user - sudo chown 101:101 nginx.conf config.conf empty nginx.logs - sudo chmod 644 *.conf - sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.logs:/var/log/nginx/access.log \ - -v $PWD/nginx.logs:/var/log/nginx/error.log \ - --network=host nginxinc/nginx-unprivileged:1.25.3 - # Print logs - sudo docker logs ${container_name} -elif which singularity >/dev/null 2>&1; then - echo "Running singularity container ${service_nginx_sif}" - # We need to mount $PWD/tmp:/tmp because otherwise nginx writes the file /tmp/nginx.pid - # and other users cannot use the node. Was not able to change this in the config.conf. - mkdir -p ./tmp - # Need to overwrite default configuration! - touch empty - singularity run -B $PWD/tmp:/tmp -B $PWD/config.conf:/etc/nginx/conf.d/config.conf -B $PWD/nginx.conf:/etc/nginx/nginx.conf -B empty:/etc/nginx/conf.d/default.conf ${service_nginx_sif} >> nginx.logs 2>&1 & - pid=$! - echo "kill ${pid}" >> cancel.sh -else - displayErrorMessage "Need Docker or Singularity to start NGINX proxy" -fi - -#################### -# START JUPYTERHUB # -#################### -jupyterhub_hubport=$(findAvailablePort) - - -export JUPYTER_CONFIG_DIR=${PWD} -jupyterhub --generate-config - -sed -i "s|^.*c\.Authenticator\.whitelist.*|c.Authenticator.whitelist = set()|" jupyterhub_config.py -sed -i "s|^.*c\.Authenticator\.allow_all.*|c.Authenticator.allow_all = True|" jupyterhub_config.py -sed -i "s|^.*c\.Authenticator\.admin_users.*|c.Authenticator.admin_users = {'${USER}'}|" jupyterhub_config.py -#sed -i "s|^.*c\.Authenticator\.allowed_users.*|c.Authenticator.allowed_users = set()|" jupyterhub_config.py -sed -i "s|^.*c\.JupyterHub\.authenticator_class.*|c.JupyterHub.authenticator_class = 'native'|" jupyterhub_config.py -sed -i "s|^.*c\.JupyterHub\.port.*|c.JupyterHub.port = ${jupyterhub_port}|" jupyterhub_config.py -sed -i "s|^.*c\.JupyterHub\.hub_port.*|c.JupyterHub.hub_port = ${jupyterhub_hubport}|" jupyterhub_config.py -sed -i "s|^.*c\.JupyterHub\.base_url.*|c.JupyterHub.base_url = \'${basepath}/\'|" jupyterhub_config.py -# This link only partially works to embed JupyterHub in an Iframe -# https://discourse.jupyter.org/t/open-jupyterhub-application-in-iframe/10430 -#sed -i "s|^.*c\.JupyterHub\.tornado_settings.*|c.JupyterHub.tornado_settings = {\"static_url_prefix\":\"${basepath}/static/\"}|" jupyterhub_config.py - - -sudo bash -c "source ${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh; conda activate ${service_conda_env}; jupyterhub -f jupyterhub_config.py" - -sleep inf diff --git a/jupyterhub-host/transfer_files.sh b/jupyterhub-host/transfer_files.sh deleted file mode 100644 index 3d27d77d3..000000000 --- a/jupyterhub-host/transfer_files.sh +++ /dev/null @@ -1,6 +0,0 @@ -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${resource_workdir}/pw/software -fi - -rsync -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" -avzq --rsync-path="mkdir -p ${service_parent_install_dir} && rsync" ${pw_job_dir}/${service_name}/*.yaml ${resource_publicIp}:${resource_jobdir} - diff --git a/jupyterhub-host/url.sh b/jupyterhub-host/url.sh deleted file mode 100755 index d8d45d08a..000000000 --- a/jupyterhub-host/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="lab\"" \ No newline at end of file diff --git a/jupyterlab-host/controller-v3.sh b/jupyterlab-host/controller-v3.sh deleted file mode 100644 index 9428cd6e3..000000000 --- a/jupyterlab-host/controller-v3.sh +++ /dev/null @@ -1,232 +0,0 @@ -cd ${resource_jobdir} - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - - -displayErrorMessage() { - echo $(date): $1 - exit 1 -} - -f_install_miniconda() { - install_dir=$1 - if [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing Miniconda3-latest-Linux-x86_64.sh" - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" - else - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh" - fi - ID=$(date +%s)-${RANDOM} # This script may run at the same time! - nohup wget --no-check-certificate ${conda_repo} -O /tmp/miniconda-${ID}.sh 2>&1 > /tmp/miniconda_wget-${ID}.out - rm -rf ${install_dir} - mkdir -p $(dirname ${install_dir}) - nohup bash /tmp/miniconda-${ID}.sh -b -p ${install_dir} 2>&1 > /tmp/miniconda_sh-${ID}.out - source ${install_dir}/etc/profile.d/conda.sh - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r -} - -f_set_up_conda_from_yaml() { - CONDA_DIR=$1 - CONDA_ENV=$2 - CONDA_YAML=$3 - CONDA_SH="${CONDA_DIR}/etc/profile.d/conda.sh" - # conda env export - # Remove line starting with name, prefix and remove empty lines - sed -i -e '/^name:/d' -e '/^prefix:/d' -e '/^$/d' ${CONDA_YAML} - - if [ ! -d "${CONDA_DIR}" ]; then - echo "Conda directory <${CONDA_DIR}> not found. Installing conda..." - f_install_miniconda ${CONDA_DIR} - fi - - echo "Sourcing Conda SH <${CONDA_SH}>" - source ${CONDA_SH} - - # Check if Conda environment exists - if ! conda env list | grep -q "${CONDA_ENV}"; then - echo "Creating Conda Environment <${CONDA_ENV}>" - conda create --name ${CONDA_ENV} - fi - - echo "Activating Conda Environment <${CONDA_ENV}>" - conda activate ${CONDA_ENV} - - echo "Installing condda environment from YAML" - conda env update -n ${CONDA_ENV} -f ${CONDA_YAML} -} - - -download_singularity_container() { - # 1. Clone the repository with --no-checkout - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - # Needed for emed - git config --global --unset http.sslbackend - git clone --no-checkout https://github.com/parallelworks/interactive_session.git - - # 2. Navigate into the repository directory - cd interactive_session - #git checkout download-dependencies - - # 3. Initialize sparse-checkout - git sparse-checkout init - - # 4. Configure sparse-checkout to include only the desired file - echo downloads/jupyter/nginx-unprivileged.sif > .git/info/sparse-checkout - - # 5. Perform the checkout - git checkout - - # 6. Move - mv downloads/jupyter/nginx-unprivileged.sif ${service_nginx_sif} - - # 7. Clean - cd ../ - rm -rf interactive_session - -} - -download_and_install_juice() { - # Configuration - local OUTPUT_FILE="juice.tgz" - - # Step 1: Get download URL from JuiceLabs API - echo "Fetching JuiceLabs download URL..." - download=$(curl -s 'https://electra.juicelabs.co/v2/public/download/linux' | python3 -c "import sys, json; print(json.load(sys.stdin)['url'])") - - - if [ -z "$download" ]; then - echo "ERROR: Download URL is empty" - exit 1 - fi - echo "Found download URL: $download" - - # Step 2: Prepare install directory - mkdir -p "${juice_install_dir}" - cd "${juice_install_dir}" || exit 1 - - # Step 3: Install prerequisites - sudo dnf install -y wget libatomic numactl-libs || { - echo "ERROR: Failed to install dependencies" - exit 1 - } - - # Step 4: Download Juice agent - echo "Downloading Juice agent..." - wget -O "$OUTPUT_FILE" "$download" || { - echo "ERROR: Failed to download file" - exit 1 - } - - # Step 5: Extract archive - echo "Extracting Juice agent..." - tar -xzvf "$OUTPUT_FILE" || { - echo "ERROR: Failed to extract $OUTPUT_FILE" - exit 1 - } - - echo "Juice agent successfully installed in ${juice_install_dir}" -} - - -if [[ "${service_conda_install}" == "true" ]]; then - - if [[ "${service_install_instructions}" == "install_command" ]]; then - echo "Running install command ${service_install_command}" - eval ${service_install_command} - elif [[ "${service_install_instructions}" == "yaml" ]]; then - echo "Installing custom conda environment" - printf "%b" "${service_yaml}" > conda.yaml - cat conda.yaml - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} conda.yaml - elif [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing latest" - { - source ${service_conda_sh} - } || { - conda_dir=$(echo ${service_conda_sh} | sed "s|etc/profile.d/conda.sh||g" ) - f_install_miniconda ${conda_dir} - source ${service_conda_sh} - } - { - eval "conda activate ${service_conda_env}" - } || { - conda create -n ${service_conda_env} jupyter -y - eval "conda activate ${service_conda_env}" - } - if [ -z $(which jupyter-lab 2> /dev/null) ]; then - conda install -c conda-forge jupyterlab -y - conda install nb_conda_kernels -y - conda install -c anaconda jinja2 -y - pip install ipywidgets - # Check if SLURM is installed - if command -v sinfo &> /dev/null; then - # SLURM extension for Jupyter Lab https://github.com/NERSC/jupyterlab-slurm - pip install jupyterlab_slurm - fi - fi - else - echo "Installing conda environment ${service_install_instructions}.yaml" - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} ${service_install_instructions}.yaml - fi - if [ -z "${service_load_env}" ]; then - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" - fi -fi -eval "${service_load_env}" - -if [ -z $(which jupyter-lab 2> /dev/null) ]; then - displayErrorMessage "jupyter-lab command not found" -fi - - -# Download singularity container if required -if ! [ -f "${service_nginx_sif}" ]; then - echo; echo "Downloading nginx singularity from Github" - download_singularity_container -fi - -# Juice -if [[ "${juice_use_juice}" == "true" ]]; then - if [ -z "${juice_exec}" ]; then - juice_install_dir=${service_parent_install_dir}/juice - juice_exec=${service_parent_install_dir}/juice/juice - if ! [ -f ${juice_exec} ]; then - echo "INFO: Installing Juice" - mkdir -p ${juice_install_dir} - download_and_install_juice - fi - if ! [ -f ${juice_exec} ]; then - echo "ERROR: Juice installation failed" - exit 1 - fi - fi -fi - - -if [[ "${service_conda_install}" != "true" ]]; then - exit 0 -fi - - -if [[ $service_install_kernels == *"julia-kernel"* ]]; then - if [ -z $(which julia 2> /dev/null) ]; then - curl -fsSL https://install.julialang.org | sh -s -- -y - source ~/.bashrc - source ~/.bash_profile - julia -e 'using Pkg; Pkg.add("IJulia")' - fi -fi - -if [[ $service_install_kernels == *"R-kernel"* ]]; then - conda install r-recommended r-irkernel -y - R -e 'IRkernel::installspec()' -fi diff --git a/jupyterlab-host/dask-extension-jupyterlab-demo.ipynb b/jupyterlab-host/dask-extension-jupyterlab-demo.ipynb deleted file mode 100644 index 35bdec963..000000000 --- a/jupyterlab-host/dask-extension-jupyterlab-demo.ipynb +++ /dev/null @@ -1,656 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "01478796-80e0-49bb-b87c-c5b2e8dff08e", - "metadata": {}, - "source": [ - "# Interactive Dask on SLURMCluster: JupyterLab Tutorial for Distributed Data Processing and AWS Integration" - ] - }, - { - "cell_type": "markdown", - "id": "e96bacab-7327-452c-ae5f-2a9436688890", - "metadata": {}, - "source": [ - "This notebook provides a tutorial on running Dask in a SLURMCluster using a JupyterLab interactive session. The steps include:\n", - "\n", - "1. SLURM Cluster Configuration: Define a SLURM cluster configuration using the SLURMCluster object, specifying parameters like the compute queue, number of CPU cores per job, and memory allocation. The cluster is then scaled to a desired number of workers using the adapt method.\n", - "\n", - "2. Connect to Dask Cluster: A Dask client is connected to the SLURMCluster, enabling interaction with the Dask computation.\n", - "\n", - "3. Display Dask Dashboard in Jupyter Lab: Utilize the Dask extension for JupyterLab to integrate the Dask Dashboard directly. Instructions are provided to establish the connection to the dashboard.\n", - "\n", - "4. Set AWS Credentials: Set temporary AWS credentials to access an AWS bucket resource defined in the Parallel Works platform, facilitating data transfer.\n", - "\n", - "5. Generate Random Data: Create a Dask DataFrame with randomly generated data, and adjust the number of rows as needed.\n", - "\n", - "6. Write and Read Data to/from AWS Bucket: Write the generated data to the specified AWS bucket and read it back into a Dask DataFrame.\n", - "\n", - "7. Process Data: Perform data processing on the Dask DataFrame, filtering rows and grouping by specific columns.\n", - "\n", - "7. Write Processed Data Back to AWS Bucket: Write the processed data back to the AWS bucket using the to_csv method.\n", - "\n", - "The notebook also provides additional details on connecting to the Dask Dashboard, setting AWS credentials, generating and processing random data, and writing and reading data to and from AWS. The provided code snippets and explanations guide users through each step of the process." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "da54acc5-8487-4d9f-b27c-cde85cefec3c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import dask\n", - "import dask.dataframe as dd\n", - "import pandas as pd\n", - "from faker import Faker\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "f7296057-c6f6-4226-8b1c-82566b704eab", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from dask.distributed import Client\n", - "from dask_jobqueue import SLURMCluster" - ] - }, - { - "cell_type": "markdown", - "id": "70f53bff-e9b4-47e9-93dc-9829de59d8c6", - "metadata": {}, - "source": [ - "### 1. Define SLURM cluster configuration\n", - "In this section, we utilize the [SLURMCluster](https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html) object to deploy Dask within a SLURM cluster. The SLURMCluster is configured with specific parameters, including the compute queue, the number of CPU cores per job, and the memory allocated per job. To facilitate this configuration, the `job_directives_skip` argument is employed, allowing Dask to bypass specific SLURM directives related to memory. It is worth noting that the `--mem` directive needs to be skipped because it is not explicitly defined for the nodes in the SLURM configuration file (`/mnt/shared/etc/slurm/slurm.conf`) of the clusters in the Parallel Works platform." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1351d1ec-9ddb-40fc-bbff-9738ea56b71f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "cluster = SLURMCluster(\n", - " queue = 'compute',\n", - " cores = 2, # Number of CPU cores per job\n", - " memory = '8GB', # Memory per job\n", - " job_directives_skip = ['--mem'], # Adding this argument allows Dask to ignore the memory parameter\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "6533d76e-ebb3-4186-86e4-2b507a6da986", - "metadata": {}, - "source": [ - "Next, the cluster is scaled to a desired number of workers using the adapt method." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b3982ef9-c06c-4ccb-8dbb-808ca6c25e03", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cluster.adapt(\n", - " minimum = 0, \n", - " maximum = 2\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "id": "553fe74c-24da-42a9-ad3a-5487695c3f6e", - "metadata": {}, - "source": [ - "Lastly, a Dask client is connected to the cluster." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "adca036c-6780-4c3d-bdfb-af9ace226fbc", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "
\n", - "
\n", - "

Client

\n", - "

Client-98809071-2988-11ef-85b9-42010a80000e

\n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "
Connection method: Cluster objectCluster type: dask_jobqueue.SLURMCluster
\n", - " Dashboard: http://10.128.0.14:8787/status\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "

Cluster Info

\n", - "
\n", - "
\n", - "
\n", - "
\n", - "

SLURMCluster

\n", - "

f61c1408

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " Dashboard: http://10.128.0.14:8787/status\n", - " \n", - " Workers: 0\n", - "
\n", - " Total threads: 0\n", - " \n", - " Total memory: 0 B\n", - "
\n", - "\n", - "
\n", - " \n", - "

Scheduler Info

\n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "
\n", - "

Scheduler

\n", - "

Scheduler-9daaa989-21fd-4809-be2f-ae99061f1e60

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " Comm: tcp://10.128.0.14:34977\n", - " \n", - " Workers: 0\n", - "
\n", - " Dashboard: http://10.128.0.14:8787/status\n", - " \n", - " Total threads: 0\n", - "
\n", - " Started: Just now\n", - " \n", - " Total memory: 0 B\n", - "
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "

Workers

\n", - "
\n", - "\n", - " \n", - "\n", - "
\n", - "
\n", - "\n", - "
\n", - "
\n", - "
\n", - "
\n", - " \n", - "\n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "client = Client(cluster)\n", - "client" - ] - }, - { - "cell_type": "markdown", - "id": "0ebfb13e-0d27-4ba8-a412-6be3aba6bf1d", - "metadata": {}, - "source": [ - "### 2. Display the Dask Dashboard in Jupyter Lab\n", - "The [Dask extension for JupyterLab](https://github.com/dask/dask-labextension) comes pre-installed in the Jupyter Lab interactive session. This extension facilitates the integration of the Dask Dashboard directly into JupyterLab, as demonstrated in this accompanying [video](https://www.youtube.com/watch?v=EX_voquHdk0). To establish the connection to the Dashboard, we employ a proxy, and you can find detailed instructions on this setup in the provided [link](https://jobqueue.dask.org/en/stable/interactive.html). \n", - "\n", - "In this case, simply paste the link that is generated below in the DASK DASHBOARD URL search bar." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "c8088b27-5b9d-42c8-b7a8-c47f74f76f9c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from urllib.parse import urlsplit\n", - "port = urlsplit(client.dashboard_link).port\n", - "os.environ['DASHBOARD_PORT'] = str(port)" - ] - }, - { - "cell_type": "markdown", - "id": "d0b0d55a-dec2-491a-8ea5-a7006c46310a", - "metadata": {}, - "source": [ - "**To connect to the Dashboard copy the link below in the DASK DASHBOARD URL search bar and press enter**" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "d72c322f-6197-4943-8bb2-7a9326223bff", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "https://cloud.parallel.works/me/51941/proxy/8787/status\n" - ] - } - ], - "source": [ - "!echo https://cloud.parallel.works/me/$openPort/proxy/$DASHBOARD_PORT/status" - ] - }, - { - "cell_type": "markdown", - "id": "6f7fd416-5a28-469f-8b06-a3d81fbbf140", - "metadata": {}, - "source": [ - "### 3. Set AWS credentials\n", - "Storage credentials can by obtained manually or using the PW API Client" - ] - }, - { - "cell_type": "markdown", - "id": "a2ce3a44-9ca9-41ad-841e-60c8e7414d4d", - "metadata": {}, - "source": [ - "#### 3.1 Manually\n", - "Follow the instructions in [this link](https://docs-staging.parallel.works/docs/storage/transferring-data/obtaining-credentials) to obtain the **temporary credentials** for an AWS bucket resource defined in the Parallel Works platform. AWS credentials are set as environment variables." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf9ceddb-fae5-49c1-a1e5-fa4cb685a87f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Set AWS credentials. Can obtain the credentials in this link. \n", - "# https://docs-staging.parallel.works/docs/storage/transferring-data/obtaining-credentials\n", - "\n", - "bucket_name='abc'\n", - "os.environ['AWS_ACCESS_KEY_ID']='123'\n", - "os.environ['AWS_SECRET_ACCESS_KEY']='xyz'\n", - "os.environ['AWS_SESSION_TOKEN']='456'\n", - "\n", - "# If accessing an S3 bucket on a cluster from another \n", - "# cloud service provider, currently you need to specify \n", - "# the URL (note the bucket region in the URL has to match\n", - "# your bucket!) as well as the secrets for the \n", - "# underlying s3fs library. This is not necessary if your\n", - "# cluster happens to be in the same CSP and region as \n", - "# your bucket. These additional storage options\n", - "# need to be included in any bucket write commands below.\n", - "storage_options={\"client_kwargs\": {\"endpoint_url\": \"https://s3-us-east-2.amazonaws.com\"}}" - ] - }, - { - "cell_type": "markdown", - "id": "e7e32fbb-a217-48bd-aa8d-83eba6c6e2b0", - "metadata": {}, - "source": [ - "#### 3.2 Using the PW API Client\n", - "Replace the storage_namespace variable below with your bucket's namespace in the format `/`" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "b1d1455b-aafc-47c7-a358-061f002a818f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning: Permanently added '[localhost]:2222' (RSA) to the list of known hosts.\n" - ] - } - ], - "source": [ - "import subprocess\n", - "import os\n", - "import json\n", - "\n", - "def load_bucket_credentials(bucket_id):\n", - " cmd = [\n", - " \"ssh\", \"usercontainer\",\n", - " os.path.join(os.environ['resource_jobdir'], \"utils/bucket_token_generator.py\"),\n", - " \"--bucket_id\", bucket_id,\n", - " \"--token_format\", \"json\"\n", - " ]\n", - " \n", - " output = subprocess.check_output(cmd, universal_newlines = True)\n", - " env_vars = json.loads(output)\n", - " os.environ.update(env_vars)\n", - " \n", - " # Return the bucket name so it can be used by the\n", - " # Dask commands later.\n", - " return env_vars[\"BUCKET_NAME\"] \n", - "\n", - "\n", - "# REPLACE WITH YOUR BUCKET NAMESPACE\n", - "bucket_namespace = '/' #'alvaro/awsbucket'\n", - "\n", - "bucket_name = load_bucket_credentials(bucket_namespace)\n", - "\n", - "# If accessing an S3 bucket on a cluster from another \n", - "# cloud service provider, currently you need to specify \n", - "# the URL (note the bucket region in the URL has to match\n", - "# your bucket!) as well as the secrets for the \n", - "# underlying s3fs library. This is not necessary if your\n", - "# cluster happens to be in the same CSP and region as \n", - "# your bucket. These additional storage options\n", - "# need to be included in any bucket write commands below.\n", - "storage_options={\"client_kwargs\": {\"endpoint_url\": \"https://s3-us-east-2.amazonaws.com\"}}" - ] - }, - { - "cell_type": "markdown", - "id": "b35b3211-3eb8-4af5-a795-b38b0e121f14", - "metadata": {}, - "source": [ - "### 4. Generate random data\n", - "In this section, a function generate_random_data is defined to create a Dask DataFrame with randomly generated data. The number of rows in the generated data can be adjusted." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "0e9b66e1-6279-4643-89e6-f85e80d7b387", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Function to generate random data\n", - "def generate_random_data(num_rows):\n", - " fake = Faker()\n", - " data = {\n", - " 'Name': [fake.name() for _ in range(num_rows)],\n", - " 'Age': [fake.random_int(min=18, max=99) for _ in range(num_rows)],\n", - " 'City': [fake.city() for _ in range(num_rows)]\n", - " }\n", - " return dd.from_pandas(pd.DataFrame(data), npartitions=2) # Create Dask DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "3e05b735-a7c3-4407-8582-3f10763e1fb9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "num_rows = 1000 # Adjust the number of rows as needed\n", - "random_data = generate_random_data(num_rows)\n" - ] - }, - { - "cell_type": "markdown", - "id": "807e54ea-7f62-4ad1-a56a-58ffc7b5a0ce", - "metadata": {}, - "source": [ - "### 5. Write data to the AWS bucket\n", - "Data generated in the previous step is written to the specified AWS bucket using the to_csv method. It's important to note that Dask employs the SLURM queue to submit jobs, acquiring workers responsible for the data transfer process. To monitor the status of this job, you can execute watch squeue in a terminal within the cluster. This command provides real-time updates on the job's progress and status." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d0a98fbe-9009-42c0-af8b-035ea896127a", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['666ae55ae7faf7886dd2e1fd/random_data.csv']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "csv_filename = 'random_data.csv'\n", - "random_data.to_csv(f's3://{bucket_name}/{csv_filename}', index=False, single_file=True, storage_options=storage_options)" - ] - }, - { - "cell_type": "markdown", - "id": "18cb6825-edcb-459e-befd-b42e6810b40e", - "metadata": {}, - "source": [ - "### 6. Read data from the AWS bucket\n", - "In this section, data is read from the AWS bucket into a Dask DataFrame." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "4dd83f4d-ead6-4414-89bf-c54df165a6f1", - "metadata": {}, - "outputs": [], - "source": [ - "dask_df = dd.read_csv(f's3://{bucket_name}/{csv_filename}', storage_options=storage_options)" - ] - }, - { - "cell_type": "markdown", - "id": "cbab9b52-8017-4543-9bed-11843923603f", - "metadata": {}, - "source": [ - "### 7. Process data\n", - "The Dask DataFrame is processed by filtering rows where the 'Age' column is greater than 21 and then grouping by the 'City' column." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "5e0d6ce0-851a-44c1-b2a1-ed6683a07837", - "metadata": {}, - "outputs": [], - "source": [ - "processed_dask_df = dask_df[dask_df['Age'] > 21].groupby('City').size()" - ] - }, - { - "cell_type": "markdown", - "id": "6c56c55f-22f3-47d5-b74b-6188ffb91fd9", - "metadata": {}, - "source": [ - "### 8. Write the processed data back to the AWS bucket\n", - "The processed data is written back to the AWS bucket using the to_csv method." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "4e6f94f9-62a7-466c-b059-31a7af2d3df1", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['666ae55ae7faf7886dd2e1fd/processed_data.csv']" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "processed_csv_filename = 'processed_data.csv'\n", - "processed_dask_df.to_csv(f's3://{bucket_name}/{processed_csv_filename}', single_file=True, storage_options=storage_options)" - ] - }, - { - "cell_type": "markdown", - "id": "c2290b91-6013-4bb5-9a3c-792e5d942bc5", - "metadata": {}, - "source": [ - "Additionally, a sample computation is triggered using compute() to showcase the processed data." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "1e13f851-77cb-4be7-ae12-7c26d28422f3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "City\n", - "Adamchester 1\n", - "Adamshaven 1\n", - "Adamsmouth 1\n", - "Albertstad 1\n", - "Alexanderchester 1\n", - " ..\n", - "Williamsmouth 2\n", - "Williamsonhaven 1\n", - "Williamsonshire 1\n", - "Williamsside 1\n", - "Williamsstad 1\n", - "Length: 906, dtype: int64" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Trigger computation if needed\n", - "processed_dask_df.compute()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "544efdc5-848f-4fba-bab1-1c5bb262015b", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:root] *", - "language": "python", - "name": "conda-root-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/jupyterlab-host/dask-extension-jupyterlab.yaml b/jupyterlab-host/dask-extension-jupyterlab.yaml deleted file mode 100644 index 5684c66fa..000000000 --- a/jupyterlab-host/dask-extension-jupyterlab.yaml +++ /dev/null @@ -1,298 +0,0 @@ -name: base -channels: - - anaconda - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - aiobotocore=2.9.0=pyhd8ed1ab_0 - - aiohttp=3.9.1=py311h459d7ec_0 - - aioitertools=0.11.0=pyhd8ed1ab_0 - - aiosignal=1.3.1=pyhd8ed1ab_0 - - anyio=4.2.0=pyhd8ed1ab_0 - - archspec=0.2.1=pyhd3eb1b0_0 - - argon2-cffi=23.1.0=pyhd8ed1ab_0 - - argon2-cffi-bindings=21.2.0=py311h459d7ec_4 - - arrow=1.3.0=pyhd8ed1ab_0 - - asttokens=2.4.1=pyhd8ed1ab_0 - - async-lru=2.0.4=pyhd8ed1ab_0 - - attrs=23.2.0=pyh71513ae_0 - - aws-c-auth=0.7.8=hcf8cf63_3 - - aws-c-cal=0.6.9=h5d48c4d_2 - - aws-c-common=0.9.10=hd590300_0 - - aws-c-compression=0.2.17=h7f92143_7 - - aws-c-event-stream=0.3.2=h0bcb0bb_8 - - aws-c-http=0.7.15=hd268abd_0 - - aws-c-io=0.13.36=hb3b01f7_3 - - aws-c-mqtt=0.10.0=hbafccad_1 - - aws-c-s3=0.4.6=h47b1690_0 - - aws-c-sdkutils=0.1.13=h7f92143_0 - - aws-checksums=0.1.17=h7f92143_6 - - aws-crt-cpp=0.25.0=hfa7cc67_4 - - aws-sdk-cpp=1.11.210=h0853bfa_5 - - babel=2.14.0=pyhd8ed1ab_0 - - beautifulsoup4=4.12.2=pyha770c72_0 - - blas=1.0=openblas - - bleach=6.1.0=pyhd8ed1ab_0 - - blinker=1.7.0=pyhd8ed1ab_0 - - boltons=23.0.0=py311h06a4308_0 - - botocore=1.33.13=pyhd8ed1ab_0 - - brotli-python=1.0.9=py311h6a678d5_7 - - bzip2=1.0.8=h7b6447c_0 - - c-ares=1.24.0=hd590300_0 - - ca-certificates=2023.12.12=h06a4308_0 - - cached-property=1.5.2=hd8ed1ab_1 - - cached_property=1.5.2=pyha770c72_1 - - cachetools=5.3.2=pyhd8ed1ab_0 - - certifi=2023.11.17=py311h06a4308_0 - - cffi=1.16.0=py311h5eee18b_0 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - click=8.1.7=unix_pyh707e725_0 - - cloudpickle=3.0.0=pyhd8ed1ab_0 - - comm=0.2.1=pyhd8ed1ab_0 - - conda=23.11.0=py311h38be061_1 - - conda-content-trust=0.2.0=py311h06a4308_0 - - conda-libmamba-solver=23.12.0=pyhd3eb1b0_1 - - conda-package-handling=2.2.0=py311h06a4308_0 - - conda-package-streaming=0.9.0=py311h06a4308_0 - - cryptography=41.0.7=py311hdda0065_0 - - cytoolz=0.12.2=py311h459d7ec_1 - - dask-core=2023.12.1=pyhd8ed1ab_0 - - dask-jobqueue=0.8.2=pyhd8ed1ab_0 - - debugpy=1.6.7=py311h6a678d5_0 - - decorator=5.1.1=pyhd8ed1ab_0 - - defusedxml=0.7.1=pyhd8ed1ab_0 - - distributed=2023.12.1=pyhd8ed1ab_0 - - distro=1.8.0=py311h06a4308_0 - - entrypoints=0.4=pyhd8ed1ab_0 - - exceptiongroup=1.2.0=pyhd8ed1ab_0 - - executing=2.0.1=pyhd8ed1ab_0 - - faker=22.0.0=pyhd8ed1ab_0 - - fmt=9.1.0=hdb19cb5_0 - - fqdn=1.5.1=pyhd8ed1ab_0 - - frozenlist=1.4.1=py311h459d7ec_0 - - fsspec=2023.12.2=pyhca7485f_0 - - gcsfs=2023.12.2.post1=pyhd8ed1ab_0 - - gflags=2.2.2=he6710b0_0 - - glog=0.6.0=h6f12383_0 - - google-api-core=2.15.0=pyhd8ed1ab_0 - - google-auth=2.26.1=pyhca7485f_0 - - google-auth-oauthlib=1.2.0=pyhd8ed1ab_0 - - google-cloud-core=2.4.1=pyhd8ed1ab_0 - - google-cloud-storage=2.14.0=pyhca7485f_0 - - google-crc32c=1.1.2=py311h9b08b9c_5 - - google-resumable-media=2.7.0=pyhd8ed1ab_0 - - googleapis-common-protos=1.62.0=pyhd8ed1ab_0 - - grpcio=1.59.3=py311ha6695c7_0 - - gtest=1.14.0=hdb19cb5_0 - - icu=73.1=h6a678d5_0 - - idna=3.4=py311h06a4308_0 - - importlib-metadata=7.0.1=pyha770c72_0 - - importlib_metadata=7.0.1=hd8ed1ab_0 - - importlib_resources=6.1.1=pyhd8ed1ab_0 - - ipykernel=6.28.0=pyhd33586a_0 - - ipython=8.19.0=pyh707e725_0 - - ipython_genutils=0.2.0=pyhd3eb1b0_1 - - isoduration=20.11.0=pyhd8ed1ab_0 - - jedi=0.19.1=pyhd8ed1ab_0 - - jinja2=3.1.2=py311h06a4308_0 - - jmespath=1.0.1=pyhd8ed1ab_0 - - json5=0.9.14=pyhd8ed1ab_0 - - jsonpatch=1.32=pyhd3eb1b0_0 - - jsonpointer=2.1=pyhd3eb1b0_0 - - jsonschema=4.20.0=pyhd8ed1ab_0 - - jsonschema-specifications=2023.12.1=pyhd8ed1ab_0 - - jsonschema-with-format-nongpl=4.20.0=pyhd8ed1ab_0 - - jupyter-lsp=2.2.1=pyhd8ed1ab_0 - - jupyter_client=8.6.0=pyhd8ed1ab_0 - - jupyter_core=5.7.0=py311h38be061_0 - - jupyter_events=0.9.0=pyhd8ed1ab_0 - - jupyter_server=2.12.2=pyhd8ed1ab_0 - - jupyter_server_terminals=0.5.1=pyhd8ed1ab_0 - - jupyterlab_pygments=0.3.0=pyhd8ed1ab_0 - - jupyterlab_server=2.25.2=pyhd8ed1ab_0 - - krb5=1.20.1=h143b758_1 - - ld_impl_linux-64=2.38=h1181459_1 - - libabseil=20230802.1=cxx17_h59595ed_0 - - libarchive=3.6.2=h6ac8c49_2 - - libarrow=14.0.1=hfb4d3a9_10_cpu - - libarrow-acero=14.0.1=h59595ed_10_cpu - - libarrow-dataset=14.0.1=h59595ed_10_cpu - - libarrow-flight=14.0.1=h120cb0d_10_cpu - - libarrow-flight-sql=14.0.1=h61ff412_10_cpu - - libarrow-gandiva=14.0.1=hacb8726_10_cpu - - libarrow-substrait=14.0.1=h61ff412_10_cpu - - libbrotlicommon=1.1.0=hd590300_1 - - libbrotlidec=1.1.0=hd590300_1 - - libbrotlienc=1.1.0=hd590300_1 - - libcrc32c=1.1.2=h9c3ff4c_0 - - libcurl=8.5.0=h251f7ec_0 - - libedit=3.1.20230828=h5eee18b_0 - - libev=4.33=h7f8727e_1 - - libevent=2.1.12=hdbd6064_1 - - libffi=3.4.4=h6a678d5_0 - - libgcc-ng=13.2.0=h807b86a_3 - - libgfortran-ng=11.2.0=h00389a5_1 - - libgfortran5=11.2.0=h1234567_1 - - libgomp=13.2.0=h807b86a_3 - - libgoogle-cloud=2.12.0=h5206363_4 - - libgrpc=1.59.3=hd6c4280_0 - - libllvm15=15.0.7=hadd5161_1 - - libmamba=1.5.3=haf1ee3a_0 - - libmambapy=1.5.3=py311h2dafd23_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libnl=3.9.0=hd590300_0 - - libnuma=2.0.16=h0b41bf4_1 - - libopenblas=0.3.21=h043d6bf_0 - - libparquet=14.0.1=h352af49_10_cpu - - libprotobuf=4.24.4=hf27288f_0 - - libre2-11=2023.06.02=h7a70373_0 - - libsodium=1.0.18=h36c2ea0_1 - - libsolv=0.7.24=he621ea3_0 - - libssh2=1.10.0=hdbd6064_2 - - libstdcxx-ng=13.2.0=h7e041cc_3 - - libthrift=0.19.0=hb90f79a_1 - - libutf8proc=2.8.0=h166bdaf_0 - - libuuid=1.41.5=h5eee18b_0 - - libxml2=2.10.4=hf1b16e4_1 - - libzlib=1.2.13=hd590300_5 - - locket=1.0.0=pyhd8ed1ab_0 - - lz4-c=1.9.4=h6a678d5_0 - - markupsafe=2.1.3=py311h459d7ec_1 - - matplotlib-inline=0.1.6=pyhd8ed1ab_0 - - menuinst=2.0.1=py311h06a4308_1 - - mistune=3.0.2=pyhd8ed1ab_0 - - msgpack-python=1.0.5=py311h9547e67_1 - - multidict=6.0.4=py311h459d7ec_1 - - nb_conda_kernels=2.3.1=py311h06a4308_0 - - nbclassic=1.0.0=py311h06a4308_0 - - nbclient=0.8.0=pyhd8ed1ab_0 - - nbconvert=7.14.0=pyhd8ed1ab_0 - - nbconvert-core=7.14.0=pyhd8ed1ab_0 - - nbconvert-pandoc=7.14.0=pyhd8ed1ab_0 - - nbformat=5.9.2=pyhd8ed1ab_0 - - ncurses=6.4=h6a678d5_0 - - nest-asyncio=1.5.8=pyhd8ed1ab_0 - - notebook=6.5.4=py311h06a4308_0 - - notebook-shim=0.2.3=pyhd8ed1ab_0 - - numpy=1.26.0=py311h24aa872_0 - - numpy-base=1.26.0=py311hbfb1bba_0 - - oauthlib=3.2.2=pyhd8ed1ab_0 - - openssl=3.2.0=hd590300_1 - - orc=1.9.2=h4b38347_0 - - overrides=7.4.0=pyhd8ed1ab_0 - - packaging=23.1=py311h06a4308_0 - - pandoc=2.19.2=ha770c72_0 - - pandocfilters=1.5.0=pyhd8ed1ab_0 - - parso=0.8.3=pyhd8ed1ab_0 - - partd=1.4.1=pyhd8ed1ab_0 - - pcre2=10.42=hebb0a14_0 - - pexpect=4.8.0=pyh1a96a4e_2 - - pickleshare=0.7.5=py_1003 - - pip=23.3.1=py311h06a4308_0 - - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 - - platformdirs=3.10.0=py311h06a4308_0 - - pluggy=1.0.0=py311h06a4308_1 - - prometheus_client=0.19.0=pyhd8ed1ab_0 - - prompt-toolkit=3.0.42=pyha770c72_0 - - protobuf=4.24.4=py311h46cbc50_0 - - psutil=5.9.7=py311h459d7ec_0 - - ptyprocess=0.7.0=pyhd3deb0d_0 - - pure_eval=0.2.2=pyhd8ed1ab_0 - - pyarrow=14.0.1=py311h39c9aba_10_cpu - - pyasn1=0.5.1=pyhd8ed1ab_0 - - pyasn1-modules=0.3.0=pyhd8ed1ab_0 - - pybind11-abi=4=hd3eb1b0_1 - - pycosat=0.6.6=py311h5eee18b_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pygments=2.17.2=pyhd8ed1ab_0 - - pyjwt=2.8.0=pyhd8ed1ab_0 - - pyopenssl=23.2.0=py311h06a4308_0 - - pysocks=1.7.1=py311h06a4308_0 - - python=3.11.5=h955ad1f_0 - - python-dateutil=2.8.2=pyhd8ed1ab_0 - - python-fastjsonschema=2.19.1=pyhd8ed1ab_0 - - python-json-logger=2.0.7=pyhd8ed1ab_0 - - python_abi=3.11=2_cp311 - - pytz=2023.3.post1=pyhd8ed1ab_0 - - pyu2f=0.1.5=pyhd8ed1ab_0 - - pyyaml=6.0.1=py311h459d7ec_1 - - pyzmq=25.1.0=py311h6a678d5_0 - - rdma-core=49.0=hd3aeb46_2 - - re2=2023.06.02=h2873b5e_0 - - readline=8.2=h5eee18b_0 - - referencing=0.32.1=pyhd8ed1ab_0 - - reproc=14.2.4=h295c915_1 - - reproc-cpp=14.2.4=h295c915_1 - - requests=2.31.0=py311h06a4308_0 - - requests-oauthlib=1.3.1=pyhd8ed1ab_0 - - rfc3339-validator=0.1.4=pyhd8ed1ab_0 - - rfc3986-validator=0.1.1=pyh9f0ad1d_0 - - rpds-py=0.16.2=py311h46250e7_0 - - rsa=4.9=pyhd8ed1ab_0 - - ruamel.yaml=0.17.21=py311h5eee18b_0 - - s2n=1.4.1=h06160fa_0 - - s3fs=2023.12.2=pyhd8ed1ab_0 - - send2trash=1.8.2=pyh41d4057_0 - - setuptools=68.2.2=py311h06a4308_0 - - six=1.16.0=pyh6c4a22f_0 - - snappy=1.1.10=h9fff704_0 - - sniffio=1.3.0=pyhd8ed1ab_0 - - sortedcontainers=2.4.0=pyhd8ed1ab_0 - - soupsieve=2.5=pyhd8ed1ab_1 - - sqlite=3.41.2=h5eee18b_0 - - stack_data=0.6.2=pyhd8ed1ab_0 - - tblib=3.0.0=pyhd8ed1ab_0 - - terminado=0.18.0=pyh0d859eb_0 - - tinycss2=1.2.1=pyhd8ed1ab_0 - - tk=8.6.12=h1ccaba5_0 - - tomli=2.0.1=pyhd8ed1ab_0 - - toolz=0.12.0=pyhd8ed1ab_0 - - tornado=6.3.3=py311h459d7ec_1 - - tqdm=4.65.0=py311h92b7b1e_0 - - traitlets=5.14.1=pyhd8ed1ab_0 - - truststore=0.8.0=py311h06a4308_0 - - types-python-dateutil=2.8.19.20240106=pyhd8ed1ab_0 - - typing-extensions=4.9.0=hd8ed1ab_0 - - typing_extensions=4.9.0=pyha770c72_0 - - typing_utils=0.1.0=pyhd8ed1ab_0 - - ucx=1.15.0=h75e419f_2 - - uri-template=1.3.0=pyhd8ed1ab_0 - - urllib3=1.26.18=py311h06a4308_0 - - wcwidth=0.2.12=pyhd8ed1ab_0 - - webcolors=1.13=pyhd8ed1ab_0 - - webencodings=0.5.1=pyhd8ed1ab_2 - - websocket-client=1.7.0=pyhd8ed1ab_0 - - wheel=0.41.2=py311h06a4308_0 - - wrapt=1.16.0=py311h459d7ec_0 - - xz=5.4.5=h5eee18b_0 - - yaml=0.2.5=h7f98852_2 - - yaml-cpp=0.8.0=h6a678d5_0 - - yarl=1.9.3=py311h459d7ec_0 - - zeromq=4.3.4=h9c3ff4c_1 - - zict=3.0.0=pyhd8ed1ab_0 - - zipp=3.17.0=pyhd8ed1ab_0 - - zlib=1.2.13=hd590300_5 - - zstandard=0.19.0=py311h5eee18b_0 - - zstd=1.5.5=hc292b87_0 - - pip: - - aiofiles==22.1.0 - - aiosqlite==0.19.0 - - bokeh==2.4.2 - - contourpy==1.2.0 - - dask-labextension==6.2.0 - - jupyter-server-fileid==0.9.1 - - jupyter-server-proxy==4.1.0 - - jupyter-server-ydoc==0.8.0 - - jupyter-ydoc==0.2.5 - - jupyterlab==3.6.6 - - jupyterlab-slurm==3.0.1 - - pandas==2.1.4 - - pillow==10.2.0 - - simpervisor==1.0.0 - - tzdata==2023.4 - - xyzservices==2023.10.1 - - y-py==0.6.2 - - ypy-websocket==0.8.4 -prefix: /home/alvaro/.miniconda3c \ No newline at end of file diff --git a/jupyterlab-host/jupyterlab4.1.5-python3.11.5.yaml b/jupyterlab-host/jupyterlab4.1.5-python3.11.5.yaml deleted file mode 100644 index a870e308e..000000000 --- a/jupyterlab-host/jupyterlab4.1.5-python3.11.5.yaml +++ /dev/null @@ -1,199 +0,0 @@ -name: base -channels: - - conda-forge - - defaults -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - anyio=4.3.0=pyhd8ed1ab_0 - - archspec=0.2.3=pyhd8ed1ab_0 - - argon2-cffi=23.1.0=pyhd8ed1ab_0 - - argon2-cffi-bindings=21.2.0=py311h459d7ec_4 - - arrow=1.3.0=pyhd8ed1ab_0 - - asttokens=2.4.1=pyhd8ed1ab_0 - - async-lru=2.0.4=pyhd8ed1ab_0 - - attrs=23.2.0=pyh71513ae_0 - - babel=2.14.0=pyhd8ed1ab_0 - - beautifulsoup4=4.12.3=pyha770c72_0 - - bleach=6.1.0=pyhd8ed1ab_0 - - boltons=23.0.0=py311h06a4308_0 - - brotli-python=1.0.9=py311h6a678d5_7 - - bzip2=1.0.8=h7b6447c_0 - - c-ares=1.19.1=h5eee18b_0 - - ca-certificates=2024.3.11=h06a4308_0 - - cached-property=1.5.2=hd8ed1ab_1 - - cached_property=1.5.2=pyha770c72_1 - - certifi=2024.2.2=py311h06a4308_0 - - cffi=1.16.0=py311h5eee18b_0 - - charset-normalizer=2.0.4=pyhd3eb1b0_0 - - comm=0.2.2=pyhd8ed1ab_0 - - conda=24.3.0=py311h06a4308_0 - - conda-content-trust=0.2.0=py311h06a4308_0 - - conda-libmamba-solver=23.12.0=pyhd3eb1b0_1 - - conda-package-handling=2.2.0=py311h06a4308_0 - - conda-package-streaming=0.9.0=py311h06a4308_0 - - cryptography=41.0.7=py311hdda0065_0 - - debugpy=1.6.7=py311h6a678d5_0 - - decorator=5.1.1=pyhd8ed1ab_0 - - defusedxml=0.7.1=pyhd8ed1ab_0 - - distro=1.8.0=py311h06a4308_0 - - entrypoints=0.4=pyhd8ed1ab_0 - - exceptiongroup=1.2.0=pyhd8ed1ab_2 - - executing=2.0.1=pyhd8ed1ab_0 - - fmt=9.1.0=hdb19cb5_0 - - fqdn=1.5.1=pyhd8ed1ab_0 - - h11=0.14.0=pyhd8ed1ab_0 - - h2=4.1.0=pyhd8ed1ab_0 - - hpack=4.0.0=pyh9f0ad1d_0 - - httpcore=1.0.4=pyhd8ed1ab_0 - - httpx=0.27.0=pyhd8ed1ab_0 - - hyperframe=6.0.1=pyhd8ed1ab_0 - - icu=73.1=h6a678d5_0 - - idna=3.4=py311h06a4308_0 - - importlib-metadata=7.1.0=pyha770c72_0 - - importlib_metadata=7.1.0=hd8ed1ab_0 - - importlib_resources=6.4.0=pyhd8ed1ab_0 - - ipykernel=6.29.3=pyhd33586a_0 - - ipython=8.22.2=pyh707e725_0 - - ipython_genutils=0.2.0=pyhd3eb1b0_1 - - isoduration=20.11.0=pyhd8ed1ab_0 - - jedi=0.19.1=pyhd8ed1ab_0 - - jinja2=3.1.3=pyhd8ed1ab_0 - - json5=0.9.24=pyhd8ed1ab_0 - - jsonpatch=1.32=pyhd3eb1b0_0 - - jsonpointer=2.1=pyhd3eb1b0_0 - - jsonschema=4.21.1=pyhd8ed1ab_0 - - jsonschema-specifications=2023.12.1=pyhd8ed1ab_0 - - jsonschema-with-format-nongpl=4.21.1=pyhd8ed1ab_0 - - jupyter-lsp=2.2.4=pyhd8ed1ab_0 - - jupyter_client=7.4.9=py311h06a4308_0 - - jupyter_core=5.7.2=py311h38be061_0 - - jupyter_events=0.10.0=pyhd8ed1ab_0 - - jupyter_server=2.13.0=pyhd8ed1ab_0 - - jupyter_server_terminals=0.5.3=pyhd8ed1ab_0 - - jupyterlab_pygments=0.3.0=pyhd8ed1ab_1 - - jupyterlab_server=2.25.4=pyhd8ed1ab_0 - - krb5=1.20.1=h143b758_1 - - ld_impl_linux-64=2.38=h1181459_1 - - libarchive=3.6.2=h6ac8c49_2 - - libcurl=8.4.0=h251f7ec_1 - - libedit=3.1.20230828=h5eee18b_0 - - libev=4.33=h7f8727e_1 - - libffi=3.4.4=h6a678d5_0 - - libgcc-ng=13.2.0=h807b86a_5 - - libgomp=13.2.0=h807b86a_5 - - libmamba=1.5.3=haf1ee3a_0 - - libmambapy=1.5.3=py311h2dafd23_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libsodium=1.0.18=h36c2ea0_1 - - libsolv=0.7.24=he621ea3_0 - - libssh2=1.10.0=hdbd6064_2 - - libstdcxx-ng=11.2.0=h1234567_1 - - libuuid=1.41.5=h5eee18b_0 - - libxml2=2.10.4=hf1b16e4_1 - - lz4-c=1.9.4=h6a678d5_0 - - markupsafe=2.1.5=py311h459d7ec_0 - - matplotlib-inline=0.1.6=pyhd8ed1ab_0 - - menuinst=2.0.1=py311h06a4308_1 - - mistune=3.0.2=pyhd8ed1ab_0 - - nb_conda_kernels=2.3.1=py311h06a4308_0 - - nbclassic=1.0.0=py311h06a4308_0 - - nbclient=0.10.0=pyhd8ed1ab_0 - - nbconvert=7.16.3=hd8ed1ab_0 - - nbconvert-core=7.16.3=pyhd8ed1ab_0 - - nbconvert-pandoc=7.16.3=hd8ed1ab_0 - - nbformat=5.10.3=pyhd8ed1ab_0 - - ncurses=6.4=h6a678d5_0 - - nest-asyncio=1.6.0=pyhd8ed1ab_0 - - notebook=6.5.4=py311h06a4308_1 - - notebook-shim=0.2.4=pyhd8ed1ab_0 - - openssl=3.2.1=hd590300_1 - - overrides=7.7.0=pyhd8ed1ab_0 - - packaging=23.1=py311h06a4308_0 - - pandoc=2.12=h06a4308_3 - - pandocfilters=1.5.0=pyhd8ed1ab_0 - - parso=0.8.3=pyhd8ed1ab_0 - - pcre2=10.42=hebb0a14_0 - - pexpect=4.9.0=pyhd8ed1ab_0 - - pickleshare=0.7.5=py_1003 - - pip=23.3.1=py311h06a4308_0 - - pkgutil-resolve-name=1.3.10=pyhd8ed1ab_1 - - platformdirs=3.10.0=py311h06a4308_0 - - pluggy=1.0.0=py311h06a4308_1 - - prometheus_client=0.20.0=pyhd8ed1ab_0 - - prompt-toolkit=3.0.42=pyha770c72_0 - - psutil=5.9.8=py311h459d7ec_0 - - ptyprocess=0.7.0=pyhd3deb0d_0 - - pure_eval=0.2.2=pyhd8ed1ab_0 - - pybind11-abi=4=hd3eb1b0_1 - - pycosat=0.6.6=py311h5eee18b_0 - - pycparser=2.21=pyhd3eb1b0_0 - - pygments=2.17.2=pyhd8ed1ab_0 - - pyopenssl=23.2.0=py311h06a4308_0 - - pysocks=1.7.1=py311h06a4308_0 - - python=3.11.5=h955ad1f_0 - - python-dateutil=2.9.0=pyhd8ed1ab_0 - - python-fastjsonschema=2.19.1=pyhd8ed1ab_0 - - python-json-logger=2.0.7=pyhd8ed1ab_0 - - python_abi=3.11=2_cp311 - - pytz=2024.1=pyhd8ed1ab_0 - - pyyaml=6.0.1=py311h459d7ec_1 - - pyzmq=24.0.1=py311h5eee18b_0 - - readline=8.2=h5eee18b_0 - - referencing=0.34.0=pyhd8ed1ab_0 - - reproc=14.2.4=h295c915_1 - - reproc-cpp=14.2.4=h295c915_1 - - requests=2.31.0=py311h06a4308_0 - - rfc3339-validator=0.1.4=pyhd8ed1ab_0 - - rfc3986-validator=0.1.1=pyh9f0ad1d_0 - - rpds-py=0.18.0=py311h46250e7_0 - - ruamel.yaml=0.17.21=py311h5eee18b_0 - - send2trash=1.8.2=pyh41d4057_0 - - setuptools=68.2.2=py311h06a4308_0 - - six=1.16.0=pyh6c4a22f_0 - - sniffio=1.3.1=pyhd8ed1ab_0 - - soupsieve=2.5=pyhd8ed1ab_1 - - sqlite=3.41.2=h5eee18b_0 - - stack_data=0.6.2=pyhd8ed1ab_0 - - terminado=0.18.1=pyh0d859eb_0 - - tinycss2=1.2.1=pyhd8ed1ab_0 - - tk=8.6.12=h1ccaba5_0 - - tomli=2.0.1=pyhd8ed1ab_0 - - tornado=6.4=py311h459d7ec_0 - - tqdm=4.65.0=py311h92b7b1e_0 - - traitlets=5.14.2=pyhd8ed1ab_0 - - truststore=0.8.0=py311h06a4308_0 - - types-python-dateutil=2.9.0.20240316=pyhd8ed1ab_0 - - typing-extensions=4.10.0=hd8ed1ab_0 - - typing_extensions=4.10.0=pyha770c72_0 - - typing_utils=0.1.0=pyhd8ed1ab_0 - - tzdata=2023c=h04d1e81_0 - - uri-template=1.3.0=pyhd8ed1ab_0 - - urllib3=1.26.18=py311h06a4308_0 - - wcwidth=0.2.13=pyhd8ed1ab_0 - - webcolors=1.13=pyhd8ed1ab_0 - - webencodings=0.5.1=pyhd8ed1ab_2 - - websocket-client=1.7.0=pyhd8ed1ab_0 - - wheel=0.41.2=py311h06a4308_0 - - xz=5.4.5=h5eee18b_0 - - yaml=0.2.5=h7f98852_2 - - yaml-cpp=0.8.0=h6a678d5_0 - - zeromq=4.3.5=h6a678d5_0 - - zipp=3.17.0=pyhd8ed1ab_0 - - zlib=1.2.13=h5eee18b_0 - - zstandard=0.19.0=py311h5eee18b_0 - - zstd=1.5.5=hc292b87_0 - - pip: - - aiofiles==22.1.0 - - aiosqlite==0.20.0 - - ipywidgets==8.1.2 - - jupyter-server-fileid==0.9.1 - - jupyter-server-ydoc==0.8.0 - - jupyter-ydoc==0.2.5 - - jupyterlab==3.6.7 - - jupyterlab-slurm==3.0.1 - - jupyterlab-widgets==3.0.10 - - widgetsnbextension==4.0.10 - - y-py==0.6.2 - - ypy-websocket==0.8.4 -prefix: /home/alvaro/miniconda-old diff --git a/jupyterlab-host/kill-template.sh b/jupyterlab-host/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/jupyterlab-host/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/jupyterlab-host/start-template-v3.sh b/jupyterlab-host/start-template-v3.sh deleted file mode 100755 index 15d0324ca..000000000 --- a/jupyterlab-host/start-template-v3.sh +++ /dev/null @@ -1,276 +0,0 @@ -# Runs via ssh + sbatch -set -x - -start_rootless_docker() { - local MAX_RETRIES=20 - local RETRY_INTERVAL=2 - local ATTEMPT=1 - - export XDG_RUNTIME_DIR=/run/user/$(id -u) - dockerd-rootless-setuptool.sh install - PATH=/usr/bin:/sbin:/usr/sbin:$PATH dockerd-rootless.sh --exec-opt native.cgroupdriver=cgroupfs > docker-rootless.log 2>&1 & #--data-root /docker-rootless/docker-rootless/ - - # Wait for Docker daemon to be ready - until docker info > /dev/null 2>&1; do - if [ $ATTEMPT -le $MAX_RETRIES ]; then - echo "$(date) Attempt $ATTEMPT of $MAX_RETRIES: Waiting for Docker daemon to start..." - sleep $RETRY_INTERVAL - ((ATTEMPT++)) - else - echo "$(date) ERROR: Docker daemon failed to start after $MAX_RETRIES attempts." - return 1 - fi - done - - echo "$(date): Docker daemon is ready!" - return 0 -} - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - -if [ -z "${service_load_env}" ]; then - service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" -fi - -eval "${service_load_env}" - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh -jupyterlab_port=$(findAvailablePort) - -if [[ "${service_conda_install}" == "true" ]]; then - source ${service_conda_sh} - eval "conda activate ${service_conda_env}" -else - eval "${service_load_env}" -fi - -if [ -z $(which jupyter-lab 2> /dev/null) ]; then - displayErrorMessage "jupyter-lab command not found" -fi - -export XDG_RUNTIME_DIR="" - -# Generate sha: -if [ -z "${service_password}" ]; then - echo "No password was specified" - sha="" -else - echo "Generating sha" - sha=$(python3 -c "from notebook.auth.security import passwd; print(passwd('${service_password}', algorithm = 'sha1'))") -fi -# Set the launch directory for JupyterHub -# If notebook_dir is not set or set to a templated value, -# use the default value of "/". -if [ -z ${service_notebook_dir} ]; then - service_notebook_dir="/" -fi - -####################### -# START NGINX WRAPPER # -####################### - -proxy_port=${jupyterlab_port} -proxy_host="127.0.0.1" -if which docker >/dev/null 2>&1 && [[ "${service_rootless_docker}" == "true" ]]; then - if ! dockerd-rootless-setuptool.sh check; then - echo "$(date) ERROR: Rootless docker is NOT support on this system" - exit 1 - fi - if ! which socat >/dev/null 2>&1; then - echo "$(date) ERROR: socat is not installed" - exit 1 - fi - start_rootless_docker - # Need to run this for the container to be able to access the port on the host's network - proxy_port=$(findAvailablePort) - proxy_host=$(hostname -I | xargs) - socat TCP-LISTEN:${proxy_port},fork,reuseaddr TCP:127.0.0.1:${jupyterlab_port} >> socat.logs 2>&1 & - pid=$! - echo "kill ${pid} #socat" >> cancel.sh -fi - -echo "Starting nginx wrapper on service port ${service_port}" - -# Write config file -cat >> config.conf <> nginx.conf </dev/null 2>&1 && [[ "${service_rootless_docker}" == "true" ]]; then - container_name="nginx-${service_port}" - touch empty - touch nginx.logs - echo "docker volume rm ${container_name}" >> cancel.sh - docker volume create ${container_name} - echo "docker stop ${container_name}" >> cancel.sh - echo "docker rm ${container_name}" >> cancel.sh - chmod 644 ${PWD}/{nginx.conf,config.conf,empty} - docker run -d --name ${container_name} \ - --add-host=host.docker.internal:host-gateway \ - -p ${service_port}:${service_port} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - nginxinc/nginx-unprivileged:1.25.3 - # Print logs - docker logs ${container_name} -elif sudo -n true 2>/dev/null && which docker >/dev/null 2>&1; then - container_name="nginx-${service_port}" - # Remove container when job is canceled - echo "sudo docker stop ${container_name}" >> cancel.sh - echo "sudo docker rm ${container_name}" >> cancel.sh - # Start container - sudo service docker start - touch empty - touch nginx.logs - # change ownership to nginx user - sudo chown 101:101 nginx.conf config.conf empty nginx.logs - sudo chmod 644 *.conf - sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.logs:/var/log/nginx/access.log \ - -v $PWD/nginx.logs:/var/log/nginx/error.log \ - --network=host nginxinc/nginx-unprivileged:1.25.3 - # Print logs - sudo docker logs ${container_name} -elif which singularity >/dev/null 2>&1; then - echo "Running singularity container ${service_nginx_sif}" - # We need to mount $PWD/tmp:/tmp because otherwise nginx writes the file /tmp/nginx.pid - # and other users cannot use the node. Was not able to change this in the config.conf. - mkdir -p ./tmp - # Need to overwrite default configuration! - touch empty - singularity run -B $PWD/tmp:/tmp -B $PWD/config.conf:/etc/nginx/conf.d/config.conf -B $PWD/nginx.conf:/etc/nginx/nginx.conf -B empty:/etc/nginx/conf.d/default.conf ${service_nginx_sif} >> nginx.logs 2>&1 & - pid=$! - echo "kill ${pid}" >> cancel.sh -else - displayErrorMessage "Need Docker or Singularity to start NGINX proxy" -fi - - -#################### -# START JUPYTERLAB # -#################### - -if [ -z ${service_notebook_dir} ]; then - service_notebook_dir="/" -fi - -export JUPYTER_CONFIG_DIR=${PWD} -rm -f jupyter_lab_config.py -jupyter-lab --generate-config - -sed -i "s|^.*c\.ExtensionApp\.default_url.*|c.ExtensionApp.default_url = '${basepath}'|" jupyter_lab_config.py -sed -i "s|^.*c\.LabServerApp\.app_url.*|c.LabServerApp.app_url = '${basepath}/lab'|" jupyter_lab_config.py -sed -i "s|^.*c\.LabApp\.app_url.*|c.LabApp.app_url = '/lab'|" jupyter_lab_config.py -sed -i "s|^.*c\.LabApp\.default_url.*|c.LabApp.default_url = '${basepath}/lab'|" jupyter_lab_config.py -sed -i "s|^.*c\.LabApp\.static_url_prefix.*|c.LabApp.static_url_prefix = '${basepath}/static'|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.allow_origin.*|c.ServerApp.allow_origin = '*'|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.allow_remote_access.*|c.ServerApp.allow_remote_access = True|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.base_url.*|c.ServerApp.base_url = '${basepath}'|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.default_url.*|c.ServerApp.default_url = '${basepath}/'|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.port.*|c.ServerApp.port = ${jupyterlab_port}|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.token.*|c.ServerApp.token = ''|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.tornado_settings.*|c.ServerApp.tornado_settings = {\"static_url_prefix\":\"${basepath}/static/\"}|" jupyter_lab_config.py -sed -i "s|^.*c\.ServerApp\.root_dir.*|c.ServerApp.root_dir = '${service_notebook_dir}'|" jupyter_lab_config.py - -cd ${service_notebook_dir} - -# JUICE https://docs.juicelabs.co/docs/juice/intro -if [[ "${juice_use_juice}" == "true" ]]; then - echo "INFO: Enabling Juice for remote GPU access" - if [ -z "${juice_exec}" ]; then - juice_exec=${service_parent_install_dir}/juice/juice - echo "INFO: Set Juice executable path to ${juice_exec}" - fi - - if ! [ -z "${juice_vram}" ]; then - vram_arg="--vram ${juice_vram}" - fi - if ! [ -z "${juice_pool_ids}" ]; then - pool_ids_arg="--pool-ids ${juice_pool_ids}" - fi - juice_cmd="${juice_exec} run ${juice_cmd_args} ${vram_arg} ${pool_ids_arg}" - echo "INFO: Prepared Juice command: ${juice_cmd}" - echo "INFO: Logging into Juice with provided token" - ${juice_exec} login -t "${JUICE_TOKEN}" || { - echo "ERROR: Failed to log into Juice" - exit 1 - } -fi - -date - -${juice_cmd} jupyter-lab --port=${jupyterlab_port} --no-browser --config=${resource_jobdir}/jupyter_lab_config.py --allow-root -#jupyter-lab --port=${jupyterlab_port} --ip ${HOSTNAME} --no-browser --config=${PWD}/jupyter_lab_config.py - -sleep inf diff --git a/jupyterlab-host/transfer_files.sh b/jupyterlab-host/transfer_files.sh deleted file mode 100644 index 3d27d77d3..000000000 --- a/jupyterlab-host/transfer_files.sh +++ /dev/null @@ -1,6 +0,0 @@ -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${resource_workdir}/pw/software -fi - -rsync -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" -avzq --rsync-path="mkdir -p ${service_parent_install_dir} && rsync" ${pw_job_dir}/${service_name}/*.yaml ${resource_publicIp}:${resource_jobdir} - diff --git a/jupyterlab-host/url.sh b/jupyterlab-host/url.sh deleted file mode 100755 index d8d45d08a..000000000 --- a/jupyterlab-host/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="lab\"" \ No newline at end of file diff --git a/kasmvnc-proxy/kill-template.sh b/kasmvnc-proxy/kill-template.sh deleted file mode 100644 index 5805056ac..000000000 --- a/kasmvnc-proxy/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/kasmvnc-proxy/start-template-v3.sh b/kasmvnc-proxy/start-template-v3.sh deleted file mode 100644 index a95fd5eeb..000000000 --- a/kasmvnc-proxy/start-template-v3.sh +++ /dev/null @@ -1,276 +0,0 @@ - -check_sudo_access() { - if ! sudo -n true 2>/dev/null; then - echo "$(date): ERROR: Cannot $1 without root access" - exit 1 - fi -} - -# Check if kasmvnc-server is installed (using rpm -qa and grep) -is_kasmvnc_installed() { - rpm -qa | grep -q kasmvncserver -} - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh - -# Runs via ssh + sbatch -set -x - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - -kasmvnc_port=$(findAvailablePort) -export kasmvnc_port=${kasmvnc_port} - -export XDG_RUNTIME_DIR="" - -#################### -# START KASMVNC # -#################### - -# if running on rocky9 update the download url -source /etc/os-release -if [[ "$VERSION_ID" == *"9"* ]]; then - service_download_url="https://github.com/kasmtech/KasmVNC/releases/download/v1.3.4/kasmvncserver_oracle_9_1.3.4_x86_64.rpm" -elif [[ "$VERSION_ID" == *"8"* ]]; then - service_download_url="https://github.com/kasmtech/KasmVNC/releases/download/v1.3.2/kasmvncserver_oracle_8_1.3.2_x86_64.rpm" -fi - -MAX_RETRIES=5 -RETRY_INTERVAL=5 -attempt=0 -while ! is_kasmvnc_installed && [ $attempt -lt $MAX_RETRIES ]; do - check_sudo_access "Install kasmvnc-server" - echo "Attempt $((attempt+1)) to install kasmvnc..." - wget ${service_download_url} - sudo dnf localinstall ./kasmvncserver_*.rpm --allowerasing -y - rm ./kasmvncserver_*.rpm - sleep $RETRY_INTERVAL - attempt=$((attempt+1)) - # Disable ssl - #sudo sed -i 's/require_ssl: true/require_ssl: false/g' /usr/share/kasmvnc/kasmvnc_defaults.yaml - sudo chgrp pwuser /etc/pki/tls/private/kasmvnc.pem -done - -if ! is_kasmvnc_installed; then - displayErrorMessage "ERROR: KasmVNC installation failed." -fi - - -kernel_version=$(uname -r | tr '[:upper:]' '[:lower:]') -# Find an available display port -if [[ $kernel_version == *microsoft* ]]; then - # In windows only this port works - displayPort=5900 -else - minPort=5901 - maxPort=5999 - for port in $(seq ${minPort} ${maxPort} | shuf); do - out=$(netstat -aln | grep LISTEN | grep ${port}) - displayNumber=${port: -2} - XdisplayNumber=$(echo ${displayNumber} | sed 's/^0*//') - if [ -z "${out}" ] && ! [ -e /tmp/.X11-unix/X${XdisplayNumber} ]; then - # To prevent multiple users from using the same available port --> Write file to reserve it - portFile=/tmp/${port}.port.used - if ! [ -f "${portFile}" ]; then - touch ${portFile} - export displayPort=${port} - export DISPLAY=:${displayNumber#0} - break - fi - fi - done -fi - - -# YOU NEED TO SET A PASSWORD! -# The password can be ignoted later using vncserver ${DISPLAY} -disableBasicAuth - -if [ "${service_set_password}" != true ]; then - service_password=password - disableBasicAuth="-disableBasicAuth" -fi - -if ! which expect >/dev/null 2>&1; then - echo "$(date) expect is not installed. Attempting to install..." - sudo dnf install -y expect -fi - -# Verify installation -if ! which expect >/dev/null 2>&1; then - echo "$(date) ERROR: expect installation failed or expect is not in PATH" >&2 - exit 1 -fi - - -expect -c 'spawn vncpasswd -u '"${USER}"' -w -r; expect "Password:"; send "'"${service_password}"'\r"; expect "Verify:"; send "'"${service_password}"'\r"; expect eof' - - -vncserver -kill ${DISPLAY} -echo "vncserver -kill ${DISPLAY}" >> cancel.sh.sh - -MAX_RETRIES=5 -RETRY_DELAY=5 -RETRY_COUNT=0 - -vncserver_cmd="vncserver ${DISPLAY} ${disableBasicAuth} -select-de gnome -websocketPort ${kasmvnc_port} -rfbport ${displayPort}" -echo Running: -echo ${vncserver_cmd} -while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do - ${vncserver_cmd} - if [ $? -eq 0 ]; then - echo "KasmVNC server started successfully." - break - else - echo "KasmVNC server failed to start. Retrying in $RETRY_DELAY seconds..." - ls -l /etc/pki/tls/private/kasmvnc.pem - sleep $RETRY_DELAY - fi - - RETRY_COUNT=$((RETRY_COUNT + 1)) -done - -rm -rf ${portFile} - -if ! [ -f "${HOME}/.vnc/${HOSTNAME}${DISPLAY}.pid" ]; then - echo $(date): "KasmVNC server failed to start. Exiting workflow." - exit 1 -fi - -vncserver_pid=$(cat "${HOME}/.vnc/${HOSTNAME}${DISPLAY}.pid") -echo "kill ${vncserver_pid}" >> cancel.sh -cat "${HOME}/.vnc/${HOSTNAME}${DISPLAY}.log" -echo "rm \"${HOME}/.vnc/${HOSTNAME}${DISPLAY}*\"" >> cancel.sh - - -####################### -# START NGINX WRAPPER # -####################### - -echo "Starting nginx wrapper on service port ${service_port}" - -# Write config file -cat >> config.conf <> nginx.conf </dev/null && which docker >/dev/null 2>&1; then - container_name="nginx-${service_port}" - # Remove container when job is canceled - echo "sudo docker stop ${container_name}" >> cancel.sh - echo "sudo docker rm ${container_name}" >> cancel.sh - # Start container - sudo service docker start - touch empty - touch nginx.logs - # change ownership to nginx user - sudo chown 101:101 nginx.conf config.conf empty nginx.logs - sudo chmod 644 *.conf - sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.logs:/var/log/nginx/access.log \ - -v $PWD/nginx.logs:/var/log/nginx/error.log \ - --network=host nginxinc/nginx-unprivileged:1.25.3 - # Print logs - sudo docker logs ${container_name} -elif which singularity >/dev/null 2>&1; then - echo "Running singularity container ${service_nginx_sif}" - # We need to mount $PWD/tmp:/tmp because otherwise nginx writes the file /tmp/nginx.pid - # and other users cannot use the node. Was not able to change this in the config.conf. - mkdir -p ./tmp - # Need to overwrite default configuration! - touch empty - singularity run -B $PWD/tmp:/tmp -B $PWD/config.conf:/etc/nginx/conf.d/config.conf -B $PWD/nginx.conf:/etc/nginx/nginx.conf -B empty:/etc/nginx/conf.d/default.conf ${service_nginx_sif} >> nginx.logs 2>&1 & - pid=$! - echo "kill ${pid}" >> cancel.sh -else - displayErrorMessage "Need Docker or Singularity to start NGINX proxy" -fi - -################## -# LAUNCH SERVICE # -################## - -# Reload env in case it was deactivated in the step above (e.g.: conda activate) -eval "${service_load_env}" - -# Launch service -cd -if ! [ -z "${service_bin}" ]; then - if [[ ${service_background} == "False" ]]; then - echo "Running ${service_bin}" - eval ${service_bin} - else - echo "Running ${service_bin} in the background" - eval ${service_bin} & - echo $! >> ${resource_jobdir}/service.pid - fi -fi - -sleep inf diff --git a/kasmvnc/kill-template.sh b/kasmvnc/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/kasmvnc/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/kasmvnc/start-template-v3.sh b/kasmvnc/start-template-v3.sh deleted file mode 100644 index b784aee8b..000000000 --- a/kasmvnc/start-template-v3.sh +++ /dev/null @@ -1,163 +0,0 @@ - -check_sudo_access() { - if ! sudo -n true 2>/dev/null; then - echo "$(date): ERROR: Cannot $1 without root access" - exit 1 - fi -} - -# Check if kasmvnc-server is installed (using rpm -qa and grep) -is_kasmvnc_installed() { - rpm -qa | grep -q kasmvncserver -} - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh - -set -x - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - -if [ -z "${service_port}" ]; then - displayErrorMessage "ERROR: No service port found in the range \${minPort}-\${maxPort} -- exiting session" -fi - -export service_port=${service_port} - - - -MAX_RETRIES=5 -RETRY_INTERVAL=5 -attempt=0 -while ! is_kasmvnc_installed && [ $attempt -lt $MAX_RETRIES ]; do - check_sudo_access "Install kasmvnc-server" - echo "Attempt $((attempt+1)) to install kasmvnc..." - wget ${service_download_url} - sudo dnf localinstall ./kasmvncserver_*.rpm --allowerasing -y - rm ./kasmvncserver_*.rpm - sleep $RETRY_INTERVAL - attempt=$((attempt+1)) - # Disable ssl - #sudo sed -i 's/require_ssl: true/require_ssl: false/g' /usr/share/kasmvnc/kasmvnc_defaults.yaml -done - -if ! is_kasmvnc_installed; then - displayErrorMessage "ERROR: KasmVNC installation failed." -fi - -# Check if user is already in the group -if ! groups $USER | grep -q "\bkasmvnc-cert\b"; then - check_sudo_access "Add user to kasmvnc-cert group" - echo "User is not in kasmvnc-cert group. Adding..." - sudo usermod -a -G kasmvnc-cert $USER - echo "Running newgrp to apply group changes..." - env > env.sh - newgrp kasmvnc-cert - source env.sh -else - echo "User is already in kasmvnc-cert group." - needs_newgrp=false -fi - -if ! groups | grep -q "\bkasmvnc-cert\b"; then - echo $(date): "ERROR: User is not in kasmvnc-cert group." - exit 1 -fi - -kernel_version=$(uname -r | tr '[:upper:]' '[:lower:]') -# Find an available display port -if [[ $kernel_version == *microsoft* ]]; then - # In windows only this port works - displayPort=5900 -else - minPort=5901 - maxPort=5999 - for port in $(seq ${minPort} ${maxPort} | shuf); do - out=$(netstat -aln | grep LISTEN | grep ${port}) - displayNumber=${port: -2} - XdisplayNumber=$(echo ${displayNumber} | sed 's/^0*//') - if [ -z "${out}" ] && ! [ -e /tmp/.X11-unix/X${XdisplayNumber} ]; then - # To prevent multiple users from using the same available port --> Write file to reserve it - portFile=/tmp/${port}.port.used - if ! [ -f "${portFile}" ]; then - touch ${portFile} - export displayPort=${port} - export DISPLAY=:${displayNumber#0} - break - fi - fi - done -fi - - -# YOU NEED TO SET A PASSWORD! -# The password can be ignoted later using vncserver ${DISPLAY} -disableBasicAuth - -if [ "${service_set_password}" != true ]; then - service_password=password - disableBasicAuth="-disableBasicAuth" -fi -expect -c 'spawn vncpasswd -u '"${USER}"' -w -r; expect "Password:"; send "'"${service_password}"'\r"; expect "Verify:"; send "'"${service_password}"'\r"; expect eof' - - -vncserver -kill ${DISPLAY} -echo "vncserver -kill ${DISPLAY}" >> cancel.sh.sh - -MAX_RETRIES=5 -RETRY_DELAY=5 -RETRY_COUNT=0 - -vncserver_cmd="vncserver ${DISPLAY} ${disableBasicAuth} -select-de gnome -websocketPort ${service_port} -rfbport ${displayPort}" -echo Running: -echo ${vncserver_cmd} -while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do - ${vncserver_cmd} - if [ $? -eq 0 ]; then - echo "KasmVNC server started successfully." - break - else - echo "KasmVNC server failed to start. Retrying in $RETRY_DELAY seconds..." - ls -l /etc/pki/tls/private/kasmvnc.pem - sleep $RETRY_DELAY - fi - - RETRY_COUNT=$((RETRY_COUNT + 1)) -done - -rm -rf ${portFile} - -if ! [ -f "${HOME}/.vnc/${HOSTNAME}${DISPLAY}.pid" ]; then - echo $(date): "KasmVNC server failed to start. Exiting workflow." - exit 1 -fi - -vncserver_pid=$(cat "${HOME}/.vnc/${HOSTNAME}${DISPLAY}.pid") -echo "kill ${vncserver_pid}" >> cancel.sh -cat "${HOME}/.vnc/${HOSTNAME}${DISPLAY}.log" -echo "rm \"${HOME}/.vnc/${HOSTNAME}${DISPLAY}*\"" >> cancel.sh - - -# Reload env in case it was deactivated in the step above (e.g.: conda activate) -eval "${service_load_env}" - -# Launch service -cd -if ! [ -z "${service_bin}" ]; then - if [[ ${service_background} == "False" ]]; then - echo "Running ${service_bin}" - eval ${service_bin} - else - echo "Running ${service_bin} in the background" - eval ${service_bin} & - echo $! >> ${resource_jobdir}/service.pid - fi -fi - -sleep inf diff --git a/lib.sh b/lib.sh deleted file mode 100755 index 965cb8531..000000000 --- a/lib.sh +++ /dev/null @@ -1,12 +0,0 @@ -echod() { - echo $(date): $@ -} - - -displayErrorMessage() { - echo $(date): $1 - sed -i "s/.*ERROR_MESSAGE.*/ \"ERROR_MESSAGE\": \"$1\"/" service.json - sed -i "s/.*JOB_STATUS.*/ \"JOB_STATUS\": \"FAILED\",/" service.json - exit 1 -} - diff --git a/main.sh b/main.sh deleted file mode 100755 index 49e280bd5..000000000 --- a/main.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -# Define a cleanup function -cleanup() { - ./utils/steps/clean_and_exit.sh 2>&1 | tee clean_and_exit.out -} - -# Set the trap to call cleanup on script exit -trap cleanup EXIT - - -./utils/steps/preprocessing.sh 2>&1 | tee preprocessing.out || exit 1 -./utils/steps/input_form_resource_wrapper.sh 2>&1 | tee input_form_resource_wrapper.out || exit 1 -./utils/steps/process_inputs_sh.sh 2>&1 | tee process_inputs_sh.out || exit 1 -./utils/steps/controller_preprocessing.sh 2>&1 | tee controller_preprocessing.out || exit 1 -./utils/steps/prepare_service_json.sh 2>&1 | tee prepare_service_json.out || exit 1 -./utils/steps/initialize_cancel_script.sh 2>&1 | tee initialize_cancel_script.out || exit 1 -./utils/steps/create_session_script.sh 2>&1 | tee create_session_script.out || exit 1 -./utils/steps/launch_job_and_wait.sh \ No newline at end of file diff --git a/marimo-host/controller-v3.sh b/marimo-host/controller-v3.sh deleted file mode 100644 index c14648de4..000000000 --- a/marimo-host/controller-v3.sh +++ /dev/null @@ -1,175 +0,0 @@ -cd ${resource_jobdir} - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - - -displayErrorMessage() { - echo $(date): $1 - exit 1 -} - -f_install_miniconda() { - install_dir=$1 - if [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing Miniconda3-latest-Linux-x86_64.sh" - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" - else - conda_repo="https://repo.anaconda.com/miniconda/Miniconda3-py312_24.9.2-0-Linux-x86_64.sh" - fi - ID=$(date +%s)-${RANDOM} # This script may run at the same time! - nohup wget --no-check-certificate ${conda_repo} -O /tmp/miniconda-${ID}.sh 2>&1 > /tmp/miniconda_wget-${ID}.out - rm -rf ${install_dir} - mkdir -p $(dirname ${install_dir}) - nohup bash /tmp/miniconda-${ID}.sh -b -p ${install_dir} 2>&1 > /tmp/miniconda_sh-${ID}.out - source ${install_dir}/etc/profile.d/conda.sh - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main - conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r -} - -f_set_up_conda_from_yaml() { - CONDA_DIR=$1 - CONDA_ENV=$2 - CONDA_YAML=$3 - CONDA_SH="${CONDA_DIR}/etc/profile.d/conda.sh" - # conda env export - # Remove line starting with name, prefix and remove empty lines - sed -i -e '/^name:/d' -e '/^prefix:/d' -e '/^$/d' ${CONDA_YAML} - - if [ ! -d "${CONDA_DIR}" ]; then - echo "Conda directory <${CONDA_DIR}> not found. Installing conda..." - f_install_miniconda ${CONDA_DIR} - fi - - echo "Sourcing Conda SH <${CONDA_SH}>" - source ${CONDA_SH} - - # Check if Conda environment exists - if ! conda env list | grep -q "${CONDA_ENV}"; then - echo "Creating Conda Environment <${CONDA_ENV}>" - conda create --name ${CONDA_ENV} - fi - - echo "Activating Conda Environment <${CONDA_ENV}>" - conda activate ${CONDA_ENV} - - echo "Installing condda environment from YAML" - conda env update -n ${CONDA_ENV} -f ${CONDA_YAML} -} - - -download_and_install_juice() { - # Configuration - local OUTPUT_FILE="juice.tgz" - - # Step 1: Get download URL from JuiceLabs API - echo "Fetching JuiceLabs download URL..." - download=$(curl -s 'https://electra.juicelabs.co/v2/public/download/linux' | python3 -c "import sys, json; print(json.load(sys.stdin)['url'])") - - - if [ -z "$download" ]; then - echo "ERROR: Download URL is empty" - exit 1 - fi - echo "Found download URL: $download" - - # Step 2: Prepare install directory - mkdir -p "${juice_install_dir}" - cd "${juice_install_dir}" || exit 1 - - # Step 3: Install prerequisites - sudo dnf install -y wget libatomic numactl-libs || { - echo "ERROR: Failed to install dependencies" - exit 1 - } - - # Step 4: Download Juice agent - echo "Downloading Juice agent..." - wget -O "$OUTPUT_FILE" "$download" || { - echo "ERROR: Failed to download file" - exit 1 - } - - # Step 5: Extract archive - echo "Extracting Juice agent..." - tar -xzvf "$OUTPUT_FILE" || { - echo "ERROR: Failed to extract $OUTPUT_FILE" - exit 1 - } - - echo "Juice agent successfully installed in ${juice_install_dir}" -} - - -if [[ "${service_conda_install}" == "true" ]]; then - - if [[ "${service_install_instructions}" == "install_command" ]]; then - echo "Running install command ${service_install_command}" - eval ${service_install_command} - elif [[ "${service_install_instructions}" == "yaml" ]]; then - echo "Installing custom conda environment" - printf "%b" "${service_yaml}" > conda.yaml - cat conda.yaml - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} conda.yaml - elif [[ "${service_install_instructions}" == "latest" ]]; then - echo "Installing latest" - { - source ${service_conda_sh} - } || { - conda_dir=$(echo ${service_conda_sh} | sed "s|etc/profile.d/conda.sh||g" ) - f_install_miniconda ${conda_dir} - source ${service_conda_sh} - } - { - eval "conda activate ${service_conda_env}" - } || { - conda create -n ${service_conda_env} marimo -y - eval "conda activate ${service_conda_env}" - } - if [ -z $(which marimo 2> /dev/null) ]; then - conda install -y -c conda-forge marimo - fi - else - echo "Installing conda environment ${service_install_instructions}.yaml" - f_set_up_conda_from_yaml ${service_parent_install_dir}/${service_conda_install_dir} ${service_conda_env} ${service_install_instructions}.yaml - fi - if [ -z "${service_load_env}" ]; then - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" - fi -fi -eval "${service_load_env}" - -if [ -z $(which marimo 2> /dev/null) ]; then - echo "$(date) ERROR: marimo command not found" - exit 1 -fi - - -# Juice -if [[ "${juice_use_juice}" == "true" ]]; then - if [ -z "${juice_exec}" ]; then - juice_install_dir=${service_parent_install_dir}/juice - juice_exec=${service_parent_install_dir}/juice/juice - if ! [ -f ${juice_exec} ]; then - echo "INFO: Installing Juice" - mkdir -p ${juice_install_dir} - download_and_install_juice - fi - if ! [ -f ${juice_exec} ]; then - echo "ERROR: Juice installation failed" - exit 1 - fi - fi -fi - - -if [[ "${service_conda_install}" != "true" ]]; then - exit 0 -fi diff --git a/marimo-host/kill-template.sh b/marimo-host/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/marimo-host/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/marimo-host/nov2025.yaml b/marimo-host/nov2025.yaml deleted file mode 100644 index 94bd5a412..000000000 --- a/marimo-host/nov2025.yaml +++ /dev/null @@ -1,146 +0,0 @@ -name: base -channels: - - defaults - - conda-forge -dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - anaconda-anon-usage=0.7.3=py313hfc0e8ea_100 - - anaconda-auth=0.10.0=py313h06a4308_1 - - anaconda-cli-base=0.5.4=py313h06a4308_0 - - annotated-types=0.6.0=py313h06a4308_0 - - anyio=4.11.0=pyhcf101f3_0 - - archspec=0.2.5=pyhd3eb1b0_0 - - boltons=25.0.0=py313h06a4308_0 - - brotlicffi=1.0.9.2=py313h6a678d5_1 - - bzip2=1.0.8=h5eee18b_6 - - c-ares=1.34.5=hef5626c_0 - - ca-certificates=2025.10.5=hbd8a1cb_0 - - certifi=2025.10.5=py313h06a4308_0 - - cffi=2.0.0=py313h4eded50_0 - - charset-normalizer=3.3.2=pyhd3eb1b0_0 - - click=8.1.8=py313h06a4308_0 - - conda=25.9.1=py313h06a4308_0 - - conda-anaconda-telemetry=0.3.0=pyhd3eb1b0_1 - - conda-anaconda-tos=0.2.2=py313h06a4308_1 - - conda-content-trust=0.2.0=py313h06a4308_1 - - conda-libmamba-solver=25.4.0=pyhd3eb1b0_0 - - conda-package-handling=2.4.0=py313h06a4308_0 - - conda-package-streaming=0.12.0=py313h06a4308_0 - - cpp-expected=1.1.0=hdb19cb5_0 - - cryptography=46.0.2=py313h498d7c9_0 - - dbus=1.16.2=h5bd4931_0 - - distro=1.9.0=py313h06a4308_0 - - docutils=0.22.3=pyhd8ed1ab_0 - - exceptiongroup=1.3.0=pyhd8ed1ab_0 - - expat=2.7.1=h6a678d5_0 - - fmt=11.2.0=hca5f364_0 - - frozendict=2.4.2=py313h06a4308_0 - - h11=0.16.0=pyhd8ed1ab_0 - - icu=73.1=h6a678d5_0 - - idna=3.7=py313h06a4308_0 - - importlib-metadata=8.7.0=pyhe01879c_1 - - itsdangerous=2.2.0=pyhd8ed1ab_1 - - jaraco.classes=3.4.0=py313h06a4308_0 - - jaraco.context=6.0.0=py313h06a4308_0 - - jaraco.functools=4.1.0=py313h06a4308_0 - - jedi=0.19.2=pyhd8ed1ab_1 - - jeepney=0.7.1=pyhd3eb1b0_0 - - jsonpatch=1.33=py313h06a4308_1 - - jsonpointer=3.0.0=py313h06a4308_0 - - keyring=25.6.0=py313h06a4308_0 - - ld_impl_linux-64=2.44=h153f514_2 - - libarchive=3.8.2=h3ec8f01_0 - - libcurl=8.15.0=hc1efc7f_0 - - libev=4.33=h7f8727e_1 - - libffi=3.4.4=h6a678d5_1 - - libgcc-ng=11.2.0=h1234567_1 - - libgomp=11.2.0=h1234567_1 - - libmamba=2.3.2=h4368100_0 - - libmambapy=2.3.2=py313hea2f153_0 - - libmpdec=4.0.0=h5eee18b_0 - - libnghttp2=1.57.0=h2d74bed_0 - - libsolv=0.7.30=h6f1ccf3_2 - - libssh2=1.11.1=h251f7ec_0 - - libstdcxx-ng=11.2.0=h1234567_1 - - libuuid=1.41.5=h5eee18b_0 - - libxcb=1.17.0=h9b100fa_0 - - libxml2=2.13.8=hfdd30dd_0 - - libzlib=1.3.1=hb25bd0a_0 - - loro=1.8.2=py313hdeb11d6_1 - - lz4-c=1.9.4=h6a678d5_1 - - marimo=0.15.2=py313hd5f5364_0 - - markdown=3.4.4=pyhd8ed1ab_0 - - markdown-it-py=4.0.0=py313h06a4308_0 - - mdurl=0.1.2=py313h06a4308_0 - - menuinst=2.3.1=py313h06a4308_0 - - more-itertools=10.8.0=py313h06a4308_0 - - narwhals=2.10.2=pyhcf101f3_0 - - ncurses=6.5=h7934f7d_0 - - nlohmann_json=3.11.2=h6a678d5_0 - - openssl=3.0.18=hd6dcaed_0 - - packaging=25.0=py313h06a4308_1 - - parso=0.8.5=pyhcf101f3_0 - - pcre2=10.46=hf426167_0 - - pip=25.2=pyhc872135_1 - - pkce=1.0.3=py313h06a4308_0 - - platformdirs=4.3.7=py313h06a4308_0 - - pluggy=1.5.0=py313h06a4308_0 - - psutil=7.0.0=py313hee96239_1 - - pthread-stubs=0.3=h0ce48e5_1 - - pybind11-abi=5=hd3eb1b0_0 - - pycosat=0.6.6=py313h5eee18b_2 - - pycparser=2.23=py313h06a4308_0 - - pydantic=2.12.2=py313h06a4308_0 - - pydantic-core=2.41.4=py313h498d7c9_0 - - pydantic-settings=2.10.1=py313h06a4308_0 - - pygments=2.19.1=py313h06a4308_0 - - pyjwt=2.10.1=py313h06a4308_0 - - pymdown-extensions=10.4=pyhd8ed1ab_0 - - pysocks=1.7.1=py313h06a4308_0 - - python=3.13.9=h7e8bc2b_100_cp313 - - python-dotenv=1.1.0=py313h06a4308_0 - - python_abi=3.13=1_cp313 - - pyyaml=6.0.3=pyh7db6752_0 - - readchar=4.2.1=py313h06a4308_0 - - readline=8.3=hc2a1206_0 - - reproc=14.2.4=h6a678d5_2 - - reproc-cpp=14.2.4=h6a678d5_2 - - requests=2.32.5=py313h06a4308_0 - - rich=14.2.0=py313h06a4308_0 - - ruamel.yaml=0.18.10=py313h5eee18b_0 - - ruamel.yaml.clib=0.2.12=py313h5eee18b_0 - - ruff=0.12.0=py313hc6f7160_0 - - secretstorage=3.4.0=py313h3e8c6aa_0 - - semver=3.0.2=py313h06a4308_1 - - setuptools=80.9.0=py313h06a4308_0 - - shellingham=1.5.0=py313h06a4308_0 - - simdjson=3.10.1=hdb19cb5_0 - - sniffio=1.3.1=pyhd8ed1ab_1 - - sqlite=3.50.2=hb25bd0a_1 - - starlette=0.50.0=pyhfdc7a7d_0 - - tk=8.6.15=h54e0aa7_0 - - tomli=2.2.1=py313h06a4308_0 - - tomlkit=0.13.3=pyha770c72_0 - - tqdm=4.67.1=py313h7040dfc_0 - - truststore=0.10.1=py313h06a4308_0 - - typer=0.17.4=py313h06a4308_0 - - typing-extensions=4.15.0=py313h06a4308_0 - - typing-inspection=0.4.2=py313h06a4308_0 - - typing_extensions=4.15.0=py313h06a4308_0 - - tzdata=2025b=h04d1e81_0 - - urllib3=2.5.0=py313h06a4308_0 - - uvicorn=0.38.0=pyh31011fe_0 - - websockets=15.0.1=py313h5eee18b_0 - - wheel=0.45.1=py313h06a4308_0 - - xorg-libx11=1.8.12=h9b100fa_1 - - xorg-libxau=1.0.12=h9b100fa_0 - - xorg-libxdmcp=1.1.5=h9b100fa_0 - - xorg-xorgproto=2024.1=h5eee18b_1 - - xz=5.6.4=h5eee18b_1 - - yaml=0.2.5=h7f98852_2 - - yaml-cpp=0.8.0=h6a678d5_1 - - zipp=3.23.0=pyhd8ed1ab_0 - - zlib=1.3.1=hb25bd0a_0 - - zstandard=0.24.0=py313h3d778a8_0 - - zstd=1.5.7=h11fc155_0 \ No newline at end of file diff --git a/marimo-host/start-template-v3.sh b/marimo-host/start-template-v3.sh deleted file mode 100755 index c5a2da00b..000000000 --- a/marimo-host/start-template-v3.sh +++ /dev/null @@ -1,49 +0,0 @@ -# Runs via ssh + sbatch -set -x - - -if [ -z "${service_load_env}" ]; then - service_conda_sh=${service_parent_install_dir}/${service_conda_install_dir}/etc/profile.d/conda.sh - service_load_env="source ${service_conda_sh}; conda activate ${service_conda_env}" -fi - -if [[ "${service_conda_install}" == "true" ]]; then - source ${service_conda_sh} - eval "conda activate ${service_conda_env}" -else - eval "${service_load_env}" -fi - - -# JUICE https://docs.juicelabs.co/docs/juice/intro -if [[ "${juice_use_juice}" == "true" ]]; then - echo "INFO: Enabling Juice for remote GPU access" - if [ -z "${juice_exec}" ]; then - juice_exec=${service_parent_install_dir}/juice/juice - echo "INFO: Set Juice executable path to ${juice_exec}" - fi - - if ! [ -z "${juice_vram}" ]; then - vram_arg="--vram ${juice_vram}" - fi - if ! [ -z "${juice_pool_ids}" ]; then - pool_ids_arg="--pool-ids ${juice_pool_ids}" - fi - juice_cmd="${juice_exec} run ${juice_cmd_args} ${vram_arg} ${pool_ids_arg}" - echo "INFO: Prepared Juice command: ${juice_cmd}" - echo "INFO: Logging into Juice with provided token" - ${juice_exec} login -t "${JUICE_TOKEN}" || { - echo "ERROR: Failed to log into Juice" - exit 1 - } -fi - -date - -if ! [ -z "${service_script}" ]; then - ${juice_cmd} marimo ${service_mode} ${service_script} --port ${service_port} --host ${HOSTNAME} --no-token -else - ${juice_cmd} marimo tutorial intro --port ${service_port} --no-token --host ${HOSTNAME} -fi - -sleep inf diff --git a/marimo-host/transfer_files.sh b/marimo-host/transfer_files.sh deleted file mode 100644 index d9468c2dd..000000000 --- a/marimo-host/transfer_files.sh +++ /dev/null @@ -1,7 +0,0 @@ - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${resource_workdir}/pw/software -fi - -rsync -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" -avzq --rsync-path="mkdir -p ${service_parent_install_dir} && rsync" ${pw_job_dir}/${service_name}/*.yaml ${resource_publicIp}:${resource_jobdir} - diff --git a/matlab-docker/kill-template.sh b/matlab-docker/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/matlab-docker/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/matlab-docker/start-template-v3.sh b/matlab-docker/start-template-v3.sh deleted file mode 100755 index 631d52e7a..000000000 --- a/matlab-docker/start-template-v3.sh +++ /dev/null @@ -1,140 +0,0 @@ - - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh -matlab_port=$(findAvailablePort) - -####################### -# START NGINX WRAPPER # -####################### - -echo "Starting nginx wrapper on service port ${service_port}" - -# Write config file -cat >> config.conf <> nginx.conf <> cancel.sh -echo "sudo docker rm ${container_name}" >> cancel.sh -# Start container -sudo service docker start -touch -sudo chown 101:101 nginx.conf config.conf empty nginx.logs -sudo chmod 644 *.conf -sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v ${PWD}/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - --network=host nginxinc/nginx-unprivileged:1.25.3 -# Print logs -sudo docker logs ${container_name} - -####################### -# START MATLAB DOCKER # -####################### -sudo docker pull ${container_name} -container_name="matlab-${service_port}" -echo "sudo docker stop ${container_name}" >> cancel.sh -echo "sudo docker rm ${container_name}" >> cancel.sh - -if [[ ${service_use_gpus} == "true" ]]; then - gpu_flag="--gpus all" - # FIXME: This should go to the image creation - curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo - sudo yum-config-manager --enable nvidia-container-toolkit-experimental - sudo yum install -y nvidia-container-toolkit - sudo nvidia-ctk runtime configure --runtime=docker - sudo systemctl restart docker -else - gpu_flag="" -fi - -MWI_BASE_URL="${basepath}/" - -# Docker supports mounting directories that do not exist (singularity does not) -set -x - -sudo docker pull ${service_docker_repo} - -# https://docs.docker.com/config/containers/container-networking/ -# sudo docker run -it --rm -p 8888:8888 --shm-size=512M mathworks/matlab:r2022a -browser -# cant run "-it" in the workflow! Fails with message: the input device is not a TTY -sudo -n docker run ${gpu_flag} -i --rm \ - --name ${container_name} \ - ${service_mount_directories} -v ${HOME}:${HOME} \ - -p ${matlab_port}:${matlab_port} \ - --shm-size=512M \ - --env MWI_LOG_LEVEL=DEBUG \ - --env MWI_ENABLE_WEB_LOGGING=True \ - --env MWI_APP_HOST=0.0.0.0 \ - --env MWI_APP_PORT=${matlab_port} \ - --env MWI_ENABLE_TOKEN_AUTH=False \ - --env MWI_BASE_URL=${MWI_BASE_URL} \ - ${service_docker_repo} \ - -browser - -# --env MWI_CUSTOM_HTTP_HEADERS='{"Content-Security-Policy": "frame-ancestors *cloud.parallel.works:* https://cloud.parallel.works:*;"}' \ - -sleep 999999999 diff --git a/metabase/kill-template.sh b/metabase/kill-template.sh deleted file mode 100644 index 4bcb1d009..000000000 --- a/metabase/kill-template.sh +++ /dev/null @@ -1,7 +0,0 @@ - -{ - bash ${resource_jobdir}/docker-kill-${job_number}.sh - rm ${resource_jobdir}/docker-kill-${job_number}.sh -} || { - echo "ERROR: Could not run bash ${resource_jobdir}/docker-kill-${job_number}.sh. Please run it manually!" -} diff --git a/metabase/start-template-v3.sh b/metabase/start-template-v3.sh deleted file mode 100644 index 45f3c3119..000000000 --- a/metabase/start-template-v3.sh +++ /dev/null @@ -1,162 +0,0 @@ -# This script runs in an environment with the following variables: - -# Defined in the input form: -# - jobschedulertype -# - service_mount_directories -# - service_docker_repo - -# Added by the workflow -# - job_number: PW job number, e.g.: 00001 - - -# service_port: This value can be specified in the input form. Otherwise, the workflow -# selects any available port - -# Check if the user can execute commands with sudo -if ! sudo -v >/dev/null 2>&1; then - displayErrorMessage "You do not have sudo access. Exiting." -fi - -set -x -sudo systemctl start docker - -####################### -# START NGINX WRAPPER # -####################### - -echo "Starting nginx wrapper on service port ${service_port}" - -# Write config file -cat >> config.conf <> nginx.conf < docker-kill-${job_number}.sh -chmod +x docker-kill-${job_number}.sh - -if sudo -n true 2>/dev/null && which docker >/dev/null 2>&1; then - container_name="nginx-${service_port}" - # Remove container when job is canceled - # CREATE CANCEL SCRIPT TO REMOVE DOCKER CONTAINER WHEN THE PW JOB IS CANCELED - if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo sudo "sudo docker stop ${container_name}" >> docker-kill-${job_number}.sh - echo sudo "sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh - else - # Create kill script. Needs to be here because we need the hostname of the compute node. - echo ssh "'$(hostname)' sudo docker stop ${container_name}" >> docker-kill-${job_number}.sh - echo ssh "'$(hostname)' sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh - fi - # Start container - touch empty - touch nginx.logs - # change ownership to nginx user - sudo chown 101:101 nginx.conf config.conf empty nginx.logs - sudo chmod 644 *.conf - sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.logs:/var/log/nginx/access.log \ - -v $PWD/nginx.logs:/var/log/nginx/error.log \ - --network=host nginxinc/nginx-unprivileged:1.25.3 - # Print logs - sudo docker logs ${container_name} -elif which singularity >/dev/null 2>&1; then - echo "Running singularity container ${service_nginx_sif}" - # We need to mount $PWD/tmp:/tmp because otherwise nginx writes the file /tmp/nginx.pid - # and other users cannot use the node. Was not able to change this in the config.conf. - mkdir -p ./tmp - # Need to overwrite default configuration! - touch empty - singularity run -B $PWD/tmp:/tmp -B $PWD/config.conf:/etc/nginx/conf.d/config.conf -B $PWD/nginx.conf:/etc/nginx/nginx.conf -B empty:/etc/nginx/conf.d/default.conf ${service_nginx_sif} >> nginx.logs 2>&1 & - pid=$! - echo "kill ${pid}" >> docker-kill-${job_number}.sh -else - displayErrorMessage "Need Docker or Singularity to start NGINX proxy" -fi - -# Run docker container -container_name="metabase" - -# CREATE CANCEL SCRIPT TO REMOVE DOCKER CONTAINER WHEN THE PW JOB IS CANCELED -if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo sudo "sudo docker stop ${container_name}" >> docker-kill-${job_number}.sh - echo sudo "sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -else - # Create kill script. Needs to be here because we need the hostname of the compute node. - echo ssh "'$(hostname)' sudo docker stop ${container_name}" >> docker-kill-${job_number}.sh - echo ssh "'$(hostname)' sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -fi - -chmod 777 docker-kill-${job_number}.sh - -# Start container -sudo -n docker run -d --network=host \ - -v ~/metabase-data:/metabase-data \ - -e "MB_DB_FILE=/metabase-data/metabase.db" \ - --name ${container_name} ${service_image} - -sudo docker logs -f ${container_name} - -# If running docker with the -d option sleep here! -# Do not exit this script until the job is canceled! -# Exiting this script before the job is canceled triggers the cancel script! -sleep inf diff --git a/mlflow/start-template-v3.sh b/mlflow/start-template-v3.sh deleted file mode 100644 index 80b5453f8..000000000 --- a/mlflow/start-template-v3.sh +++ /dev/null @@ -1,10 +0,0 @@ - -# Load or install mlflow -if [ -z "${service_load_env}" ]; then - export PATH=${PATH}:~/.local/bin - eval ${service_mlflow_install_cmd} -else - eval ${service_mlflow_load_cmd} -fi - -mlflow server --port ${service_port} --host ${HOSTNAME} ${additional_flags} diff --git a/n8n/kill-template.sh b/n8n/kill-template.sh deleted file mode 100755 index 8c8de028a..000000000 --- a/n8n/kill-template.sh +++ /dev/null @@ -1,4 +0,0 @@ - -# Runs in the controller node: -bash ${resource_jobdir}/cancel.sh - diff --git a/n8n/start-template-v3.sh b/n8n/start-template-v3.sh deleted file mode 100755 index 0e131195b..000000000 --- a/n8n/start-template-v3.sh +++ /dev/null @@ -1,55 +0,0 @@ -# Runs via ssh + sbatch -set -x - -mkdir -p n8n_data -chmod 777 n8n_data -Rf - -cat >> docker-compose.yml < service-kill-${job_number}.sh --> service-kill-${job_number}-main.sh -echo "Creating file ${resource_jobdir}/service-kill-${job_number}-main.sh from directory ${PWD}" -if [[ ${jobschedulertype} != "CONTROLLER" ]]; then - # Remove .cluster.local for einteinmed! - hname=$(hostname | sed "s/.cluster.local//g") - echo "ssh ${hname} 'bash -s' < ${resource_jobdir}/service-kill-${job_number}-main.sh" > ${resource_jobdir}/service-kill-${job_number}.sh -else - echo "bash ${resource_jobdir}/service-kill-${job_number}-main.sh" > ${resource_jobdir}/service-kill-${job_number}.sh -fi - -cat >> ${resource_jobdir}/service-kill-${job_number}-main.sh <= 3.7, while +# the system python3 is 3.6 on some cluster nodes. +SLURM_APP_VENV=${service_parent_install_dir}/ngencerf-venv +PYTHON_BIN=python3.8 + +if ! command -v ${PYTHON_BIN} >/dev/null 2>&1; then + echo "::error::${PYTHON_BIN} is required for the SLURM wrapper app but was not found." + exit 1 +fi + +# Rebuild the venv if it is missing gunicorn or was created with an unsupported +# Python (e.g. an older venv built with python3 == 3.6). +if [ -f "${SLURM_APP_VENV}/bin/gunicorn" ] && \ + "${SLURM_APP_VENV}/bin/python" -c 'import sys; sys.exit(0 if sys.version_info >= (3, 7) else 1)' >/dev/null 2>&1; then + echo "::notice::Python dependencies already installed at ${SLURM_APP_VENV}" +else + echo "::group::Python Dependencies" + echo "::notice::Creating virtual environment at ${SLURM_APP_VENV} using ${PYTHON_BIN}" + rm -rf "${SLURM_APP_VENV}" + mkdir -p ${service_parent_install_dir} + ${PYTHON_BIN} -m venv ${SLURM_APP_VENV} + ${SLURM_APP_VENV}/bin/pip install Flask gunicorn + echo "::endgroup::" +fi diff --git a/ngencerf/start-template-v3.sh b/ngencerf/start-template-v3.sh index d3baf0f00..9f7b14ddb 100644 --- a/ngencerf/start-template-v3.sh +++ b/ngencerf/start-template-v3.sh @@ -3,6 +3,10 @@ set -x echo whoami ": $(whoami)" +if [ -z ${service_parent_install_dir} ]; then + service_parent_install_dir=${HOME}/pw/software +fi + PORT=5000 if lsof -i :$PORT >/dev/null 2>&1; then echo @@ -43,6 +47,13 @@ if [[ "${service_only_connect}" == "true" ]]; then sleep infinity fi +# Previously provided by the session wrapper; the v1.4 session_runner injects no +# helper functions, so define it here. +displayErrorMessage() { + echo $(date): $1 + exit 1 +} + if ! [ -f "${service_nginx_sif}" ]; then displayErrorMessage "NGINX proxy singularity container was not found ${service_nginx_sif}" fi @@ -175,23 +186,17 @@ echo "kill $!" >> cancel.sh # Launch SLURM Wrapper Flask App # ################################## # Transfer Python script -if ! [ -f slurm-wrapper-app-v3.py ]; then - displayErrorMessage "SLURM wrapper slurm-wrapper-app-v3.py app not found " -fi - - -# Install Flask -sudo -n pip3.8 install Flask -sudo -n pip3.8 install gunicorn +cp ${PW_PARENT_JOB_DIR}/ngencerf/slurm-wrapper-app-v3.py . # Start Flask app using gunicorn export PARTITIONS=$(scontrol show partition | awk -F '=' '/^PartitionName=/ {printf "%s,", $2}' | sed 's/,$//') # This script is required to run the callback with retries +cp ${PW_PARENT_JOB_DIR}/ngencerf/run_callback.sh . sed -i "s|__LOCAL_DATA_DIR__|${local_data_dir}|g" run_callback.sh chmod +x run_callback.sh -/usr/local/bin/gunicorn -w ${service_slurm_app_workers} -b 0.0.0.0:5000 slurm-wrapper-app-v3:app \ +${service_parent_install_dir}/ngencerf-venv/bin/gunicorn -w ${service_slurm_app_workers} -b 0.0.0.0:5000 slurm-wrapper-app-v3:app \ --access-logfile slurm-wrapper-app-v3.log \ --error-logfile slurm-wrapper-app-v3.log \ --capture-output \ @@ -201,6 +206,7 @@ slurm_wrapper_pid=$! echo "kill ${slurm_wrapper_pid}" >> cancel.sh # Rerun previous callbacks +cp ${PW_PARENT_JOB_DIR}/ngencerf/run_pending_callbacks.sh . sed -i "s|__LOCAL_DATA_DIR__|${local_data_dir}|g" run_pending_callbacks.sh bash run_pending_callbacks.sh >> run_pending_callback.log 2>&1 & run_pending_callbacks_pid=$! diff --git a/nginx-docker/kill-template.sh b/nginx-docker/kill-template.sh deleted file mode 100755 index df5cf1fc3..000000000 --- a/nginx-docker/kill-template.sh +++ /dev/null @@ -1,8 +0,0 @@ - -{ - bash ${resource_jobdir}/docker-kill-${job_number}.sh - mv ${resource_jobdir}/docker-kill-${job_number}.sh ${resource_jobdir}/docker-kill-${job_number}.sh.completed -} || { - echo "ERROR: Could not run bash ${resource_jobdir}/docker-kill-${job_number}.sh. Please run it manually!" - exit 1 -} diff --git a/nginx-docker/start-template-v3.sh b/nginx-docker/start-template-v3.sh deleted file mode 100755 index 0e07dc703..000000000 --- a/nginx-docker/start-template-v3.sh +++ /dev/null @@ -1,48 +0,0 @@ -# This script runs in an environment with the following variables: - -# Defined in the input form: -# - jobschedulertype -# - service_mount_directories -# - service_image - -# Added by the workflow -# - job_number: PW job number, e.g.: 00001 - - -# service_port: This value can be specified in the input form. Otherwise, the workflow -# selects any available port in the range 6000-9000 - -# Check if the user can execute commands with sudo -if ! sudo -v >/dev/null 2>&1; then - displayErrorMessage "You do not have sudo access. Exiting." -fi - -# Run docker container -container_name="nginx-${service_port}" - -# CREATE CANCEL SCRIPT TO REMOVE DOCKER CONTAINER WHEN THE PW JOB IS CANCELED -if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo sudo "sudo docker stop ${container_name}" > docker-kill-${job_number}.sh - echo sudo "sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -else - # Create kill script. Needs to be here because we need the hostname of the compute node. - echo ssh "'$(hostname)' sudo docker stop ${container_name}" > docker-kill-${job_number}.sh - echo ssh "'$(hostname)' sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -fi - -chmod 777 docker-kill-${job_number}.sh - -# Start container -sudo service docker start - -sudo -n docker run -d --name ${container_name} \ - ${service_mount_directories} -v ${HOME}:${HOME} \ - -p $service_port:80 \ - ${service_image} - -sudo docker logs ${container_name} - -# If running docker with the -d option sleep here! -# Do not exit this script until the job is canceled! -# Exiting this script before the job is canceled triggers the cancel script! -sleep inf diff --git a/nicedcv/kill-template.sh b/nicedcv/kill-template.sh deleted file mode 100755 index 187ca25fa..000000000 --- a/nicedcv/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash ${resource_jobdir}/service-kill-${job_number}.sh - diff --git a/nicedcv/url.sh b/nicedcv/url.sh deleted file mode 100755 index 29a9f7bae..000000000 --- a/nicedcv/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="\"" \ No newline at end of file diff --git a/novnc-docker/kill-template.sh b/novnc-docker/kill-template.sh deleted file mode 100755 index 616a00b9e..000000000 --- a/novnc-docker/kill-template.sh +++ /dev/null @@ -1,4 +0,0 @@ - -# Runs in the controller node: -bash ${resource_jobdir}/service-kill-${job_number}.sh - diff --git a/novnc-docker/url.sh b/novnc-docker/url.sh deleted file mode 100644 index 620132989..000000000 --- a/novnc-docker/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="vnc.html?resize=remote\&autoconnect=true\&show_dot=true\&path=websockify\&password=headless\&host=\"+window.location.host+\"/me/${openPort}\"+\"\/\&dt=\"+(new Date()).getTime()" \ No newline at end of file diff --git a/ollama-openwebui/controller-v3.sh b/ollama-openwebui/controller-v3.sh deleted file mode 100644 index 61588b032..000000000 --- a/ollama-openwebui/controller-v3.sh +++ /dev/null @@ -1,18 +0,0 @@ -cd ${resource_jobdir} - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/open-webui.sif -fi - -if ! [ -f ${service_nginx_sif} ]; then - echo; echo - echo "Singularity container ${service_nginx_sif} not found" - echo "Creating container" - module load singularity - singularity pull ${service_nginx_sif} docker://ghcr.io/open-webui/open-webui:main -fi - diff --git a/ollama-openwebui/kill-template.sh b/ollama-openwebui/kill-template.sh deleted file mode 100755 index 5805056ac..000000000 --- a/ollama-openwebui/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash "${resource_jobdir}/cancel.sh" - diff --git a/ollama-openwebui/start-template-v3.sh b/ollama-openwebui/start-template-v3.sh deleted file mode 100755 index 690bdc78c..000000000 --- a/ollama-openwebui/start-template-v3.sh +++ /dev/null @@ -1,43 +0,0 @@ -# Runs via ssh + sbatch -set -x - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/open-webui.sif -fi - -# Initialize cancel script -echo '#!/bin/bash' > cancel.sh -chmod +x cancel.sh - - -ollama_port=$(findAvailablePort) - -module load ollama -module load singularity - -export OLLAMA_MODELS=${service_models} -export OLLAMA_HOST=0.0.0.0:${ollama_port} -export OLLAMA_NUM_PARALLEL=${service_num_parallel} -export OLLAMA_MAX_LOADED_MODELS=${service_max_loaded_models} -export OLLAMA_DEFAULT_KEEP_ALIVE=${service_default_keep_alive} - -echo; echo -echo "STARTING OLLAMA SERVER" -ollama serve & -ollama_pid=$! -echo "kill ${ollama_pid} # ollama pid" >> cancel.sh -sleep 5 - -echo; echo -echo "STARTING OPEN-WEBUI" -mkdir open-webui -echo "{\"version\": 0, \"ui\": {}, \"ollama\": {\"base_urls\": [\"http://0.0.0.0:$ollama_port\"]}}" > open-webui/config.json -singularity exec --bind open-webui:/app/backend/data \ - --env WEBUI_AUTH=False \ - --env OLLAMA_API_BASE_URL=http://0.0.0.0:${ollama_port} \ - --env WEBUI_PORT=${service_port} \ - ${service_nginx_sif} /app/backend/start.sh diff --git a/openvscode/controller-v3.sh b/openvscode/controller-v3.sh deleted file mode 100755 index 8773eb448..000000000 --- a/openvscode/controller-v3.sh +++ /dev/null @@ -1,159 +0,0 @@ - -displayErrorMessage() { - echo $(date): $1 -} - - -init_code_server_settings() { - local settings_dir=${HOME}/.local/share/code-server/User - local settings_json=${settings_dir}/settings.json - mkdir -p ${settings_dir} - -cat > "${settings_json}" < service-kill-${job_number}.sh --> service-kill-${job_number}-main.sh -echo "Creating file ${resource_jobdir}/service-kill-${job_number}-main.sh from directory ${PWD}" -if [[ ${jobschedulertype} != "CONTROLLER" ]]; then - # Remove .cluster.local for einteinmed! - hname=$(hostname | sed "s/.cluster.local//g") - echo "ssh ${hname} 'bash -s' < ${resource_jobdir}/service-kill-${job_number}-main.sh" > ${resource_jobdir}/service-kill-${job_number}.sh -else - echo "bash ${resource_jobdir}/service-kill-${job_number}-main.sh" > ${resource_jobdir}/service-kill-${job_number}.sh -fi - -cat >> ${resource_jobdir}/service-kill-${job_number}-main.sh </dev/null 2>&1; then - displayErrorMessage "You do not have sudo access. Exiting." -fi - -set -x - -# Run docker container -container_name="pgadmin4-${service_port}" - -# CREATE CANCEL SCRIPT TO REMOVE DOCKER CONTAINER WHEN THE PW JOB IS CANCELED -if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo sudo "sudo docker stop ${container_name}" > docker-kill-${job_number}.sh - echo sudo "sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -else - # Create kill script. Needs to be here because we need the hostname of the compute node. - echo ssh "'$(hostname)' sudo docker stop ${container_name}" > docker-kill-${job_number}.sh - echo ssh "'$(hostname)' sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -fi - -chmod 777 docker-kill-${job_number}.sh - -# Start container -sudo systemctl start docker -sudo -n docker run -d --name ${container_name} \ - ${service_mount_directories} \ - -p $service_port:80 \ - -e PGADMIN_DEFAULT_EMAIL=${service_email} \ - -e PGADMIN_DEFAULT_PASSWORD=${service_password} \ - ${service_image} - -sudo docker logs ${container_name} - -# If running docker with the -d option sleep here! -# Do not exit this script until the job is canceled! -# Exiting this script before the job is canceled triggers the cancel script! -sleep inf diff --git a/postgres/kill-template.sh b/postgres/kill-template.sh deleted file mode 100755 index 2d456c5bd..000000000 --- a/postgres/kill-template.sh +++ /dev/null @@ -1,7 +0,0 @@ - -# { -# bash ${resource_jobdir}/docker-kill-${job_number}.sh -# rm ${resource_jobdir}/docker-kill-${job_number}.sh -# } || { -# echo "ERROR: Could not run bash ${resource_jobdir}/docker-kill-${job_number}.sh. Please run it manually!" -# } diff --git a/postgres/start-template-v3.sh b/postgres/start-template-v3.sh deleted file mode 100755 index cd472df54..000000000 --- a/postgres/start-template-v3.sh +++ /dev/null @@ -1,60 +0,0 @@ -# This script runs in an environment with the following variables: - -# Defined in the input form: -# - jobschedulertype -# - service_mount_directories -# - service_docker_repo - -# Added by the workflow -# - job_number: PW job number, e.g.: 00001 - - -# service_port: This value can be specified in the input form. Otherwise, the workflow -# selects any available port - -# Check if the user can execute commands with sudo -if ! sudo -v >/dev/null 2>&1; then - displayErrorMessage "You do not have sudo access. Exiting." -fi - -set -x - -# Run docker container -container_name="postgres-${service_port}" - -# CREATE CANCEL SCRIPT TO REMOVE DOCKER CONTAINER WHEN THE PW JOB IS CANCELED -if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo sudo "sudo docker stop ${container_name}" > docker-kill-${job_number}.sh - echo sudo "sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -else - # Create kill script. Needs to be here because we need the hostname of the compute node. - echo ssh "'$(hostname)' sudo docker stop ${container_name}" > docker-kill-${job_number}.sh - echo ssh "'$(hostname)' sudo docker rm ${container_name}" >> docker-kill-${job_number}.sh -fi - -chmod 777 docker-kill-${job_number}.sh - -if ! [ -z "${service_db}" ]; then - POSTGRES_DB_ENV=" -e POSTGRES_DB=${service_db}" -fi - -# Start container -sudo mkdir -p /postgres-data - -sudo systemctl start docker -sudo -n docker run -d --rm --name ${container_name} \ - ${service_mount_directories} \ - -p $service_port:5432 \ - -e PGDATA=/var/lib/postgresql/data/pgdata \ - -v /postgres-data:/var/lib/postgresql/data \ - -e POSTGRES_USER=${service_user} -e POSTGRES_PASSWORD=${service_password} ${POSTGRES_DB_ENV} \ - ${service_image} - -sleep 5 - -sudo docker logs ${container_name} - -# If running docker with the -d option sleep here! -# Do not exit this script until the job is canceled! -# Exiting this script before the job is canceled triggers the cancel script! -#sleep inf diff --git a/pvweb-singularity/paraviewweb-pv-osmesa-v5.6.0.def b/pvweb-singularity/paraviewweb-pv-osmesa-v5.6.0.def deleted file mode 100644 index 232bba6f7..000000000 --- a/pvweb-singularity/paraviewweb-pv-osmesa-v5.6.0.def +++ /dev/null @@ -1,33 +0,0 @@ -Bootstrap: docker -From: kitware/paraviewweb:pv-osmesa-v5.6.0 - -# This container is intended for those *without* nvidia graphics cards! -# If you have an nvidia graphics card, see the Singularity.nv recipe -# Usage: - -%help - Run singularity run -B data_dir:/data paraviewweb.sif bash run-paraviewweb.sh - Where run-paraviewweb.sh: - /opt/paraview/install/bin/pvpython '"${EXTRA_PVPYTHON_ARGS}"' \ - /opt/paraview/install/share/paraview-5.6/web/visualizer/server/pvw-visualizer.py \ - --content /opt/paraview/install/share/paraview-5.6/web/visualizer/www \ - --port 8081 \ - --data /data \ - --viewport-max-width 1920 \ - --viewport-max-height 1080 \ - --timeout 30 - - Based on https://github.com/singularityhub/paraview-visualizer/blob/master/Singularity - -%setup - mkdir -p $SINGULARITY_ROOTFS/data - mkdir -p $SINGULARITY_ROOTFS/usr/local/bin - exit 0 - -%environment - export EXTRA_PVPYTHON_ARGS="-dr --mesa-swr" - export LANG=C - ALLOW_HTTP=true - URL=localhost - export ALLOW_HTTP URL - diff --git a/pvweb-singularity/url.sh b/pvweb-singularity/url.sh deleted file mode 100644 index 29a9f7bae..000000000 --- a/pvweb-singularity/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="\"" \ No newline at end of file diff --git a/r-singularity/notes.txt b/r-singularity/notes.txt deleted file mode 100755 index 530dfe56f..000000000 --- a/r-singularity/notes.txt +++ /dev/null @@ -1,2 +0,0 @@ -Build singularity container with: -sudo singularity build rserver.sif rserver.def \ No newline at end of file diff --git a/r-singularity/rserver.def b/r-singularity/rserver.def deleted file mode 100755 index e1ee11cf3..000000000 --- a/r-singularity/rserver.def +++ /dev/null @@ -1,19 +0,0 @@ -BootStrap: docker -From: centos:centos7 - -%post - yum install epel-release -y - yum install wget -y - yum install R -y - wget https://download2.rstudio.org/server/centos7/x86_64/rstudio-server-rhel-1.4.1717-x86_64.rpm - yum install rstudio-server-rhel-1.4.1717-x86_64.rpm -y - -%startscript - /usr/lib/rstudio-server/bin/rserver - -%labels - Author Alvaro.Vidal - Version v0.0.1 - -%help - This is a container with centos7 and R server \ No newline at end of file diff --git a/r-singularity/rserver.help b/r-singularity/rserver.help deleted file mode 100755 index b69841bec..000000000 --- a/r-singularity/rserver.help +++ /dev/null @@ -1,225 +0,0 @@ -command-line options: - -verify: - --verify-installation arg (=0) Runs verification mode to verify the - current installation. - -server: - --server-working-dir arg (=/) The default working directory of the - rserver process. - --server-user arg (=rstudio-server) The user account of the rserver - process. - --server-daemonize arg (=0) Indicates whether or not the rserver - process should run as a daemon. - --server-pid-file arg (=/var/run/rstudio-server.pid) - The path to a file where the rserver - daemon's pid is written. - --server-app-armor-enabled arg (=0) Indicates whether or not to enable - AppArmor profiles for the rserver - process. - --server-set-umask arg (=1) If enabled, sets the rserver process - umask to 022 on startup, which causes - new files to have rw-r-r permissions. - --secure-cookie-key-file arg If set, overrides the default path of - the secure-cookie-key file used for - encrypting cookies. - --server-data-dir arg (=/var/run/rstudio-server) - Path to the data directory where - RStudio Server will write run-time - state. - --server-add-header arg Adds a header to all responses from - RStudio Server. This option can be - specified multiple times to add - multiple headers. - -www: - --www-address arg (=0.0.0.0) The network address that RStudio Server - will listen on for incoming - connections. - --www-port arg The port that RStudio Server will bind - to while listening for incoming - connections. If left empty, the port - will be automatically determined based - on your SSL settings (443 for SSL, 80 - for no SSL). - --www-root-path arg (=/) The path prefix added by a proxy to the - incoming RStudio URL. This setting is - used so RStudio Server knows what path - it is being served from. If running - RStudio Server behind a path-modifying - proxy, this should be changed to match - the base RStudio Server URL. - --www-local-path arg (=www) The relative path from the RStudio - installation directory, or absolute - path where web assets are stored. - --www-symbol-maps-path arg (=www-symbolmaps) - The relative path from the RStudio - installation directory, or absolute - path, where symbol maps are stored. - --www-use-emulated-stack arg (=0) Indicates whether or not to use GWT's - emulated stack. - --www-thread-pool-size arg (=2) The size of the threadpool from which - requests will be serviced. This may be - increased to enable more concurrency, - but should only be done if the - underlying hardware has more than 2 - cores. It is recommended to use a value - that is <= to the number of hardware - cores, or <= to two times the number of - hardware cores if the hardware utilizes - hyperthreading. - --www-proxy-localhost arg (=1) Indicates whether or not to proxy - requests to localhost ports over the - main server port. This should generally - be enabled, and is used to proxy HTTP - traffic within a session that belongs - to code running within the session - (e.g. Shiny or Plumber APIs) - --www-verify-user-agent arg (=1) Indicates whether or not to verify - connecting browser user agents to - ensure they are compatible with RStudio - Server. - --www-same-site arg The value of the 'SameSite' attribute - on the cookies issued by RStudio - Server. Accepted values are 'none' or - 'lax'. The value 'none' should be used - only when RStudio is hosted into an - iFrame. For compatibility with some - browsers (i.e. Safari 12), duplicate - cookies will be issued by RStudio - Server when 'none' is used. - --www-frame-origin arg (=none) Specifies the allowed origin for the - iFrame hosting RStudio if iFrame - embedding is enabled. - --www-enable-origin-check arg (=0) If enabled, cause RStudio to enforce - that incoming request origins are from - the host domain. This can be added for - additional security. See - https://cheatsheetseries.owasp.org/chea - tsheets/Cross-Site_Request_Forgery_Prev - ention_Cheat_Sheet.html#verifying-origi - n-with-standard-headers - --www-allow-origin arg Specifies an additional origin that - requests are allowed from, even if it - does not match the host domain. Used if - origin checking is enabled. May be - specified multiple times for multiple - origins. - -rsession: - --rsession-which-r arg The path to the main R program (e.g. - /usr/bin/R). This should be set if no - versions are specified in - /etc/rstudio/r-versions and the default - R installation is not available on the - system path. - --rsession-path arg (=rsession) The relative path from the RStudio - installation directory, or absolute - path to the rsession executable. - --rldpath-path arg (=r-ldpath) The path to the r-ldpath script which - specifies extra library paths for R - versions. - --rsession-ld-library-path arg Specifies additional LD_LIBRARY_PATHs - to use for R sessions. - --rsession-config-file arg If set, overrides the path to the - /etc/rstudio/rsession.conf - configuration file. The specified path - may be a relative path from the RStudio - installation directory, or an absolute - path. - --rsession-proxy-max-wait-secs arg (=10) - The maximum time to wait in seconds for - a successful response when proxying - requests to rsession. - --rsession-memory-limit-mb arg (=0) The limit in MB that an rsession - process may consume. - --rsession-stack-limit-mb arg (=0) The limit in MB that an rsession - process may consume for its stack. - --rsession-process-limit arg (=0) The maximum number of allowable - rsession processes. - -database: - --database-config-file arg If set, overrides the path to the - /etc/rstudio/database.conf - configuration file. - --db-command arg Executes the shell command specified - injecting the current database - configuration in the command. - -auth: - --auth-none arg (=1) If set, disables multi-user - authentication. Workbench/Pro features - may not work in this mode. - --auth-validate-users arg (=0) Indicates whether or not to validate - that authenticated users exist on the - target system. Disabling this option - may cause issues to start or to run a - session. - --auth-stay-signed-in-days arg (=30) The number of days to keep a user - signed in when using the "Stay Signed - In" option. Will only take affect when - auth-timeout-minutes is 0 (disabled). - --auth-timeout-minutes arg (=60) The number of minutes a user will stay - logged in while idle before required to - sign in again. Set this to 0 (disabled) - to enable legacy timeout - auth-stay-signed-in-days. - --auth-encrypt-password arg (=1) Indicates whether or not to encrypt the - password sent from the login form. For - security purposes, we strongly - recommend you leave this enabled. - --auth-login-page-html arg (=/etc/rstudio/login.html) - The path to a file containing - additional HTML customization for the - login page. - --auth-rdp-login-page-html arg (=/etc/rstudio/rdplogin.html) - The path to a file containing - additional HTML customization for the - login page, as seen by RDP users. - --auth-required-user-group arg Specifies a group that users must be in - to be able to use RStudio. - --auth-minimum-user-id arg (=auto) Specifies a minimum user id value. - Users with a uid lower than this value - may not use RStudio. - --auth-pam-helper-path arg (=rserver-pam) - The relative path from the RStudio - installation directory, or absolute - path where the PAM helper binary - resides. - --auth-pam-require-password-prompt arg (=1) - Indicates whether or not to require the - "Password: " prompt before sending the - password via PAM. In most cases, this - should be enabled. If using a custom - PAM password prompt, you may need to - disable this setting if PAM logins do - not work correctly. - --auth-pam-requires-priv arg (=1) Deprecated - will always be true. - --auth-sign-in-throttle-seconds arg (=5) - The minimum amount of time a user must - wait before attempting to sign in again - after signing out. - --auth-revocation-list-dir arg If set, overrides the path to the - directory which contains the revocation - list to be used for storing expired - tokens. As of RStudio Server 1.4, this - has been moved to database storage, and - so this setting is deprecated, but will - be used to port over any existing - file-based expired tokens. - --auth-cookies-force-secure arg (=0) Indicates whether or not auth cookies - should be forcefully marked as secure. - This should be enabled if running an - SSL terminator infront of RStudio - Server. Otherwise, cookies will be - marked secure if SSL is configured. - -monitor: - --monitor-interval-seconds arg (=60) The interval in seconds at which the - monitor is probed for new data. - -general: - --help print help message - --test-config test to ensure the config file is valid - --config-file arg (=/etc/rstudio/rserver.conf) - configuration file diff --git a/r-singularity/tried.txt b/r-singularity/tried.txt deleted file mode 100755 index 69f680beb..000000000 --- a/r-singularity/tried.txt +++ /dev/null @@ -1,41 +0,0 @@ -# Build command: - -# Building with fake root: -# sudo singularity config fakeroot --add User.Demo -# [User.Demo@gcpslurmv2-27 ~]$ sudo singularity build --fakeroot rserver-root.sif rserver.def -# FATAL: could not use fakeroot: no mapping entry found in /etc/subuid for root - - -sudo singularity shell -s /bin/bash --net --network-args "portmap=8787:8787/tcp" -B `pwd`:`pwd` rserver.sif -# https://docs.sylabs.io/guides/3.0/user-guide/networking.html -# https://support.rstudio.com/hc/en-us/articles/200532327-Managing-RStudio-Workbench-RStudio-Server - - - - -# As User: -# [User.Demo@gcpslurmv2-27 ~]$ singularity shell -s /bin/bash -B `pwd`:`pwd` rserver.sif -# Singularity> rstudio-server start -# Failed to get D-Bus connection: Operation not permitted - -# As root: -# [User.Demo@gcpslurmv2-27 ~]$ sudo singularity shell -s /bin/bash -B `pwd`:`pwd` rserver.sif -# Singularity> rstudio-server start -# Running in chroot, ignoring request. - - -# PORTS: -# When you start the server in a VM you can see port 8787 listening but cant find the listening process: -# [root@gcpslurmv2rserver-1 User.Demo]$ netstat -tulpn | grep LISTEN | grep 8787 -# tcp 0 0 0.0.0.0:8787 0.0.0.0:* LISTEN 6439/rserver - - -# THIS WORKS!! -# https://rocker-project.org/use/singularity.html -mkdir -p run var-lib-rstudio-server -printf 'provider=sqlite\ndirectory=/var/lib/rstudio-server\n' > database.conf -singularity exec --bind run:/run,var-lib-rstudio-server:/var/lib/rstudio-server,database.conf:/etc/rstudio/database.conf rserver.sif /usr/lib/rstudio-server/bin/rserver --www-address=127.0.0.1 - -# singularity instance start --bind run:/run,var-lib-rstudio-server:/var/lib/rstudio-server,database.conf:/etc/rstudio/database.conf rserver.sif rserver - -# Tunnel command needs to run on remote node! \ No newline at end of file diff --git a/r-singularity/url.sh b/r-singularity/url.sh deleted file mode 100644 index 29a9f7bae..000000000 --- a/r-singularity/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="\"" \ No newline at end of file diff --git a/scw/kill-template.sh b/scw/kill-template.sh deleted file mode 100644 index 187ca25fa..000000000 --- a/scw/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash ${resource_jobdir}/service-kill-${job_number}.sh - diff --git a/scw/url.sh b/scw/url.sh deleted file mode 100644 index 4b902b72c..000000000 --- a/scw/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="\"" diff --git a/stream.sh b/stream.sh deleted file mode 100755 index f20c0d44d..000000000 --- a/stream.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -# Runs on the remote host - -# Inputs: -# --host localhost -# --pushpath pw/path/to/filename -# --pushfile filename -# --delay 30 -# --port ${PARSL_CLIENT_SSH_PORT} -# --masterIP internal-IP-of-controller-node - -# Exports inputs in the formart -# --a 1 --b 2 --c 3 -# to: -# export a=1 b=2 c=3 -parseArgs() { - index=1 - args="" - for arg in $@; do - prefix=$(echo "${arg}" | cut -c1-2) - if [[ ${prefix} == '--' ]]; then - pname=$(echo $@ | cut -d ' ' -f${index} | sed 's/--//g') - pval=$(echo $@ | cut -d ' ' -f$((index + 1))) - # To support empty inputs (--a 1 --b --c 3) - if [ ${pval:0:2} != "--" ]; then - echo "export ${pname}=${pval}" >> $(dirname $0)/stream-env.sh - export "${pname}=${pval}" - fi - fi - index=$((index+1)) - done -} - -parseArgs $@ - -if [ -z "${port}" ]; then - port_flag="" -else - port_flag=" -p ${port} " -fi - -sshcmd="ssh ${resource_ssh_usercontainer_options} ${port_flag} $host" - -#pushpath=$(ls ${pushpath}*) - -${sshcmd} 'cat >>"'$pushpath'"' >> logstream.out 2>&1 - -while true; do - if [ -f "$pushfile" ]; then - echo "Running" >> logstream.out 2>&1 - tail -c +1 -f "$pushfile" | ${sshcmd} 'cat >>"'$pushpath'"' >> logstream.out 2>&1 - echo CLOSING PID: $? >> logstream.out 2>&1 - exit 0 - else - echo "Preparing" >> logstream.out 2>&1 - echo "preparing inputs" | ${sshcmd} 'cat >>"'$pushpath'"' >> logstream.out 2>&1 - sleep $delay - fi -done \ No newline at end of file diff --git a/streamlit-host/url.sh b/streamlit-host/url.sh deleted file mode 100755 index 29a9f7bae..000000000 --- a/streamlit-host/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="\"" \ No newline at end of file diff --git a/turbovnc/controller-v3.sh b/turbovnc/controller-v3.sh deleted file mode 100644 index 6a5314f73..000000000 --- a/turbovnc/controller-v3.sh +++ /dev/null @@ -1,55 +0,0 @@ -if [ -z ${service_novnc_parent_install_dir} ]; then - service_novnc_parent_install_dir=${HOME}/pw/software -fi - -download_and_install() { - # 1. Clone the repository with --no-checkout - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - # Needed for emed - git config --global --unset http.sslbackend - git clone --no-checkout https://github.com/parallelworks/interactive_session.git - - # 2. Navigate into the repository directory - cd interactive_session - #git checkout download-dependencies - - # 3. Initialize sparse-checkout - git sparse-checkout init - - # 4. Configure sparse-checkout to include only the desired directory - service_novnc_tgz_repo_path="downloads/vnc/${service_novnc_tgz_basename}" - echo "${service_novnc_tgz_repo_path}" > .git/info/sparse-checkout - - # 5. Perform the checkout - git checkout - - # 6. Extract tgz - tar -zxf ${service_novnc_tgz_repo_path} -C ${service_novnc_parent_install_dir} - - # 7. Clean - cd ../ - rm -rf interactive_session - -} - -displayErrorMessage() { - echo $(date): $1 -} - -echo; echo - -mkdir -p ${service_novnc_parent_install_dir} - -service_novnc_tgz_stem=$(echo ${service_novnc_tgz_basename} | sed "s|.tar.gz||g" | sed "s|.tgz||g") -service_novnc_install_dir=${service_novnc_parent_install_dir}/${service_novnc_tgz_stem} - -if ! [ -d "${service_novnc_install_dir}" ]; then - echo "Downloading and installing ${service_novnc_install_dir}" - download_and_install -fi - -if ! [ -d "${service_novnc_install_dir}" ]; then - echo - displayErrorMessage "Failed to install ${service_novnc_install_dir}" - exit 1 -fi \ No newline at end of file diff --git a/turbovnc/kill-template.sh b/turbovnc/kill-template.sh deleted file mode 100755 index 187ca25fa..000000000 --- a/turbovnc/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash ${resource_jobdir}/service-kill-${job_number}.sh - diff --git a/turbovnc/start-template-v3.sh b/turbovnc/start-template-v3.sh deleted file mode 100755 index d730a2786..000000000 --- a/turbovnc/start-template-v3.sh +++ /dev/null @@ -1,295 +0,0 @@ -# Make sure no conda environment is activated! -# https://github.com/parallelworks/issues/issues/1081 - - -start_gnome_session_with_retries() { - k=1 - while true; do - gnome-session - sleep $((k*60)) - k=$((k+1)) - done -} - - - -if [ -z ${service_novnc_parent_install_dir} ]; then - service_novnc_parent_install_dir=${HOME}/pw/software -fi - -service_novnc_tgz_stem=$(echo ${service_novnc_tgz_basename} | sed "s|.tar.gz||g" | sed "s|.tgz||g") -service_novnc_install_dir=${service_novnc_parent_install_dir}/${service_novnc_tgz_stem} - -# Determine if the service is running in windows using WSL -kernel_version=$(uname -r | tr '[:upper:]' '[:lower:]') - -# Deactive default conda environments (required for emed) -export $(env | grep CONDA_PREFIX) -echo ${CONDA_PREFIX} - -if ! [ -z "${CONDA_PREFIX}" ]; then - echo "Deactivating conda environment" - source ${CONDA_PREFIX}/etc/profile.d/conda.sh - conda deactivate -fi - -set -x - -# Runs via ssh + sbatch -vnc_bin=vncserver - -if [[ $kernel_version == *microsoft* ]]; then - service_vnc_exec=NA -fi - -if [[ "${HOSTNAME}" == gaea* && -f /usr/lib/vncserver ]]; then - export service_vnc_exec=/usr/lib/vncserver - mkdir -p ${HOME}/.vnc/ - if [ ! -f "${HOME}/.vnc/config" ]; then - echo "securitytypes=None" > "${HOME}/.vnc/config" - else - # Check if the line is already in the file - if ! grep -Fxq "securitytypes=None" "${HOME}/.vnc/config"; then - echo "securitytypes=None" >> "${HOME}/.vnc/config" - fi - fi -fi - -# Find an available display port -if [[ $kernel_version == *microsoft* ]]; then - # In windows only this port works - displayPort=5900 -else - minPort=5901 - maxPort=5999 - for port in $(seq ${minPort} ${maxPort} | shuf); do - out=$(netstat -aln | grep LISTEN | grep ${port}) - displayNumber=${port: -2} - XdisplayNumber=$(echo ${displayNumber} | sed 's/^0*//') - if [ -z "${out}" ] && ! [ -e /tmp/.X11-unix/X${XdisplayNumber} ]; then - # To prevent multiple users from using the same available port --> Write file to reserve it - portFile=/tmp/${port}.port.used - if ! [ -f "${portFile}" ]; then - touch ${portFile} - echo "rm ${portFile}" >> ${resource_jobdir}/service-kill-${job_number}.sh - export displayPort=${port} - export DISPLAY=:${displayNumber#0} - break - fi - fi - done -fi - -if [ -z "${service_port}" ]; then - displayErrorMessage "ERROR: No service port found in the range \${minPort}-\${maxPort} -- exiting session" -fi - -# Prepare kill service script -# - Needs to be here because we need the hostname of the compute node. -# - kill-template.sh --> service-kill-${job_number}.sh --> service-kill-${job_number}-main.sh -echo "Creating file ${resource_jobdir}/service-kill-${job_number}-main.sh from directory ${PWD}" -if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo "bash ${resource_jobdir}/service-kill-${job_number}-main.sh" >> ${resource_jobdir}/service-kill-${job_number}.sh -else - # Remove .cluster.local for einteinmed! - hname=$(hostname | sed "s/.cluster.local//g") - echo "ssh ${hname} 'bash -s' < ${resource_jobdir}/service-kill-${job_number}-main.sh" >> ${resource_jobdir}/service-kill-${job_number}.sh -fi - -if [[ "${HOSTNAME}" == gaea* && -f /usr/lib/vncserver ]]; then -cat >> ${resource_jobdir}/service-kill-${job_number}-main.sh <> ${resource_jobdir}/service-kill-${job_number}-main.sh < ~/.vnc/xstartup - echo 'unset SESSION_MANAGER' >> ~/.vnc/xstartup - echo 'unset DBUS_SESSION_BUS_ADDRESS' >> ~/.vnc/xstartup - if grep -q 'ID="rocky"' /etc/os-release && grep -q 'VERSION_ID="9\.' /etc/os-release; then - # Rocky Linux 9. Prevent "Something has gone wrong" message - echo 'export XDG_SESSION_TYPE=x11' >> ~/.vnc/xstartup - echo 'export GDK_BACKEND=x11' >> ~/.vnc/xstartup - echo 'export LIBGL_ALWAYS_SOFTWARE=1' >> ~/.vnc/xstartup - else - echo '/etc/X11/xinit/xinitrc' >> ~/.vnc/xstartup - fi - chmod +x ~/.vnc/xstartup - fi - - # service_vnc_type needs to be an input to the workflow in the XML - # if vncserver is not tigervnc - if [[ "${HOSTNAME}" == gaea* && -f /usr/lib/vncserver ]]; then - # FIXME: Change ~/.vnc/config - ${service_vnc_exec} ${DISPLAY} &> ${resource_jobdir}/vncserver.log & - echo $! > ${resource_jobdir}/vncserver.pid - elif [[ ${service_vnc_type} == "turbovnc" ]]; then - ${service_vnc_exec} ${DISPLAY} -SecurityTypes None - else - # tigervnc - ${service_vnc_exec} ${DISPLAY} -SecurityTypes=None - fi - - rm -f ${resource_jobdir}/service.pid - touch ${resource_jobdir}/service.pid - - # Need this to activate pam_systemd when running under SLURM - # Otherwise we get permission denied messages when starting the - # desktop environment - if [[ ${jobschedulertype} == "SLURM" ]]; then - ssh -N -f localhost & - echo $! > ${resource_jobdir}/service.pid - fi - - mkdir -p /run/user/$(id -u)/dconf - chmod og+rx /run/user/$(id -u) - chmod 0700 /run/user/$(id -u)/dconf - - # Start desktop here too just in case - if [[ ${service_desktop} == "gnome-session" ]]; then - start_gnome_session_with_retries &> start_gnome_session_with_retries.out & - service_desktop_pid=$! - else - eval ${service_desktop} & - service_desktop_pid=$! - fi - echo "${service_desktop_pid}" >> ${resource_jobdir}/service.pid -fi - -cd ${service_novnc_install_dir} - -echo "Running ./utils/novnc_proxy --vnc ${HOSTNAME}:${displayPort} --listen ${HOSTNAME}:${service_port}" -#./utils/novnc_proxy --vnc localhost:${displayPort} --listen localhost:${service_port} /dev/null & -./utils/novnc_proxy --vnc ${HOSTNAME}:${displayPort} --listen ${HOSTNAME}:${service_port} > ${resource_jobdir}/service.pid -pid=$(ps -x | grep vnc | grep ${displayPort} | awk '{print $1}') -echo ${pid} >> ${resource_jobdir}/service.pid -rm -f ${portFile} -sleep 6 # Need this specially in controller node or second software won't show up! - -# Reload env in case it was deactivated in the step above (e.g.: conda activate) -eval "${service_load_env}" - -# Launch service -cd -if ! [ -z "${service_bin}" ]; then - if [[ ${service_background} == "False" ]]; then - echo "Running ${service_bin}" - eval ${service_bin} - else - echo "Running ${service_bin} in the background" - eval ${service_bin} & - echo $! >> ${resource_jobdir}/service.pid - fi -fi - -sleep 999999999 diff --git a/turbovnc/url.sh b/turbovnc/url.sh deleted file mode 100755 index 620132989..000000000 --- a/turbovnc/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="vnc.html?resize=remote\&autoconnect=true\&show_dot=true\&path=websockify\&password=headless\&host=\"+window.location.host+\"/me/${openPort}\"+\"\/\&dt=\"+(new Date()).getTime()" \ No newline at end of file diff --git a/utils/input_form_resource_wrapper.py b/utils/input_form_resource_wrapper.py deleted file mode 100755 index 061e2617e..000000000 --- a/utils/input_form_resource_wrapper.py +++ /dev/null @@ -1,535 +0,0 @@ -#!/usr/bin/env python3 -import json -import os -import sys -import logging -import subprocess -from base64 import b64encode -from copy import deepcopy - -""" -# Form Resource Wrapper -The code in this workflow is a wrapper to run before any other workflow in order to process and organize -the resource information. The wrapper performs the following actions: -1. Creates a directory for each resource under the job directory. -2. Creates `input.json` and `inputs.sh` files for each resource under the resource's directory. Note - that this is helpful to create code that runs on each of the resources without having to parse the - workflow arguments every time (see link below). For more information see resource inputs section below. - https://github.com/parallelworks/workflow_tutorial/blob/main/011_script_submitter_timeout_failover/main.sh -3. Creates a batch header with the PBS or SLURM directives under the resource's directory. Note that this - header can be used as the header of any script that the workflow submits to the resource. -4. Replaces the values of _replace_with_. with the corresponding value -5. Sets the variable submit_cmd to sbatch or qsub if jobscheduler type is SLURM or PBS, respectively. If - qos is present in the inputs dict it sets submit_cmd to sbatch --qos -6. Some parameters have different items (like default value, help, type) depending on other parameters. For, - example, parameter p1 may have a different default value if the resource is onprem or cloud. The form does - not support this type of logic so instead we define a parameter p1_tag_onprem and p1_tag_cloud. The resource - wrapper removes everything after _tag_ and renames the parameter to p1. -7. Calculates the --ntasks-per-node SLURM parameter required to fit a maximum number of workers per node - specified in the max_workers_per_node input parameter - - -### Workflow XML -The wrapper only works if the resources are defined using a specific format in the workflow.xml file. -1. Every resource is defined in a separate section. -2. The section name is "pwrl_", where the prefix "pwrl_" (PW resource label) is used to - indicate that the section corresponds to a resource definition section. -3. Every section may contain the following special parameters: "jobschedulertype", "scheduler_directives", - "_sch_ parameters" and "nports". -4. jobschedulertype: Select SLURM, PBS or CONTROLLER if the workflow uses this resource to run jobs on a - SLURM partition, a PBS queue or the controller node, respectively. -5. scheduler_directives: Use to type SLURM or PBS scheduler directives for the resource. Use the semicolon - character ";" to separate parameters and do not include the "#SLURM" or "#PBS" keywords. For example, - "--mem=1000;--gpus-per-node=1" or "-l mem=1000;-l nodes=1:ppn=4". -6. _sch_ parameters: These parameters are used to directly expose SLURM and PBS scheduler directives on - the input form in a way that does not require the end user to know the directives or type them using - the "scheduler_directives" parameter. A special format must be used to name these parameters. The - parameter name is directly converted to the corresponding scheduler directive. Therefore, new directives - can be added to the XML without having to modify the workflow code. - -### Resource Inputs -The wrapper uses the inputs.sh and inputs.json files to write the resources//inputs.json and -resources//inputs.sh files. These files contain the following information: -2. The resource section of the inputs.json is collapsed and any other resource section is removed, see example below. - Original inputs.json: - { - "novnc_dir": "__WORKDIR__/pw/bootstrap/noVNC-1.3.0", - "novnc_tgz": "/swift-pw-bin/apps/noVNC-1.3.0.tgz", - "pwrl_host": { - "resource": { - "id": "6419f5bd7d72b40e5b9a2af7", - "name": "gcpv2", - "status": "on", - "namespace": "alvaro", - "type": "gclusterv2", - "workdir": "/home/alvaro", - "publicIp": "35.222.63.173", - "privateIp": "10.128.0.66", - "username": "alvaro" - }, - "nports": "1", - "jobschedulertype": "CONTROLLER" - }, - "advanced_options": { - "service_name": "turbovnc", - "stream": true - } -} -resources/host/inputs.json: -{ - "resource": { - "id": "6419f5bd7d72b40e5b9a2af7", - "name": "gcpv2", - "status": "on", - "namespace": "alvaro", - "type": "gclusterv2", - "workdir": "/home/alvaro", - "publicIp": "alvaro@35.222.63.173", - "privateIp": "10.128.0.66", - "username": "alvaro", - "ports": [ - 55238 - ], - "jobdir": "/home/alvaro/pw/jobs/desktop/00023" - }, - "nports": "1", - "jobschedulertype": "CONTROLLER", - "novnc_dir": "/home/alvaro/pw/bootstrap/noVNC-1.3.0", - "novnc_tgz": "/swift-pw-bin/apps/noVNC-1.3.0.tgz", - "advanced_options": { - "service_name": "turbovnc", - "stream": true - } -} -""" - -# FIXME: There many ssh connections in this script. Reduce the number of ssh connections - -def encode_string_to_base64(text): - # Convert the string to bytes - text_bytes = text.encode('utf-8') - # Encode the bytes to base64 - encoded_bytes = b64encode(text_bytes) - # Convert the encoded bytes back to a string - encoded_string = encoded_bytes.decode('utf-8') - return encoded_string - -RESOURCES_DIR: str = 'resources' -SUPPORTED_RESOURCE_TYPES: list = ['gclusterv2', 'pclusterv2', 'azclusterv2', 'slurmshv2', 'existing', 'aws-slurm', 'google-slurm', 'azure-slurm'] -ONPREM_RESOURCE_TYPES: list = ['slurmshv2', 'existing'] -SSH_CMD: str = 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - - -def get_logger(log_file, name, level=logging.INFO): - formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s') - - # Create directory for the log file if it doesn't exist - os.makedirs(os.path.dirname(log_file), exist_ok=True) - - # Create a file handler for writing to the log file - file_handler = logging.FileHandler(log_file) - file_handler.setFormatter(formatter) - - # Create a stream handler for printing to stdout - stream_handler = logging.StreamHandler(sys.stdout) - stream_handler.setFormatter(formatter) - - # Get the logger - logger = logging.getLogger(name) - logger.setLevel(level) - - # Add both handlers to the logger - logger.addHandler(file_handler) - logger.addHandler(stream_handler) - - return logger - -os.makedirs(RESOURCES_DIR, exist_ok = True) -log_file = os.path.join(RESOURCES_DIR, os.path.basename(__file__).replace('py', 'log')) -logger = get_logger(log_file, 'resource_wrapper') - -# Given /home/alvaro/pw/jobs/vscodecodeassist/00001 -# Returns pw/jobs/vscodecodeassist/00001 -def get_pw_path(path): - marker = "pw/" - if marker in path: - return marker + path.split(marker, 1)[1] - raise ValueError("The string does not contain 'pw/'") - -def get_command_output(command): - logger.info(f'Running command <{command}>') - try: - result = subprocess.check_output(command, shell=True, universal_newlines=True) - output = result.strip() - return output - except subprocess.CalledProcessError as e: - raise(Exception(f"An error occurred while executing the command: {e}")) - - -def replace_placeholders(inputs_dict, placeholder_dict): - for ik,iv in inputs_dict.items(): - if type(iv) == str: - for pk, pv in placeholder_dict.items(): - if pk in iv: - inputs_dict[ik] =iv.replace(pk, pv) - elif type(iv) == dict: - inputs_dict[ik] = replace_placeholders(iv, placeholder_dict) - - return inputs_dict - - - -def extract_value_from_dict(string, my_dict): - """ - Extracts a value from a nested dictionary based on a hierarchical key specified in dot notation. - - Args: - string (str): A string representing a hierarchical key in dot notation. - my_dict (dict): The dictionary from which to extract the value. - - Returns: - The value located at the hierarchical key specified by the input string. - """ - keys = string.split('.') - result = my_dict - for key in keys: - result = result[key] - return result - - -def replace_assigned_values(inputs_dict, inputs_dict_orig): - keys = list(inputs_dict.keys()) - for ik in keys: #,iv in inputs_dict.items(): - iv = inputs_dict[ik] - if type(iv) == str: - if iv.startswith('_replace_with_'): - pkey = iv.replace('_replace_with_', '') - inputs_dict[ik] = extract_value_from_dict(pkey, inputs_dict_orig) - - elif type(iv) == dict: - inputs_dict[ik] = replace_assigned_values(iv, inputs_dict_orig) - - return inputs_dict - - -def workers_per_node_to_tasks_per_node(max_workers_per_node, cpus_per_node): - truncated = cpus_per_node // max_workers_per_node - remainder = cpus_per_node % max_workers_per_node - if remainder < truncated: - return truncated - else: - return truncated + 1 - -def complete_resource_information(inputs_dict): - - if not inputs_dict['resource']['publicIp']: - if not inputs_dict['resource']['privateIp']: - msg = f'No public or private IP found' - logger.error(msg) - print(f'ERROR: {msg}', flush = True) - raise(Exception(msg)) - else: - inputs_dict['resource']['publicIp'] = inputs_dict['resource']['privateIp'] - - inputs_dict['resource']['publicIp'] = inputs_dict['resource']['username'] + '@' + inputs_dict['resource']['publicIp'] - - command_to_get_home_directory = f"{SSH_CMD} {inputs_dict['resource']['publicIp']} pwd" - inputs_dict['resource']['home'] = get_command_output(command_to_get_home_directory) - - if 'workdir' in inputs_dict: - inputs_dict['resource']['workdir'] = inputs_dict['workdir'] - - if 'jobschedulertype' not in inputs_dict: - inputs_dict['jobschedulertype'] = 'CONTROLLER' - - if inputs_dict['resource']['name'] == 'user_workspace': - inputs_dict['jobschedulertype'] = 'LOCAL' - inputs_dict['resource']['workdir'] = os.path.expanduser("~") - else: - workdir = inputs_dict['resource'].get('workdir') - if not workdir or workdir == '${HOME}': - inputs_dict['resource']['workdir'] = inputs_dict['resource']['home'] - - if inputs_dict['jobschedulertype'] == 'SLURM': - if '_sch__dd_partition_e_' in inputs_dict: - partition = inputs_dict['_sch__dd_partition_e_'] - command_to_obtain_cpus_per_node=f"{SSH_CMD} {inputs_dict['resource']['publicIp']} sinfo -Nel | awk '/{partition}/ " + "{print $5}' | tail -n1" - cpus_per_node = get_command_output(command_to_obtain_cpus_per_node) - if cpus_per_node: - cpus_per_node = int(cpus_per_node) - inputs_dict['cpus_per_node'] = cpus_per_node - - - if 'cpus_per_node' in inputs_dict and 'max_workers_per_node' in inputs_dict: - max_workers_per_node = int(inputs_dict['max_workers_per_node']) - inputs_dict['_sch__dd_ntasks_d_per_d_node_e_'] = workers_per_node_to_tasks_per_node(max_workers_per_node, cpus_per_node) - - inputs_dict['submit_cmd'] = "sbatch" - inputs_dict['cancel_cmd'] = "scancel" - inputs_dict['status_cmd'] = "squeue" - if 'slurm_options' in inputs_dict: - inputs_dict['submit_cmd'] = inputs_dict['submit_cmd'] + ' ' + inputs_dict['slurm_options'] - inputs_dict['cancel_cmd'] = inputs_dict['cancel_cmd'] + ' ' + inputs_dict['slurm_options'] - inputs_dict['status_cmd'] = inputs_dict['status_cmd'] + ' ' + inputs_dict['slurm_options'] - if 'qos' in inputs_dict: - inputs_dict['submit_cmd'] = inputs_dict['submit_cmd'] + ' --qos ' + inputs_dict['qos'] - - - elif inputs_dict['jobschedulertype'] == 'PBS': - inputs_dict['submit_cmd'] = "qsub" - inputs_dict['cancel_cmd'] = "qdel" - inputs_dict['status_cmd'] = "qstat" - - - inputs_dict['resource']['jobdir'] = os.path.join( - inputs_dict['resource']['workdir'], - get_pw_path(os.getcwd()) - ) - - inputs_dict = replace_placeholders( - inputs_dict, - { - '__home__': inputs_dict['resource']['home'], - '__HOME__': inputs_dict['resource']['home'], - '__workdir__': inputs_dict['resource']['workdir'], - '__WORKDIR__': inputs_dict['resource']['workdir'], - '__user__': inputs_dict['resource']['username'], - '__USER__': inputs_dict['resource']['username'], - '__user__': os.environ['PW_USER'], - '__USER__': os.environ['PW_USER'], - '__pw_user__': os.environ['PW_USER'], - '__PW_USER__': os.environ['PW_USER'] - } - ) - - inputs_dict = replace_assigned_values(inputs_dict, inputs_dict) - return inputs_dict - -def flatten_dictionary(dictionary, parent_key='', separator='_'): - flattened_dict = {} - for key, value in dictionary.items(): - new_key = f"{parent_key}{separator}{key}" if parent_key else key - if isinstance(value, dict): - flattened_dict.update(flatten_dictionary(value, new_key, separator)) - if isinstance(value, list): - flattened_dict[new_key] = '___'.join([str(i) for i in value]) - else: - flattened_dict[new_key] = value - return flattened_dict - -def get_scheduler_directives_from_input_form(inputs_dict): - """ - The parameter names are converted to scheduler directives - # Character mapping for special scheduler parameters: - # 1. _sch_ --> '' - # 2. _d_ --> '-' - # 3. _dd_ --> '--' - # 4. _e_ --> '=' - # 5. _colon_ --> ':' - # 5. ___ --> ' ' (Not in this function) - # Get special scheduler parameters - """ - - scheduler_directives = [] - for k,v in inputs_dict.items(): - if k.startswith('_sch_'): - schd = k.replace('_sch_', '') - schd = schd.replace('_d_', '-') - schd = schd.replace('_dd_', '--') - schd = schd.replace('_e_', '=') - schd = schd.replace('_colon_', ':') - schd = schd.replace('___', ' ') - if v: - scheduler_directives.append(schd+str(v)) - - return scheduler_directives - - -def create_batch_header(inputs_dict, header_sh): - scheduler_directives = [] - - if 'scheduler_directives' in inputs_dict: - scheduler_directives = inputs_dict['scheduler_directives'].split(';') - - elif inputs_dict['jobschedulertype'] == 'SLURM': - if 'scheduler_directives_slurm' in inputs_dict: - scheduler_directives = inputs_dict['scheduler_directives_slurm'].split(';') - - elif inputs_dict['jobschedulertype'] == 'PBS': - if 'scheduler_directives_pbs' in inputs_dict: - scheduler_directives = inputs_dict['scheduler_directives_pbs'].split(';') - - if scheduler_directives: - scheduler_directives = [schd.lstrip() for schd in scheduler_directives] - - scheduler_directives += get_scheduler_directives_from_input_form(inputs_dict) - - jobdir = inputs_dict['resource']['jobdir'] - scheduler_directives += [f'-o {jobdir}/logs.out', f'-e {jobdir}/logs.out'] - jobschedulertype = inputs_dict['jobschedulertype'] - - if jobschedulertype == 'SLURM': - directive_prefix="#SBATCH" - scheduler_directives += ["--job-name={}".format(inputs_dict['job_name']), f"--chdir={jobdir}"] - elif jobschedulertype == 'PBS': - directive_prefix="#PBS" - scheduler_directives += ["-N {}".format(inputs_dict['job_name'])] - else: - return - - if 'shebang' in inputs_dict: - shebang = inputs_dict['shebang'] - else: - shebang = '#!/bin/bash' - - with open(header_sh, 'w') as f: - f.write(shebang + '\n') - for schd in scheduler_directives: - if schd: - schd.replace('___',' ') - f.write(f'{directive_prefix} {schd}\n') - -def convert_bool_to_string(bool_var): - if bool_var: - return "true" - return "false" - -def create_resource_directory(resource_inputs, resource_label): - dir = os.path.join(RESOURCES_DIR, resource_label) - inputs_json = os.path.join(dir, 'inputs.json') - inputs_sh = os.path.join(dir, 'inputs.sh') - header_sh = os.path.join(dir, 'batch_header.sh') - resource_inputs_flatten = flatten_dictionary(resource_inputs) - # Remove dictionaries - resource_inputs_flatten = {key: value for key, value in resource_inputs_flatten.items() if not isinstance(value, dict)} - - os.makedirs(dir, exist_ok=True) - - with open(inputs_json, 'w') as f: - json.dump(resource_inputs, f, indent = 4) - - with open(inputs_sh, 'w') as f: - for k,v in resource_inputs_flatten.items(): - if type(v) == bool: - v = convert_bool_to_string(v) - if type(v) == str: - v = v.replace('"', '\\"') - f.write(f"export {k}=\"{v}\"\n") - - create_batch_header(resource_inputs, header_sh) - - -def extract_resource_inputs(inputs_dict, resource_label): - """ - Extracts inputs from a dictionary, including the resource-specific data identified - by the provided resource label, along with any general inputs not associated with a resource label. - - Parameters: - inputs_dict (dict): The dictionary with the contents of /pw/jobs//inputs.json - label (str): The resource label identifying the resource-specific data to be extracted. - - Returns: - dict: A dictionary containing both the resource data corresponding to the provided label - and any general inputs not associated with a specific resource. - """ - resource_inputs = inputs_dict[f'pwrl_{resource_label}'] - - # Copy every other input with no resource label - for key, value in inputs_dict.items(): - if not key.startswith('pwrl_'): - resource_inputs[key] = value - - return resource_inputs - - - -def check_slurm(public_ip): - # Fail if slurmctld is not running - command = f'{SSH_CMD} {public_ip} ps aux | grep slurmctld | grep -v grep || echo' - is_slurmctld = get_command_output(command) - - if not is_slurmctld: - msg = f'slurmctld is not running in resource {public_ip}' - logger.error(msg) - print(f'ERROR: {msg}', flush = True) - raise(Exception(msg)) - - -def create_remote_job_directory(ip, jobdir): - mkdir_cmd =f"{SSH_CMD} {ip} mkdir -p {jobdir}" - get_command_output(mkdir_cmd) - - -def prepare_resource(inputs_dict, resource_label): - - resource_inputs = extract_resource_inputs(inputs_dict, resource_label) - - resource_inputs = complete_resource_information(resource_inputs) - resource_inputs['resource']['label'] = resource_label - - if resource_inputs['jobschedulertype'] == 'SLURM' and resource_inputs['resource']['type'] not in ONPREM_RESOURCE_TYPES: - check_slurm(resource_inputs['resource']['publicIp']) - - logger.info(json.dumps(resource_inputs, indent = 4)) - create_resource_directory(resource_inputs, resource_label) - - create_remote_job_directory(resource_inputs['resource']['publicIp'], resource_inputs['resource']['jobdir']) - - -def clean_inputs(inputs_dict): - """ - Some parameters have different items (like default value, help, type) depending on other parameters. For, - example, parameter p1 may have a different default value if the resource is onprem or cloud. The form does - not support this type of logic so instead we define a parameter p1_tag_onprem and p1_tag_cloud. The resource - wrapper removes everything after _tag_ and renames the parameter to p1. - """ - new_inputs_dict = deepcopy(inputs_dict) - - for ik,iv in inputs_dict.items(): - if '_tag_' in ik: - del new_inputs_dict[ik] - new_ik = ik.split('_tag_')[0] - else: - new_ik = ik - - if type(iv) == dict: - new_inputs_dict[new_ik] = clean_inputs(iv) - elif iv: - new_inputs_dict[new_ik] = iv - - return new_inputs_dict - -if __name__ == '__main__': - with open('inputs.json') as inputs_json: - inputs_dict = json.load(inputs_json) - - # FIXME: Remove this code when issue https://github.com/parallelworks/core/issues/5826 is resolved! - if len(sys.argv) == 2: - public_ip = sys.argv[1] - inputs_dict['pwrl_host']['resource']['publicIp'] = public_ip - ################################################################################ - - inputs_dict = clean_inputs(inputs_dict) - - # Add basic job info to inputs_dict: - pw_job_dir = os.getcwd() - inputs_dict['pw_job_dir'] = pw_job_dir - inputs_dict['job_number'] = get_pw_path(pw_job_dir).split('/')[3] - inputs_dict['job_number_int'] = int(inputs_dict['job_number']) - inputs_dict['workflow_name'] = get_pw_path(pw_job_dir).split('/')[2] - inputs_dict['job_name'] = "{}-{}".format(inputs_dict['workflow_name'], inputs_dict['job_number']) - inputs_dict['pw_user'] = os.environ.get('PW_USER') - inputs_dict['pw_platform_host'] = os.environ.get('PW_PLATFORM_HOST') - - # Find all resource labels - resource_labels = [label.replace('pwrl_','') for label in inputs_dict.keys() if label.startswith('pwrl_')] - - if not resource_labels: - logger.info('No resource labels found. Exiting wrapper.') - exit() - - logger.info('Resource labels: [{}]'.format(', '.join(resource_labels))) - - for label in resource_labels: - logger.info(f'Preparing resource <{label}>') - prepare_resource(inputs_dict, label) diff --git a/utils/kill_session.sh b/utils/kill_session.sh deleted file mode 100755 index b361456bf..000000000 --- a/utils/kill_session.sh +++ /dev/null @@ -1,11 +0,0 @@ - -# RUNS IN THE CONTROLLER NODE: -# - Kill the session script pid and its child processes -job_pid_file=${resource_jobdir}/${job_number}.pid -if [ -f "${job_pid_file}" ]; then - pid=$(cat ${job_pid_file}) - echo "Killing job=${job_number} pid=${pid}" - pkill -P ${pid} - kill ${pid} - rm ${job_pid_file} -fi diff --git a/utils/kill_tunnels.sh b/utils/kill_tunnels.sh deleted file mode 100755 index 26d4797f3..000000000 --- a/utils/kill_tunnels.sh +++ /dev/null @@ -1,10 +0,0 @@ - -# Required at least for servers running in controller node! -KILL_PORTS="__KILL_PORTS__" -for kill_port in ${KILL_PORTS}; do - tunnel_pid=$(ps -x | grep ssh | grep ${kill_port} | awk '{print $1}') - if ! [ -z "${tunnel_pid}" ]; then - echo "Killing tunnel pid ${tunnel_pid} running in ${HOSTNAME}" - kill ${tunnel_pid} - fi -done diff --git a/utils/load-env.sh b/utils/load-env.sh deleted file mode 100644 index dd3d132d8..000000000 --- a/utils/load-env.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash -# These lines should not really be necessary but they are needed on some platforms for some reason -source /etc/profile.d/parallelworks.sh -source /etc/profile.d/parallelworks-env.sh - -echod() { - echo $(date): $@ -} - - -displayErrorMessage() { - echo $(date): $1 - # Jobs use this file to determine if another job has failed - touch ERROR - exit 1 -} - -failIfError(){ - if [ -f ERROR ]; then - echo "One of the jobs failed. Exiting workflow..." - exit 1 - fi -} - -# export the users env file (for some reason not all systems are getting these upon execution) -while read LINE; do export "$LINE"; done < ~/.env - -# load kerberos if it exists -if [ -d /pw/kerberos ];then - echo "LOADING KERBEROS SSH PACKAGES" - source /pw/kerberos/source.env - which ssh kinit -fi diff --git a/utils/notify.sh b/utils/notify.sh deleted file mode 100755 index ac115dbbe..000000000 --- a/utils/notify.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash -# Run this script right before starting the server -# This script needs to run in the user container to access the environment variable PW_API_KEY -# Use "ssh usercontainer /absolute/path/to/notify.sh" to launch the script - -# Needed for now to get the PW_PLATFORM_HOST and PW_API_KEY -source /etc/profile.d/parallelworks-env.sh - -pw_job_dir=$(dirname $(dirname $0)) -status="$1" - -source ${pw_job_dir}/resources/host/inputs.sh - -url="/workflows/${workflow_name}/${job_number}/view" - -# Change job status -echo "Changing job status to ${status}" -sed -i "s/.*JOB_STATUS.*/ \"JOB_STATUS\": \"${status}\",/" ${pw_job_dir}/service.json - -if [[ "${status}" != "Running" ]]; then - exit 0 -fi - -# Send notification if status is Running -echo "Posting notification" -curl -s \ - -X POST -H "Content-Type: application/json" \ - -d "{\"title\": \"Interactive workflow ${workflow_name} job ${job_number} is ${status}\", \"href\": \"${url}\", \"type\": \"workflow\", \"subtype\": \"readyInteractive\"}" \ - https://${PW_PLATFORM_HOST}/api/v2/notifications \ - -H "Authorization: Basic $(echo ${PW_API_KEY}|base64)" - -exit 0 \ No newline at end of file diff --git a/utils/service.json b/utils/service.json deleted file mode 100644 index 354aac710..000000000 --- a/utils/service.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "URL": "", - "PORT": "", - "SLUG": "", - "JOB_STATUS": "Initializing", - "ERROR_MESSAGE": "" -} \ No newline at end of file diff --git a/utils/steps-v3/clean_and_exit.sh b/utils/steps-v3/clean_and_exit.sh deleted file mode 100755 index 8515054b5..000000000 --- a/utils/steps-v3/clean_and_exit.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -source resources/host/inputs.sh -set -x - -if [ -f "kill.sh" ]; then - # Only run if file exists. The kill.sh file is moved to _kill.sh after execution. - # This is done to prevent the file form running twice which would generate errors. - bash kill.sh -fi diff --git a/utils/steps-v3/compute/create_session_script.sh b/utils/steps-v3/compute/create_session_script.sh deleted file mode 100755 index a534fe88d..000000000 --- a/utils/steps-v3/compute/create_session_script.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -source resources/host/inputs.sh - -# Initiallize session batch file: -echo "Generating session script" -cp resources/host/batch_header.sh ${session_sh} - -echo >> ${session_sh} -cat resources/host/inputs.sh >> ${session_sh} - -cat >> ${session_sh} < job.id -hostname > target.hostname - -displayErrorMessage() { - echo \$(date): \$1 - exit 1 -} - -findAvailablePort() { - availablePort=\$(${pw_cmd_path} agent open-port) - echo \${availablePort} - if [ -z "\${availablePort}" ]; then - availablePort=ERROR - displayErrorMessage "ERROR: No service port found in the range \${minPort}-\${maxPort} -- exiting session" - fi -} - -cd ${resource_jobdir} -set -x - -# Find an available service_port. Could be anywhere in the form (_service_port) -service_port=$(env | grep service_port | cut -d'=' -f2) -if [ -z "\${service_port}" ]; then - service_port=\$(findAvailablePort) -fi -echo \${service_port} > service.port - -echo "Exit code: \$?" -echo "Starting session..." - -rm -f \${portFile} - -date -HERE - -# Add application-specific code -if [ -f "${service_name}/start-template-v3.sh" ]; then - cat ${service_name}/start-template-v3.sh >> ${session_sh} -fi - -# move the session file over -chmod +x ${session_sh} - diff --git a/utils/steps-v3/compute/launch_and_monitor_job.sh b/utils/steps-v3/compute/launch_and_monitor_job.sh deleted file mode 100755 index ce0b6495f..000000000 --- a/utils/steps-v3/compute/launch_and_monitor_job.sh +++ /dev/null @@ -1,126 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -source resources/host/inputs.sh - -# TRANSFER FILES TO REMOTE DIRECTORY -scp -p ${session_sh} ${resource_publicIp}:${resource_jobdir}/session-${job_number}.sh - -echo -echo "Submitting ${submit_cmd} request (wait for node to become available before connecting)..." -echo -echo $sshcmd ${submit_cmd} ${resource_jobdir}/session-${job_number}.sh - -# START STREAMING -${sshcmd} touch ${resource_jobdir}/logs.out -${sshcmd} tail -f ${resource_jobdir}/logs.out & -echo "kill $! # kill streaming" >> ${kill_sh} - -# Submit job and get job id -if [[ ${jobschedulertype} == "SLURM" ]]; then - jobid=$($sshcmd ${submit_cmd} ${resource_jobdir}/session-${job_number}.sh | tail -1 | awk -F ' ' '{print $4}') -elif [[ ${jobschedulertype} == "PBS" ]]; then - jobid=$($sshcmd ${submit_cmd} ${resource_jobdir}/session-${job_number}.sh) - jobid=$(echo "$jobid" | cut -d'.' -f1) -fi - -if [[ "${jobid}" == "" ]];then - displayErrorMessage "ERROR submitting job - exiting the workflow" -fi - -sed -i "/set -x/a ${cancel_cmd} ${jobid}" ${kill_ssh} - -echo -echo "Submitted job: ${jobid}" - -get_slurm_job_status() { - # Get the header line to determine the column index corresponding to the job status - if [ -z "${SQUEUE_HEADER}" ]; then - export SQUEUE_HEADER="$(eval "$sshcmd ${status_cmd}" | awk 'NR==1')" - fi - status_column=$(echo "${SQUEUE_HEADER}" | awk '{ for (i=1; i<=NF; i++) if ($i ~ /^S/) { print i; exit } }') - status_response=$(eval $sshcmd ${status_cmd} | awk -v jobid="${jobid}" '$1 == jobid') - echo "${SQUEUE_HEADER}" - echo "${status_response}" - export job_status=$(echo ${status_response} | awk -v id="${jobid}" -v col="$status_column" '{print $col}') -} - -get_pbs_job_status() { - # Get the header line to determine the column index corresponding to the job status - if [ -z "${QSTAT_HEADER}" ]; then - export QSTAT_HEADER="$(eval "$sshcmd ${status_cmd}" | awk 'NR==1')" - fi - status_response=$(eval $sshcmd ${status_cmd} 2>/dev/null | grep "\<${jobid}\>") - echo "${QSTAT_HEADER}" - echo "${status_response}" - export job_status="$(eval $sshcmd ${status_cmd} -f ${jobid} 2>/dev/null | grep job_state | cut -d'=' -f2 | tr -d ' ')" - -} - - -# Job status file writen by remote script: -ssh_max_retries=15 -ssh_retry_count=0 -status_max_retries=2 -status_retry_count=0 -export sshcmd=$(echo ${sshcmd} | sed "s|ssh|ssh -o ConnectTimeout=10|g") -while true; do - sleep 15 - # squeue won't give you status of jobs that are not running or waiting to run - # qstat returns the status of all recent jobs - if [[ ${jobschedulertype} == "SLURM" ]]; then - get_slurm_job_status - # If job status is empty job is no longer running - if [ -z "${job_status}" ]; then - # Test ssh connection to support retries for disconnected clusters - ${sshcmd} exit - if [ $? -eq 0 ]; then - status_retry_count=$((status_retry_count + 1)) - echo "Job status is empty (status attempt ${status_retry_count}/${status_max_retries})" - if [ $status_retry_count -ge $status_max_retries ]; then - job_status=$($sshcmd sacct -j ${jobid} --format=state | tail -n1) - echo "Exiting job status loop" - break - fi - else - echo "ERROR: Failed to get SLURM job status using ${sshcmd}" - echo " (ssh attempt $((ssh_retry_count + 1))/$ssh_max_retries)" - ssh_retry_count=$((ssh_retry_count + 1)) - fi - else - ssh_retry_count=0 - status_retry_count=0 - fi - elif [[ ${jobschedulertype} == "PBS" ]]; then - get_pbs_job_status - if [[ "${job_status}" == "C" ]]; then - break - elif [ -z "${job_status}" ]; then - # Test ssh connection to support retries for disconnected clusters - ${sshcmd} exit - if [ $? -eq 0 ]; then - break - else - echo "ERROR: Failed to get SLURM job status using ${sshcmd}" - echo " (ssh attempt $((ssh_retry_count + 1))/$ssh_max_retries)" - ssh_retry_count=$((ssh_retry_count + 1)) - fi - else - ssh_retry_count=0 - status_retry_count=0 - fi - fi - if [ $ssh_retry_count -ge $ssh_max_retries ]; then - echo "[ $ssh_retry_count -lt $ssh_max_retries ]" - echo "ERROR: Reached maximum ssh retries for ${sshcmd} command" - echo " SSH connection to cluster failed" - echo " Exiting workflow" - exit 2 - fi -done - -echo "Job status: ${job_status}" - -if [[ ${jobschedulertype} == "SLURM" ]]; then - $sshcmd scontrol show job ${jobid} -dd - $sshcmd sacct -j ${jobid} -fi diff --git a/utils/steps-v3/controller/create_session_script.sh b/utils/steps-v3/controller/create_session_script.sh deleted file mode 100755 index cda1493ec..000000000 --- a/utils/steps-v3/controller/create_session_script.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -source resources/host/inputs.sh - -# Initiallize session batch file: -echo "Generating session script" -echo "#!/bin/bash" > ${session_sh} -cat resources/host/inputs.sh >> ${session_sh} -# Need this on some systems when running code with ssh -# - CAREFUL! This command can change your ${PWD} directory -echo "source ~/.bashrc" >> ${session_sh} - -if ! [ -z "${resource_jobdir}" ] && ! [[ "${resource_jobdir}" == "default" ]]; then - echo "mkdir -p ${resource_jobdir}" >> ${session_sh} - echo "cd ${resource_jobdir}" >> ${session_sh} -fi - - -cat >> ${session_sh} < ${job_number}.pid - -if [ -z "\${service_port}" ]; then - service_port=\$(findAvailablePort) -fi -echo \${service_port} > service.port - -echo -echo -echo "STARTING SERVICE" -echo -date -HERE - - -# Add application-specific code -if [ -f "${service_name}/start-template-v3.sh" ]; then - cat "${service_name}/start-template-v3.sh" >> ${session_sh} -fi - -# Note that job is no longer running -echo >> ${session_sh} - -chmod +x ${session_sh} \ No newline at end of file diff --git a/utils/steps-v3/controller/launch_and_monitor_job.sh b/utils/steps-v3/controller/launch_and_monitor_job.sh deleted file mode 100755 index 9a833f689..000000000 --- a/utils/steps-v3/controller/launch_and_monitor_job.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -source resources/host/inputs.sh - -if [[ "${use_screen}" == "true" ]]; then - scp -p ${session_sh} ${resource_publicIp}:${resource_jobdir}/session-${job_number}.sh - - # START STREAMING - ${sshcmd} touch ${resource_jobdir}/logs.out - ${sshcmd} tail -f ${resource_jobdir}/logs.out & - echo "kill \$(ps -x | grep tail | grep ${resource_jobdir}/logs.out | awk '{print \$1}')" >> ${kill_ssh} - echo "kill $! # kill streaming" >> ${kill_sh} - - # Launch job - screen_session_name="${workflow_name}-${job_number}" - echo "Submitting session using screen command" - $sshcmd "screen -dmS ${screen_session_name} bash -c \"${resource_jobdir}/session-${job_number}.sh > ${resource_jobdir}/logs.out 2>&1\"" - - # Prepare cleanup script - echo "screen -X -S ${screen_session_name} quit" >> ${kill_ssh} - - # Initialize retry counter - retry_count=0 - max_retries=5 - while true; do - # Check if the screen session exists on the remote host - if ssh "${resource_publicIp}" screen -list | grep ${screen_session_name} > /dev/null 2>&1; then - echo "$(date) ${screen_session_name} screen session is running on ${resource_publicIp}" >> screen-session.log 2>&1 - retry_count=0 - else - echo "$(date) ${screen_session_name} screen session was not found on ${resource_publicIp}" 2>&1 | tee -a screen-session.log - retry_count=$((retry_count + 1)) - fi - - # Exit after 5 retries - if [ "$retry_count" -ge "$max_retries" ]; then - echo "$(date) Maximum retries reached, exiting." 2>&1 | tee -a screen-session.log - break - fi - - sleep 60 - done - -else - echo "Submitting ssh job (wait for node to become available before connecting)..." - echo "$sshcmd 'bash -s' < ${session_sh}" - echo - - # Run service - $sshcmd 'bash -s' < ${session_sh} #&> ${pw_job_dir}/session-${job_number}.out -fi diff --git a/utils/steps-v3/preprocessing/controller_preprocessing.sh b/utils/steps-v3/preprocessing/controller_preprocessing.sh deleted file mode 100755 index 24d510530..000000000 --- a/utils/steps-v3/preprocessing/controller_preprocessing.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -# Runs the /controller-v3.sh script in the controller node which is -# used to install software or run other speficic steps in the controller -source utils/load-env.sh -source resources/host/inputs.sh - -set -x - -if [ -f "${service_name}/controller-v3.sh" ]; then - echo; echo; echo "RUNNING PREPROCESSING STEP" - echo '#!/bin/bash' > controller-v3.sh - cat resources/host/inputs.sh >> controller-v3.sh - cat ${service_name}/controller-v3.sh >> controller-v3.sh - echo "$sshcmd 'bash -s' < controller-v3.sh" - $sshcmd 'bash -s' < controller-v3.sh -fi diff --git a/utils/steps-v3/preprocessing/initialize_cancel_script.sh b/utils/steps-v3/preprocessing/initialize_cancel_script.sh deleted file mode 100755 index ce148362c..000000000 --- a/utils/steps-v3/preprocessing/initialize_cancel_script.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -source resources/host/inputs.sh - -set -x - -# CREATE KILL FILE: -# - NEEDS TO BE MADE BEFORE RUNNING SESSION SCRIPT! -# - When the job is killed PW runs ${pw_job_dir}/kill.sh - -# KILL_SSH: Part of the kill_sh that runs on the remote host with ssh -echo "#!/bin/bash" > ${kill_ssh} -cat resources/host/inputs.sh >> ${kill_ssh} -if [ -f "${service_name}/kill-template.sh" ]; then - echo "Adding kill server script ${service_name}/kill-template.sh to ${kill_ssh}" - cat ${service_name}/kill-template.sh >> ${kill_ssh} -fi -cat utils/kill_session.sh >> ${kill_ssh} - - -# KILL_SH: File that runs on the user space -job_number_to_clean=$((job_number_int-10)) -formatted_job_number_to_clean=$(printf "%05d\n" "${job_number_to_clean}") -job_to_clean="/pw/jobs/${workflow_name}/${formatted_job_number_to_clean}" -# Use this file to verify if the job to clean is completed or not -completed_kill_sh=${job_to_clean}/kill.sh.completed - -echo "#!/bin/bash" > ${kill_sh} -echo "cp ${kill_sh} ${kill_sh}.completed" >> ${kill_sh} -echo "echo ${kill_sh} was already executed" >> ${kill_sh} - -cat resources/host/inputs.sh >> ${kill_sh} -if [ "${job_number_to_clean}" -gt 0 ] && [ -f "${completed_kill_sh}" ]; then - echo "trap \"rm -rf ${job_to_clean}\" EXIT" >> ${kill_sh} -fi - -echo "echo Running ${kill_sh}" >> ${kill_sh} -# Add kill_ssh -cat >> ${kill_sh} <> ${kill_sh} -chmod 777 ${kill_sh} \ No newline at end of file diff --git a/utils/steps-v3/preprocessing/input_form_resource_wrapper.sh b/utils/steps-v3/preprocessing/input_form_resource_wrapper.sh deleted file mode 100755 index 4a01e8728..000000000 --- a/utils/steps-v3/preprocessing/input_form_resource_wrapper.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -sed -i 's|\\\\|\\|g' inputs.sh -source inputs.sh - -set -x - -python3 ./utils/input_form_resource_wrapper.py $1 - -if [ $? -ne 0 ]; then - displayErrorMessage "ERROR - Resource wrapper failed" -fi - -if ! [ -f "resources/host/inputs.sh" ]; then - displayErrorMessage "ERROR - Missing file ./resources/host/inputs.sh. Resource wrapper failed" -fi diff --git a/utils/steps-v3/preprocessing/process_inputs_sh.sh b/utils/steps-v3/preprocessing/process_inputs_sh.sh deleted file mode 100755 index 8924250a0..000000000 --- a/utils/steps-v3/preprocessing/process_inputs_sh.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -source utils/load-env.sh -source resources/host/inputs.sh - -set -x - - -export sshcmd="ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=5 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${resource_publicIp}" -echo "export sshcmd=\"${sshcmd}\"" >> resources/host/inputs.sh - -# Obtain the service_name from any section of the XML -export service_name=$(cat resources/host/inputs.sh | grep service_name | cut -d'=' -f2 | tr -d '"') -echo "export service_name=${service_name}" >> resources/host/inputs.sh - -if ! [ -d "${service_name}" ]; then - displayErrorMessage "ERROR: Directory ${service_name} was not found --> Service ${service_name} is not supported --> Exiting workflow" - exit 1 -fi - -sed -i "s/__job_number__/${job_number}/g" resources/host/inputs.sh - -# Paths to the scripts to kill the jobs -echo "export kill_ssh=${pw_job_dir}/kill_ssh.sh" >> resources/host/inputs.sh -echo "export kill_sh=${pw_job_dir}/kill.sh" >> resources/host/inputs.sh - -# Path to the session script -echo "export session_sh=${pw_job_dir}/session.sh" >> resources/host/inputs.sh - -# Obtain path to pw command -#pw_cmd_path=$(${sshcmd} 'PATH=$HOME/pw:$PATH which pw') -pw_cmd_path=$(${sshcmd} 'bash -c "PATH=$HOME/pw:$PATH; which pw"') - -if [ -z "${pw_cmd_path}" ]; then - echo "$(date) ERROR: Unable to find pw client in the PATH" - exit 1 -fi - -echo "export pw_cmd_path=${pw_cmd_path}" >> resources/host/inputs.sh diff --git a/utils/steps-v3/preprocessing/transfer_files_to_controller.sh b/utils/steps-v3/preprocessing/transfer_files_to_controller.sh deleted file mode 100755 index 8ffb3ae5d..000000000 --- a/utils/steps-v3/preprocessing/transfer_files_to_controller.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -# Runs the /rsync.sh script in the user container to transfer -# files to the controller nodes -source utils/load-env.sh -source resources/host/inputs.sh - -set -x - -if [ -f "${service_name}/transfer_files.sh" ]; then - echo; echo; echo "TRASFERRING FILES TO CONTROLLER" - echo '#!/bin/bash' > transfer_files.sh - cat resources/host/inputs.sh >> transfer_files.sh - cat ${service_name}/transfer_files.sh >> transfer_files.sh - chmod +x transfer_files.sh - ./transfer_files.sh -fi \ No newline at end of file diff --git a/vncserver/controller-v3.sh b/vncserver/controller-v3.sh deleted file mode 100644 index a55127c2c..000000000 --- a/vncserver/controller-v3.sh +++ /dev/null @@ -1,148 +0,0 @@ -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - -if [ -z "${service_vncserver_sif}" ]; then - service_vncserver_sif=${service_parent_install_dir}/vncserver.sif -fi - - -download_and_install_novnc() { - # 1. Clone the repository with --no-checkout - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - # Needed for emed - git config --global --unset http.sslbackend - git clone --no-checkout https://github.com/parallelworks/interactive_session.git - - # 2. Navigate into the repository directory - cd interactive_session - #git checkout download-dependencies - - # 3. Initialize sparse-checkout - git sparse-checkout init - - # 4. Configure sparse-checkout to include only the desired directory - service_novnc_tgz_repo_path="downloads/vnc/${service_novnc_tgz_basename}" - echo "${service_novnc_tgz_repo_path}" > .git/info/sparse-checkout - - # 5. Perform the checkout - git checkout - - # 6. Extract tgz - tar -zxf ${service_novnc_tgz_repo_path} -C ${service_parent_install_dir} - - # 7. Clean - cd ../ - rm -rf interactive_session - -} - -download_singularity_container() { - local repo_path=$1 - local host_path=$2 - # 1. Clone the repository with --no-checkout - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - # Needed for emed - git config --global --unset http.sslbackend - git clone --no-checkout https://github.com/parallelworks/interactive_session.git - - # 2. Navigate into the repository directory - cd interactive_session - #git checkout download-dependencies - - # 3. Initialize sparse-checkout - git sparse-checkout init - - # 4. Configure sparse-checkout to include only the desired file - echo ${repo_path} > .git/info/sparse-checkout - - # 5. Perform the checkout - git checkout - - # 6. Extract tgz - mv ${repo_path} ${host_path} - - # 7. Clean - cd ../ - rm -rf interactive_session -} - -download_oras(){ - VER="1.2.0" # example — replace with newest - wget https://github.com/oras-project/oras/releases/download/v${VER}/oras_${VER}_linux_amd64.tar.gz - mkdir -p ${service_parent_install_dir}/oras - tar -xvf oras_${VER}_linux_amd64.tar.gz -C ${service_parent_install_dir}/oras - rm oras_${VER}_linux_amd64.tar.gz -} - -oras_pull_file(){ - repo=$1 - repo_path=$2 - host_path=$3 - ${service_parent_install_dir}/oras/oras pull ${repo} - mv ${repo_path} ${host_path} -} - -displayErrorMessage() { - echo $(date): $1 -} - -echo; echo - -mkdir -p ${service_parent_install_dir} - -service_novnc_tgz_stem=$(echo ${service_novnc_tgz_basename} | sed "s|.tar.gz||g" | sed "s|.tgz||g") -service_novnc_install_dir=${service_parent_install_dir}/${service_novnc_tgz_stem} - -if ! [ -d "${service_novnc_install_dir}" ]; then - echo "Downloading and installing ${service_novnc_install_dir}" - download_and_install_novnc -fi - -# Download nginx singularity container -if ! [ -f "${service_nginx_sif}" ]; then - echo; echo "Downloading nginx singularity from Github" - download_singularity_container downloads/jupyter/nginx-unprivileged.sif ${service_nginx_sif} -fi - -if ! [ -d "${service_novnc_install_dir}" ]; then - echo - displayErrorMessage "Failed to install ${service_novnc_install_dir}" - exit 1 -fi - -# Download vnserver container if vncserver is missing -if [[ "${HOSTNAME}" == gaea* && -f /usr/lib/vncserver ]]; then - export service_vnc_exec=/usr/lib/vncserver -fi - -# The reason we need service_download_vncserver_container is: -# - vncserver can be installed in the compute nodes but not in the controlle nodes -# - Some compute nodes don't have access to the internet -if [[ ${service_download_vncserver_container} == "true" ]]; then - if [ ! -s ${service_vncserver_sif} ]; then - wget -O ${service_vncserver_sif} https://github.com/parallelworks/interactive_session/raw/main/downloads/vnc/vncserver.sif - fi - if [ ! -s ${service_vncserver_sif} ]; then - echo "$(date) WARNING: Failed to download file ${service_vncserver_sif} from GitHub repository" - echo "$(date) Using GitHub registry to download file" - download_oras - oras_pull_file ghcr.io/avidalto/vncserver-sif:1.0 downloads/vnc/vncserver.sif ${service_parent_install_dir}/vncserver.sif - fi - if [ ! -s ${service_vncserver_sif} ]; then - echo "$(date) ERROR: Failed to download file ${service_vncserver_sif}" - exit 1 - fi - chmod +x ${service_vncserver_sif} - - xterm_path=$(which xterm) - if ! [ -z ${xterm_path} ]; then - cp ${xterm_path} ${service_parent_install_dir}/xterm - chmod +x ${service_parent_install_dir}/xterm - fi -fi diff --git a/vncserver/kill-template.sh b/vncserver/kill-template.sh deleted file mode 100755 index dfc189839..000000000 --- a/vncserver/kill-template.sh +++ /dev/null @@ -1,11 +0,0 @@ -set -x -# Runs in the controller node: -if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo "Running "${resource_jobdir}/cancel.sh"" - bash "${resource_jobdir}/cancel.sh" -else - compute_node=$(cat ${resource_jobdir}/target.hostname | sed "s/.cluster.local//g") - # Running the ssh command directly is not working - ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${compute_node} "${resource_jobdir}/cancel.sh" || echo "WARNING: ssh returned non-zero exit code but continuing..." -fi - diff --git a/vncserver/start-template-v3.sh b/vncserver/start-template-v3.sh deleted file mode 100755 index fbcf84b71..000000000 --- a/vncserver/start-template-v3.sh +++ /dev/null @@ -1,593 +0,0 @@ -# Make sure no conda environment is activated! -# https://github.com/parallelworks/issues/issues/1081 - - -start_rootless_docker() { - local MAX_RETRIES=20 - local RETRY_INTERVAL=2 - local ATTEMPT=1 - - export XDG_RUNTIME_DIR=/run/user/$(id -u) - dockerd-rootless-setuptool.sh install - - # Run Docker rootless daemon — use screen if available, otherwise run in background - if command -v screen >/dev/null 2>&1; then - echo "$(date): Starting Docker rootless daemon in a screen session..." - screen -dmS docker-rootless bash -c "PATH=/usr/bin:/sbin:/usr/sbin:\$PATH dockerd-rootless.sh --exec-opt native.cgroupdriver=cgroupfs > ~/docker-rootless.log 2>&1" - else - echo "$(date): 'screen' not found, starting Docker rootless daemon in background..." - PATH=/usr/bin:/sbin:/usr/sbin:$PATH dockerd-rootless.sh --exec-opt native.cgroupdriver=cgroupfs > ~/docker-rootless.log 2>&1 & - fi - - # Wait for Docker daemon to be ready - until docker info > /dev/null 2>&1; do - if [ $ATTEMPT -le $MAX_RETRIES ]; then - echo "$(date) Attempt $ATTEMPT of $MAX_RETRIES: Waiting for Docker daemon to start..." - sleep $RETRY_INTERVAL - ((ATTEMPT++)) - else - echo "$(date) ERROR: Docker daemon failed to start after $MAX_RETRIES attempts." - return 1 - fi - done - - echo "$(date): Docker daemon is ready!" - return 0 -} - -run_xterm_loop(){ - while true; do - echo "$(date): Starting xterm" - ${service_parent_install_dir}/xterm -fa "DejaVu Sans Mono" -fs 12 - sleep 1 - done -} - -################### -# PREPARE CLEANUP # -################### - -echo '#!/bin/bash' > "${resource_jobdir}/cancel.sh" -chmod +x "${resource_jobdir}/cancel.sh" -echo "mv "${resource_jobdir}/cancel.sh" "${resource_jobdir}/cancel.sh".executed" >> "${resource_jobdir}/cancel.sh" -if ![ -z "${SLURM_JOB_ID}" ]; then - echo "scancel ${SLURM_JOB_ID}" >> "${resource_jobdir}/cancel.sh" -fi -################### -################### - -start_gnome_session_with_retries() { - k=1 - while true; do - if xset q >/dev/null 2>&1; then - echo "(date) X server on $DISPLAY is alive." - sleep $((k*10)) - else - echo "(date) X server on $DISPLAY is unresponsive." - if [ $k -gt 1 ]; then - echo "$(date) Restarting vncserver" - ${service_vnc_exec} -kill ${DISPLAY} - sleep 3 - ${service_vnc_exec} ${DISPLAY} -SecurityTypes VncAuth -PasswordFile ${resource_jobdir}/.vncpasswd - fi - sleep 2 - echo "$(date) Starting gnome-session" - gnome-session --debug - sleep $((k*10)) - fi - k=$((k+1)) - done -} - - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -if [ -z "${service_nginx_sif}" ]; then - service_nginx_sif=${service_parent_install_dir}/nginx-unprivileged.sif -fi - -if [ -z "${service_vncserver_sif}" ]; then - service_vncserver_sif=${service_parent_install_dir}/vncserver.sif -fi - -service_novnc_tgz_stem=$(echo ${service_novnc_tgz_basename} | sed "s|.tar.gz||g" | sed "s|.tgz||g") -service_novnc_install_dir=${service_parent_install_dir}/${service_novnc_tgz_stem} - -# Determine if the service is running in windows using WSL -kernel_version=$(uname -r | tr '[:upper:]' '[:lower:]') - -# Deactive default conda environments (required for emed) -export $(env | grep CONDA_PREFIX) -echo ${CONDA_PREFIX} - -if ! [ -z "${CONDA_PREFIX}" ]; then - echo "Deactivating conda environment" - source ${CONDA_PREFIX}/etc/profile.d/conda.sh - conda deactivate - export LD_LIBRARY_PATH=$(echo "$LD_LIBRARY_PATH" | tr ':' '\n' | grep -v 'conda' | tr '\n' ':' | sed 's/:$//') -fi - -set -x - -# Find an available display port -minPort=5901 -maxPort=5999 -for port in $(seq ${minPort} ${maxPort} | shuf); do - out=$(netstat -aln | grep LISTEN | grep ${port}) - displayNumber=${port: -2} - XdisplayNumber=$(echo ${displayNumber} | sed 's/^0*//') - if [ -z "${out}" ] && ! [ -e /tmp/.X11-unix/X${XdisplayNumber} ] && ! [ -e /tmp/.X${XdisplayNumber}-lock ]; then - # To prevent multiple users from using the same available port --> Write file to reserve it - portFile=/tmp/${port}.port.used - if ! [ -f "${portFile}" ]; then - touch ${portFile} - echo "rm ${portFile}" >> cancel.sh - export displayPort=${port} - export DISPLAY=:${displayNumber#0} - break - fi - fi -done - -date - -if [[ "${HOSTNAME}" == gaea* && -f /usr/lib/vncserver ]]; then - export service_vnc_exec=/usr/lib/vncserver - # vncserver -list does not work - export service_vnc_type="TigerVNC" - mkdir -p ${HOME}/.vnc/ - if [ ! -f "${HOME}/.vnc/config" ]; then - echo "securitytypes=None" > "${HOME}/.vnc/config" - else - # Check if the line is already in the file - if ! grep -Fxq "securitytypes=None" "${HOME}/.vnc/config"; then - echo "securitytypes=None" >> "${HOME}/.vnc/config" - fi - fi -fi - -if [ -z "${service_vnc_exec}" ]; then - service_vnc_exec=$(which vncserver) -fi - -if [ -z ${service_vnc_exec} ] || ! [ -f "${service_vnc_exec}" ]; then - if [[ ${service_download_vncserver_container} != "true" ]]; then - echo "$(date) ERROR: No vncserver command found" - exit 1 - fi - if ! which singularity > /dev/null 2>&1; then - echo "(date) ERROR: No vncserver or singularity command found" - exit 1 - fi - echo "$(date): vncserver is not installed. Using singularity container..." - singularity_exec="singularity run --writable-tmpfs --bind /tmp/.X11-unix:/tmp/.X11-unix --bind ${HOME}:${HOME} ${service_vncserver_sif}" - service_vnc_exec="${singularity_exec} vncserver" - service_vnc_type="SingularityTurboVNC" - service_desktop="echo Starting no service desktop on the host" - mkdir -p /tmp/.X11-unix - rm -f ~/.vnc/xstartup.turbovnc -cat >> ~/.vnc/xstartup.turbovnc <> ${resource_jobdir}/vncserver.sh <> "${resource_jobdir}/cancel.sh" <> "${resource_jobdir}/cancel.sh" <> cancel.sh - - # To prevent the process from being killed at startime - if [ -f "${HOME}/.vnc/xstartup" ]; then - sed -i '/vncserver -kill $DISPLAY/ s/^#*/#/' ~/.vnc/xstartup - else - echo '#!/bin/sh' > ~/.vnc/xstartup - echo 'unset SESSION_MANAGER' >> ~/.vnc/xstartup - echo 'unset DBUS_SESSION_BUS_ADDRESS' >> ~/.vnc/xstartup - if grep -q 'ID="rocky"' /etc/os-release && grep -q 'VERSION_ID="9\.' /etc/os-release; then - # Rocky Linux 9. Prevent "Something has gone wrong" message - echo 'export XDG_SESSION_TYPE=x11' >> ~/.vnc/xstartup - echo 'export GDK_BACKEND=x11' >> ~/.vnc/xstartup - echo 'export LIBGL_ALWAYS_SOFTWARE=1' >> ~/.vnc/xstartup - else - echo '/etc/X11/xinit/xinitrc' >> ~/.vnc/xstartup - fi - chmod +x ~/.vnc/xstartup - fi - - # service_vnc_type needs to be an input to the workflow in the YAML - # if vncserver is not tigervnc - - # Set password - printf "${password}\n${password}\n\n" | vncpasswd -f > ${resource_jobdir}/.vncpasswd - chmod 600 ${resource_jobdir}/.vncpasswd - - if [[ "${HOSTNAME}" == gaea* && -f /usr/lib/vncserver ]]; then - # FIXME: Change ~/.vnc/config - ${service_vnc_exec} ${DISPLAY} &> ${resource_jobdir}/vncserver.log & - echo $! > ${resource_jobdir}/vncserver.pid - else - ${service_vnc_exec} ${DISPLAY} -SecurityTypes VncAuth -PasswordFile ${resource_jobdir}/.vncpasswd - fi - - rm -f ${resource_jobdir}/service.pid - touch ${resource_jobdir}/service.pid - - # Need this to activate pam_systemd when running under SLURM - # Otherwise we get permission denied messages when starting the - # desktop environment - if [[ ${jobschedulertype} == "SLURM" ]]; then - ssh -N -f localhost & - echo $! > ${resource_jobdir}/service.pid - fi - - mkdir -p /run/user/$(id -u)/dconf - chmod og+rx /run/user/$(id -u) - chmod 0700 /run/user/$(id -u)/dconf - - # Start desktop here too just in case - if [[ ${service_desktop} == "gnome-session" ]]; then - start_gnome_session_with_retries &> start_gnome_session_with_retries.out & - service_desktop_pid=$! - else - eval ${service_desktop} & - service_desktop_pid=$! - fi - echo "${service_desktop_pid}" >> ${resource_jobdir}/service.pid - - cd ${service_novnc_install_dir} - ./utils/novnc_proxy --vnc ${HOSTNAME}:${displayPort} --listen ${HOSTNAME}:${service_port} > ${resource_jobdir}/service.pid - pid=$(ps -x | grep vnc | grep ${displayPort} | awk '{print $1}') - echo ${pid} >> ${resource_jobdir}/service.pid - rm -f ${portFile} -elif [[ "${service_vnc_type}" == "SingularityTurboVNC" ]]; then - # Start service - mkdir -p ~/.vnc - echo "${service_vnc_exec} -kill ${DISPLAY}" >> cancel.sh - ${singularity_exec} ${resource_jobdir}/vncserver.sh | tee -a vncserver.out & - #echo "kill $! # singularity run" >> cancel.sh - - cd ${service_novnc_install_dir} - ./utils/novnc_proxy --vnc ${HOSTNAME}:${displayPort} --listen ${HOSTNAME}:${service_port} > cancel.sh - pid=$(ps -x | grep vnc | grep ${displayPort} | awk '{print $1}') - echo ${pid} >> ${resource_jobdir}/service.pid - rm -f ${portFile} - - # Run xterm in a loop so that users can access a terminal directly in the main host - cd - run_xterm_loop | tee -a ${resource_jobdir}/xterm.out & - echo "kill $! # run_xterm_loop" >> cancel.sh - - -elif [[ "${service_vnc_type}" == "KasmVNC" ]]; then - ########### - # KasmVNC # - ########### - export kasmvnc_port=$(findAvailablePort) - export XDG_RUNTIME_DIR="" - - if [ "${service_set_password}" != true ]; then - service_password=password - disableBasicAuth="-disableBasicAuth" - fi - #expect -c 'spawn vncpasswd -u '"${USER}"' -w -r; expect "Password:"; send "'"${service_password}"'\r"; expect "Verify:"; send "'"${service_password}"'\r"; expect eof' - printf "%s\n%s\n" "$service_password" "$service_password" | vncpasswd -u "$USER" -w -r - - - ${service_vnc_exec} -kill ${DISPLAY} - echo "${service_vnc_exec} -kill ${DISPLAY}" >> cancel.sh - - MAX_RETRIES=5 - RETRY_DELAY=5 - RETRY_COUNT=0 - - vncserver_cmd="${service_vnc_exec} ${DISPLAY} ${disableBasicAuth} -select-de gnome -websocketPort ${kasmvnc_port} -rfbport ${displayPort}" - echo Running: - echo ${vncserver_cmd} - while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do - ${vncserver_cmd} - if [ $? -eq 0 ]; then - echo "KasmVNC server started successfully." - break - else - echo "KasmVNC server failed to start. Retrying in $RETRY_DELAY seconds..." - ls -l /etc/pki/tls/private/kasmvnc.pem - sleep $RETRY_DELAY - fi - - RETRY_COUNT=$((RETRY_COUNT + 1)) - done - - rm -rf ${portFile} - - if ! [ -f "${HOME}/.vnc/$(hostname)${DISPLAY}.pid" ]; then - echo $(date): "KasmVNC server failed to start. Exiting workflow." - exit 1 - fi - - vncserver_pid=$(cat "${HOME}/.vnc/$(hostname)${DISPLAY}.pid") - echo "kill ${vncserver_pid} #${HOME}/.vnc/$(hostname)${DISPLAY}.pid" >> cancel.sh - cat "${HOME}/.vnc/$(hostname)${DISPLAY}.log" >> cancel.sh - echo "rm \"${HOME}/.vnc/$(hostname)${DISPLAY}*\"" >> cancel.sh - cat ${HOME}/.vnc/$(hostname)${DISPLAY}.log - - ####################### - # START NGINX WRAPPER # - ####################### - - proxy_port=${kasmvnc_port} - proxy_host="127.0.0.1" - if which docker >/dev/null 2>&1 && [[ "${service_rootless_docker}" == "true" ]]; then - if ! dockerd-rootless-setuptool.sh check; then - echo "$(date) ERROR: Rootless docker is NOT support on this system" - exit 1 - fi - if ! which socat >/dev/null 2>&1; then - echo "$(date) ERROR: socat is not installed" - exit 1 - fi - start_rootless_docker - # Need to run this for the container to be able to access the port on the host's network - proxy_port=$(findAvailablePort) - proxy_host=$(hostname -I | xargs) - - socat TCP-LISTEN:${proxy_port},reuseaddr,fork,bind=0.0.0.0 TCP:127.0.0.1:${kasmvnc_port} >> socat.logs 2>&1 & - pid=$! - echo "kill ${pid} #socat" >> cancel.sh - fi - echo "Starting nginx wrapper on service port ${service_port}" - - # Write config file - cat >> config.conf <> nginx.conf </dev/null 2>&1 && [[ "${service_rootless_docker}" == "true" ]]; then - container_name="nginx-${service_port}" - touch empty - touch nginx.logs - echo "docker volume rm ${container_name}" >> cancel.sh - docker volume create ${container_name} - echo "docker stop ${container_name}" >> cancel.sh - echo "docker rm ${container_name}" >> cancel.sh - chmod 644 ${PWD}/{nginx.conf,config.conf,empty} - docker run -d --name ${container_name} \ - --add-host=host.docker.internal:host-gateway \ - -p ${service_port}:${service_port} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - nginxinc/nginx-unprivileged:1.25.3 - # Print logs - docker logs ${container_name} - elif sudo -n true 2>/dev/null && which docker >/dev/null 2>&1; then - container_name="nginx-${service_port}" - # Remove container when job is canceled - echo "sudo docker stop ${container_name}" >> cancel.sh - echo "sudo docker rm ${container_name}" >> cancel.sh - # Start container - sudo service docker start - touch empty - touch nginx.logs - # change ownership to nginx user - sudo chown 101:101 nginx.conf config.conf empty nginx.logs - sudo chmod 644 *.conf - sudo docker run -d --name ${container_name} \ - -v $PWD/config.conf:/etc/nginx/conf.d/config.conf \ - -v $PWD/nginx.conf:/etc/nginx/nginx.conf \ - -v $PWD/empty:/etc/nginx/conf.d/default.conf \ - -v $PWD/nginx.logs:/var/log/nginx/access.log \ - -v $PWD/nginx.logs:/var/log/nginx/error.log \ - --network=host nginxinc/nginx-unprivileged:1.25.3 - # Print logs - sudo docker logs ${container_name} - elif which singularity >/dev/null 2>&1; then - echo "Running singularity container ${service_nginx_sif}" - # We need to mount $PWD/tmp:/tmp because otherwise nginx writes the file /tmp/nginx.pid - # and other users cannot use the node. Was not able to change this in the config.conf. - mkdir -p ./tmp - # Need to overwrite default configuration! - touch empty - singularity run -B $PWD/tmp:/tmp -B $PWD/config.conf:/etc/nginx/conf.d/config.conf -B $PWD/nginx.conf:/etc/nginx/nginx.conf -B empty:/etc/nginx/conf.d/default.conf ${service_nginx_sif} >> nginx.logs 2>&1 & - pid=$! - echo "kill ${pid}" >> cancel.sh - else - displayErrorMessage "Need Docker or Singularity to start NGINX proxy" - fi -fi - - -sleep 6 # Need this specially in controller node or second software won't show up! - -date - -# Reload env in case it was deactivated in the step above (e.g.: conda activate) -eval "${service_load_env}" - -# Launch service -cd -if ! [ -z "${service_bin}" ]; then - if [[ ${service_background} == "False" ]]; then - echo "Running ${service_bin}" - eval ${service_bin} - else - echo "Running ${service_bin} in the background" - eval ${service_bin} & - echo $! >> ${resource_jobdir}/service.pid - fi -fi - -sleep inf diff --git a/webshell/controller-v3.sh b/webshell/controller-v3.sh deleted file mode 100644 index f24474b82..000000000 --- a/webshell/controller-v3.sh +++ /dev/null @@ -1,129 +0,0 @@ - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -#service_novnc_tgz_basename=noVNC-1.3.0.tgz - -download_and_install() { - # 1. Clone the repository with --no-checkout - export GIT_SSH_COMMAND='ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ' - # Needed for emed - git config --global --unset http.sslbackend - git clone --no-checkout https://github.com/parallelworks/interactive_session.git - - # 2. Navigate into the repository directory - cd interactive_session - #git checkout download-dependencies - - # 3. Initialize sparse-checkout - git sparse-checkout init - - # 4. Configure sparse-checkout to include only the desired directory - service_novnc_tgz_repo_path="downloads/vnc/${service_novnc_tgz_basename}" - echo "${service_novnc_tgz_repo_path}" > .git/info/sparse-checkout - - # 5. Perform the checkout - git checkout - - # 6. Extract tgz - tar -zxf ${service_novnc_tgz_repo_path} -C ${service_parent_install_dir} - - # 7. Clean - cd ../ - rm -rf interactive_session - -} - -download_and_install_juice() { - # Configuration - local OUTPUT_FILE="juice.tgz" - - # Step 1: Get download URL from JuiceLabs API - echo "Fetching JuiceLabs download URL..." - download=$(curl -s 'https://electra.juicelabs.co/v2/public/download/linux' | python3 -c "import sys, json; print(json.load(sys.stdin)['url'])") - - - if [ -z "$download" ]; then - echo "ERROR: Download URL is empty" - exit 1 - fi - echo "Found download URL: $download" - - # Step 2: Prepare install directory - mkdir -p "${juice_install_dir}" - cd "${juice_install_dir}" || exit 1 - - # Step 3: Install prerequisites - sudo dnf install -y wget libatomic numactl-libs || { - echo "ERROR: Failed to install dependencies" - exit 1 - } - - # Step 4: Download Juice agent - echo "Downloading Juice agent..." - wget -O "$OUTPUT_FILE" "$download" || { - echo "ERROR: Failed to download file" - exit 1 - } - - # Step 5: Extract archive - echo "Extracting Juice agent..." - tar -xzvf "$OUTPUT_FILE" || { - echo "ERROR: Failed to extract $OUTPUT_FILE" - exit 1 - } - - echo "Juice agent successfully installed in ${juice_install_dir}" -} - - -displayErrorMessage() { - echo $(date): $1 -} - -echo; echo - -mkdir -p ${service_parent_install_dir} - -service_novnc_tgz_stem=$(echo ${service_novnc_tgz_basename} | sed "s|.tar.gz||g" | sed "s|.tgz||g") -service_novnc_install_dir=${service_parent_install_dir}/${service_novnc_tgz_stem} - -if ! [ -d "${service_novnc_install_dir}" ]; then - echo "Downloading and installing ${service_novnc_install_dir}" - download_and_install -fi - -if ! [ -d "${service_novnc_install_dir}" ]; then - echo - displayErrorMessage "Failed to install ${service_novnc_install_dir}" - exit 1 -fi - - -# Check if the file exists -if ! [ -f "${service_novnc_install_dir}/ttyd.x86_64" ]; then - echo - displayErrorMessage "Missing file ${service_novnc_install_dir}/ttyd.x86_64" - exit 1 -else - chmod +x "${service_novnc_install_dir}/ttyd.x86_64" -fi - - -# Juice -if [[ "${juice_use_juice}" == "true" ]]; then - if [ -z "${juice_exec}" ]; then - juice_install_dir=${service_parent_install_dir}/juice - juice_exec=${service_parent_install_dir}/juice/juice - if ! [ -f ${juice_exec} ]; then - echo "INFO: Installing Juice" - mkdir -p ${juice_install_dir} - download_and_install_juice - fi - if ! [ -f ${juice_exec} ]; then - echo "ERROR: Juice installation failed" - exit 1 - fi - fi -fi \ No newline at end of file diff --git a/webshell/kill-template.sh b/webshell/kill-template.sh deleted file mode 100755 index 187ca25fa..000000000 --- a/webshell/kill-template.sh +++ /dev/null @@ -1,3 +0,0 @@ -# Runs in the controller node: -bash ${resource_jobdir}/service-kill-${job_number}.sh - diff --git a/webshell/start-template-v3.sh b/webshell/start-template-v3.sh deleted file mode 100755 index 811f2eac5..000000000 --- a/webshell/start-template-v3.sh +++ /dev/null @@ -1,63 +0,0 @@ -# Runs via ssh + sbatch - -if [ -z ${service_parent_install_dir} ]; then - service_parent_install_dir=${HOME}/pw/software -fi - -service_novnc_tgz_stem=$(echo ${service_novnc_tgz_basename} | sed "s|.tar.gz||g" | sed "s|.tgz||g") -service_novnc_install_dir=${service_parent_install_dir}/${service_novnc_tgz_stem} - -# Prepare kill service script -# - Needs to be here because we need the hostname of the compute node. -# - kill-template.sh --> service-kill-${job_number}.sh --> service-kill-${job_number}-main.sh - -if [[ ${jobschedulertype} == "CONTROLLER" ]]; then - echo "bash ${PWD}/service-kill-${job_number}-main.sh" > service-kill-${job_number}.sh -else - # Remove .cluster.local for einteinmed! - hname=$(hostname | sed "s/.cluster.local//g") - echo "ssh ${hname} 'bash -s' < ${PWD}/service-kill-${job_number}-main.sh" > service-kill-${job_number}.sh -fi - -cat >> service-kill-${job_number}-main.sh <> ${PWD}/service.pid - -sleep 99999 \ No newline at end of file diff --git a/webshell/url.sh b/webshell/url.sh deleted file mode 100755 index 29a9f7bae..000000000 --- a/webshell/url.sh +++ /dev/null @@ -1 +0,0 @@ -export URLEND="\"" \ No newline at end of file diff --git a/workflow/.DS_Store b/workflow/.DS_Store new file mode 100644 index 000000000..96e3c2975 Binary files /dev/null and b/workflow/.DS_Store differ diff --git a/workflow/readmes/airflow/general.md b/workflow/readmes/airflow/general.md deleted file mode 100644 index 53557679b..000000000 --- a/workflow/readmes/airflow/general.md +++ /dev/null @@ -1,23 +0,0 @@ -## Airflow Interactive Session -This workflow starts an [Airflow server](https://airflow.apache.org/) [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - - -### Installation -Specify Airflow's home directory in the input form. By default, it is set to `__WORKDIR__/pw/airflow`, where `__WORKDIR__` is the user's home directory. - -If the specified directory does not exist, the workflow installs Airflow using pip in a Miniconda environment. The Miniconda installation directory defaults to: -``` -__WORKDIR__/pw/software/miniconda3- - -``` - -For example, if Airflow's home directory is `__WORKDIR__/pw/airflow`, Miniconda will be installed at: -``` -__WORKDIR__/pw/software/miniconda3-airflow -``` - - -### Dags -Files in `./airflow-host/dags/` are copied to Airflow’s DAGs folder and will appear in the Airflow UI. - - diff --git a/workflow/readmes/cesium/general.md b/workflow/readmes/cesium/general.md deleted file mode 100644 index a9b475d15..000000000 --- a/workflow/readmes/cesium/general.md +++ /dev/null @@ -1,41 +0,0 @@ -# Cesium Interactive Session -This workflow starts a Cesium app [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) a **Compute Cluster** (SLURM or PBS). It requires **Rocky Linux 9** - -The app is defined as an HTML. You may use the example below after replacing your access token: -``` - - - - - Cesium App - - - - - -
- - - -``` - - diff --git a/workflow/readmes/docker-service/docker-service.md b/workflow/readmes/docker-service/docker-service.md deleted file mode 100644 index 8cbfd7b72..000000000 --- a/workflow/readmes/docker-service/docker-service.md +++ /dev/null @@ -1,17 +0,0 @@ -## Docker Service -This workflow connects a service hosted in a Docker container to the Parallel Works platform using the [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) workflow framework. It accepts a Docker command to start the server, which can utilize the following special placeholders replaced by the workflow: -1. `__docker_port__`: This placeholder represents an available port that is selected by the workflow. -2. `__base_url__`: This placeholder signifies the base URL from which the server is served on the platform. -3. `__container_name__`: This placeholder represents a name that is assigned by the workflow, primarily to facilitate cleanup procedures once the job is cancelled. - -Two examples are provided below for Matlab and Tensorflow. - -**Matlab** -``` -sudo -n docker run -i --rm --name __container_name__ -v /home/alvaro:/home/alvaro -p __docker_port__:__docker_port__ --shm-size=512M --env MWI_ENABLE_WEB_LOGGING=True --env MWI_APP_HOST=0.0.0.0 --env MWI_APP_PORT=__docker_port__ --env MWI_ENABLE_TOKEN_AUTH=False --env MWI_BASE_URL=__base_url__ mathworks/matlab:r2022a -browser -``` - -**TensorFlow** -``` -sudo -n docker run -i --rm --name __container_name__ -v /home/alvaro:/home/alvaro -p __docker_port__:__docker_port__ tensorflow/tensorflow:latest-gpu-jupyter jupyter-notebook --port=__docker_port__ --ip=0.0.0.0 --no-browser --allow-root --ServerApp.trust_xheaders=True --ServerApp.allow_origin='*' --ServerApp.allow_remote_access=True --ServerApp.token="" --ServerApp.base_url=__base_url__ -``` diff --git a/workflow/readmes/h2o-3/general.md b/workflow/readmes/h2o-3/general.md deleted file mode 100644 index cf7a0fced..000000000 --- a/workflow/readmes/h2o-3/general.md +++ /dev/null @@ -1,3 +0,0 @@ -## H2O Flow Interactive Session -This workflow starts an [H2O Flow](https://docs.h2o.ai/h2o/latest-stable/h2o-docs/flow.html) server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/hammerspace/hammerspace.md b/workflow/readmes/hammerspace/hammerspace.md deleted file mode 100644 index c3a8148b1..000000000 --- a/workflow/readmes/hammerspace/hammerspace.md +++ /dev/null @@ -1,2 +0,0 @@ -## Hammerspace Dashboard -This workflow connects to the Hammerspace Dashboard through an [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). diff --git a/workflow/readmes/juice-server/general.md b/workflow/readmes/juice-server/general.md deleted file mode 100644 index 0cf535a6f..000000000 --- a/workflow/readmes/juice-server/general.md +++ /dev/null @@ -1,19 +0,0 @@ -## Juice Server Session -This workflow starts a Juice Server in the selected target. - -### Juice Client -To run the client in your user workspace you must first run the following commands as root: -``` -dnf install vulkan-loader -sudo ln -s /etc/ssl/certs/ca-bundle.crt /etc/ssl/certs/ca-certificates.crt -``` - -Follow [these instructions](https://github.com/Juice-Labs/Juice-Labs/wiki/Install-Juice) to install the Juice client. - -#### Examples -A hello-world PyTorch example is included in the service directory and other PyTorch examples are downloaded with the Juice Client. - -Run the example using: -``` -./path/to/client/juicify python example.py -``` \ No newline at end of file diff --git a/workflow/readmes/jupyter-docker/README.md b/workflow/readmes/jupyter-docker/README.md deleted file mode 100644 index 346ccba68..000000000 --- a/workflow/readmes/jupyter-docker/README.md +++ /dev/null @@ -1,11 +0,0 @@ -## Interactive Session - Jupyter Docker -This workflow starts a Jupyter session in a Docker container with optional GPU support. - - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/jupyter-docker/jupyter-docker.md b/workflow/readmes/jupyter-docker/jupyter-docker.md deleted file mode 100644 index 8cd2ab7e9..000000000 --- a/workflow/readmes/jupyter-docker/jupyter-docker.md +++ /dev/null @@ -1,6 +0,0 @@ -## Jupyter Notebook Docker Interactive Session -This workflow starts a Jupyter Notebook [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) using the specified docker repository. - -### Examples -1. [TensorFlow](https://www.tensorflow.org/install/docker): `tensorflow/tensorflow:2.7.0-gpu-jupyter`. The latest version of TensorFlow may be incompatible with the cuda version. The older versions may contain incompatible jupyter notebook versions. -2. [PyTorch](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch): `nvcr.io/nvidia/pytorch:22.01-py3` diff --git a/workflow/readmes/jupyter-docker/jupyter_docker_tensorflow_gpu.md b/workflow/readmes/jupyter-docker/jupyter_docker_tensorflow_gpu.md deleted file mode 100644 index fd8dccdff..000000000 --- a/workflow/readmes/jupyter-docker/jupyter_docker_tensorflow_gpu.md +++ /dev/null @@ -1,12 +0,0 @@ -## Interactive Session - Docker Tensorflow GPU -This workflow starts a Jupyter session in a TensorFlow docker container with GPU support. **You must select a partition with GPU support in the input form!** - - - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/jupyter-host/atnorth-onprem.md b/workflow/readmes/jupyter-host/atnorth-onprem.md deleted file mode 100644 index f02c9bc28..000000000 --- a/workflow/readmes/jupyter-host/atnorth-onprem.md +++ /dev/null @@ -1,10 +0,0 @@ -## Jupyter Interactive Session -This workflow starts a Jupyter server in a slurm partition or in the controller node. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/jupyter-host/cloud.md b/workflow/readmes/jupyter-host/cloud.md deleted file mode 100644 index 6ea5d8d77..000000000 --- a/workflow/readmes/jupyter-host/cloud.md +++ /dev/null @@ -1,3 +0,0 @@ -## Jupyter Interactive Session -This workflow starts a Jupyter Notebook server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). -This is a description test diff --git a/workflow/readmes/jupyter-host/emed-onprem.md b/workflow/readmes/jupyter-host/emed-onprem.md deleted file mode 100644 index 91bbd9f3d..000000000 --- a/workflow/readmes/jupyter-host/emed-onprem.md +++ /dev/null @@ -1,3 +0,0 @@ -## Jupyter Interactive Session -This workflow starts a Jupyter Notebook server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/jupyter-host/noaa-onprem.md b/workflow/readmes/jupyter-host/noaa-onprem.md deleted file mode 100644 index f02c9bc28..000000000 --- a/workflow/readmes/jupyter-host/noaa-onprem.md +++ /dev/null @@ -1,10 +0,0 @@ -## Jupyter Interactive Session -This workflow starts a Jupyter server in a slurm partition or in the controller node. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/jupyter-host/podmt3.md b/workflow/readmes/jupyter-host/podmt3.md deleted file mode 100644 index 022ce48ef..000000000 --- a/workflow/readmes/jupyter-host/podmt3.md +++ /dev/null @@ -1,3 +0,0 @@ -## Jupyter Interactive Session -This workflow starts a Jupyter Notebook server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) in the POD MT3 cluster. - diff --git a/workflow/readmes/jupyter-host/workdir.md b/workflow/readmes/jupyter-host/workdir.md deleted file mode 100644 index 91bbd9f3d..000000000 --- a/workflow/readmes/jupyter-host/workdir.md +++ /dev/null @@ -1,3 +0,0 @@ -## Jupyter Interactive Session -This workflow starts a Jupyter Notebook server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/jupyter-singularity/README.md b/workflow/readmes/jupyter-singularity/README.md deleted file mode 100644 index 39272e467..000000000 --- a/workflow/readmes/jupyter-singularity/README.md +++ /dev/null @@ -1,11 +0,0 @@ -## Interactive Session - Jupyter Singularity -This workflow starts a Jupyter session in a singularity container with optional GPU support. - - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/jupyter-singularity/tensorflow_emed.md b/workflow/readmes/jupyter-singularity/tensorflow_emed.md deleted file mode 100644 index 1b1e38789..000000000 --- a/workflow/readmes/jupyter-singularity/tensorflow_emed.md +++ /dev/null @@ -1,3 +0,0 @@ -## Jupyter Singularity Interactive Session -This workflow starts a Jupyter Notebook server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) in a Singularity container. - diff --git a/workflow/readmes/jupyterhub-host/general.md b/workflow/readmes/jupyterhub-host/general.md deleted file mode 100644 index 184cd50f7..000000000 --- a/workflow/readmes/jupyterhub-host/general.md +++ /dev/null @@ -1,7 +0,0 @@ -## JupyterHub Interactive Session -This workflow starts a JupyterHub server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - - -This workflow is designed for scenarios where a group of users needs to access a shared resource and collaborate within the same session. The user who starts the workflow will be assigned as the admin of the JupyterHub server. - -Upon first login, the admin must set up a password within JupyterHub. Other users must first connect to the cluster via SSH to create their home directories. After that, they can log into JupyterHub and set their own passwords there. Additionally, any new users must be authorized by the admin at the following URL: https://x.x.x/hub/authorize. \ No newline at end of file diff --git a/workflow/readmes/jupyterlab-host/dask-input-form.png b/workflow/readmes/jupyterlab-host/dask-input-form.png deleted file mode 100644 index 682b54315..000000000 Binary files a/workflow/readmes/jupyterlab-host/dask-input-form.png and /dev/null differ diff --git a/workflow/readmes/jupyterlab-host/general.md b/workflow/readmes/jupyterlab-host/general.md deleted file mode 100644 index 6aabaaf70..000000000 --- a/workflow/readmes/jupyterlab-host/general.md +++ /dev/null @@ -1,13 +0,0 @@ -## JupyterLab Interactive Session -This workflow starts a JupyterLab server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - - -### Dask Integration on Parallel Works -Refer to the included Jupyter notebook at `jupyterlab-host/dask-extension-jupyterlab-demo.ipynb` for a practical guide illustrating: - -1. Deployment of Dask on a SLURM cluster using the [SLURMCluster](https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html) object. -2. Data transfer to and from a PW storage resource, corresponding to an AWS S3 bucket. Authentication is streamlined through short-term credentials. -3. Integration of the [Dask extension for JupyterLab](https://github.com/dask/dask-labextension) - -A sample YAML file outlining Dask dependencies for PW is provided at `jupyterlab-host/dask-extension-jupyterlab.yaml`. These dependencies are automatically installed by selecting the input form parameters displayed in this [screenshot](https://raw.githubusercontent.com/parallelworks/interactive_session/jupyterlab-yaml-file/workflow/readmes/jupyterlab-host/dask-input-form.png). Alternatively, you have the option to use your own YAML file. - diff --git a/workflow/readmes/jupyterlab-host/general_k8s.md b/workflow/readmes/jupyterlab-host/general_k8s.md deleted file mode 100644 index 379139d55..000000000 --- a/workflow/readmes/jupyterlab-host/general_k8s.md +++ /dev/null @@ -1,75 +0,0 @@ -# JupyterLab Interactive Session -This workflow starts a JupyterLab server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md), on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - - -## Compute Cluster -Launches a JupyterLab server on a **Compute Cluster** using SLURM or PBS. - -### Dask Integration on Parallel Works -Refer to the included Jupyter notebook at `jupyterlab-host/dask-extension-jupyterlab-demo.ipynb` for a practical guide illustrating: - -1. Deployment of Dask on a SLURM cluster using the [SLURMCluster](https://jobqueue.dask.org/en/latest/generated/dask_jobqueue.SLURMCluster.html) object. -2. Data transfer to and from a PW storage resource, corresponding to an AWS S3 bucket. Authentication is streamlined through short-term credentials. -3. Integration of the [Dask extension for JupyterLab](https://github.com/dask/dask-labextension) - -A sample YAML file outlining Dask dependencies for PW is provided at `jupyterlab-host/dask-extension-jupyterlab.yaml`. These dependencies are automatically installed by selecting the input form parameters displayed in this [screenshot](https://raw.githubusercontent.com/parallelworks/interactive_session/jupyterlab-yaml-file/workflow/readmes/jupyterlab-host/dask-input-form.png). Alternatively, you have the option to use your own YAML file. - - - -## Kubernetes Cluster -Launches a JupyterLab server on a **Kubernetes Cluster** using a user-specified image and resource settings. The image must have JupyterLab pre-installed. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Use a JupyterLab-compatible image (default: jupyter/datascience-notebook). -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy JupyterLab and access it via a web interface. - -### Using Nvidia GPUs -For GPU-accelerated workloads, use images from the [Nvidia NGC Catalog](https://catalog.ngc.nvidia.com/containers). **Ensure that the driver version on the node meets the minimum driver requirement for that image.** - -Examples: -- **PyTorch:** `nvcr.io/nvidia/pytorch:24.09-py3` -- **TensorFlow:** `nvcr.io/nvidia/tensorflow:25.02-tf2-py3` - - -#### Test GPU Access in JupyterLab - -##### PyTorch -``` -import torch -if torch.cuda.is_available(): - num_gpus = torch.cuda.device_count() - print(f"GPU is available. Number of GPUs: {num_gpus}") - for i in range(num_gpus): - print(f" - GPU {i}: {torch.cuda.get_device_name(i)}") -else: - print("No GPU available. Using CPU only.") -``` - -##### TensorFlow -``` -import tensorflow as tf -physical_devices = tf.config.list_physical_devices('GPU') -if physical_devices: - print(f"TensorFlow detected {len(physical_devices)} GPU(s).") - for i, device in enumerate(physical_devices): - print(f" - GPU {i}: {device}") -else: - print("No GPU available. Using CPU only.") -``` - -##### Nvidia MIG Instances -To use more than one Multi-Instance GPUs (MIG) set the `CUDA_VISIBLE_DEVICES` environment variable. -``` -!nvidia-smi -L | grep MIG | grep -o 'MIG-[a-f0-9-]\+' -import os -# Replace with the MIG instance IDs -os.environ["CUDA_VISIBLE_DEVICES"] = ( - "MIG-5a9b896b-dbaa-50ca-bd8d-6c50ed9b31c1," - "MIG-9dc7b6fb-7215-536d-b2c3-5ee18463260c" -) -``` diff --git a/workflow/readmes/kasmvnc-proxy/kasmvnc.md b/workflow/readmes/kasmvnc-proxy/kasmvnc.md deleted file mode 100644 index 690fee9a4..000000000 --- a/workflow/readmes/kasmvnc-proxy/kasmvnc.md +++ /dev/null @@ -1,6 +0,0 @@ -## KasmVNC Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) using KasmVNC. - -## Compute Cluster -Launches a JupyterLab server on a **Compute Cluster** using SLURM or PBS. Default values are set for PW cloud resources. - diff --git a/workflow/readmes/kasmvnc-proxy/kasmvnc_k8s.md b/workflow/readmes/kasmvnc-proxy/kasmvnc_k8s.md deleted file mode 100644 index e39e14def..000000000 --- a/workflow/readmes/kasmvnc-proxy/kasmvnc_k8s.md +++ /dev/null @@ -1,18 +0,0 @@ -## KasmVNC Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) using KasmVNC, on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Kubernetes Cluster -Launches KasmVNC on a Kubernetes cluster using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `kasmweb/desktop:1.16.0` from [this](https://hub.docker.com/r/kasmweb/desktop) DockerHub repository. Enter `kasm_user` when prompted for a user. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - - -## Compute Cluster -Launches a KasamVNC on a **Compute Cluster** using SLURM or PBS. Default values are set for PW cloud resources. \ No newline at end of file diff --git a/workflow/readmes/kasmvnc/kasmvnc.md b/workflow/readmes/kasmvnc/kasmvnc.md deleted file mode 100644 index 690fee9a4..000000000 --- a/workflow/readmes/kasmvnc/kasmvnc.md +++ /dev/null @@ -1,6 +0,0 @@ -## KasmVNC Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) using KasmVNC. - -## Compute Cluster -Launches a JupyterLab server on a **Compute Cluster** using SLURM or PBS. Default values are set for PW cloud resources. - diff --git a/workflow/readmes/kasmvnc/kasmvnc_k8s.md b/workflow/readmes/kasmvnc/kasmvnc_k8s.md deleted file mode 100644 index e39e14def..000000000 --- a/workflow/readmes/kasmvnc/kasmvnc_k8s.md +++ /dev/null @@ -1,18 +0,0 @@ -## KasmVNC Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) using KasmVNC, on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Kubernetes Cluster -Launches KasmVNC on a Kubernetes cluster using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `kasmweb/desktop:1.16.0` from [this](https://hub.docker.com/r/kasmweb/desktop) DockerHub repository. Enter `kasm_user` when prompted for a user. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - - -## Compute Cluster -Launches a KasamVNC on a **Compute Cluster** using SLURM or PBS. Default values are set for PW cloud resources. \ No newline at end of file diff --git a/workflow/readmes/marimo-host/general.md b/workflow/readmes/marimo-host/general.md deleted file mode 100644 index 13eb59012..000000000 --- a/workflow/readmes/marimo-host/general.md +++ /dev/null @@ -1,2 +0,0 @@ -## Marimo Interactive Session -This workflow runs a [Marimo notebook](https://marimo.io/) in edit or run mode. diff --git a/workflow/readmes/matlab-docker/matlab-docker.md b/workflow/readmes/matlab-docker/matlab-docker.md deleted file mode 100644 index 82b85e034..000000000 --- a/workflow/readmes/matlab-docker/matlab-docker.md +++ /dev/null @@ -1,2 +0,0 @@ -## Matlab Docker Interactive Session -This workflow starts a MATLAB [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) using the official MATLAB docker [container](https://www.mathworks.com/help/cloudcenter/ug/matlab-container-on-docker-hub.html) diff --git a/workflow/readmes/metabase/general_k8s.md b/workflow/readmes/metabase/general_k8s.md deleted file mode 100644 index 35d876282..000000000 --- a/workflow/readmes/metabase/general_k8s.md +++ /dev/null @@ -1,20 +0,0 @@ -# Metabase Interactive Session -This workflow starts a Metabase server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md), on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Compute Cluster -Launches a metabase server on a **Compute Cluster** using a user-specified docker image. Default is `metabase/metabase` from [this](https://hub.docker.com/r/metabase/metabase) DockerHub repository. - -## Kubernetes Cluster -This workflow launches Metabase on a **Kubernetes Cluster** using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `metabase` from [this](https://hub.docker.com/r/metabase/metabase/) DockerHub repository. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Metabase and access it via a web interface. - - - diff --git a/workflow/readmes/mlflow/general.md b/workflow/readmes/mlflow/general.md deleted file mode 100644 index e00787fa4..000000000 --- a/workflow/readmes/mlflow/general.md +++ /dev/null @@ -1,17 +0,0 @@ -## MLflow Interactive Session -This workflow starts an [mlflow server](https://mlflow.org/docs/latest/cli.html#mlflow-server) [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md), on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Compute Cluster -Launches MLFlow on a **Compute Cluster** using a user-specified install or load command. - -## Kubernetes Cluster -This workflow launches MLFlow on a **Kubernetes Cluster** using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `ubuntu/mlflow:2.1.1_1.0-22.04`. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. \ No newline at end of file diff --git a/workflow/readmes/n8n/general.md b/workflow/readmes/n8n/general.md deleted file mode 100644 index f3cba2693..000000000 --- a/workflow/readmes/n8n/general.md +++ /dev/null @@ -1,3 +0,0 @@ -## OpenVSCode Interactive Session -This workflow starts an [n8n](https://n8n.io/?) server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/netcat-tester/README.md b/workflow/readmes/netcat-tester/README.md deleted file mode 100644 index a0f71f228..000000000 --- a/workflow/readmes/netcat-tester/README.md +++ /dev/null @@ -1,14 +0,0 @@ -## Netcat Interactive Session Tester - -This workflow starts submits a job to a slurm scheduler for running of a remote interactive "blocking" session on a given port. - -Specifically, this demo app uses netcat to start a mock web server on a remote compute node submitted and provisioned through slurm, and then generates a simple session.html file that can be viewed directly within the Parallel Works environment. - -This workflow's intent is to provide a tester for remote resource networking, and as a base template for other interactive session workflows like jupyter notebooks, R server and noVNC. - -#### Instructions - -* Enter form parameters for the submitted slurm job. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* The netcat webserver should return a hello world statement. \ No newline at end of file diff --git a/workflow/readmes/netcat-tester/noaa-onprem.md b/workflow/readmes/netcat-tester/noaa-onprem.md deleted file mode 100644 index e18b794d1..000000000 --- a/workflow/readmes/netcat-tester/noaa-onprem.md +++ /dev/null @@ -1,3 +0,0 @@ -## Netcat Interactive Session -This workflow starts a Netcat server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/ngencerf/README.md b/workflow/readmes/ngencerf/README.md new file mode 100644 index 000000000..2982993b0 --- /dev/null +++ b/workflow/readmes/ngencerf/README.md @@ -0,0 +1,92 @@ +# NGENCERF Interactive Session + +Launch the **NGENCERF** (Next Generation Engine for Community Research on Environmental Flows) application stack as a browser-based interactive session on an HPC cluster. + +## Features + +- Full NGENCERF stack: server (Django/REST API), UI (React), and SLURM job submission API +- Runs NWM calibration, validation, forecast, hindcast, cold-start, and verification jobs on SLURM +- NGINX reverse proxy with WebSocket support, running as an unprivileged Singularity container +- SLURM wrapper Flask app for submitting ngen-cal/nwm-fcst-mgr/nwm-verf Singularity jobs +- Automatic callback retry — pending job callbacks resume when a session restarts +- Optional local Docker image build for server and UI components +- Connect-only mode to attach a new browser session to an already-running service + +## Use Cases + +- Running NWM calibration experiments and inspecting results in the browser +- Submitting and monitoring multi-step hydrological modeling workflows via SLURM +- Iterative forecast and hindcast runs with real-time status updates +- Collaborative analysis of NGENCERF run outputs on shared HPC storage + +## Requirements + +The target cluster must have: +- **Docker** (accessible to the session user, sudo required for certain operations) +- **Singularity/Apptainer** (for the NGINX proxy and NWM computation containers) +- **SLURM** with `scontrol`, `sbatch`, `squeue`, `sacct`, `scancel` available +- **Passwordless sudo** for the session user (`sudo -n true` must succeed) +- **Python 3** with `venv` module (for the SLURM wrapper app virtual environment) +- Pre-pulled Singularity containers: nginx-unprivileged, nwm-cal-mgr, nwm-fcst-mgr, nwm-verf +- Pre-cloned repositories on shared storage: ngencerf-server (with `production-pw.yaml`) and ngencerf-ui (with `compose.yaml`) +- Shared filesystem accessible from both login and compute nodes for data and software installs + +## Configuration + +### Compute Cluster Settings + +| Field | Description | +|-------|-------------| +| Service host | The cluster resource on which to run the session | +| Schedule Job? | Submit via SLURM (`sbatch`) or run on the login/controller node | +| SLURM partition | Partition to use when scheduling (optional) | +| Walltime | Maximum wall-clock time; default `08:00:00` | +| Scheduler Directives | Extra `#SBATCH` lines for GPU, node pinning, etc. | + +### NGENCERF Settings + +**Container Paths** — absolute paths on the cluster filesystem: + +| Field | Description | +|-------|-------------| +| NGINX Singularity Container Path | Path to `nginx-unprivileged.sif` | +| NWM Calibration Manager Container Path | Path to the nwm-cal-mgr `.sif` | +| NWM Forecast Manager Container Path | Path to the nwm-fcst-mgr `.sif` | +| NWM Verification Container Path | Path to the nwm-verf `.sif` | + +**Data Directories:** + +| Field | Description | +|-------|-------------| +| Data Directory (host path) | Shared filesystem path mounted into containers (e.g. `/ngencerf-app/data/`) | +| Data Directory (container path) | Bind-mount target inside containers; default `/ngencerf/data/` | + +**Application Repositories:** + +| Field | Description | +|-------|-------------| +| NGENCERF Server Repository Path | Path to the ngencerf-server checkout (must contain `production-pw.yaml`) | +| NGENCERF UI Repository Path | Path to the ngencerf-ui checkout (must contain `compose.yaml`) | + +**Build Options:** + +| Field | Description | +|-------|-------------| +| Build Server Image Locally? | Rebuild the ngencerf-server Docker image from source; default `No` | +| Build UI Image Locally? | Rebuild the ngencerf-ui Docker image from source; default `No` | + +**Runtime Options:** + +| Field | Description | +|-------|-------------| +| SLURM Wrapper App Workers | Gunicorn worker count for the job-submission API; default `4` | +| Connect to Existing Session? | Attach browser to an already-running service without relaunching containers | +| Python Install Directory | Location for the SLURM wrapper app virtual environment; default `${HOME}/pw/software` | + +## Getting Started + +1. Select the cluster resource and configure scheduler settings. +2. Fill in the paths to all required Singularity containers, the shared data directory, and the application repositories. +3. Click **Execute** to launch the session. +4. Wait for the session URL to appear — click it to open the NGENCERF application in your browser. +5. When finished, cancel the workflow job to stop all containers and free cluster resources. diff --git a/workflow/readmes/ngencerf/connect.md b/workflow/readmes/ngencerf/connect.md deleted file mode 100644 index 0eff60e28..000000000 --- a/workflow/readmes/ngencerf/connect.md +++ /dev/null @@ -1,6 +0,0 @@ -## Start ngenCERF -This connects to the ngenCERF service as a PW [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - -**Usage Instructions:** -- Launch this workflow on the shared cluster where the `ngencerf_start` workflow is already running. -- The workflow establishes an SSH tunnel for secure access to the UI from the platform. diff --git a/workflow/readmes/ngencerf/start.md b/workflow/readmes/ngencerf/start.md deleted file mode 100644 index 5002be01c..000000000 --- a/workflow/readmes/ngencerf/start.md +++ /dev/null @@ -1,14 +0,0 @@ -## Start ngenCERF -This workflow starts and connects the ngenCERF service as a PW [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - - -The ngencerf_start workflow runs on the controller node of a shared SLURM cluster and performs the following actions: -- Starting Docker Containers: Uses docker compose up to launch the service containers. -- NGINX Wrapper: Configures and runs an NGINX wrapper to manage HTTP requests. -- SLURM Wrapper: Initiates a SLURM wrapper application, enabling the main service to submit jobs to the SLURM scheduler via a REST API. -- SSH Tunnel: Creates an SSH tunnel for secure access to the UI from the platform - -Most of the inputs to the workflow are configured in this [file](https://github.com/parallelworks/interactive_session/blob/main/workflow/yamls/ngencerf/start.yaml). To share the session with other users please follow the instructions on this [link](https://parallelworks.com/docs/run/sessions/running-sessions). - - -Note that you will need to rebuild the app every time the session name changes or every time a different user starts the app. \ No newline at end of file diff --git a/workflow/readmes/nginx-docker/README.md b/workflow/readmes/nginx-docker/README.md deleted file mode 100644 index 7ee4bbeea..000000000 --- a/workflow/readmes/nginx-docker/README.md +++ /dev/null @@ -1,3 +0,0 @@ -## Nginx Interactive Session -This workflow starts an Nginx server in a Docker container. - diff --git a/workflow/readmes/novnc-docker/novnc_docker.md b/workflow/readmes/novnc-docker/novnc_docker.md deleted file mode 100644 index 9b568f8f6..000000000 --- a/workflow/readmes/novnc-docker/novnc_docker.md +++ /dev/null @@ -1,10 +0,0 @@ -## Interactive Session - noVNC -This workflow starts a noVNC session in the selected Docker container. Choose one of the compatible Docker repositories! - - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/ollama-openwebui/emed.md b/workflow/readmes/ollama-openwebui/emed.md deleted file mode 100644 index a91a03132..000000000 --- a/workflow/readmes/ollama-openwebui/emed.md +++ /dev/null @@ -1,8 +0,0 @@ -## Ollama through OpenWebUI -Automates deployment of Ollama (AI model server) and OpenWebUI (web interface) on a SLURM cluster. - - - - - - diff --git a/workflow/readmes/openvscode/cloud.md b/workflow/readmes/openvscode/cloud.md deleted file mode 100644 index c8acb76eb..000000000 --- a/workflow/readmes/openvscode/cloud.md +++ /dev/null @@ -1,3 +0,0 @@ -## OpenVSCode Interactive Session -This workflow starts an OpenVSCode server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/openvscode/emed-onprem.md b/workflow/readmes/openvscode/emed-onprem.md deleted file mode 100644 index c8acb76eb..000000000 --- a/workflow/readmes/openvscode/emed-onprem.md +++ /dev/null @@ -1,3 +0,0 @@ -## OpenVSCode Interactive Session -This workflow starts an OpenVSCode server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/openvscode/general_k8s.md b/workflow/readmes/openvscode/general_k8s.md deleted file mode 100644 index 6bbb635ed..000000000 --- a/workflow/readmes/openvscode/general_k8s.md +++ /dev/null @@ -1,20 +0,0 @@ -# OpenVSCode Interactive Session -This workflow starts an OpenVSCode server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md), on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Compute Cluster -Launches a Code Server server on a **Compute Cluster** using a user-specified release. - -## Kubernetes Cluster -This workflow launches Code Server on a **Kubernetes Cluster** using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `codercom/code-server:latest` from [this](https://hub.docker.com/r/codercom/code-server) DockerHub repository. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - - - diff --git a/workflow/readmes/openvscode/noaa-onprem.md b/workflow/readmes/openvscode/noaa-onprem.md deleted file mode 100644 index c8acb76eb..000000000 --- a/workflow/readmes/openvscode/noaa-onprem.md +++ /dev/null @@ -1,3 +0,0 @@ -## OpenVSCode Interactive Session -This workflow starts an OpenVSCode server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/readmes/pgadmin4/general_k8s.md b/workflow/readmes/pgadmin4/general_k8s.md deleted file mode 100644 index fcac406bd..000000000 --- a/workflow/readmes/pgadmin4/general_k8s.md +++ /dev/null @@ -1,20 +0,0 @@ -# pgAdmin 4 Interactive Session -This workflow starts a pgAdmin 4 server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md), on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Compute Cluster -Launches a pgAdmin 4 server on a **Compute Cluster** using a user-specified docker image. Default is `dpage/pgadmin4` from [this](https://hub.docker.com/r/dpage/pgadmin4/) DockerHub repository. - -## Kubernetes Cluster -This workflow launches pgAdmin 4 on a **Kubernetes Cluster** using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `dpage/pgadmin4` from [this](https://hub.docker.com/r/dpage/pgadmin4/) DockerHub repository. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy pgAdmin 4 and access it via a web interface. - - - diff --git a/workflow/readmes/postgres/general_k8s.md b/workflow/readmes/postgres/general_k8s.md deleted file mode 100644 index 6ce3e85e9..000000000 --- a/workflow/readmes/postgres/general_k8s.md +++ /dev/null @@ -1,20 +0,0 @@ -# postgres Interactive Session -This workflow starts a postgres server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md), on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Compute Cluster -Launches a postgres server on a **Compute Cluster** using a user-specified docker image. Default is `postgres:latest` from [this](https://hub.docker.com/_/postgres) DockerHub repository. - -## Kubernetes Cluster -This workflow launches postgres on a **Kubernetes Cluster** using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `postgres:latest` from [this](https://hub.docker.com/_/postgres) DockerHub repository. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy postgres and access it via a web interface. - - - diff --git a/workflow/readmes/pvweb-singularity/cloud.md b/workflow/readmes/pvweb-singularity/cloud.md deleted file mode 100644 index 2317677f7..000000000 --- a/workflow/readmes/pvweb-singularity/cloud.md +++ /dev/null @@ -1,11 +0,0 @@ -## Paraview Web Singularity -This workflow starts a [Paraview Web](https://www.paraview.org/web/) session in a singularity container with optional GPU support. - - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/r-singularity/README.md b/workflow/readmes/r-singularity/README.md deleted file mode 100644 index f87f5c14c..000000000 --- a/workflow/readmes/r-singularity/README.md +++ /dev/null @@ -1 +0,0 @@ -# TODO \ No newline at end of file diff --git a/workflow/readmes/r-singularity/cloud.md b/workflow/readmes/r-singularity/cloud.md deleted file mode 100644 index 6fbeb029b..000000000 --- a/workflow/readmes/r-singularity/cloud.md +++ /dev/null @@ -1,11 +0,0 @@ -## RServer Singularity - Interactive Session -This workflow starts R server in a singularity container with optional GPU support. - - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/screenshots/connected-desktop.png b/workflow/readmes/screenshots/connected-desktop.png deleted file mode 100644 index 4ca327f71..000000000 Binary files a/workflow/readmes/screenshots/connected-desktop.png and /dev/null differ diff --git a/workflow/readmes/screenshots/input-form.png b/workflow/readmes/screenshots/input-form.png deleted file mode 100644 index d51bcdee7..000000000 Binary files a/workflow/readmes/screenshots/input-form.png and /dev/null differ diff --git a/workflow/readmes/screenshots/readme-is-v3-1.png b/workflow/readmes/screenshots/readme-is-v3-1.png deleted file mode 100644 index 87edabd93..000000000 Binary files a/workflow/readmes/screenshots/readme-is-v3-1.png and /dev/null differ diff --git a/workflow/readmes/screenshots/readme-is-v3-2.png b/workflow/readmes/screenshots/readme-is-v3-2.png deleted file mode 100644 index 7da209019..000000000 Binary files a/workflow/readmes/screenshots/readme-is-v3-2.png and /dev/null differ diff --git a/workflow/readmes/screenshots/workflow-monitor.png b/workflow/readmes/screenshots/workflow-monitor.png deleted file mode 100644 index 5bd09eae6..000000000 Binary files a/workflow/readmes/screenshots/workflow-monitor.png and /dev/null differ diff --git a/workflow/readmes/streamlit-host/general.md b/workflow/readmes/streamlit-host/general.md deleted file mode 100644 index 05949cce1..000000000 --- a/workflow/readmes/streamlit-host/general.md +++ /dev/null @@ -1,9 +0,0 @@ -## Streamlit Interactive Session -This workflow starts Streamlit in the selected resource. - -#### Instructions -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/turbovnc/abaqus.md b/workflow/readmes/turbovnc/abaqus.md deleted file mode 100644 index 84fec6f95..000000000 --- a/workflow/readmes/turbovnc/abaqus.md +++ /dev/null @@ -1,10 +0,0 @@ -## Abaqus Interactive Session - -This workflow starts an interactive session for Abaqus in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -- Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -- Wait for node to be provisioned from slurm. -- Once provisioned, open the session.html file (double click) in the job directory. -- To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/turbovnc/allegro.md b/workflow/readmes/turbovnc/allegro.md deleted file mode 100644 index d7bb64c35..000000000 --- a/workflow/readmes/turbovnc/allegro.md +++ /dev/null @@ -1,2 +0,0 @@ -## Allegro Free Viewer -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and launches Allegro Free Viewer. diff --git a/workflow/readmes/turbovnc/ansys.md b/workflow/readmes/turbovnc/ansys.md deleted file mode 100644 index 9c65863e3..000000000 --- a/workflow/readmes/turbovnc/ansys.md +++ /dev/null @@ -1,10 +0,0 @@ -## Ansys Interactive Session - -This workflow starts an interactive session for Ansys in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -- Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -- Wait for node to be provisioned from slurm. -- Once provisioned, open the session.html file (double click) in the job directory. -- To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/turbovnc/armforge-noaa-onprem.md b/workflow/readmes/turbovnc/armforge-noaa-onprem.md deleted file mode 100644 index 712f89cf0..000000000 --- a/workflow/readmes/turbovnc/armforge-noaa-onprem.md +++ /dev/null @@ -1,5 +0,0 @@ -## Arm Forge Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts Arm Forge. - -**Arm Forge must be installed, and the appropriate command (e.g., `module load armforge`) must be entered in the workflow's input form.** - diff --git a/workflow/readmes/turbovnc/cubit.md b/workflow/readmes/turbovnc/cubit.md deleted file mode 100644 index 9c65863e3..000000000 --- a/workflow/readmes/turbovnc/cubit.md +++ /dev/null @@ -1,10 +0,0 @@ -## Ansys Interactive Session - -This workflow starts an interactive session for Ansys in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -- Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -- Wait for node to be provisioned from slurm. -- Once provisioned, open the session.html file (double click) in the job directory. -- To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/turbovnc/ecflow-noaa-onprem.md b/workflow/readmes/turbovnc/ecflow-noaa-onprem.md deleted file mode 100644 index 61aa41830..000000000 --- a/workflow/readmes/turbovnc/ecflow-noaa-onprem.md +++ /dev/null @@ -1,4 +0,0 @@ -## ecFlow Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts ecFlow. - -**ecFlow must be installed, and the appropriate command (e.g., `module load ecflow`) must be entered in the workflow's input form.** diff --git a/workflow/readmes/turbovnc/firefox.md b/workflow/readmes/turbovnc/firefox.md deleted file mode 100644 index 60ac26a65..000000000 --- a/workflow/readmes/turbovnc/firefox.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop with Firefox -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and runs firefox. diff --git a/workflow/readmes/turbovnc/fluent.md b/workflow/readmes/turbovnc/fluent.md deleted file mode 100644 index 6cd60137b..000000000 --- a/workflow/readmes/turbovnc/fluent.md +++ /dev/null @@ -1,10 +0,0 @@ -## Fluent Interactive Session - -This workflow starts an interactive session for Fluent in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -- Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -- Wait for node to be provisioned from slurm. -- Once provisioned, open the session.html file (double click) in the job directory. -- To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/turbovnc/forge.md b/workflow/readmes/turbovnc/forge.md deleted file mode 100644 index 7b5c290cb..000000000 --- a/workflow/readmes/turbovnc/forge.md +++ /dev/null @@ -1,2 +0,0 @@ -## Forge Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts ARM Forge. diff --git a/workflow/readmes/turbovnc/fsl-emed-onprem.md b/workflow/readmes/turbovnc/fsl-emed-onprem.md deleted file mode 100644 index f24a8677b..000000000 --- a/workflow/readmes/turbovnc/fsl-emed-onprem.md +++ /dev/null @@ -1,2 +0,0 @@ -## FSL Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts FSL. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/gtise.md b/workflow/readmes/turbovnc/gtise.md deleted file mode 100644 index 1e6447b7f..000000000 --- a/workflow/readmes/turbovnc/gtise.md +++ /dev/null @@ -1,4 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and runs GTISE. - -Run on a cluster in which the [start_scheduler workflow](https://github.com/parallelworks/start_scheduler/tree/cloud) is already running. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/libreoffice.md b/workflow/readmes/turbovnc/libreoffice.md deleted file mode 100644 index 8936a5496..000000000 --- a/workflow/readmes/turbovnc/libreoffice.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and launches LibreOffice. diff --git a/workflow/readmes/turbovnc/matlab-emed-onprem.md b/workflow/readmes/turbovnc/matlab-emed-onprem.md deleted file mode 100644 index 7889efa33..000000000 --- a/workflow/readmes/turbovnc/matlab-emed-onprem.md +++ /dev/null @@ -1,2 +0,0 @@ -## Matlab Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts MATLAB. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/matlab-noaa-onprem.md b/workflow/readmes/turbovnc/matlab-noaa-onprem.md deleted file mode 100644 index 1e465d45d..000000000 --- a/workflow/readmes/turbovnc/matlab-noaa-onprem.md +++ /dev/null @@ -1,4 +0,0 @@ -## Matlab Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts MATLAB. - -**MATLAB must be installed, and the appropriate command (e.g., `module load matlab`) must be entered in the workflow's input form.** diff --git a/workflow/readmes/turbovnc/matlab.md b/workflow/readmes/turbovnc/matlab.md deleted file mode 100644 index b2807e6d7..000000000 --- a/workflow/readmes/turbovnc/matlab.md +++ /dev/null @@ -1,4 +0,0 @@ -## Matlab Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts MATLAB. - -Default values are set for PW cloud resources. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/matlab_cloud.md b/workflow/readmes/turbovnc/matlab_cloud.md deleted file mode 100644 index b2807e6d7..000000000 --- a/workflow/readmes/turbovnc/matlab_cloud.md +++ /dev/null @@ -1,4 +0,0 @@ -## Matlab Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts MATLAB. - -Default values are set for PW cloud resources. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/ncdiff.md b/workflow/readmes/turbovnc/ncdiff.md deleted file mode 100644 index d8c497f41..000000000 --- a/workflow/readmes/turbovnc/ncdiff.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and launches a terminal to run `ncdiff`. diff --git a/workflow/readmes/turbovnc/nextflow-emed-onprem.md b/workflow/readmes/turbovnc/nextflow-emed-onprem.md deleted file mode 100644 index d7df68b83..000000000 --- a/workflow/readmes/turbovnc/nextflow-emed-onprem.md +++ /dev/null @@ -1,10 +0,0 @@ -## NextFlow Interactive Session -This workflow starts an interactive session for NextFlow in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -* Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -* Wait for node to be provisioned from slurm. -* Once provisioned, open the session.html file (double click) in the job directory. -* To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). - diff --git a/workflow/readmes/turbovnc/novnc-emed-onprem.md b/workflow/readmes/turbovnc/novnc-emed-onprem.md deleted file mode 100644 index 19ae94cb1..000000000 --- a/workflow/readmes/turbovnc/novnc-emed-onprem.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). diff --git a/workflow/readmes/turbovnc/novnc-noaa-onprem.md b/workflow/readmes/turbovnc/novnc-noaa-onprem.md deleted file mode 100644 index 19ae94cb1..000000000 --- a/workflow/readmes/turbovnc/novnc-noaa-onprem.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). diff --git a/workflow/readmes/turbovnc/novnc.md b/workflow/readmes/turbovnc/novnc.md deleted file mode 100644 index 5f18c5715..000000000 --- a/workflow/readmes/turbovnc/novnc.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser using NoVNC. It utilizes either TurboVNC or TigerVNC, depending on which is installed on the target resource. diff --git a/workflow/readmes/turbovnc/octave.md b/workflow/readmes/turbovnc/octave.md deleted file mode 100644 index 7c2bc37ec..000000000 --- a/workflow/readmes/turbovnc/octave.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and launches Octave. diff --git a/workflow/readmes/turbovnc/panoply.md b/workflow/readmes/turbovnc/panoply.md deleted file mode 100644 index 89954f889..000000000 --- a/workflow/readmes/turbovnc/panoply.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and launches [Panoply](https://www.earthdata.nasa.gov/technology/panoply#:~:text=Panoply%2C%20developed%20by%20NASA's%20Goddard,and%20GRIdded%20Binary%20(GRIB).). diff --git a/workflow/readmes/turbovnc/paraview.md b/workflow/readmes/turbovnc/paraview.md deleted file mode 100644 index c44a4fa70..000000000 --- a/workflow/readmes/turbovnc/paraview.md +++ /dev/null @@ -1,4 +0,0 @@ -## Matlab Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts Paraview. - - diff --git a/workflow/readmes/turbovnc/qgis.md b/workflow/readmes/turbovnc/qgis.md deleted file mode 100644 index 60216d033..000000000 --- a/workflow/readmes/turbovnc/qgis.md +++ /dev/null @@ -1,2 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and launches QGIS. diff --git a/workflow/readmes/turbovnc/rstudio-emed-onprem.md b/workflow/readmes/turbovnc/rstudio-emed-onprem.md deleted file mode 100644 index b1dd28ac1..000000000 --- a/workflow/readmes/turbovnc/rstudio-emed-onprem.md +++ /dev/null @@ -1,2 +0,0 @@ -## RStudio Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts RStudio. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/rstudio.md b/workflow/readmes/turbovnc/rstudio.md deleted file mode 100644 index edc252ea5..000000000 --- a/workflow/readmes/turbovnc/rstudio.md +++ /dev/null @@ -1,4 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and runs RStudio. - -Default values are set for PW cloud resources. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/schrodinger-emed-onprem.md b/workflow/readmes/turbovnc/schrodinger-emed-onprem.md deleted file mode 100644 index 837f0de97..000000000 --- a/workflow/readmes/turbovnc/schrodinger-emed-onprem.md +++ /dev/null @@ -1,2 +0,0 @@ -## Schrodinger Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts Schrodinger. \ No newline at end of file diff --git a/workflow/readmes/turbovnc/starccm.md b/workflow/readmes/turbovnc/starccm.md deleted file mode 100644 index d507d918a..000000000 --- a/workflow/readmes/turbovnc/starccm.md +++ /dev/null @@ -1,10 +0,0 @@ -## STAR-CCM+ Interactive Session - -This workflow starts an interactive session for STAR-CCM+ in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -- Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -- Wait for node to be provisioned from slurm. -- Once provisioned, open the session.html file (double click) in the job directory. -- To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/turbovnc/totalview.md b/workflow/readmes/turbovnc/totalview.md deleted file mode 100644 index 2c63820b5..000000000 --- a/workflow/readmes/turbovnc/totalview.md +++ /dev/null @@ -1,10 +0,0 @@ -## TotalView Interactive Session - -This workflow starts an interactive session for TotalView in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -- Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -- Wait for node to be provisioned from slurm. -- Once provisioned, open the session.html file (double click) in the job directory. -- To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/turbovnc/visit.md b/workflow/readmes/turbovnc/visit.md deleted file mode 100644 index f64dc8005..000000000 --- a/workflow/readmes/turbovnc/visit.md +++ /dev/null @@ -1,10 +0,0 @@ -## VisIt Interactive Session - -This workflow starts an interactive session for VisIt in a desktop environment. The services are started in the selected slurm partition using an sbatch command. - -#### Instructions - -- Enter form parameters and click _Execute_ to launch a PW job. The job status can be monitored under COMPUTE > Workflow Monitor. The job files and logs are under the newly created `/pw/jobs///` directory. -- Wait for node to be provisioned from slurm. -- Once provisioned, open the session.html file (double click) in the job directory. -- To close a session kill the PW job by clicking on COMPUTE > Workflow Monitor > Cancel Job (red icon). diff --git a/workflow/readmes/turbovnc/vmd-emed-onprem.md b/workflow/readmes/turbovnc/vmd-emed-onprem.md deleted file mode 100644 index 8b623ee83..000000000 --- a/workflow/readmes/turbovnc/vmd-emed-onprem.md +++ /dev/null @@ -1,2 +0,0 @@ -## Visual Molecular Dynamics Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) and starts Visual Molecular Dynamics. \ No newline at end of file diff --git a/workflow/readmes/vncserver/armforge.md b/workflow/readmes/vncserver/armforge.md deleted file mode 100644 index b29e0d596..000000000 --- a/workflow/readmes/vncserver/armforge.md +++ /dev/null @@ -1,6 +0,0 @@ -## Arm FORGE Interactive Session -This workflow launches Arm FORGE in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS). - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - - diff --git a/workflow/readmes/vncserver/firefox.md b/workflow/readmes/vncserver/firefox.md deleted file mode 100644 index 21e29fe8e..000000000 --- a/workflow/readmes/vncserver/firefox.md +++ /dev/null @@ -1,6 +0,0 @@ -## Firefox Interactive Session -This workflow launches Firefox in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS). - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - - diff --git a/workflow/readmes/vncserver/fsl.md b/workflow/readmes/vncserver/fsl.md deleted file mode 100644 index 94500bcfa..000000000 --- a/workflow/readmes/vncserver/fsl.md +++ /dev/null @@ -1,6 +0,0 @@ -## FSL Interactive Session -This workflow launches FSL in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS). - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - - diff --git a/workflow/readmes/vncserver/general.md b/workflow/readmes/vncserver/general.md deleted file mode 100644 index 228a1cf72..000000000 --- a/workflow/readmes/vncserver/general.md +++ /dev/null @@ -1,4 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser. - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. diff --git a/workflow/readmes/vncserver/general_k8s.md b/workflow/readmes/vncserver/general_k8s.md deleted file mode 100644 index 5f5226ad8..000000000 --- a/workflow/readmes/vncserver/general_k8s.md +++ /dev/null @@ -1,18 +0,0 @@ -## Desktop Interactive Session -This workflow starts a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser, on either a **Compute Cluster** (SLURM or PBS) or a **Kubernetes Cluster**. - -Use the `Target Type` input to select your environment. - -## Compute Cluster - It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - -## Kubernetes Cluster -Launches KasmVNC on a Kubernetes cluster using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `kasmweb/desktop:1.16.0` from [this](https://hub.docker.com/r/kasmweb/desktop) DockerHub repository. Enter `kasm_user` when prompted for a user. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - diff --git a/workflow/readmes/vncserver/matlab.md b/workflow/readmes/vncserver/matlab.md deleted file mode 100644 index 3008ef0cc..000000000 --- a/workflow/readmes/vncserver/matlab.md +++ /dev/null @@ -1,6 +0,0 @@ -## MATLAB Interactive Session -This workflow launches MATLAB in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS). - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - - diff --git a/workflow/readmes/vncserver/matlab_k8s.md b/workflow/readmes/vncserver/matlab_k8s.md deleted file mode 100644 index 2598361f2..000000000 --- a/workflow/readmes/vncserver/matlab_k8s.md +++ /dev/null @@ -1,26 +0,0 @@ -## MATLAB Interactive Session -This workflow launches MATLAB in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS) or as a container on a **Kubernetes Cluster** using the official MATLAB docker [image](https://hub.docker.com/r/mathworks/matlab). - -Use the `Target Type` input to select your environment. - -## Compute Cluster -Runs MATLAB in a remote desktop session on a SLURM or PBS cluster using TurboVNC, TigerVNC, or KasmVNC, depending on the installed VNC software. - -### Requirements: -- MATLAB must be installed on the target resource. -- **Users must have access to a valid MATLAB license.** -- Users must provide a command to load and start MATLAB (e.g., matlab -desktop). - -## Kubernetes Cluster -Deploys a MATLAB container on a Kubernetes cluster with user-defined image and resource settings. -- Note: No remote desktop is used; MATLAB runs directly in the container. -- Requirement: **Users must provide their own MATLAB account or license.** - - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `mathworks/matlab:r2025a` from [this](https://hub.docker.com/r/mathworks/matlab) DockerHub repository. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - diff --git a/workflow/readmes/vncserver/rstudio.md b/workflow/readmes/vncserver/rstudio.md deleted file mode 100644 index 85d62766f..000000000 --- a/workflow/readmes/vncserver/rstudio.md +++ /dev/null @@ -1,6 +0,0 @@ -## RStudio Interactive Session -This workflow launches RStudio in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS). - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - - diff --git a/workflow/readmes/vncserver/rstudio_k8s.md b/workflow/readmes/vncserver/rstudio_k8s.md deleted file mode 100644 index dc3b3ce5d..000000000 --- a/workflow/readmes/vncserver/rstudio_k8s.md +++ /dev/null @@ -1,21 +0,0 @@ -## RStudio Interactive Session -This workflow launches RStudio in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS) or as a container on a **Kubernetes Cluster** using the official RStudio docker [image](https://hub.docker.com/r/rocker/rstudio). - -Use the `Target Type` input to select your environment. - -## Compute Cluster -Runs RStudio in a remote desktop session on a SLURM or PBS cluster using TurboVNC, TigerVNC, or KasmVNC, depending on the installed VNC software. - - -## Kubernetes Cluster -Deploys an RStudio container on a Kubernetes cluster with user-defined image and resource settings. -- Note: No remote desktop is used; RStudio runs directly in the container. - - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `rocker/rstudio` from [this](https://hub.docker.com/r/rocker/rstudio) DockerHub repository. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - diff --git a/workflow/readmes/vncserver/schrodinger.md b/workflow/readmes/vncserver/schrodinger.md deleted file mode 100644 index 1b54940ea..000000000 --- a/workflow/readmes/vncserver/schrodinger.md +++ /dev/null @@ -1,6 +0,0 @@ -## Schrodinger Interactive Session -This workflow launches Schrodinger in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS). - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - - diff --git a/workflow/readmes/vncserver/vmd.md b/workflow/readmes/vncserver/vmd.md deleted file mode 100644 index 498e731c7..000000000 --- a/workflow/readmes/vncserver/vmd.md +++ /dev/null @@ -1,6 +0,0 @@ -## VMD Interactive Session -This workflow launches VMD in a remote desktop [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md) accessible via a web browser on a **Compute Cluster** (SLURM or PBS). - -It utilizes either TurboVNC, TigerVNC or KasmVNC, depending on which is installed on the target resource. - - diff --git a/workflow/readmes/webshell/noaa-onprem.md b/workflow/readmes/webshell/noaa-onprem.md deleted file mode 100644 index af94b6313..000000000 --- a/workflow/readmes/webshell/noaa-onprem.md +++ /dev/null @@ -1,3 +0,0 @@ -## Webshell Interactive Session -This workflow starts a Webshell server [interactive session](https://github.com/parallelworks/interactive_session/blob/main/README-v3.md). - diff --git a/workflow/thumbnails/langflow.png b/workflow/thumbnails/langflow.png new file mode 100644 index 000000000..22fffab24 Binary files /dev/null and b/workflow/thumbnails/langflow.png differ diff --git a/workflow/thumbnails/librechat.png b/workflow/thumbnails/librechat.png new file mode 100644 index 000000000..f2da90c73 Binary files /dev/null and b/workflow/thumbnails/librechat.png differ diff --git a/workflow/thumbnails/n8n.png b/workflow/thumbnails/n8n.png index 43cbbaa80..f94a5e90b 100644 Binary files a/workflow/thumbnails/n8n.png and b/workflow/thumbnails/n8n.png differ diff --git a/workflow/thumbnails/open-notebook.png b/workflow/thumbnails/open-notebook.png new file mode 100644 index 000000000..bbc5231b9 Binary files /dev/null and b/workflow/thumbnails/open-notebook.png differ diff --git a/workflow/thumbnails/rocket.png b/workflow/thumbnails/rocket.png new file mode 100644 index 000000000..9542c8610 Binary files /dev/null and b/workflow/thumbnails/rocket.png differ diff --git a/workflow/yamls/airflow-host/general.yaml b/workflow/yamls/airflow-host/general.yaml deleted file mode 100644 index 95a2c62b4..000000000 --- a/workflow/yamls/airflow-host/general.yaml +++ /dev/null @@ -1,324 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - useCustomDomain: true - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=login" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Jupyter Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Airflow Settings - items: - name: - type: string - hidden: true - default: airflow-host - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - airflow_version: - label: Airflow Version - type: string - default: '2.10.5' - airflow_home: - label: Airflow Home Directory - type: string - default: __HOME__/pw/airflow - tooltip: Airflow is installed if the home directory is missing - username: - label: User Name - type: string - default: admin - tooltip: Username of the user - firstname: - label: First Name - type: string - default: FIRSTNAME - tooltip: First name of the user - lastname: - label: Last Name - type: string - default: LASTNAME - tooltip: Last name of the user - role: - label: User Role - type: dropdown - default: Admin - tooltip: Role of the user - options: - - value: Admin - label: Admin - - value: User - label: User - - value: Op - label: Op - - value: Viewer - label: Viewer - - value: Public - label: Public - email: - label: Email - type: string - default: user@domain.com - tooltip: Email of the user - password: - label: Password - type: password - tooltip: Password of the user diff --git a/workflow/yamls/cesium/general.yaml b/workflow/yamls/cesium/general.yaml deleted file mode 100644 index ded8f6533..000000000 --- a/workflow/yamls/cesium/general.yaml +++ /dev/null @@ -1,272 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - echo "slug=?folder=${service_directory}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Cesium Settings - items: - name: - type: string - hidden: true - default: cesium - html: - label: Copy Your HTML - type: editor diff --git a/workflow/yamls/h2o-3/general.yaml b/workflow/yamls/h2o-3/general.yaml deleted file mode 100644 index aac9b73ee..000000000 --- a/workflow/yamls/h2o-3/general.yaml +++ /dev/null @@ -1,273 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=flow/index.html" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: H2O Flow Settings - items: - name: - type: string - hidden: true - default: h2o-3 - download_url: - label: Download URL - type: string - default: https://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/6/h2o-3.46.0.6.zip - tooltip: Downloads or uses the h2o version specified in the URL diff --git a/workflow/yamls/h2o-3/hsp.yaml b/workflow/yamls/h2o-3/hsp.yaml deleted file mode 100644 index f180ce48d..000000000 --- a/workflow/yamls/h2o-3/hsp.yaml +++ /dev/null @@ -1,317 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=flow/index.html" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: H2O Flow Settings - items: - name: - type: string - hidden: true - default: h2o-3 - download_url: - label: Download URL - type: string - default: https://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/6/h2o-3.46.0.6.zip - tooltip: Downloads or uses the h2o version specified in the URL diff --git a/workflow/yamls/h2o-3/noaa-v3.yaml b/workflow/yamls/h2o-3/noaa-v3.yaml deleted file mode 100644 index d9c3ba0e9..000000000 --- a/workflow/yamls/h2o-3/noaa-v3.yaml +++ /dev/null @@ -1,298 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=flow/index.html" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: H2O Flow Settings - items: - name: - type: string - hidden: true - default: h2o-3 - download_url: - label: Download URL - type: string - default: https://h2o-release.s3.amazonaws.com/h2o/rel-3.46.0/6/h2o-3.46.0.6.zip - tooltip: Downloads or uses the h2o version specified in the URL diff --git a/workflow/yamls/hammerspace/general.yaml b/workflow/yamls/hammerspace/general.yaml deleted file mode 100644 index 725905c2c..000000000 --- a/workflow/yamls/hammerspace/general.yaml +++ /dev/null @@ -1,186 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - echo "slug=#/auth/login" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: ${{ inputs.service.host }} - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Resource Target - items: - resource: - type: compute-clusters - label: Select Target - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - hidden: true - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - collapsed: false - service: - type: group - label: Hammespace Settings - items: - name: - type: string - hidden: true - default: hammerspace - port: - label: Port - type: number - default: 8443 - host: - label: Server Host - type: string diff --git a/workflow/yamls/juice-server/general.yaml b/workflow/yamls/juice-server/general.yaml deleted file mode 100644 index 519b7a50b..000000000 --- a/workflow/yamls/juice-server/general.yaml +++ /dev/null @@ -1,269 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: false - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: VS Code Settings - items: - name: - type: string - hidden: true - default: juice-server - download_url: - label: Download URL - type: string - default: https://github.com/Juice-Labs/Juice-Labs/releases/latest/download/JuiceServer-linux.tar.gz - tooltip: Downloads or uses the Juice server version specified in the URL diff --git a/workflow/yamls/jupyter-docker/general.yaml b/workflow/yamls/jupyter-docker/general.yaml deleted file mode 100644 index 9f6a0d282..000000000 --- a/workflow/yamls/jupyter-docker/general.yaml +++ /dev/null @@ -1,297 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=tree?dt=$(date +%s)" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - gclusterv2 - - pclusterv2 - - azclusterv2 - - aws-slurm - - google-slurm - - azure-slurm - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Jupyter Docker Settings - items: - name: - type: string - hidden: true - default: jupyter-docker - docker_repo: - label: Docker repository - type: string - default: tensorflow/tensorflow:2.7.0-gpu-jupyter - tooltip: Docker repository to start with docker run and containing jupyter-notebook - notebook_dir: - label: Directory to start Jupyter - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the JupyterHub graphical user interface starts. The default value here is your home directory. - mount_directories: - label: Docker mount volumnes - type: string - optional: true - tooltip: 'Type in the mount volume options for the docker command. E.g.: -v /lustre:/lustre -v /contrib:/contrib' - use_gpus: - label: Use GPUs? - type: boolean - default: false - tooltip: Select Yes to run a CUDA application inside a container diff --git a/workflow/yamls/jupyter-host/emed.yaml b/workflow/yamls/jupyter-host/emed.yaml deleted file mode 100644 index 84a019323..000000000 --- a/workflow/yamls/jupyter-host/emed.yaml +++ /dev/null @@ -1,366 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=tree?dt=$(date +%s)" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} -'on': - execute: - inputs: - pwrl_host: - type: group - label: Jupyter Lab Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Jupyter Notebook Settings - items: - name: - type: string - hidden: true - default: jupyter-host - nginx_sif: - type: string - hidden: true - default: /public/apps/pw/nginx-unprivileged.sif - notebook_dir: - label: Directory to start Jupyter Notebook GUI - type: string - default: /gs/gsfs0/users/__USER__/pw/ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - load_env: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source /gs/gsfs0/hpc01/rhel8/apps/conda3/etc/profile.d/conda.sh; conda activate base; module load cuda - tooltip: Use a bash command - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password diff --git a/workflow/yamls/jupyter-host/general.yaml b/workflow/yamls/jupyter-host/general.yaml deleted file mode 100644 index 41c57e087..000000000 --- a/workflow/yamls/jupyter-host/general.yaml +++ /dev/null @@ -1,348 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyter after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=tree?dt=$(date +%s)" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Jupyter Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Jupyter Notebook Settings - items: - name: - type: string - hidden: true - default: jupyter-host - notebook_dir: - label: Directory to start Jupyter Notebook GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: '${{ inputs.service.conda_install == false }}' - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - install_instructions: - label: Select Jupyter Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: latest - options: - - value: latest - label: Latest default versions of Jupyter Notebook and Python (not thoroughly tested) - - value: notebook7.2.2-python3.12.2 - label: Jupyter Notebook 7.2.2 with Python version 3.12.2 - - value: yaml - label: Provide custom Conda environment YAML file - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel diff --git a/workflow/yamls/jupyter-host/noaa-v3.yaml b/workflow/yamls/jupyter-host/noaa-v3.yaml deleted file mode 100644 index 0b056ad20..000000000 --- a/workflow/yamls/jupyter-host/noaa-v3.yaml +++ /dev/null @@ -1,401 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyter after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=tree?dt=$(date +%s)" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Jupyter Notebook Settings - items: - name: - type: string - hidden: true - default: jupyter-host - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - notebook_dir: - label: Directory to start Jupyter Notebook GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - conda_install: - label: Install Jupyter-Notebook If Not There? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - use_conda: - label: Use Conda? - type: boolean - default: false - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Defaults to ~/pw/software. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env_tag_cloud: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/software/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - load_env_tag_existing: - label: Command to load Jupyter Notebook to the PATH - type: string - default: module load python; export PATH=$PATH:/home/${USER}/.local/bin - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - install_instructions_tag_cloud: - label: Install Instructions - type: dropdown - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - default: notebook7.2.2-python3.12.2 - options: - - value: latest - label: Latest default versions of Jupyter Notebook and Python (not thoroughly tested) - - value: notebook7.2.2-python3.12.2 - label: Jupyter Notebook 7.2.2 with Python version 3.12.2 - - value: yaml - label: Provide custom Conda environment YAML file - install_instructions_tag_existing: - label: Install Instructions - type: string - default: install_command - hidden: true - ignore: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions_tag_cloud != yaml || inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel - install_command: - label: Command to install Jupyter - type: string - default: module load python; pip install --upgrade "notebook>=7" --no-warn-script-location - hidden: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - ignore: ${{ .hidden }} diff --git a/workflow/yamls/jupyter-host/noaa.yaml b/workflow/yamls/jupyter-host/noaa.yaml deleted file mode 100644 index 524174cd9..000000000 --- a/workflow/yamls/jupyter-host/noaa.yaml +++ /dev/null @@ -1,227 +0,0 @@ -jobs: - main: - steps: - - name: Preprocessing - run: ./utils/steps/preprocessing.sh - - name: Validating Target Resource - run: ./utils/steps/input_form_resource_wrapper.sh - - name: Process Inputs - run: ./utils/steps/process_inputs_sh.sh - - name: Controller Preprocessing - run: ./utils/steps/controller_preprocessing.sh - - name: Prepare Service JSON - run: ./utils/steps/prepare_service_json.sh - - name: Initialize Cancel Script - run: ./utils/steps/initialize_cancel_script.sh - - name: Create Session Script - run: ./utils/steps/create_session_script.sh - - name: Launch Job and Wait - run: ./utils/steps/launch_job_and_wait.sh - cleanup: ./kill.sh - - name: Clean and Exit - run: ./utils/steps/clean_and_exit.sh - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Jupyter Notebook Settings - items: - name: - type: string - hidden: true - default: jupyter-host - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - notebook_dir: - label: Directory to start Jupyter Lab GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the JupyterHub graphical user interface starts. The default value here is your home directory. - conda_install: - label: Install Jupyter-Notebook If Not There? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - use_conda: - label: Use Conda? - type: boolean - default: false - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Defaults to ~/pw/software. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env_tag_cloud: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/software/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - load_env_tag_existing: - label: Command to load Jupyter Notebook - type: string - default: module load python; export PATH=$PATH:/home/${USER}/.local/bin - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - install_instructions_tag_cloud: - label: Install Instructions - type: dropdown - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - default: notebook7.2.2-python3.12.2 - options: - - value: latest - label: Latest default versions of Jupyter Notebook and Python (not thoroughly tested) - - value: notebook7.2.2-python3.12.2 - label: Jupyter Notebook 7.2.2 with Python version 3.12.2 - - value: yaml - label: Provide custom Conda environment YAML file - install_instructions_tag_existing: - label: Install Instructions - type: string - default: install_command - hidden: true - ignore: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions_tag_cloud != yaml || inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel - install_command: - label: Command to install Jupyter - type: string - default: module load python; pip install notebook==6.5.7 --no-warn-script-location - hidden: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - ignore: ${{ .hidden }} diff --git a/workflow/yamls/jupyterhub-host/general.yaml b/workflow/yamls/jupyterhub-host/general.yaml deleted file mode 100644 index 2dcf6b5d4..000000000 --- a/workflow/yamls/jupyterhub-host/general.yaml +++ /dev/null @@ -1,350 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyter after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=hub/login" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Jupyter Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - gclusterv2 - - pclusterv2 - - azclusterv2 - - aws-slurm - - google-slurm - - azure-slurm - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Jupyter Notebook Settings - items: - name: - type: string - hidden: true - default: jupyterhub-host - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: '${{ inputs.service.conda_install == false }}' - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - password: - label: Password for Jupyter Hub - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - install_instructions: - label: Select Jupyter Hub Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: jupyterhub5.2.1-python3.13.1 - options: - - value: jupyterhub5.2.1-python3.13.1 - label: Jupyter Hub 5.2.1 with Python 3.13.1 - - value: latest - label: Latest versions of Jupyter Hub and Python (not thoroughly tested) - - value: yaml - label: Provide custom Conda environment YAML file - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel diff --git a/workflow/yamls/jupyterlab-host/emed.yaml b/workflow/yamls/jupyterlab-host/emed.yaml deleted file mode 100644 index d895660a4..000000000 --- a/workflow/yamls/jupyterlab-host/emed.yaml +++ /dev/null @@ -1,445 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=lab" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} -'on': - execute: - inputs: - pwrl_host: - type: group - label: Jupyter Lab Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Jupyter Lab Settings - items: - name: - type: string - hidden: true - default: jupyterlab-host - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} - notebook_dir_tag_existing: - label: Directory to start Jupyter Lab GUI - type: string - default: __HOME__/pw/ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - load_env_tag_existing: - label: Command to load Jupyter Lab to the PATH - type: string - default: source /gs/gsfs0/hpc01/rhel8/apps/conda3/etc/profile.d/conda.sh; conda activate base; module load cuda - tooltip: Use a bash command - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - conda_install_tag_cloud: - label: Install miniconda environment if not there? - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir_tag_cloud: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install_tag_cloud == false || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir_tag_cloud: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install_tag_cloud == false || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env_tag_cloud: - label: Conda environment - type: string - default: base - hidden: "${{ inputs.service.conda_install_tag_cloud == false || 'existing' == inputs.pwrl_host.resource.provider }}" - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env_tag_cloud: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install_tag_cloud == true || 'existing' == inputs.pwrl_host.resource.provider }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - install_instructions_tag_cloud: - label: Select Jupyter Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install_tag_cloud == false || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - default: jupyterlab4.1.5-python3.11.5 - options: - - value: jupyterlab4.1.5-python3.11.5 - label: Jupyter Lab 4.1.5 with Python 3.11.5 - - value: latest - label: Latest versions of Jupyter Lab and Python (not thoroughly tested) - - value: dask-extension-jupyterlab - label: Dask dependencies for PW - - value: yaml - label: Provide custom Conda environment YAML file - yaml_tag_cloud: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions_tag_cloud != yaml || inputs.service.conda_install_tag_cloud == false || 'existing' == inputs.pwrl_host.resource.provider }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels_tag_cloud: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install_tag_cloud == false || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password diff --git a/workflow/yamls/jupyterlab-host/general.yaml b/workflow/yamls/jupyterlab-host/general.yaml deleted file mode 100644 index 2f982140d..000000000 --- a/workflow/yamls/jupyterlab-host/general.yaml +++ /dev/null @@ -1,351 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=lab" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Jupyter Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Jupyter Lab Settings - items: - name: - type: string - hidden: true - default: jupyterlab-host - notebook_dir: - label: Directory to start Jupyter Lab GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: '${{ inputs.service.conda_install == false }}' - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - install_instructions: - label: Select Jupyter Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: jupyterlab4.1.5-python3.11.5 - options: - - value: jupyterlab4.1.5-python3.11.5 - label: Jupyter Lab 4.1.5 with Python 3.11.5 - - value: latest - label: Latest versions of Jupyter Lab and Python (not thoroughly tested) - - value: dask-extension-jupyterlab - label: Dask dependencies for PW - - value: yaml - label: Provide custom Conda environment YAML file - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel diff --git a/workflow/yamls/jupyterlab-host/general_k8s.yaml b/workflow/yamls/jupyterlab-host/general_k8s.yaml deleted file mode 100644 index 480031150..000000000 --- a/workflow/yamls/jupyterlab-host/general_k8s.yaml +++ /dev/null @@ -1,899 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - if ! [ -z ${{ org.JUICE_TOKEN }} ]; then - echo "export JUICE_TOKEN=${{ org.JUICE_TOKEN }}" >> resources/host/inputs.sh - fi - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - echo "slug=lab" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]] && kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - - if [[ "${{ inputs.service_k8s.use_token_auth }}" == "true" ]]; then - token="$(openssl rand -hex 16)" - fi - echo "token=${token}" | tee -a $OUTPUTS | tee -a OUTPUTS - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: jupyter-storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - - command: ["jupyter", "lab"] - args: - - "--allow-root" - - "--no-browser" - - "--ip=0.0.0.0" - - "--NotebookApp.default_url='/lab'" - - "--ServerApp.trust_xheaders=True" - - "--ServerApp.allow_origin='*'" - - "--ServerApp.allow_remote_access=True" - - "--IdentityProvider.token='${token}'" - - "--ServerApp.password=''" - - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: jupyter-storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: jupyter-storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - run: kubectl apply -f app.yaml - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - jupyter_pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "jupyter_pod=$jupyter_pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Get SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - if [ -z "${token}" ]; then - slug="lab" - else - slug="lab?token=${token}" - fi - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - slug: ${{ needs.create_k8s_session.outputs.slug }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} - -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - hidden: ${{ inputs.targetType != 'compute-cluster' }} - type: group - label: Compute Settings - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - optional: ${{ inputs.targetType != 'compute-cluster' }} - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Jupyter Lab Settings - collapsed: true - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - name: - type: string - hidden: true - default: jupyterlab-host - notebook_dir: - label: Directory to start Jupyter Lab GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: ${{ inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - install_instructions: - label: Select Jupyter Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: jupyterlab4.1.5-python3.11.5 - options: - - value: jupyterlab4.1.5-python3.11.5 - label: Jupyter Lab 4.1.5 with Python 3.11.5 - - value: latest - label: Latest versions of Jupyter Lab and Python (not thoroughly tested) - - value: dask-extension-jupyterlab - label: Dask dependencies for PW - - value: yaml - label: Provide custom Conda environment YAML file - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: Jupyter Lab Image - type: string - default: jupyter/datascience-notebook - tooltip: | - Sample containers from https://catalog.ngc.nvidia.com/containers: - - TensorFlow: nvcr.io/nvidia/tensorflow:25.02-tf2-py3 - - Pytorch: nvcr.io/nvidia/pytorch:24.09-py3 - image_port: - label: Jupyter Lab Port - type: number - default: 8888 - tooltip: Define the port on which the Jupyter Lab server runs inside the container. Default is 8888, which is standard for Jupyter Lab. - use_token_auth: - label: Use Token Authentication? - type: boolean - tooltip: Enable token-based authentication. Some containers may not support it. - juice: - type: group - label: Attached GPU Settings - collapsed: true - hidden: ${{ org.JUICE_TOKEN == "" || inputs.targetType != 'compute-cluster' }} - items: - use_juice: - label: Enable Juice? - type: boolean - default: false - tooltip: Enable Juice to access and share remote GPUs over a network for your workload. - pool_ids: - label: Pool IDs - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Comma separated list of pool ids from which to allocate the session resources, when empty any available pool you have access to is used - vram: - label: VRAM - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Amount of VRAM requested in gibibytes. Can return a session with fewer bytes if there is not enough space on the device. Suffixes can be provided for convenience. E.g. 4 GiB (default "0") - cmd_args: - label: Juice Run Command Arguments - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Optional arguments for the juice run command to customize workload execution, e.g., "--gpu-ids string". diff --git a/workflow/yamls/jupyterlab-host/hsp.yaml b/workflow/yamls/jupyterlab-host/hsp.yaml deleted file mode 100644 index 43b93b570..000000000 --- a/workflow/yamls/jupyterlab-host/hsp.yaml +++ /dev/null @@ -1,400 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=lab" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Jupyter Lab Settings - items: - name: - type: string - hidden: true - default: jupyterlab-host - notebook_dir: - label: Directory to start Jupyter Lab GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - rootless_docker: - label: Use Rootless Docker? - type: boolean - default: true - hidden: true - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: '${{ inputs.service.conda_install == false }}' - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - install_instructions: - label: Select Jupyter Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: jupyterlab4.1.5-python3.11.5 - options: - - value: jupyterlab4.1.5-python3.11.5 - label: Jupyter Lab 4.1.5 with Python 3.11.5 - - value: latest - label: Latest versions of Jupyter Lab and Python (not thoroughly tested) - - value: dask-extension-jupyterlab - label: Dask dependencies for PW - - value: yaml - label: Provide custom Conda environment YAML file - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel diff --git a/workflow/yamls/jupyterlab-host/noaa-v3.yaml b/workflow/yamls/jupyterlab-host/noaa-v3.yaml deleted file mode 100644 index 312188baf..000000000 --- a/workflow/yamls/jupyterlab-host/noaa-v3.yaml +++ /dev/null @@ -1,403 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=lab" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Jupyter Lab Settings - items: - name: - type: string - hidden: true - default: jupyterlab-host - password: - label: Password for Jupyter Lab session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - notebook_dir: - label: Directory to start Jupyter Lab GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the Jupyter graphical user interface starts. The default value here is your home directory. - conda_install: - label: Install Jupyter-Lab If Not There? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - use_conda: - label: Use Conda? - type: boolean - default: false - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Defaults to ~/pw/software. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env_tag_cloud: - label: Command to load Jupyter Lab to the PATH - type: string - default: source __HOME__/pw/software/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - load_env_tag_existing: - label: Command to load Jupyter Lab to the PATH - type: string - default: module load python; export PATH=$PATH:/home/${USER}/.local/bin - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - install_instructions_tag_cloud: - label: Select Jupyter Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - default: jupyterlab4.1.5-python3.11.5 - options: - - value: jupyterlab4.1.5-python3.11.5 - label: Jupyter Lab 4.1.5 with Python 3.11.5 - - value: latest - label: Latest versions of Jupyter Lab and Python (not thoroughly tested) - - value: dask-extension-jupyterlab - label: Dask dependencies for PW - - value: yaml - label: Provide custom Conda environment YAML file - install_instructions_tag_existing: - label: Install Instructions - type: string - default: install_command - hidden: true - ignore: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions_tag_cloud != yaml || inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel - install_command: - label: Command to install Jupyter - type: string - default: module load python; pip install jupyterlab --no-warn-script-location - hidden: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - ignore: ${{ .hidden }} diff --git a/workflow/yamls/jupyterlab-host/noaa.yaml b/workflow/yamls/jupyterlab-host/noaa.yaml deleted file mode 100644 index 709858ccf..000000000 --- a/workflow/yamls/jupyterlab-host/noaa.yaml +++ /dev/null @@ -1,229 +0,0 @@ -jobs: - main: - steps: - - name: Preprocessing - run: ./utils/steps/preprocessing.sh - - name: Validating Target Resource - run: ./utils/steps/input_form_resource_wrapper.sh - - name: Process Inputs - run: ./utils/steps/process_inputs_sh.sh - - name: Controller Preprocessing - run: ./utils/steps/controller_preprocessing.sh - - name: Prepare Service JSON - run: ./utils/steps/prepare_service_json.sh - - name: Initialize Cancel Script - run: ./utils/steps/initialize_cancel_script.sh - - name: Create Session Script - run: ./utils/steps/create_session_script.sh - - name: Launch Job and Wait - run: ./utils/steps/launch_job_and_wait.sh - cleanup: ./kill.sh - - name: Clean and Exit - run: ./utils/steps/clean_and_exit.sh - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Jupyter Notebook Settings - items: - name: - type: string - hidden: true - default: jupyterlab-host - password: - label: Password for notebook session - type: password - optional: true - hidden: true - ignore: true - tooltip: Enter password or leave blank for no password - notebook_dir: - label: Directory to start Jupyter Lab GUI - type: string - default: __HOME__ - tooltip: This is the directory that you start with when the JupyterHub graphical user interface starts. The default value here is your home directory. - conda_install: - label: Install Jupyter-Notebook If Not There? - type: boolean - default: true - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - use_conda: - label: Use Conda? - type: boolean - default: false - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Select Yes to install Jupyter in miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Defaults to ~/pw/software. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3c - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env_tag_cloud: - label: Command to load Jupyter Notebook to the PATH - type: string - default: source __HOME__/pw/software/.miniconda3c/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - load_env_tag_existing: - label: Command to load Jupyter Notebook - type: string - default: module load python; export PATH=$PATH:/home/${USER}/.local/bin - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - install_instructions_tag_cloud: - label: Select Jupyter Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - default: jupyterlab4.1.5-python3.11.5 - options: - - value: jupyterlab4.1.5-python3.11.5 - label: Jupyter Lab 4.1.5 with Python 3.11.5 - - value: latest - label: Latest versions of Jupyter Lab and Python (not thoroughly tested) - - value: dask-extension-jupyterlab - label: Dask dependencies for PW - - value: yaml - label: Provide custom Conda environment YAML file - install_instructions_tag_existing: - label: Install Instructions - type: string - default: install_command - hidden: true - ignore: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions_tag_cloud != yaml || inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file - install_kernels: - label: Select additional kernels to install - type: multi-dropdown - optional: true - hidden: ${{ inputs.service.conda_install == false || ('existing' == inputs.pwrl_host.resource.provider && inputs.service.use_conda == false) }} - ignore: ${{ .hidden }} - options: - - value: julia-kernel - label: Julia Kernel - - value: R-kernel - label: R Kernel - install_command: - label: Command to install Jupyter - type: string - default: module load python; pip install jupyterlab --no-warn-script-location - hidden: ${{ inputs.service.conda_install == false || 'existing' != inputs.pwrl_host.resource.provider || inputs.service.use_conda == true }} - ignore: ${{ .hidden }} diff --git a/workflow/yamls/k8s/jupyter/general.md b/workflow/yamls/k8s/jupyter/general.md deleted file mode 100644 index 790fd8bc3..000000000 --- a/workflow/yamls/k8s/jupyter/general.md +++ /dev/null @@ -1,55 +0,0 @@ -## JupyterLab on Kubernetes -This workflow launches a JupyterLab server on a Kubernetes cluster using a user-specified image and resource settings. The image must have JupyterLab pre-installed. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Use a JupyterLab-compatible image (default: jupyter/datascience-notebook). -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy JupyterLab and access it via a web interface. - -### Using Nvidia GPUs -For GPU-accelerated workloads, use images from the [Nvidia NGC Catalog](https://catalog.ngc.nvidia.com/containers). **Ensure that the driver version on the node meets the minimum driver requirement for that image.** - -Examples: -- **PyTorch:** `nvcr.io/nvidia/pytorch:24.09-py3` -- **TensorFlow:** `nvcr.io/nvidia/tensorflow:25.02-tf2-py3` - - -#### Test GPU Access in JupyterLab - -##### PyTorch -``` -import torch -if torch.cuda.is_available(): - num_gpus = torch.cuda.device_count() - print(f"GPU is available. Number of GPUs: {num_gpus}") - for i in range(num_gpus): - print(f" - GPU {i}: {torch.cuda.get_device_name(i)}") -else: - print("No GPU available. Using CPU only.") -``` - -##### TensorFlow -``` -import tensorflow as tf -physical_devices = tf.config.list_physical_devices('GPU') -if physical_devices: - print(f"TensorFlow detected {len(physical_devices)} GPU(s).") - for i, device in enumerate(physical_devices): - print(f" - GPU {i}: {device}") -else: - print("No GPU available. Using CPU only.") -``` - -##### Nvidia MIG Instances -To use more than one Multi-Instance GPUs (MIG) set the `CUDA_VISIBLE_DEVICES` environment variable. -``` -!nvidia-smi -L | grep MIG | grep -o 'MIG-[a-f0-9-]\+' -import os -# Replace with the MIG instance IDs -os.environ["CUDA_VISIBLE_DEVICES"] = ( - "MIG-5a9b896b-dbaa-50ca-bd8d-6c50ed9b31c1," - "MIG-9dc7b6fb-7215-536d-b2c3-5ee18463260c" -) -``` diff --git a/workflow/yamls/k8s/jupyter/general.yaml b/workflow/yamls/k8s/jupyter/general.yaml deleted file mode 100644 index f070dce6c..000000000 --- a/workflow/yamls/k8s/jupyter/general.yaml +++ /dev/null @@ -1,496 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: true - -jobs: - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - if [[ "${{ inputs.service_k8s.use_token_auth }}" == "true" ]]; then - token="$(openssl rand -hex 16)" - fi - echo "token=${token}" | tee -a $OUTPUTS | tee -a OUTPUTS - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: jupyter-storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - - command: ["jupyter", "lab"] - args: - - "--allow-root" - - "--no-browser" - - "--ip=0.0.0.0" - - "--NotebookApp.default_url='/lab'" - - "--ServerApp.trust_xheaders=True" - - "--ServerApp.allow_origin='*'" - - "--ServerApp.allow_remote_access=True" - - "--IdentityProvider.token='${token}'" - - "--ServerApp.password=''" - - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: jupyter-storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: jupyter-storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - jupyter_pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "jupyter_pod=$jupyter_pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - run: | - kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - echo Existing - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Get SLUG - early-cancel: any-job-failed - run: | - source OUTPUTS - if [ -z "${token}" ]; then - slug="lab" - else - slug="lab?token=${token}" - fi - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - slug: ${{ needs.create_k8s_session.outputs.slug }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - k8s: - type: group - label: Kubernetes Settings - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - items: - image: - label: Jupyter Lab Image - type: string - default: jupyter/datascience-notebook - tooltip: | - Sample containers from https://catalog.ngc.nvidia.com/containers: - - TensorFlow: nvcr.io/nvidia/tensorflow:25.02-tf2-py3 - - Pytorch: nvcr.io/nvidia/pytorch:24.09-py3 - image_port: - label: Jupyter Lab Port - type: number - default: 8888 - tooltip: Define the port on which the Jupyter Lab server runs inside the container. Default is 8888, which is standard for Jupyter Lab. - use_token_auth: - label: Use Token Authentication? - type: boolean - tooltip: Enable token-based authentication. Some containers may not support it. diff --git a/workflow/yamls/k8s/kasmvnc/general.md b/workflow/yamls/k8s/kasmvnc/general.md deleted file mode 100644 index 8010b60ab..000000000 --- a/workflow/yamls/k8s/kasmvnc/general.md +++ /dev/null @@ -1,10 +0,0 @@ -## KasmVNC on Kubernetes -This workflow launches KasmVNC on a Kubernetes cluster using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `kasmweb/desktop:1.16.0` from [this](https://hub.docker.com/r/kasmweb/desktop) DockerHub repository. Enter `kasm_user` when prompted for a user. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - diff --git a/workflow/yamls/k8s/kasmvnc/general.yaml b/workflow/yamls/k8s/kasmvnc/general.yaml deleted file mode 100644 index e1b43ba26..000000000 --- a/workflow/yamls/k8s/kasmvnc/general.yaml +++ /dev/null @@ -1,469 +0,0 @@ -permissions: - - '*' -sessions: - session: - redirect: true - useCustomDomain: true - useTLS: true - - -jobs: - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev | tr '_' '-' | tr '.' '-') - pvc_name="${PW_USER}-${workflow_name}-${job_number}-pvc" - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - run: | - set -x - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: VNC_PW - value: ${{ inputs.service_k8s.password }} - securityContext: - capabilities: - add: ["NET_ADMIN"] # Might be required depending on kasmvnc container usage - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - k8s: - type: group - label: Kubernetes Settings - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - items: - image: - label: KasmVNC Image - type: string - default: kasmweb/desktop:1.16.0 - tooltip: Sample container from https://hub.docker.com/r/kasmweb/desktop - image_port: - label: KasmVNC Port - type: number - default: 6901 - tooltip: Define the port on which the KasmVNC runs inside the container. Default is 6901. - password: - label: Password - type: password - tooltip: Type in a password for user kasm_user diff --git a/workflow/yamls/k8s/mlflow/general.md b/workflow/yamls/k8s/mlflow/general.md deleted file mode 100644 index 10db31dd2..000000000 --- a/workflow/yamls/k8s/mlflow/general.md +++ /dev/null @@ -1,10 +0,0 @@ -## MLFlow on Kubernetes -This workflow launches MLFlow on a Kubernetes cluster using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `ubuntu/mlflow:2.1.1_1.0-22.04`. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - diff --git a/workflow/yamls/k8s/mlflow/general.yaml b/workflow/yamls/k8s/mlflow/general.yaml deleted file mode 100644 index a1e086fff..000000000 --- a/workflow/yamls/k8s/mlflow/general.yaml +++ /dev/null @@ -1,461 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: true - -jobs: - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - if [[ "${{ inputs.service_k8s.use_token_auth }}" == "true" ]]; then - token="$(openssl rand -hex 16)" - fi - echo "token=${token}" | tee -a $OUTPUTS | tee -a OUTPUTS - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - command: ["mlflow", "ui"] - args: - - "-h" - - "0.0.0.0" - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - k8s: - type: group - label: Kubernetes Settings - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - items: - image: - label: MLFlow Image - type: string - default: ubuntu/mlflow:2.1.1_1.0-22.04 - image_port: - label: MLFlow Port - type: number - default: 5000 - tooltip: Define the port on which the MLFlow server runs inside the container. diff --git a/workflow/yamls/k8s/ollama-openwebui/general.md b/workflow/yamls/k8s/ollama-openwebui/general.md deleted file mode 100644 index 53e0628ce..000000000 --- a/workflow/yamls/k8s/ollama-openwebui/general.md +++ /dev/null @@ -1,11 +0,0 @@ -## Ollama through OpenWebUI on Kubernetes -- Automates deployment of Ollama (AI model server) and OpenWebUI (web interface) on a Kubernetes cluster -- Supports GPU and dynamic resource allocation -- Exposes OpenWebUI via a service for external access -- Pulls Ollama models -- Streams logs for both services - - - - - diff --git a/workflow/yamls/k8s/ollama-openwebui/general.yaml b/workflow/yamls/k8s/ollama-openwebui/general.yaml deleted file mode 100644 index b52506d56..000000000 --- a/workflow/yamls/k8s/ollama-openwebui/general.yaml +++ /dev/null @@ -1,576 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: true - - -jobs: - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - run: | - if [[ "${{ inputs.k8s.ollama_resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.ollama_resources.limits.gpu_resource_key }}: ${{ inputs.k8s.ollama_resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.ollama_resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.ollama_resources.limits.select_gpu }}: ${{ inputs.k8s.ollama_resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - --- - # Deployment for ollama - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.ollama_resources.requests.memory }}" - cpu: "${{ inputs.k8s.ollama_resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.ollama_resources.limits.memory }}" - cpu: "${{ inputs.k8s.ollama_resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.ollama_k8s.image }} - ports: - - containerPort: ${{ inputs.ollama_k8s.image_port }} - env: - - name: NVIDIA_VISIBLE_DEVICES - value: "all" - - name: NVIDIA_DRIVER_CAPABILITIES - value: "compute,utility" - resources: - requests: - memory: "${{ inputs.k8s.ollama_resources.requests.memory }}" - cpu: "${{ inputs.k8s.ollama_resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.ollama_resources.limits.memory }}" - cpu: "${{ inputs.k8s.ollama_resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - --- - # Service for ollama - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.ollama_k8s.image_port }} - targetPort: ${{ inputs.ollama_k8s.image_port }} - - --- - # Deployment for openwebui - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui - spec: - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui - image: ${{ inputs.openwebui_k8s.image }} - ports: - - containerPort: ${{ inputs.openwebui_k8s.image_port }} - env: - - name: OLLAMA_BASE_URL - value: "http://${{ needs.prepare_k8s_deployment.outputs.app_name }}.${{ inputs.k8s.namespace }}.svc.cluster.local.:${{ inputs.ollama_k8s.image_port }}" - - name: WEBUI_AUTH - value: "False" - resources: - requests: - memory: "${{ inputs.k8s.openwebui_resources.requests.memory }}" - cpu: "${{ inputs.k8s.openwebui_resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.openwebui_resources.limits.memory }}" - cpu: "${{ inputs.k8s.openwebui_resources.limits.cpu }}" - - --- - # Service for openwebui - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-openwebui - ports: - - protocol: TCP - port: ${{ inputs.openwebui_k8s.image_port }} - targetPort: ${{ inputs.openwebui_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Pull Ollama Models - early-cancel: any-job-failed - env: - pod_name: ${{ needs.apply_k8s_deployment.outputs.pod }} - namespace: ${{ inputs.k8s.namespace }} - run: | - set -x - kubectl -n $namespace exec $pod_name -- /bin/sh -c "ollama pull llama3 && ollama pull mistral && ollama pull phi3" & - - name: Stream Ollama Logs - run: | - kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} & - ollama_stream_pid=$? - echo ${ollama_stream_pid} > ollama_stream.pid - cleanup: kill $(cat ollama_stream.pid) - - name: Stream OpenWebUI Logs - early-cancel: any-job-failed - run: | - kubectl logs -f --selector=app=${{ needs.apply_k8s_deployment.outputs.app_name }}-openwebui -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.openwebui_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - k8s: - type: group - label: Kubernetes Settings - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - volumes: - type: group - label: Ollama Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /root/.ollama - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 20Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - ollama_resources: - type: group - label: Ollama Resources - collapsed: true - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - default: nvidia.com/gpu - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.ollama_resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.ollama_resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - openwebui_resources: - type: group - label: OpenWebUI Resources - collapsed: true - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - ollama_k8s: - type: group - label: Ollama Settings - collapsed: true - items: - image: - label: Ollama Image - type: string - default: ollama/ollama:latest - image_port: - label: Ollama Port - type: number - default: 11434 - openwebui_k8s: - type: group - label: OpenWebUI Settings - collapsed: true - items: - image: - label: OpenWebUI Image - type: string - default: ghcr.io/open-webui/open-webui:main - image_port: - label: OpenWebUI Port - type: number - default: 8080 diff --git a/workflow/yamls/k8s/vscode/general.md b/workflow/yamls/k8s/vscode/general.md deleted file mode 100644 index 630a2ed4e..000000000 --- a/workflow/yamls/k8s/vscode/general.md +++ /dev/null @@ -1,10 +0,0 @@ -## Code Server on Kubernetes -This workflow launches Code Server on a Kubernetes cluster using a user-specified image and resource settings. - -### Quick Start -- **Select a Kubernetes Cluster:** Choose your target K8s cluster. -- **Set Namespace:** Specify a namespace (e.g., default). -- **Choose an Image:** Default is `codercom/code-server:latest` from [this](https://hub.docker.com/r/codercom/code-server) DockerHub repository. -- **Configure Resources:** Set CPU, memory, and optional GPU requests/limits. -- **Run the Workflow:** Deploy Code Server and access it via a web interface. - diff --git a/workflow/yamls/k8s/vscode/general.yaml b/workflow/yamls/k8s/vscode/general.yaml deleted file mode 100644 index 60d5d34e0..000000000 --- a/workflow/yamls/k8s/vscode/general.yaml +++ /dev/null @@ -1,478 +0,0 @@ -permissions: - - '*' -sessions: - session: - redirect: true - useCustomDomain: true - - -jobs: - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: PASSWORD - value: ${{ inputs.service_k8s.password }} - - name: GITHUB_TOKEN - value: ${{ inputs.service_k8s.github_token }} - args: - - --auth - - password - - --bind-addr - - 0.0.0.0:8080 - - --disable-telemetry - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - early-cancel: any-job-failed - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - k8s: - type: group - label: Kubernetes Settings - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - items: - image: - label: Code Server Image - type: string - default: codercom/code-server:latest - tooltip: Sample container from https://hub.docker.com/r/codercom/code-server - image_port: - label: Code Server Port - type: number - default: 8080 - tooltip: Define the port on which the Code Server runs inside the container. Default is 8080. - password: - label: Password - type: password - tooltip: Type in a password - github_token: - label: GitHub Authentication Token - type: password - optional: true diff --git a/workflow/yamls/kasmvnc-proxy/general.yaml b/workflow/yamls/kasmvnc-proxy/general.yaml deleted file mode 100644 index b115674c9..000000000 --- a/workflow/yamls/kasmvnc-proxy/general.yaml +++ /dev/null @@ -1,296 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: true - useCustomDomain: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="?resize=remote&autoconnect=true&show_dot=true&path=websockify&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - gclusterv2 - - pclusterv2 - - azclusterv2 - - aws-slurm - - google-slurm - - azure-slurm - tooltip: Resource to host the service. Only supported in cloud clusters. - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - name: - type: string - label: Select remote display protocol - hidden: true - default: kasmvnc-proxy - set_password: - label: Set Password - type: boolean - default: false - tooltip: | - Select 'Yes' to enable password authentication for KasmVNC, requiring users to enter a password to access the session. - Select 'No' to disable password authentication, allowing users to access the session without a password. - password: - label: Password - type: password - hidden: ${{ inputs.service.set_password == false }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: The password applies to all sessions on the same target. Changing it affects active sessions, requiring the new password. diff --git a/workflow/yamls/kasmvnc-proxy/general_k8s.yaml b/workflow/yamls/kasmvnc-proxy/general_k8s.yaml deleted file mode 100644 index f5c74c9df..000000000 --- a/workflow/yamls/kasmvnc-proxy/general_k8s.yaml +++ /dev/null @@ -1,790 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: ${{ inputs.targetType == 'kubernetes-cluster' }} - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="?resize=remote&autoconnect=true&show_dot=true&path=websockify&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: VNC_PW - value: ${{ inputs.service_k8s.password }} - securityContext: - capabilities: - add: ["NET_ADMIN"] # Might be required depending on kasmvnc container usage - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} - -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - type: group - label: Service Host - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - resource: - type: compute-clusters - optional: ${{ inputs.targetType != 'compute-cluster' }} - label: Service host - include-workspace: false - tooltip: Resource to host the service. Only supported in cloud clusters. - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service Settings - collapsed: true - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - name: - type: string - label: Select remote display protocol - hidden: true - default: kasmvnc-proxy - set_password: - label: Set Password - type: boolean - default: false - tooltip: | - Select 'Yes' to enable password authentication for KasmVNC, requiring users to enter a password to access the session. - Select 'No' to disable password authentication, allowing users to access the session without a password. - password: - label: Password - type: password - hidden: ${{ inputs.service.set_password == false }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: The password applies to all sessions on the same target. Changing it affects active sessions, requiring the new password. - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: KasmVNC Image - type: string - default: kasmweb/desktop:1.16.0 - tooltip: Sample container from https://hub.docker.com/r/kasmweb/desktop - image_port: - label: KasmVNC Port - type: number - default: 6901 - tooltip: Define the port on which the KasmVNC runs inside the container. Default is 6901. - password: - label: Password - type: password - tooltip: Type in a password for user kasm_user - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} diff --git a/workflow/yamls/kasmvnc/general.yaml b/workflow/yamls/kasmvnc/general.yaml deleted file mode 100644 index c8f476463..000000000 --- a/workflow/yamls/kasmvnc/general.yaml +++ /dev/null @@ -1,289 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: true - useCustomDomain: true - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - gclusterv2 - - pclusterv2 - - azclusterv2 - - aws-slurm - - google-slurm - - azure-slurm - tooltip: Resource to host the service. Only supported in cloud clusters. - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - name: - type: string - label: Select remote display protocol - hidden: true - default: kasmvnc - set_password: - label: Set Password - type: boolean - default: false - tooltip: | - Select 'Yes' to enable password authentication for KasmVNC, requiring users to enter a password to access the session. - Select 'No' to disable password authentication, allowing users to access the session without a password. - password: - label: Password - type: password - hidden: ${{ inputs.service.set_password == false }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: The password applies to all sessions on the same target. Changing it affects active sessions, requiring the new password. diff --git a/workflow/yamls/kasmvnc/general_k8s.yaml b/workflow/yamls/kasmvnc/general_k8s.yaml deleted file mode 100644 index d27f1fd5f..000000000 --- a/workflow/yamls/kasmvnc/general_k8s.yaml +++ /dev/null @@ -1,789 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: true - useCustomDomain: true - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: VNC_PW - value: ${{ inputs.service_k8s.password }} - securityContext: - capabilities: - add: ["NET_ADMIN"] # Might be required depending on kasmvnc container usage - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} - -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: kubernetes-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - type: group - label: Service Host - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - resource: - type: compute-clusters - optional: ${{ inputs.targetType != 'compute-cluster' }} - label: Service host - include-workspace: false - provider: - - gclusterv2 - - pclusterv2 - - azclusterv2 - - aws-slurm - - google-slurm - - azure-slurm - tooltip: Resource to host the service. Only supported in cloud clusters. - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - name: - type: string - label: Select remote display protocol - hidden: true - default: kasmvnc - set_password: - label: Set Password - type: boolean - default: false - tooltip: | - Select 'Yes' to enable password authentication for KasmVNC, requiring users to enter a password to access the session. - Select 'No' to disable password authentication, allowing users to access the session without a password. - password: - label: Password - type: password - hidden: ${{ inputs.service.set_password == false }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: The password applies to all sessions on the same target. Changing it affects active sessions, requiring the new password. - - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: KasmVNC Image - type: string - default: kasmweb/desktop:1.16.0 - tooltip: Sample container from https://hub.docker.com/r/kasmweb/desktop - image_port: - label: KasmVNC Port - type: number - default: 6901 - tooltip: Define the port on which the KasmVNC runs inside the container. Default is 6901. - password: - label: Password - type: password - tooltip: Type in a password for user kasm_user - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} diff --git a/workflow/yamls/marimo-host/general.yaml b/workflow/yamls/marimo-host/general.yaml deleted file mode 100644 index f1ea79de4..000000000 --- a/workflow/yamls/marimo-host/general.yaml +++ /dev/null @@ -1,334 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Marimo Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Marimo Lab Settings - items: - name: - type: string - hidden: true - default: marimo-host - mode: - label: Notebook Mode - tooltip: Choose whether to edit or run the Marimo notebook. - type: dropdown - default: edit - options: - - value: edit - label: Edit - - value: run - label: Run - script: - label: Marimo Script Path - tooltip: "Path to the Marimo notebook (.py) file to open or run. If left empty, Marimo will launch the default tutorial notebook: marimo tutorial intro" - type: string - optional: ${{ inputs.service.mode == edit }} - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Marimo in Miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3-marimo - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: '${{ inputs.service.conda_install == false }}' - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Marimo to the PATH - type: string - default: source __HOME__/pw/.miniconda3-marimo/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - install_instructions: - label: Select Marimo Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: latest - options: - - value: latest - label: Latest versions of Marimo and Miniconda - - value: nov2025 - label: November 2025 - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file diff --git a/workflow/yamls/marimo-host/hsp.yaml b/workflow/yamls/marimo-host/hsp.yaml deleted file mode 100644 index fb113dfe7..000000000 --- a/workflow/yamls/marimo-host/hsp.yaml +++ /dev/null @@ -1,376 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Marimo Lab Settings - items: - name: - type: string - hidden: true - default: marimo-host - mode: - label: Notebook Mode - tooltip: Choose whether to edit or run the Marimo notebook. - type: dropdown - default: edit - options: - - value: edit - label: Edit - - value: run - label: Run - script: - label: Marimo Script Path - tooltip: "Path to the Marimo notebook (.py) file to open or run. If left empty, Marimo will launch the default tutorial notebook: marimo tutorial intro" - type: string - optional: ${{ inputs.service.mode == edit }} - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Marimo in Miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3-marimo - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: '${{ inputs.service.conda_install == false }}' - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Marimo to the PATH - type: string - default: source __HOME__/pw/.miniconda3-marimo/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - install_instructions: - label: Select Marimo Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: latest - options: - - value: latest - label: Latest versions of Marimo and Miniconda - - value: nov2025 - label: November 2025 - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file diff --git a/workflow/yamls/marimo-host/noaa.yaml b/workflow/yamls/marimo-host/noaa.yaml deleted file mode 100644 index 3b25722ee..000000000 --- a/workflow/yamls/marimo-host/noaa.yaml +++ /dev/null @@ -1,377 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Marimo Lab Settings - items: - name: - type: string - hidden: true - default: marimo-host - mode: - label: Notebook Mode - tooltip: Choose whether to edit or run the Marimo notebook. - type: dropdown - default: edit - options: - - value: edit - label: Edit - - value: run - label: Run - script: - label: Marimo Script Path - tooltip: "Path to the Marimo notebook (.py) file to open or run. If left empty, Marimo will launch the default tutorial notebook: marimo tutorial intro" - type: string - optional: ${{ inputs.service.mode == edit }} - conda_install: - label: Install miniconda environment if not there? - type: boolean - default: true - tooltip: Select Yes to install Marimo in Miniconda environment and No to load an existing python environment - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Software dependencies are installed in this directory. Ensure the directory has sufficient space! - conda_install_dir: - label: Name of the Conda Installation Directory - type: string - default: .miniconda3-marimo - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - tooltip: Ensure the directory has sufficient space for Conda and its packages. - conda_env: - label: Conda environment - type: string - default: base - hidden: '${{ inputs.service.conda_install == false }}' - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Environment to active. The base environment enables changing kernel to other environments! - load_env: - label: Command to load Marimo to the PATH - type: string - default: source __HOME__/pw/.miniconda3-marimo/etc/profile.d/conda.sh; conda activate base - hidden: ${{ inputs.service.conda_install == true }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Use a bash command - install_instructions: - label: Select Marimo Lab Installation - type: dropdown - hidden: ${{ inputs.service.conda_install == false }} - ignore: ${{ .hidden }} - default: latest - options: - - value: latest - label: Latest versions of Marimo and Miniconda - - value: nov2025 - label: November 2025 - yaml: - label: Paste Conda Environment Defition YAML - type: editor - hidden: ${{ inputs.service.install_instructions != yaml || inputs.service.conda_install == false }} - optional: ${{ .hidden }} - ignore: ${{ .hidden }} - tooltip: Copy and paste a custom Conda environment definition YAML file diff --git a/workflow/yamls/matlab-docker/general.yaml b/workflow/yamls/matlab-docker/general.yaml deleted file mode 100644 index 46345c91f..000000000 --- a/workflow/yamls/matlab-docker/general.yaml +++ /dev/null @@ -1,290 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - gclusterv2 - - pclusterv2 - - azclusterv2 - - aws-slurm - - google-slurm - - azure-slurm - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Matlab Docker Settings - items: - name: - type: string - hidden: true - default: matlab-docker - docker_repo: - label: Docker repository - type: string - default: mathworks/matlab:r2022a - tooltip: Docker repository to start with docker run - mount_directories: - label: Docker mount volumnes - type: string - optional: true - tooltip: 'Type in the mount volume options for the docker command. E.g.: -v /lustre:/lustre -v /contrib:/contrib' - use_gpus: - label: Use GPUs? - type: boolean - default: false - tooltip: Select Yes to run a CUDA application inside a container diff --git a/workflow/yamls/metabase/general_k8s.yaml b/workflow/yamls/metabase/general_k8s.yaml deleted file mode 100644 index b6604503e..000000000 --- a/workflow/yamls/metabase/general_k8s.yaml +++ /dev/null @@ -1,750 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: false - prompt-for-name: - default: 'metabase' - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: 80 - args: - - --auth - - password - - --bind-addr - - 0.0.0.0:8080 - - --disable-telemetry - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: 80 - targetPort: 80 - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '80' - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - hidden: ${{ inputs.targetType != 'compute-cluster' }} - type: group - label: Compute Settings - items: - resource: - type: compute-clusters - optional: ${{ inputs.targetType != 'compute-cluster' }} - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Metabase Settings - collapsed: false - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - name: - type: string - hidden: true - default: metabase - image: - label: Metabase Image - type: string - default: metabase/metabase - tooltip: Sample container from https://hub.docker.com/r/metabase/metabase - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - default: None - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: Metabase Image - type: string - default: metabase/metabase - tooltip: Sample container from https://hub.docker.com/r/metabase/metabase diff --git a/workflow/yamls/mlflow/general.yaml b/workflow/yamls/mlflow/general.yaml deleted file mode 100644 index 285099352..000000000 --- a/workflow/yamls/mlflow/general.yaml +++ /dev/null @@ -1,284 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: MLflow - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - hidden: true - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: MLFlow Settings - items: - name: - type: string - hidden: true - default: mlflow - install_mlflow: - label: Install MLflow? - type: boolean - default: true - tooltip: Select Yes to install MLflow and No to load MLflow into the environment - mlflow_install_cmd: - label: Command to Install MLflow - type: string - default: pip3 install mlflow - hidden: ${{ inputs.service.install_mlflow == false }} - ignore: ${{ .hidden }} - mlflow_load_cmd: - label: Command to Load MLflow - type: string - hidden: ${{ inputs.service.install_mlflow == true }} - ignore: ${{ .hidden }} - port: - label: Port - type: number - min: 1024 - max: 65535 - default: 5000 - tooltip: Port number to listen on - additional_flags: - label: Additional Flags - type: string - optional: true - tooltip: Additional flags to pass to the mlflow server command. See https://mlflow.org/docs/latest/cli.html#mlflow-server diff --git a/workflow/yamls/mlflow/general_k8s.yaml b/workflow/yamls/mlflow/general_k8s.yaml deleted file mode 100644 index fe0f30a58..000000000 --- a/workflow/yamls/mlflow/general_k8s.yaml +++ /dev/null @@ -1,777 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - if [[ "${{ inputs.service_k8s.use_token_auth }}" == "true" ]]; then - token="$(openssl rand -hex 16)" - fi - echo "token=${token}" | tee -a $OUTPUTS | tee -a OUTPUTS - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - command: ["mlflow", "ui"] - args: - - "-h" - - "0.0.0.0" - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} - -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: kubernetes-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - hidden: ${{ inputs.targetType != 'compute-cluster' }} - type: group - label: MLflow - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - optional: ${{ inputs.targetType != 'compute-cluster' }} - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - hidden: true - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: MLFlow Settings - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - name: - type: string - hidden: true - default: mlflow - install_mlflow: - label: Install MLflow? - type: boolean - default: true - tooltip: Select Yes to install MLflow and No to load MLflow into the environment - mlflow_install_cmd: - label: Command to Install MLflow - type: string - default: pip3 install mlflow - hidden: ${{ inputs.service.install_mlflow == false }} - ignore: ${{ .hidden }} - mlflow_load_cmd: - label: Command to Load MLflow - type: string - hidden: ${{ inputs.service.install_mlflow == true }} - ignore: ${{ .hidden }} - port: - label: Port - type: number - min: 1024 - max: 65535 - default: 5000 - tooltip: Port number to listen on - additional_flags: - label: Additional Flags - type: string - optional: true - tooltip: Additional flags to pass to the mlflow server command. See https://mlflow.org/docs/latest/cli.html#mlflow-server - - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: MLFlow Image - type: string - default: ubuntu/mlflow:2.1.1_1.0-22.04 - image_port: - label: MLFlow Port - type: number - default: 5000 - tooltip: Define the port on which the MLFlow server runs inside the container. diff --git a/workflow/yamls/n8n/general.yaml b/workflow/yamls/n8n/general.yaml deleted file mode 100644 index 88e278a9b..000000000 --- a/workflow/yamls/n8n/general.yaml +++ /dev/null @@ -1,268 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: n8n Settings - items: - name: - type: string - hidden: true - default: n8n - docker_repo: - label: Docker Repository - type: string - default: docker.io/n8nio/n8n:1.123.4 diff --git a/workflow/yamls/ngencerf/general_v4.yaml b/workflow/yamls/ngencerf/general_v4.yaml new file mode 100644 index 000000000..f9140b298 --- /dev/null +++ b/workflow/yamls/ngencerf/general_v4.yaml @@ -0,0 +1,246 @@ +# yaml-language-server: $schema=https://activate.parallel.works/workflow.schema.json +permissions: + - '*' +sessions: + session: + useTLS: false + redirect: true + prompt-for-name: + default: 'ngencerf' + +jobs: + preprocessing: + ssh: + remoteHost: ${{ inputs.cluster.resource.ip }} + steps: + - name: Checkout + uses: parallelworks/checkout + with: + repo: https://github.com/avidalto/interactive_session.git + branch: alvaro/v2 + sparse_checkout: + - ${{ inputs.service.name }} + - name: Create Inputs + run: | + set -x + # Write PW platform environment variables (excludes API key) + env | grep '^PW_' | grep -v 'PW_API_KEY' > inputs.sh + # Encase values in quotes + sed -i 's/=\(.*\)/="\1"/' inputs.sh + # Write form inputs and session-specific variables + cat <<'EOF' >> inputs.sh + basepath=/me/session/${PW_USER}/${{ sessions.session }} + PATH=$HOME/pw:$PATH + service_name="${{ inputs.service.name }}" + service_nginx_sif="${{ inputs.service.nginx_sif }}" + service_slurm_app_workers="${{ inputs.service.slurm_app_workers }}" + local_data_dir="${{ inputs.service.local_data_dir }}" + container_data_dir="${{ inputs.service.container_data_dir }}" + nwm_cal_mgr_singularity_container_path="${{ inputs.service.nwm_cal_mgr_sif }}" + nwm_fcst_mgr_singularity_container_path="${{ inputs.service.nwm_fcst_mgr_sif }}" + nwm_verf_singularity_container_path="${{ inputs.service.nwm_verf_sif }}" + service_ngencerf_server_dir="${{ inputs.service.ngencerf_server_dir }}" + service_ngencerf_ui_dir="${{ inputs.service.ngencerf_ui_dir }}" + service_ngencerf_docker_dir="${{ inputs.service.ngencerf_docker_dir }}" + service_build_server="${{ inputs.service.build_server }}" + service_build_ui="${{ inputs.service.build_ui }}" + pw_platform_host="${PW_PLATFORM_HOST}" + EOF + # Remove empty/undefined variables and export all + sed -i '/=\s*$\|=undefined\s*$/d' inputs.sh + sed -i '/=""/d' inputs.sh + sed -i 's/^/export /' inputs.sh + + session_runner: + needs: + - preprocessing + ssh: + remoteHost: ${{ inputs.cluster.resource.ip }} + steps: + - uses: github/parallelworks/interactive_session@main + early-cancel: any-job-failed + with: + $yaml: workflow/session_runner/v1.4/general.yaml + session: ${{ sessions.session }} + resource: ${{ inputs.cluster.resource }} + cluster: + scheduler: ${{ inputs.cluster.scheduler }} + slurm: + is_enabled: ${{ inputs.cluster.slurm.is_enabled }} + partition: ${{ inputs.cluster.slurm.partition }} + scheduler_directives: ${{ inputs.cluster.slurm.scheduler_directives }} + time: ${{ inputs.cluster.slurm.time }} + pbs: + is_enabled: ${{ inputs.cluster.pbs.is_enabled }} + scheduler_directives: ${{ inputs.cluster.pbs.scheduler_directives }} + service: + start_service_script: ${PW_PARENT_JOB_DIR}/${{ inputs.service.name }}/start-template-v3.sh + controller_script: ${PW_PARENT_JOB_DIR}/${{ inputs.service.name }}/controller-v3.sh + inputs_sh: ${PW_PARENT_JOB_DIR}/inputs.sh + slug: "" + rundir: ${PW_PARENT_JOB_DIR} + +'on': + execute: + inputs: + cluster: + type: group + label: Compute Cluster Settings + items: + resource: + type: compute-clusters + label: Service host + include-workspace: false + tooltip: Resource to host the NGENCERF service + scheduler: + type: boolean + default: false + hidden: true + label: Schedule Job? + tooltip: | + Yes → Job is submitted to the SLURM scheduler via sbatch + No → Job runs directly on the controller/login node + slurm: + type: group + label: SLURM Directives + hidden: ${{ inputs.cluster.resource.schedulerType != 'slurm' || inputs.cluster.scheduler == false }} + ignore: ${{ inputs.cluster.resource.schedulerType != 'slurm' || inputs.cluster.scheduler == false }} + items: + is_enabled: + type: boolean + hidden: true + default: true + label: Is SLURM Enabled? + partition: + type: slurm-partitions + label: SLURM partition + optional: true + resource: ${{ inputs.cluster.resource }} + tooltip: Select a SLURM partition from the drop-down menu. + time: + label: Walltime + type: string + default: '08:00:00' + tooltip: '--time= SLURM directive. Set the maximum wall-clock time for the session.' + scheduler_directives: + type: editor + optional: true + tooltip: | + Add extra SLURM directives, one per line, using the #SBATCH prefix. For example: + #SBATCH --exclusive + #SBATCH --nodelist= + pbs: + type: group + label: PBS Directives + hidden: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.cluster.scheduler == false }} + ignore: ${{ inputs.cluster.resource.schedulerType != 'pbs' || inputs.cluster.scheduler == false }} + items: + is_enabled: + type: boolean + hidden: true + default: true + label: Is PBS Enabled? + scheduler_directives: + label: Scheduler Directives + type: editor + tooltip: | + Add extra PBS directives, one per line, using the #PBS prefix. For example: + #PBS -l mem=16gb + #PBS -l ngpus=1 + + service: + type: group + label: NGENCERF Settings + collapsed: true + items: + name: + type: string + hidden: true + default: ngencerf + + # ── Container paths ────────────────────────────────────────────────── + nginx_sif: + label: NGINX Singularity Container Path + type: string + default: /ngencerf-app/singularity/nginx-unprivileged.sif + hidden: true + tooltip: Absolute path to the nginx-unprivileged Singularity (.sif) container on the cluster. + + nwm_cal_mgr_sif: + label: NWM Calibration Manager Container Path + type: string + default: /ngencerf-app/singularity/nwm-cal-mgr.sif + hidden: true + tooltip: Absolute path to the nwm-cal-mgr Singularity container on the cluster. + + nwm_fcst_mgr_sif: + label: NWM Forecast Manager Container Path + type: string + default: /ngencerf-app/singularity/nwm-fcst-mgr.sif + hidden: true + tooltip: Absolute path to the nwm-fcst-mgr Singularity container on the cluster. + + nwm_verf_sif: + label: NWM Verification Container Path + type: string + default: /ngencerf-app/singularity/nwm-verf.sif + hidden: true + tooltip: Absolute path to the nwm-verf Singularity container on the cluster. + + # ── Data directories ───────────────────────────────────────────────── + local_data_dir: + label: Data Directory (host path) + type: string + default: /ngencerf-app/data/ngen-cal-data/ + hidden: true + tooltip: Absolute path to the shared data directory on the cluster filesystem. + + container_data_dir: + label: Data Directory (container path) + type: string + default: /ngencerf/data/ + hidden: true + tooltip: Path inside the Singularity containers where the data directory is bind-mounted. + + # ── Application source directories ─────────────────────────────────── + ngencerf_server_dir: + label: NGENCERF Server Repository Path + type: string + default: /ngencerf-app/ngencerf-server + hidden: true + tooltip: Absolute path to the ngencerf-server repository on the cluster (contains production-pw.yaml). + + ngencerf_ui_dir: + label: NGENCERF UI Repository Path + type: string + default: /ngencerf-app/ngencerf-ui + hidden: true + tooltip: Absolute path to the ngencerf-ui repository on the cluster (contains compose.yaml). + + ngencerf_docker_dir: + label: NGENCERF Docker Repository Path + type: string + default: /ngencerf-app/ngencerf-docker + hidden: true + tooltip: Absolute path to the ngencerf-docker repository on the cluster. + + # ── Build options ──────────────────────────────────────────────────── + build_server: + label: Build ngencerf-server container? + type: boolean + default: true + tooltip: Set to Yes to rebuild the ngencerf-server Docker image from source before starting. + + build_ui: + label: Build ngencerf-ui container? + type: boolean + default: true + tooltip: Set to Yes to rebuild the ngencerf-ui Docker image from source before starting. + + # ── Runtime options ────────────────────────────────────────────────── + slurm_app_workers: + label: Workers for SLURM wrapper app + type: number + default: 2 + hidden: true + tooltip: Number of Gunicorn worker processes for the SLURM job submission API (port 5000). diff --git a/workflow/yamls/ngencerf/start.yaml b/workflow/yamls/ngencerf/start.yaml deleted file mode 100644 index cae9dc53c..000000000 --- a/workflow/yamls/ngencerf/start.yaml +++ /dev/null @@ -1,284 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - prompt-for-name: - default: 'ngencerf' - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - use_screen: - label: Use screen to run controller sessions - type: boolean - default: true - hidden: true - nwm_cal_mgr_singularity_container_path: - label: Path to the nwm-cal-mgr singularity container - type: string - default: /ngencerf-app/singularity/nwm-cal-mgr.sif - hidden: true - nwm_fcst_mgr_singularity_container_path: - label: Path to the nwm-fcst-mgr singularity container - type: string - default: /ngencerf-app/singularity/nwm-fcst-mgr.sif - hidden: true - nwm_verf_singularity_container_path: - label: Path to the nwm-verf singularity container - type: string - default: /ngencerf-app/singularity/nwm-verf.sif - hidden: true - container_data_dir: - label: Path to the data directory within the container - type: string - default: /ngencerf/data/ - hidden: true - local_data_dir: - label: Path to the data directory on the cluster - type: string - default: /ngencerf-app/data/ngen-cal-data/ - hidden: true - pwrl_host: - type: group - label: ngenCERF Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - hidden: true - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - service: - type: group - label: ngenCERF - items: - name: - type: string - hidden: true - default: ngencerf - build_server: - label: Build ngencerf-server container? - type: boolean - default: true - tooltip: Select Yes to build the ngencerf-server Docker container running docker compose up --build - build_ui: - label: Build ngencerf-ui container? - type: boolean - default: true - tooltip: Select Yes to build the ngencerf-ui Docker container running docker compose up --build - ngencerf_server_dir: - label: Directory with ngencerf-server - type: string - default: /ngencerf-app/ngencerf-server - hidden: true - ngencerf_ui_dir: - label: Directory with ngencerf-ui - type: string - default: /ngencerf-app/ngencerf-ui - hidden: true - ngencerf_docker_dir: - label: Directory with ngencerf-docker - type: string - default: /ngencerf-app/ngencerf-docker - hidden: true - nginx_sif: - label: NGINX Singularity container - type: string - default: /ngencerf-app/singularity/nginx-unprivileged.sif - hidden: true - slurm_app_workers: - label: Workers for SLURM wrapper app - type: number - default: 2 - hidden: true diff --git a/workflow/yamls/nginx-docker/general.yaml b/workflow/yamls/nginx-docker/general.yaml deleted file mode 100644 index bb4acc95f..000000000 --- a/workflow/yamls/nginx-docker/general.yaml +++ /dev/null @@ -1,287 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - echo "slug=" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - gclusterv2 - - pclusterv2 - - azclusterv2 - - aws-slurm - - google-slurm - - azure-slurm - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Nginx Docker Settings - items: - name: - type: string - hidden: true - default: nginx-docker - docker_repo: - label: Docker repository - type: string - default: nginx - tooltip: Nginx Docker repository for the docker command - mount_directories: - label: Docker mount volumnes - type: string - optional: true - tooltip: 'Type in the mount volume options for the docker command. E.g.: -v /lustre:/lustre -v /contrib:/contrib' diff --git a/workflow/yamls/ollama-openwebui/emed.yaml b/workflow/yamls/ollama-openwebui/emed.yaml deleted file mode 100644 index 44353beb4..000000000 --- a/workflow/yamls/ollama-openwebui/emed.yaml +++ /dev/null @@ -1,279 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - items: - resource: - type: compute-clusters - label: Service host - provider: - - existing - include-workspace: false - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service Settings - items: - name: - type: string - hidden: true - default: ollama-openwebui - models: - label: Ollama Models - type: string - default: /public/apps/ollama/models - num_parallel: - label: Ollama Num Parallel - type: number - default: 2 - min: 1 - max_loaded_models: - label: Ollama Max Loaded Models - type: number - default: 2 - min: 1 - default_keep_alive: - label: Ollama Default Keep Alive - type: string - default: 120m diff --git a/workflow/yamls/openvscode/emed.yaml b/workflow/yamls/openvscode/emed.yaml deleted file mode 100644 index f0861f48a..000000000 --- a/workflow/yamls/openvscode/emed.yaml +++ /dev/null @@ -1,363 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - set -x - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - echo "slug=?folder=${service_directory}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: VS Code Settings - items: - name: - type: string - hidden: true - default: openvscode - download_url: - label: Download URL - type: string - default: https://github.com/coder/code-server/releases/download/v4.105.1/code-server-4.105.1-linux-amd64.tar.gz - tooltip: Downloads or uses the code-server server version specified in the URL - password: - label: Password for IDE session - type: password - optional: true - tooltip: Enter password or leave blank for no password - directory: - label: Directory to open in VS Code - type: string - default: /gs/gsfs0/users/__USER__/pw/ - tooltip: Directory to open in VS code \ No newline at end of file diff --git a/workflow/yamls/openvscode/general.yaml b/workflow/yamls/openvscode/general.yaml deleted file mode 100644 index 378e8d023..000000000 --- a/workflow/yamls/openvscode/general.yaml +++ /dev/null @@ -1,285 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - echo "slug=?folder=${service_directory}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: VS Code Settings - items: - name: - type: string - hidden: true - default: openvscode - download_url: - label: Download URL - type: string - default: https://github.com/coder/code-server/releases/download/v4.105.1/code-server-4.105.1-linux-amd64.tar.gz - tooltip: Downloads or uses the code-server server version specified in the URL - password: - label: Password for IDE session - type: password - optional: true - tooltip: Enter password or leave blank for no password - directory: - label: Directory to open in VS Code - type: string - default: __HOME__ - tooltip: Directory to open in VS code diff --git a/workflow/yamls/openvscode/general_k8s.yaml b/workflow/yamls/openvscode/general_k8s.yaml deleted file mode 100644 index a07d2a3bc..000000000 --- a/workflow/yamls/openvscode/general_k8s.yaml +++ /dev/null @@ -1,814 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - if ! [ -z ${{ org.JUICE_TOKEN }} ]; then - echo "export JUICE_TOKEN=${{ org.JUICE_TOKEN }}" >> resources/host/inputs.sh - fi - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - source resources/host/inputs.sh - echo "slug=?folder=${service_directory}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: PASSWORD - value: ${{ inputs.service_k8s.password }} - - args: - - --auth - - password - - --bind-addr - - 0.0.0.0:8080 - - --disable-telemetry - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - hidden: ${{ inputs.targetType != 'compute-cluster' }} - type: group - label: Compute Settings - items: - resource: - type: compute-clusters - optional: ${{ inputs.targetType != 'compute-cluster' }} - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: VS Code Settings - collapsed: true - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - name: - type: string - hidden: true - default: openvscode - download_url: - label: Download URL - type: string - default: https://github.com/coder/code-server/releases/download/v4.105.1/code-server-4.105.1-linux-amd64.tar.gz - tooltip: Downloads or uses the code-server server version specified in the URL - password: - label: Password for IDE session - type: password - optional: true - tooltip: Enter password or leave blank for no password - directory: - label: Directory to open in VS Code - type: string - default: __HOME__ - tooltip: Directory to open in VS code - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: Code Server Image - type: string - default: codercom/code-server:latest - tooltip: Sample container from https://hub.docker.com/r/codercom/code-server - image_port: - label: Code Server Port - type: number - default: 8080 - tooltip: Define the port on which the Code Server runs inside the container. Default is 8080. - password: - label: Password - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - type: password - tooltip: Type in a password - - juice: - type: group - label: Attached GPU Settings - collapsed: true - hidden: ${{ org.JUICE_TOKEN == "" || inputs.targetType != 'compute-cluster' }} - items: - use_juice: - label: Enable Juice? - type: boolean - default: false - tooltip: Enable Juice to access and share remote GPUs over a network for your workload. - pool_ids: - label: Pool IDs - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Comma separated list of pool ids from which to allocate the session resources, when empty any available pool you have access to is used - vram: - label: VRAM - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Amount of VRAM requested in gibibytes. Can return a session with fewer bytes if there is not enough space on the device. Suffixes can be provided for convenience. E.g. 4 GiB (default "0") - cmd_args: - label: Juice Run Command Arguments - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Optional arguments for the juice run command to customize workload execution, e.g., "--gpu-ids string". diff --git a/workflow/yamls/openvscode/hsp.yaml b/workflow/yamls/openvscode/hsp.yaml deleted file mode 100644 index 287bc3719..000000000 --- a/workflow/yamls/openvscode/hsp.yaml +++ /dev/null @@ -1,330 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - echo "slug=?folder=${service_directory}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: VS Code Settings - items: - name: - type: string - hidden: true - default: openvscode - download_url: - label: Download URL - type: string - default: https://github.com/coder/code-server/releases/download/v4.105.1/code-server-4.105.1-linux-amd64.tar.gz - tooltip: Downloads or uses the code-server server version specified in the URL - password: - label: Password for IDE session - type: password - optional: true - tooltip: Enter password or leave blank for no password - directory: - label: Directory to open in VS Code - type: string - default: __HOME__ - tooltip: Directory to open in VS code - diff --git a/workflow/yamls/openvscode/noaa-v3.yaml b/workflow/yamls/openvscode/noaa-v3.yaml deleted file mode 100644 index 0fac810b0..000000000 --- a/workflow/yamls/openvscode/noaa-v3.yaml +++ /dev/null @@ -1,314 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - echo "slug=?folder=${service_directory}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: VS Code Settings - collapsed: true - items: - name: - type: string - hidden: true - default: openvscode - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - tooltip: Defaults to ~/pw/software. Make sure there is enough disk space available! - download_url: - label: Download URL - type: string - default: https://github.com/coder/code-server/releases/download/v4.105.1/code-server-4.105.1-linux-amd64.tar.gz - tooltip: Downloads or uses the code-server server version specified in the URL - password: - label: Password for IDE session - type: password - optional: true - tooltip: Enter password or leave blank for no password - directory: - label: Directory to open in VS Code - type: string - default: __HOME__ - tooltip: Directory to open in VS code diff --git a/workflow/yamls/openvscode/noaa.yaml b/workflow/yamls/openvscode/noaa.yaml deleted file mode 100644 index 9cf0fcb85..000000000 --- a/workflow/yamls/openvscode/noaa.yaml +++ /dev/null @@ -1,144 +0,0 @@ -jobs: - main: - steps: - - name: Preprocessing - run: ./utils/steps/preprocessing.sh - - name: Validating Target Resource - run: ./utils/steps/input_form_resource_wrapper.sh - - name: Process Inputs - run: ./utils/steps/process_inputs_sh.sh - - name: Controller Preprocessing - run: ./utils/steps/controller_preprocessing.sh - - name: Prepare Service JSON - run: ./utils/steps/prepare_service_json.sh - - name: Initialize Cancel Script - run: ./utils/steps/initialize_cancel_script.sh - - name: Create Session Script - run: ./utils/steps/create_session_script.sh - - name: Launch Job and Wait - run: ./utils/steps/launch_job_and_wait.sh - cleanup: ./kill.sh - - name: Clean and Exit - run: ./utils/steps/clean_and_exit.sh - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: VS Code Settings - collapsed: true - items: - name: - type: string - hidden: true - default: openvscode - parent_install_dir: - label: Parent Install Directory - type: string - default: __HOME__/pw/software - tooltip: Defaults to ~/pw/software. Make sure there is enough disk space available! - download_url: - label: Download URL - type: string - default: https://github.com/coder/code-server/releases/download/v4.105.1/code-server-4.105.1-linux-amd64.tar.gz - tooltip: Downloads or uses the code-server server version specified in the URL - password: - label: Password for IDE session - type: password - optional: true - tooltip: Enter password or leave blank for no password - directory: - label: Directory to open in VS Code - type: string - default: __HOME__ - tooltip: Directory to open in VS code diff --git a/workflow/yamls/pgadmin4/general_k8s.yaml b/workflow/yamls/pgadmin4/general_k8s.yaml deleted file mode 100644 index 5838565c8..000000000 --- a/workflow/yamls/pgadmin4/general_k8s.yaml +++ /dev/null @@ -1,778 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: 80 - env: - - name: PGADMIN_DEFAULT_PASSWORD - value: ${{ inputs.service_k8s.password }} - - name: PGADMIN_DEFAULT_EMAIL - value: ${{ inputs.service_k8s.email }} - args: - - --auth - - password - - --bind-addr - - 0.0.0.0:8080 - - --disable-telemetry - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: 80 - targetPort: 80 - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '80' - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - hidden: ${{ inputs.targetType != 'compute-cluster' }} - type: group - label: Compute Settings - items: - resource: - type: compute-clusters - optional: ${{ inputs.targetType != 'compute-cluster' }} - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Pgadmin4 Settings - collapsed: false - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - name: - type: string - hidden: true - default: pgadmin4 - image: - label: Pgadmin4 Image - type: string - default: dpage/pgadmin4 - tooltip: Sample container from https://hub.docker.com/r/dpage/pgadmin4/ - mount_directories: - label: Docker mount volumes - type: string - optional: true - tooltip: 'Type in the mount volume options for the docker command. E.g.: -v /mnt:/mnt -v /data:/data' - email: - label: Email - optional: ${{ inputs.targetType != 'compute-cluster' }} - type: string - tooltip: Enter email for PGADMIN_DEFAULT_EMAIL variable - password: - label: Password - optional: ${{ inputs.targetType != 'compute-cluster' }} - type: password - tooltip: Enter password for PGADMIN_DEFAULT_PASSWORD variable - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - default: None - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: Pgadmin4 Image - type: string - default: dpage/pgadmin4 - tooltip: Sample container from https://hub.docker.com/r/dpage/pgadmin4/ - email: - label: Email - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - type: string - tooltip: Enter email for PGADMIN_DEFAULT_EMAIL variable - password: - label: Password - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - type: password - tooltip: Enter password for PGADMIN_DEFAULT_PASSWORD variable diff --git a/workflow/yamls/postgres/general_k8s.yaml b/workflow/yamls/postgres/general_k8s.yaml deleted file mode 100644 index 5954633ff..000000000 --- a/workflow/yamls/postgres/general_k8s.yaml +++ /dev/null @@ -1,769 +0,0 @@ -# yaml-language-server: $schema=https://activate.parallel.works/workflow.schema.json ---- -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: false - useCustomDomain: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - # - name: Select Local Port - # early-cancel: any-job-failed - # if: ${{ inputs.targetType == 'compute-cluster' }} - # run: | - # local_port=$(pw agent open-port) - # echo "local_port=${local_port}" | tee -a $OUTPUTS - # - name: Expose Port - # early-cancel: any-job-failed - # if: ${{ inputs.targetType == 'compute-cluster' }} - # uses: parallelworks/update-session - # with: - # remotePort: ${{ needs.create_session.outputs.remote_port }} - # localPort: ${{ needs.create_session.outputs.local_port }} - # remoteHost: ${{ needs.create_session.outputs.target_hostname }} - # target: ${{ inputs.pwrl_host.resource.id }} - # name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - if [[ "${{ inputs.service_k8s.db}" == "undefined" ]]; then - service_db="" - else - service_db="${{ inputs.service_k8s.db }}" - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: 80 - env: - - name: POSTGRES_PASSWORD - value: ${{ inputs.service_k8s.password }} - - name: POSTGRES_USER - value: ${{ inputs.service_k8s.user }} - - name: POSTGRES_DB - value: "${service_db}" - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: 80 - targetPort: 80 - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '80' - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - hidden: ${{ inputs.targetType != 'compute-cluster' }} - type: group - label: Compute Settings - items: - resource: - type: compute-clusters - optional: ${{ inputs.targetType != 'compute-cluster' }} - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: postgres Settings - collapsed: false - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - name: - type: string - hidden: true - default: postgres - image: - label: postgres Image - type: string - default: postgres:latest - tooltip: Sample container from https://hub.docker.com/_/postgres - mount_directories: - label: Docker Flags - type: string - default: --network=host - optional: true - tooltip: 'Type in the docker flags and mount volume options for the docker command. E.g.: -v /mnt:/mnt -v /data:/data' - user: - label: postgres User - optional: ${{ inputs.targetType != 'compute-cluster' }} - type: string - default: postgres - tooltip: Enter email for POSTGRES_USER variable - password: - label: Password - optional: ${{ inputs.targetType != 'compute-cluster' }} - type: password - tooltip: Enter password for POSTGRES_PASSWORD variable - db: - label: DB - type: string - default: postgres - tooltip: Enter email for POSTGRES_DB variable - optional: true - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 512Mi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '1' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 1Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - default: None - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: postgres Image - type: string - default: postgres:latest - tooltip: Sample container from https://hub.docker.com/_/postgres - mount_directories: - label: Docker mount volumes - type: string - optional: true - tooltip: 'Type in the mount volume options for the docker command. E.g.: -v /mnt:/mnt -v /data:/data' - user: - label: postgres User - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - type: string - tooltip: Enter email for POSTGRES_USER variable - password: - label: Password - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - type: password - tooltip: Enter password for POSTGRES_PASSWORD variable - db: - label: DB - type: string - tooltip: Enter email for POSTGRES_DB variable - optional: true diff --git a/workflow/yamls/turbovnc/allegro.yaml b/workflow/yamls/turbovnc/allegro.yaml deleted file mode 100644 index 6ebeae928..000000000 --- a/workflow/yamls/turbovnc/allegro.yaml +++ /dev/null @@ -1,297 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load Allegro to the PATH - type: string - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - default: flatpak run --command=bottles-cli com.usebottles.bottles run -b Cadence -p allegro_free_viewer - tooltip: Command to start Allegro diff --git a/workflow/yamls/turbovnc/armforge-noaa-v3.yaml b/workflow/yamls/turbovnc/armforge-noaa-v3.yaml deleted file mode 100644 index 9aeb837fe..000000000 --- a/workflow/yamls/turbovnc/armforge-noaa-v3.yaml +++ /dev/null @@ -1,354 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - _sch__dd_partition_e__tag_ursa: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name !includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: u1-service - value: u1-service - default: u1-service - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - vnc_type: - label: VNC Server Type - type: string - default: turbovnc - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - load_env: - label: Command to load ARM Forge to the PATH - type: string - default: module load forge - tooltip: 'To load the environment, enter the appropriate command, for example: module load module-name or source path/to/env.sh.' - bin: - label: Command to launch ARM Forge - type: string - default: forge - tooltip: Command to start ARM Forge - background: - label: Run service in background - type: boolean - default: true - hidden: true diff --git a/workflow/yamls/turbovnc/armforge-noaa.yaml b/workflow/yamls/turbovnc/armforge-noaa.yaml deleted file mode 100644 index 54faaa61b..000000000 --- a/workflow/yamls/turbovnc/armforge-noaa.yaml +++ /dev/null @@ -1,173 +0,0 @@ -jobs: - main: - steps: - - name: Preprocessing - run: ./utils/steps/preprocessing.sh - - name: Validating Target Resource - run: ./utils/steps/input_form_resource_wrapper.sh - - name: Process Inputs - run: ./utils/steps/process_inputs_sh.sh - - name: Controller Preprocessing - run: ./utils/steps/controller_preprocessing.sh - - name: Prepare Service JSON - run: ./utils/steps/prepare_service_json.sh - - name: Initialize Cancel Script - run: ./utils/steps/initialize_cancel_script.sh - - name: Create Session Script - run: ./utils/steps/create_session_script.sh - - name: Launch Job and Wait - run: ./utils/steps/launch_job_and_wait.sh - cleanup: ./kill.sh - - name: Clean and Exit - run: ./utils/steps/clean_and_exit.sh - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - vnc_type: - label: VNC Server Type - type: string - default: turbovnc - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - load_env: - label: Command to load ARM Forge to the PATH - type: string - default: module load forge - tooltip: 'To load the environment, enter the appropriate command, for example: module load module-name or source path/to/env.sh.' - bin: - label: Command to launch ARM Forge - type: string - default: forge - tooltip: Command to start ARM Forge - background: - label: Run service in background - type: boolean - default: true - hidden: true diff --git a/workflow/yamls/turbovnc/firefox-emed.yaml b/workflow/yamls/turbovnc/firefox-emed.yaml deleted file mode 100644 index f4cd2a46f..000000000 --- a/workflow/yamls/turbovnc/firefox-emed.yaml +++ /dev/null @@ -1,289 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Desktop Host - items: - resource: - type: compute-clusters - label: Service host - provider: - - existing - include-workspace: false - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: false - bin: - label: Service Binary - type: string - hidden: true - default: firefox diff --git a/workflow/yamls/turbovnc/fsl-emed.yaml b/workflow/yamls/turbovnc/fsl-emed.yaml deleted file mode 100644 index afe0cdb69..000000000 --- a/workflow/yamls/turbovnc/fsl-emed.yaml +++ /dev/null @@ -1,292 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: FSL Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - existing - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: false - load_env: - label: Command to load FSL - type: string - default: module load fsl/6.0.5_cpu - bin: - label: Service Binary - type: string - hidden: true - default: fsl diff --git a/workflow/yamls/turbovnc/gtise.yaml b/workflow/yamls/turbovnc/gtise.yaml deleted file mode 100644 index 72e59c65b..000000000 --- a/workflow/yamls/turbovnc/gtise.yaml +++ /dev/null @@ -1,298 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load GTISE Environment - type: string - default: source /var/opt/gtsuite/load-env.sh - optional: true - tooltip: Ensure you launch the scheduler workflow before! - bin: - label: Launch command - type: string - default: gtise - tooltip: Ensure you launch the scheduler workflow before! diff --git a/workflow/yamls/turbovnc/libreoffice-mate.yaml b/workflow/yamls/turbovnc/libreoffice-mate.yaml deleted file mode 100644 index dbc170871..000000000 --- a/workflow/yamls/turbovnc/libreoffice-mate.yaml +++ /dev/null @@ -1,305 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load or install libreoffice - type: string - default: sudo yum install libreoffice-calc.x86_64 -y - hidden: true - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - hidden: true - default: libreoffice --calc - tooltip: Command to start LibreOffice - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true diff --git a/workflow/yamls/turbovnc/libreoffice.yaml b/workflow/yamls/turbovnc/libreoffice.yaml deleted file mode 100644 index d765a899c..000000000 --- a/workflow/yamls/turbovnc/libreoffice.yaml +++ /dev/null @@ -1,300 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load or install libreoffice - type: string - default: sudo yum install libreoffice-calc.x86_64 -y - hidden: true - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - hidden: true - default: libreoffice --calc - tooltip: Command to start LibreOffice diff --git a/workflow/yamls/turbovnc/matlab-emed.yaml b/workflow/yamls/turbovnc/matlab-emed.yaml deleted file mode 100644 index 44c733b18..000000000 --- a/workflow/yamls/turbovnc/matlab-emed.yaml +++ /dev/null @@ -1,292 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Matlab Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - existing - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: false - load_env: - label: Command to load Matlab - type: string - default: module load matlab - bin: - label: Service Binary - type: string - hidden: true - default: matlab -desktop diff --git a/workflow/yamls/turbovnc/matlab-noaa-v3.yaml b/workflow/yamls/turbovnc/matlab-noaa-v3.yaml deleted file mode 100644 index 08671588c..000000000 --- a/workflow/yamls/turbovnc/matlab-noaa-v3.yaml +++ /dev/null @@ -1,354 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - _sch__dd_partition_e__tag_ursa: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name !includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: u1-service - value: u1-service - default: u1-service - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - vnc_type: - label: VNC Server Type - type: string - default: turbovnc - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - load_env: - label: Command to load MATLAB to the PATH - type: string - default: module load matlab - tooltip: 'To load the environment, enter the appropriate command, for example: module load module-name or source path/to/env.sh.' - bin: - label: Command to launch MATLAB - type: string - default: matlab -desktop - tooltip: Command to start MATLAB - background: - label: Run service in background - type: boolean - default: true - hidden: true diff --git a/workflow/yamls/turbovnc/matlab-noaa.yaml b/workflow/yamls/turbovnc/matlab-noaa.yaml deleted file mode 100644 index 94a493b47..000000000 --- a/workflow/yamls/turbovnc/matlab-noaa.yaml +++ /dev/null @@ -1,173 +0,0 @@ -jobs: - main: - steps: - - name: Preprocessing - run: ./utils/steps/preprocessing.sh - - name: Validating Target Resource - run: ./utils/steps/input_form_resource_wrapper.sh - - name: Process Inputs - run: ./utils/steps/process_inputs_sh.sh - - name: Controller Preprocessing - run: ./utils/steps/controller_preprocessing.sh - - name: Prepare Service JSON - run: ./utils/steps/prepare_service_json.sh - - name: Initialize Cancel Script - run: ./utils/steps/initialize_cancel_script.sh - - name: Create Session Script - run: ./utils/steps/create_session_script.sh - - name: Launch Job and Wait - run: ./utils/steps/launch_job_and_wait.sh - cleanup: ./kill.sh - - name: Clean and Exit - run: ./utils/steps/clean_and_exit.sh - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - vnc_type: - label: VNC Server Type - type: string - default: turbovnc - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - load_env: - label: Command to load MATLAB to the PATH - type: string - default: module load matlab - tooltip: 'To load the environment, enter the appropriate command, for example: module load module-name or source path/to/env.sh.' - bin: - label: Command to launch MATLAB - type: string - default: matlab -desktop - tooltip: Command to start MATLAB - background: - label: Run service in background - type: boolean - default: true - hidden: true diff --git a/workflow/yamls/turbovnc/ncdiff-mate.yaml b/workflow/yamls/turbovnc/ncdiff-mate.yaml deleted file mode 100644 index dec522ef9..000000000 --- a/workflow/yamls/turbovnc/ncdiff-mate.yaml +++ /dev/null @@ -1,302 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load or install ncdiff - type: string - optional: true - value: 'sudo dnf install nco -y' - bin: - label: Launch command - type: string - default: gnome-terminal -- bash -c \"ncdiff --help; exec bash\" - tooltip: Command to start ncdiff - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true diff --git a/workflow/yamls/turbovnc/ncdiff.yaml b/workflow/yamls/turbovnc/ncdiff.yaml deleted file mode 100644 index de77c2ed4..000000000 --- a/workflow/yamls/turbovnc/ncdiff.yaml +++ /dev/null @@ -1,297 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load or install ncdiff - type: string - optional: true - value: 'sudo dnf install nco -y' - bin: - label: Launch command - type: string - default: gnome-terminal -- bash -c \"ncdiff --help; exec bash\" - tooltip: Command to start ncdiff diff --git a/workflow/yamls/turbovnc/novnc-emed.yaml b/workflow/yamls/turbovnc/novnc-emed.yaml deleted file mode 100644 index 0fbcbfed6..000000000 --- a/workflow/yamls/turbovnc/novnc-emed.yaml +++ /dev/null @@ -1,284 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Desktop Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - existing - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: false diff --git a/workflow/yamls/turbovnc/novnc-mate.yaml b/workflow/yamls/turbovnc/novnc-mate.yaml deleted file mode 100644 index 2299b3b65..000000000 --- a/workflow/yamls/turbovnc/novnc-mate.yaml +++ /dev/null @@ -1,294 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: ${{ inputs.service.name == nicedcv && true || false }} - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true diff --git a/workflow/yamls/turbovnc/novnc-noaa-v3.yaml b/workflow/yamls/turbovnc/novnc-noaa-v3.yaml deleted file mode 100644 index 1beb10085..000000000 --- a/workflow/yamls/turbovnc/novnc-noaa-v3.yaml +++ /dev/null @@ -1,326 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - _sch__dd_partition_e__tag_ursa: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name !includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: u1-service - value: u1-service - default: u1-service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: '01:00:00' - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - vnc_type: - label: VNC Server Type - type: string - default: turbovnc - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} diff --git a/workflow/yamls/turbovnc/novnc-noaa.yaml b/workflow/yamls/turbovnc/novnc-noaa.yaml deleted file mode 100644 index ca11e5b8e..000000000 --- a/workflow/yamls/turbovnc/novnc-noaa.yaml +++ /dev/null @@ -1,158 +0,0 @@ -jobs: - main: - steps: - - name: Preprocessing - run: ./utils/steps/preprocessing.sh - - name: Validating Target Resource - run: ./utils/steps/input_form_resource_wrapper.sh - - name: Process Inputs - run: ./utils/steps/process_inputs_sh.sh - - name: Controller Preprocessing - run: ./utils/steps/controller_preprocessing.sh - - name: Prepare Service JSON - run: ./utils/steps/prepare_service_json.sh - - name: Initialize Cancel Script - run: ./utils/steps/initialize_cancel_script.sh - - name: Create Session Script - run: ./utils/steps/create_session_script.sh - - name: Launch Job and Wait - run: ./utils/steps/launch_job_and_wait.sh - cleanup: ./kill.sh - - name: Clean and Exit - run: ./utils/steps/clean_and_exit.sh - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - vnc_type: - label: VNC Server Type - type: string - default: turbovnc - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} diff --git a/workflow/yamls/turbovnc/novnc.yaml b/workflow/yamls/turbovnc/novnc.yaml deleted file mode 100644 index 1750306c0..000000000 --- a/workflow/yamls/turbovnc/novnc.yaml +++ /dev/null @@ -1,280 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc diff --git a/workflow/yamls/turbovnc/octave-mate.yaml b/workflow/yamls/turbovnc/octave-mate.yaml deleted file mode 100644 index d8fda1f64..000000000 --- a/workflow/yamls/turbovnc/octave-mate.yaml +++ /dev/null @@ -1,303 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load RStudio to the PATH - type: string - default: sudo yum install -y octave; sudo yum install -y qt5-qttools-libs-help - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - default: octave --force-gui - tooltip: Command to start RStudio - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true diff --git a/workflow/yamls/turbovnc/octave.yaml b/workflow/yamls/turbovnc/octave.yaml deleted file mode 100644 index e3ebf21de..000000000 --- a/workflow/yamls/turbovnc/octave.yaml +++ /dev/null @@ -1,298 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load RStudio to the PATH - type: string - default: sudo yum install -y octave; sudo yum install -y qt5-qttools-libs-help - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - default: octave --force-gui - tooltip: Command to start RStudio diff --git a/workflow/yamls/turbovnc/panoply-mate.yaml b/workflow/yamls/turbovnc/panoply-mate.yaml deleted file mode 100644 index bcedd0897..000000000 --- a/workflow/yamls/turbovnc/panoply-mate.yaml +++ /dev/null @@ -1,303 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load or install Panoply - type: string - optional: true - default: /contrib/software/Panoply/set_panoply.sh - tooltip: Make sure you have access to the files! - bin: - label: Command to launch Panoply - type: string - default: /contrib/software/Panoply/PanoplyJ/panoply.sh - tooltip: Make sure you have access to the files! - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true diff --git a/workflow/yamls/turbovnc/panoply.yaml b/workflow/yamls/turbovnc/panoply.yaml deleted file mode 100644 index 6bf780319..000000000 --- a/workflow/yamls/turbovnc/panoply.yaml +++ /dev/null @@ -1,286 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load or install Panoply - type: string - optional: true - default: /contrib/software/Panoply/set_panoply.sh - tooltip: Make sure you have access to the files! - bin: - label: Command to launch Panoply - type: string - default: /contrib/software/Panoply/PanoplyJ/panoply.sh - tooltip: Make sure you have access to the files! diff --git a/workflow/yamls/turbovnc/qgis-mate.yaml b/workflow/yamls/turbovnc/qgis-mate.yaml deleted file mode 100644 index 29e2243e2..000000000 --- a/workflow/yamls/turbovnc/qgis-mate.yaml +++ /dev/null @@ -1,303 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load QGIS - type: string - default: source /contrib/software/miniconda/miniconda/etc/profile.d/conda.sh; conda activate qgis_stable - optional: true - tooltip: 'To load the environment, enter the appropriate command, for exampl, module load module-name or source path/to/env.sh.' - bin: - label: Launch command - type: string - default: qgis - tooltip: Command to start QGIS - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true diff --git a/workflow/yamls/turbovnc/qgis.yaml b/workflow/yamls/turbovnc/qgis.yaml deleted file mode 100644 index 76488ad56..000000000 --- a/workflow/yamls/turbovnc/qgis.yaml +++ /dev/null @@ -1,298 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load QGIS - type: string - default: source /contrib/software/miniconda/miniconda/etc/profile.d/conda.sh; conda activate qgis_stable - optional: true - tooltip: 'To load the environment, enter the appropriate command, for exampl, module load module-name or source path/to/env.sh.' - bin: - label: Launch command - type: string - default: qgis - tooltip: Command to start QGIS diff --git a/workflow/yamls/turbovnc/recursive_diff-mate.yaml b/workflow/yamls/turbovnc/recursive_diff-mate.yaml deleted file mode 100644 index 9d23115d0..000000000 --- a/workflow/yamls/turbovnc/recursive_diff-mate.yaml +++ /dev/null @@ -1,298 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: "${{ needs.create_session.outputs.remote_port }}" - localPort: "${{ needs.create_session.outputs.local_port }}" - remoteHost: "${{ needs.create_session.outputs.target_hostname }}" - slug: "${{ needs.create_session.outputs.slug }}" - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - bin: - label: Command to launch terminal with ncdiff - type: string - default: "gnome-terminal -- bash -c \"source /contrib/software/miniconda/miniconda/etc/profile.d/conda.sh; conda activate recursive_diff_latest -; ncdiff --help; exec bash\"" - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true \ No newline at end of file diff --git a/workflow/yamls/turbovnc/recursive_diff.yaml b/workflow/yamls/turbovnc/recursive_diff.yaml deleted file mode 100644 index f506a8f8d..000000000 --- a/workflow/yamls/turbovnc/recursive_diff.yaml +++ /dev/null @@ -1,293 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: "${{ needs.create_session.outputs.remote_port }}" - localPort: "${{ needs.create_session.outputs.local_port }}" - remoteHost: "${{ needs.create_session.outputs.target_hostname }}" - slug: "${{ needs.create_session.outputs.slug }}" - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - bin: - label: Command to launch terminal with ncdiff - type: string - default: "gnome-terminal -- bash -c \"source /contrib/software/miniconda/miniconda/etc/profile.d/conda.sh; conda activate recursive_diff_latest -; ncdiff --help; exec bash\"" \ No newline at end of file diff --git a/workflow/yamls/turbovnc/rstudio-emed.yaml b/workflow/yamls/turbovnc/rstudio-emed.yaml deleted file mode 100644 index 6ddd5df7c..000000000 --- a/workflow/yamls/turbovnc/rstudio-emed.yaml +++ /dev/null @@ -1,292 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: RStudio Host - items: - resource: - type: compute-clusters - label: Service host - provider: - - existing - include-workspace: false - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: false - load_env: - label: Command to load RStudio - type: string - default: module load rstudio - bin: - label: Service Binary - type: string - hidden: true - default: rstudio diff --git a/workflow/yamls/turbovnc/rstudio-mate.yaml b/workflow/yamls/turbovnc/rstudio-mate.yaml deleted file mode 100644 index c301044fe..000000000 --- a/workflow/yamls/turbovnc/rstudio-mate.yaml +++ /dev/null @@ -1,302 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load RStudio to the PATH - type: string - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - default: rstudio - tooltip: Command to start RStudio - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true diff --git a/workflow/yamls/turbovnc/rstudio.yaml b/workflow/yamls/turbovnc/rstudio.yaml deleted file mode 100644 index 8dfb031c9..000000000 --- a/workflow/yamls/turbovnc/rstudio.yaml +++ /dev/null @@ -1,297 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: dropdown - label: Select remote display protocol - options: - - label: VNC Server - value: turbovnc - - label: Nice DCV - value: nicedcv - - label: Scyld Cloud Workstation - value: scw - tooltip: Make sure the display protocol is installed in the host! - default: turbovnc - load_env: - label: Command to load RStudio to the PATH - type: string - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - default: rstudio - tooltip: Command to start RStudio diff --git a/workflow/yamls/turbovnc/schrodinger-emed.yaml b/workflow/yamls/turbovnc/schrodinger-emed.yaml deleted file mode 100644 index ee0d402a3..000000000 --- a/workflow/yamls/turbovnc/schrodinger-emed.yaml +++ /dev/null @@ -1,292 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Schrodinger Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - existing - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: false - load_env: - label: Command to load Schrodinger - type: string - default: module load schrodinger - bin: - label: Service Binary - type: string - hidden: true - default: maestro diff --git a/workflow/yamls/turbovnc/vmd-emed.yaml b/workflow/yamls/turbovnc/vmd-emed.yaml deleted file mode 100644 index 615ce6382..000000000 --- a/workflow/yamls/turbovnc/vmd-emed.yaml +++ /dev/null @@ -1,292 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=headless&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: VMD Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - provider: - - existing - autoselect: true - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_gres_e_gpu_colon_: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e_ != 'gpu' && inputs.pwrl_host._sch__dd_partition_e_ != 'gpu-quick' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e_: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: --cpus-per-task=value slurm directive - _sch__dd_mem_e_: - type: string - label: Minimum total memory required - default: 8GB - tooltip: --mem=value slurm directive - optional: true - _sch__dd_time_e_: - type: string - label: Walltime - default: 01:00:00 - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - optional: true - scheduler_directives: - type: string - label: Scheduler directives - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: turbovnc - use_tls: - label: Use TLS - type: boolean - hidden: true - default: false - load_env: - label: Command to load VMD - type: string - default: module load vmd - bin: - label: Service Binary - type: string - hidden: true - default: vmd diff --git a/workflow/yamls/vncserver/emed-firefox.yaml b/workflow/yamls/vncserver/emed-firefox.yaml deleted file mode 100644 index a3464703a..000000000 --- a/workflow/yamls/vncserver/emed-firefox.yaml +++ /dev/null @@ -1,372 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - sleep 20 - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - bin: - label: Service Binary - type: string - hidden: true - default: firefox - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} \ No newline at end of file diff --git a/workflow/yamls/vncserver/emed-fsl.yaml b/workflow/yamls/vncserver/emed-fsl.yaml deleted file mode 100644 index c4bb08027..000000000 --- a/workflow/yamls/vncserver/emed-fsl.yaml +++ /dev/null @@ -1,376 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - sleep 20 - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load FSL - type: string - default: module load fsl/6.0.5_cpu - bin: - label: Service Binary - type: string - hidden: true - default: fsl - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} diff --git a/workflow/yamls/vncserver/emed-matlab.yaml b/workflow/yamls/vncserver/emed-matlab.yaml deleted file mode 100644 index 1283301fc..000000000 --- a/workflow/yamls/vncserver/emed-matlab.yaml +++ /dev/null @@ -1,376 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - sleep 20 - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load Matlab - type: string - default: module load matlab - bin: - label: Service Binary - type: string - hidden: true - default: matlab -desktop - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} \ No newline at end of file diff --git a/workflow/yamls/vncserver/emed-rstudio.yaml b/workflow/yamls/vncserver/emed-rstudio.yaml deleted file mode 100644 index 063bf6268..000000000 --- a/workflow/yamls/vncserver/emed-rstudio.yaml +++ /dev/null @@ -1,384 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - sleep 20 - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Service - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env_tag_cloud: - label: Command to load RStudio - type: string - optional: true - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - load_env_tag_existing: - label: Command to load RStudio - type: string - default: module load rstudio - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - bin: - label: Service Binary - type: string - hidden: true - default: rstudio - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} \ No newline at end of file diff --git a/workflow/yamls/vncserver/emed-schrodinger.yaml b/workflow/yamls/vncserver/emed-schrodinger.yaml deleted file mode 100644 index cb2e68d32..000000000 --- a/workflow/yamls/vncserver/emed-schrodinger.yaml +++ /dev/null @@ -1,377 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - sleep 20 - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load Schrodinger - type: string - default: module load schrodinger - bin: - label: Service Binary - type: string - hidden: true - default: maestro - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} \ No newline at end of file diff --git a/workflow/yamls/vncserver/emed-vmd.yaml b/workflow/yamls/vncserver/emed-vmd.yaml deleted file mode 100644 index 639f34a47..000000000 --- a/workflow/yamls/vncserver/emed-vmd.yaml +++ /dev/null @@ -1,378 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - sleep 20 - - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load VMD - type: string - default: module load vmd - bin: - label: Service Binary - type: string - hidden: true - default: vmd - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} \ No newline at end of file diff --git a/workflow/yamls/vncserver/emed.yaml b/workflow/yamls/vncserver/emed.yaml deleted file mode 100644 index 06190b8ff..000000000 --- a/workflow/yamls/vncserver/emed.yaml +++ /dev/null @@ -1,367 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - echo "module load singularity" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: ./kill.sh - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'existing' == inputs.pwrl_host.resource.provider || 'CONTROLLER' != inputs.pwrl_host.jobschedulertype_tag_cloud }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_existing || 'SLURM' == inputs.pwrl_host.jobschedulertype_tag_cloud }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue ${{ inputs.pwrl_host.slurm_options }} -j "${job_id}" --noheader --format="%N") - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - # Sleep to wait for jupyterlab after nginx connects - sleep 20 - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - sleep 20 - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Host - collapsed: false - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - autoselect: true - tooltip: Resource to host the service - slurm_options: - type: dropdown - label: Select Cluster - optional: true - default: '' - options: - - value: '' - label: Default - - value: -M hpc4 - label: HPC4 - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype_tag_existing: - type: string - label: Select Controller, SLURM Partition or PBS Queue - default: SLURM - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore}} - _sch__dd_partition_e__tag_default: - type: slurm-partitions - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' == inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - _sch__dd_partition_e__tag_hpc4: - type: dropdown - label: SLURM partition - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider || '-M hpc4' != inputs.pwrl_host.slurm_options }} - ignore: ${{ .hidden }} - default: normal - options: - - normal - - gpu - - gpu-h200 - - gpu-quick - - ht - - large-mem - - quick - - test - - unlimited - _sch__dd_gres_e_gpu_colon__tag_default: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_default != 'gpu-quick' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_gres_e_gpu_colon__tag_hpc4: - type: number - label: Number of GPUs - hidden: ${{ inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-quick' && inputs.pwrl_host._sch__dd_partition_e__tag_hpc4 != 'gpu-h200' || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - min: 1 - max: 4 - default: 1 - tooltip: '--gres=gpu:X slurm directive' - _sch__dd_cpus_d_per_d_task_e__tag_existing: - type: number - label: CPUs per task - min: 1 - max: 32 - default: 1 - tooltip: '--cpus-per-task=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_mem_e__tag_existing: - type: string - label: Minimum total memory required - default: 8GB - tooltip: '--mem=value slurm directive' - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - _sch__dd_time_e__tag_existing: - type: string - label: Walltime - default: '01:00:00' - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - scheduler_directives_tag_existing: - type: string - label: Scheduler directives - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - jobschedulertype_tag_cloud: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e__tag_cloud: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_tag_cloud: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype_tag_cloud || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - nginx_sif_tag_existing: - type: string - default: /public/apps/pw/nginx-unprivileged.sif - hidden: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - optional: ${{ .ignore }} diff --git a/workflow/yamls/vncserver/general.yaml b/workflow/yamls/vncserver/general.yaml deleted file mode 100644 index 1d5a1fe21..000000000 --- a/workflow/yamls/vncserver/general.yaml +++ /dev/null @@ -1,285 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: true - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\" >/dev/null 2>&1" - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver diff --git a/workflow/yamls/vncserver/general_k8s.yaml b/workflow/yamls/vncserver/general_k8s.yaml deleted file mode 100644 index 36d28a190..000000000 --- a/workflow/yamls/vncserver/general_k8s.yaml +++ /dev/null @@ -1,785 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: ${{ inputs.targetType == 'kubernetes-cluster' }} - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\" >/dev/null 2>&1" - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: VNC_PW - value: ${{ inputs.service_k8s.password }} - securityContext: - capabilities: - add: ["NET_ADMIN"] # Might be required depending on kasmvnc container usage - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} - -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - type: group - label: Service Host - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - resource: - type: compute-clusters - label: Service host - optional: ${{ inputs.targetType != 'compute-cluster' }} - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: KasmVNC Image - type: string - default: kasmweb/desktop:1.16.0 - tooltip: Sample container from https://hub.docker.com/r/kasmweb/desktop - image_port: - label: KasmVNC Port - type: number - default: 6901 - tooltip: Define the port on which the KasmVNC runs inside the container. Default is 6901. - password: - label: Password - type: password - tooltip: Type in a password for user kasm_user - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} diff --git a/workflow/yamls/vncserver/hsp.yaml b/workflow/yamls/vncserver/hsp.yaml deleted file mode 100644 index e0b2ad5ec..000000000 --- a/workflow/yamls/vncserver/hsp.yaml +++ /dev/null @@ -1,339 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\" >/dev/null 2>&1" - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - rootless_docker: - label: Use Rootless Docker? - type: boolean - default: true - hidden: true - download_vncserver_container: - label: Use singularity vncserver if necessary - type: boolean - default: true - hidden: true diff --git a/workflow/yamls/vncserver/matlab_k8s.yaml b/workflow/yamls/vncserver/matlab_k8s.yaml deleted file mode 100644 index f79571e4c..000000000 --- a/workflow/yamls/vncserver/matlab_k8s.yaml +++ /dev/null @@ -1,784 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\" >/dev/null 2>&1" - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - args: ["-browser"] - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: MWI_ENABLE_WEB_LOGGING - value: "True" - - name: MWI_APP_HOST - value: "0.0.0.0" - - name: MWI_APP_PORT - value: "${{ inputs.service_k8s.image_port }}" - - name: MWI_ENABLE_TOKEN_AUTH - value: "False" - - name: MWI_BASE_URL - value: "/" - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - slug: index.html - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: kubernetes-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - type: group - label: Service Host - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - resource: - type: compute-clusters - label: Service host - optional: ${{ inputs.targetType != 'compute-cluster' }} - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load Matlab - type: string - optional: true - bin: - label: Service Binary - type: string - optional: ${{ inputs.targetType != 'compute-cluster' }} - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: Matlab Image - type: string - default: mathworks/matlab:r2025a - tooltip: Sample container from https://hub.docker.com/r/mathworks/matlab - image_port: - label: Matlab Port - type: number - default: 8888 - hidden: true diff --git a/workflow/yamls/vncserver/noaa-armforge.yaml b/workflow/yamls/vncserver/noaa-armforge.yaml deleted file mode 100644 index e35eb5354..000000000 --- a/workflow/yamls/vncserver/noaa-armforge.yaml +++ /dev/null @@ -1,338 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - _sch__dd_partition_e__tag_ursa: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name !includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: u1-service - value: u1-service - default: u1-service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: '01:00:00' - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - tooltip: Make sure the display protocol is installed in the host! - default: vncserver - hidden: true - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - load_env: - label: Command to load ARM Forge to the PATH - type: string - default: module load forge - tooltip: 'To load the environment, enter the appropriate command, for example: module load module-name or source path/to/env.sh.' - bin: - label: Command to launch ARM Forge - type: string - default: forge - tooltip: Command to start ARM Forge - background: - label: Run service in background - type: boolean - default: true - hidden: true diff --git a/workflow/yamls/vncserver/noaa-matlab.yaml b/workflow/yamls/vncserver/noaa-matlab.yaml deleted file mode 100644 index 40dffdde5..000000000 --- a/workflow/yamls/vncserver/noaa-matlab.yaml +++ /dev/null @@ -1,338 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - _sch__dd_partition_e__tag_ursa: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name !includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: u1-service - value: u1-service - default: u1-service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: '01:00:00' - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - tooltip: Make sure the display protocol is installed in the host! - default: vncserver - hidden: true - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - load_env: - label: Command to load MATLAB to the PATH - type: string - default: module load matlab - tooltip: 'To load the environment, enter the appropriate command, for example: module load module-name or source path/to/env.sh.' - bin: - label: Command to launch MATLAB - type: string - default: matlab -desktop - tooltip: Command to start MATLAB - background: - label: Run service in background - type: boolean - default: true - hidden: true diff --git a/workflow/yamls/vncserver/noaa-rstudio.yaml b/workflow/yamls/vncserver/noaa-rstudio.yaml deleted file mode 100644 index 4d97ffc97..000000000 --- a/workflow/yamls/vncserver/noaa-rstudio.yaml +++ /dev/null @@ -1,334 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - _sch__dd_partition_e__tag_ursa: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name !includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: u1-service - value: u1-service - default: u1-service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: '01:00:00' - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - tooltip: Make sure the display protocol is installed in the host! - default: vncserver - hidden: true - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} - load_env: - label: Command to load RStudio to the PATH - type: string - default: module load RStudio - tooltip: 'To load the environment, enter the appropriate command, for example: module load module-name or source path/to/env.sh.' - bin: - label: Command to launch RStudio - type: string - default: rstudio - tooltip: Command to start RStudio - diff --git a/workflow/yamls/vncserver/noaa.yaml b/workflow/yamls/vncserver/noaa.yaml deleted file mode 100644 index 3b2f4e8a5..000000000 --- a/workflow/yamls/vncserver/noaa.yaml +++ /dev/null @@ -1,320 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: service - value: service - default: service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - _sch__dd_partition_e__tag_ursa: - type: dropdown - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider || (inputs.pwrl_host.resource.name !includes 'ursa') }} - ignore: ${{ .hidden }} - options: - - label: u1-service - value: u1-service - default: u1-service - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: '01:00:00' - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ 'existing' == inputs.pwrl_host.resource.provider }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - tooltip: Make sure the display protocol is installed in the host! - default: vncserver - hidden: true - desktop: - label: Desktop Session - type: string - default: mate-session - hidden: true - ignore: ${{ 'existing' != inputs.pwrl_host.resource.provider }} diff --git a/workflow/yamls/vncserver/pointwise.yaml b/workflow/yamls/vncserver/pointwise.yaml deleted file mode 100644 index fb8f44779..000000000 --- a/workflow/yamls/vncserver/pointwise.yaml +++ /dev/null @@ -1,766 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: ${{ inputs.targetType == 'kubernetes-cluster' }} - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh\ - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\" >/dev/null 2>&1" - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: VNC_PW - value: ${{ inputs.service_k8s.password }} - securityContext: - capabilities: - add: ["NET_ADMIN"] # Might be required depending on kasmvnc container usage - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - kubectl delete -f pvc.yaml - touch pvc.deleted - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - kubectl delete -f app.yaml - touch app.deleted - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} - -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - type: group - label: Service Host - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - resource: - type: compute-clusters - label: Service host - optional: ${{ inputs.targetType != 'compute-cluster' }} - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: false - collapsed: true - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load Pointwise to the PATH - type: string - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - default: /apps/Pointwise2024.2/pointwise - tooltip: Command to start Pointwise - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: KasmVNC Image - type: string - default: kasmweb/desktop:1.16.0 - tooltip: Sample container from https://hub.docker.com/r/kasmweb/desktop - image_port: - label: KasmVNC Port - type: number - default: 6901 - tooltip: Define the port on which the KasmVNC runs inside the container. Default is 6901. - password: - label: Password - type: password - tooltip: Type in a password for user kasm_user - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} diff --git a/workflow/yamls/vncserver/rstudio_k8s.yaml b/workflow/yamls/vncserver/rstudio_k8s.yaml deleted file mode 100644 index d63f05d59..000000000 --- a/workflow/yamls/vncserver/rstudio_k8s.yaml +++ /dev/null @@ -1,790 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\" >/dev/null 2>&1" - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ needs.create_session.outputs.remote_port }} - localPort: ${{ needs.create_session.outputs.local_port }} - remoteHost: ${{ needs.create_session.outputs.target_hostname }} - slug: ${{ needs.create_session.outputs.slug }} - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - if [ -z "${{ inputs.service_k8s.password }}" ] || [ "${{ inputs.service_k8s.password }}" = "undefined" ]; then - auth_env_var_name="DISABLE_AUTH" - auth_env_var_value="true" - else - auth_env_var_name="PASSWORD" - auth_env_var_value="${{ inputs.service_k8s.password }}" - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: ${auth_env_var_name} - value: "${auth_env_var_value}" - - name: ROOT - value: "true" - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - set -x - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f pvc.yaml; then - echo "PVC deleted successfully" - touch pvc.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete PVC after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete PVC failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - fi - - name: Apply Deployment and Service - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - set -x - MAX_ATTEMPTS=5 - ATTEMPT=1 - while true; do - if kubectl delete -f app.yaml; then - echo "Resources deleted successfully" - touch app.deleted - break - elif [ $ATTEMPT -gt $MAX_ATTEMPTS ]; then - echo "Failed to delete resources after $MAX_ATTEMPTS attempts" - exit 1 - else - echo "Attempt $ATTEMPT to delete resources failed. Retrying in 5 seconds..." - sleep 5 - ((ATTEMPT++)) - fi - done - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - type: group - label: Service Host - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - resource: - type: compute-clusters - label: Service host - optional: ${{ inputs.targetType != 'compute-cluster' }} - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: Queue to submit the interactive job. Must select one! Use [qstat -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load RStudio - type: string - optional: true - bin: - label: Service Binary - type: string - default: rstudio - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: RStudio Image - type: string - default: rocker/rstudio - tooltip: Sample container from https://hub.docker.com/r/rocker/rstudio - image_port: - label: RStudio Port - type: number - default: 8787 - hidden: true - password: - label: Password - type: password - tooltip: Type in a password for user rstudio - optional: true diff --git a/workflow/yamls/vncserver/virtuoso.yaml b/workflow/yamls/vncserver/virtuoso.yaml deleted file mode 100644 index 0c6ca1046..000000000 --- a/workflow/yamls/vncserver/virtuoso.yaml +++ /dev/null @@ -1,767 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: ${{ inputs.targetType == 'kubernetes-cluster' }} - redirect: true - useCustomDomain: ${{ inputs.targetType == 'kubernetes-cluster' }} - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - password=$(openssl rand -base64 12 | head -c 12) - echo "export password=${password}" >> resources/host/inputs.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype && inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\" >/dev/null 2>&1" - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Set URL SLUG - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - run: | - source resources/host/inputs.sh - slug="vnc.html?resize=remote&autoconnect=true&show_dot=true&path=websockify&password=${password}&host=${PW_PLATFORM_HOST}${basepath}/&dt=0" - echo "slug=${slug}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'compute-cluster' }} - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - slug: '${{ needs.create_session.outputs.slug }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - - auth_k8s: - steps: - - name: Authenticate kubectl - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: pw kube auth ${{ inputs.k8s.cluster }} - prepare_k8s_pvc: - needs: - - auth_k8s - steps: - - name: Creating New PVC YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - if [[ ${{ inputs.k8s.volumes.pvc_persist }} == "true" ]]; then - pvc_name="${{ inputs.k8s.volumes.pvc_name }}" - else - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - pvc_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56)-pvc - fi - pvc_storage_class=${{ inputs.k8s.volumes.pvc_storage_class }} - if [ -z "${pvc_storage_class}" ] || [[ "${pvc_storage_class}" == "undefined" ]]; then - default_class=$(kubectl get storageclass -n ${{ inputs.k8s.namespace }} | grep '(default)') - if [ $? -ne 0 ]; then - echo "WARNING: Could not obtain default storageClass with command:" - echo " kubectl get storageclass -n ${{ inputs.k8s.namespace }}" - echo " You might need to provide a storage class input" - elif [ -z "${default_class}" ]; then - echo "ERROR: No default storage class found. You must specify one explicitly." - exit 1 - fi - else - storageClassName="storageClassName: ${{ inputs.k8s.volumes.pvc_storage_class }}" - fi - echo "${pvc_name}" > pvc_name - cat < pvc.yaml - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: ${pvc_name} - namespace: ${{ inputs.k8s.namespace }} - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: ${{ inputs.k8s.volumes.pvc_storage_size }} - ${storageClassName} - EOF - cat pvc.yaml - - name: Dry Run PVC - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: | - echo "Performing dry run..." - kubectl apply -f pvc.yaml --dry-run=client - - prepare_k8s_deployment: - if: ${{ always }} - needs: - - prepare_k8s_pvc - steps: - - name: Defining App Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - job_number=$(pwd | rev | cut -d "/" -f1 | rev) - workflow_name=$(pwd | rev | cut -d "/" -f2 | rev) - app_name=$(echo "${PW_USER}${workflow_name}${job_number}" | sed 's|_||g' | sed 's|\.||g' | tr '[:upper:]' '[:lower:]' | tail -c 56) - echo "app_name=${app_name}" | tee -a $OUTPUTS | tee -a OUTPUTS - - name: Creating Deployment and Service YAML - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - if [[ "${{ inputs.k8s.resources.limits.select_gpu }}" == "Custom" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.gpu_resource_key }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - elif [[ "${{ inputs.k8s.resources.limits.select_gpu }}" != "None" ]]; then - gpu_limits="${{ inputs.k8s.resources.limits.select_gpu }}: ${{ inputs.k8s.resources.limits.number_of_gpus }}" - fi - # Attach RuntimeClass if it's available and using NVIDIA - if ! [ -z "${gpu_limits}" ]; then - if kubectl get runtimeclass nvidia &>/dev/null; then - echo "nvidia RuntimeClass is available" - runtimeClassName="runtimeClassName: nvidia" - fi - fi - - if [[ "${{ inputs.k8s.volumes.pvc }}" == "Existing" ]]; then - pvc_name=${{ inputs.k8s.volumes.pvc_existing }} - else - pvc_name=$(cat pvc_name) - fi - - cat < app.yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - spec: - replicas: 1 - selector: - matchLabels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - template: - metadata: - labels: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - spec: - ${runtimeClassName} - initContainers: - - name: set-permissions - image: busybox - command: ["sh", "-c", "chmod 777 ${{ inputs.k8s.volumes.pvc_mount_path }} -R"] - securityContext: - runAsUser: 0 - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - containers: - - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - image: ${{ inputs.service_k8s.image }} - ports: - - containerPort: ${{ inputs.service_k8s.image_port }} - env: - - name: VNC_PW - value: ${{ inputs.service_k8s.password }} - securityContext: - capabilities: - add: ["NET_ADMIN"] # Might be required depending on kasmvnc container usage - resources: - requests: - memory: "${{ inputs.k8s.resources.requests.memory }}" - cpu: "${{ inputs.k8s.resources.requests.cpu }}" - limits: - memory: "${{ inputs.k8s.resources.limits.memory }}" - cpu: "${{ inputs.k8s.resources.limits.cpu }}" - ${gpu_limits} - volumeMounts: - - name: storage - mountPath: ${{ inputs.k8s.volumes.pvc_mount_path }} - volumes: - - name: storage - persistentVolumeClaim: - claimName: ${pvc_name} # Assumes PVC name is provided as an input - - --- - apiVersion: v1 - kind: Service - metadata: - name: ${{ needs.prepare_k8s_deployment.outputs.app_name }}-lb - namespace: ${{ inputs.k8s.namespace }} - spec: - selector: - app: ${{ needs.prepare_k8s_deployment.outputs.app_name }} - ports: - - protocol: TCP - port: ${{ inputs.service_k8s.image_port }} - targetPort: ${{ inputs.service_k8s.image_port }} - EOF - - name: Dry Run Deployment - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - echo "Performing dry run..." - kubectl apply -f app.yaml --dry-run=client - apply_k8s_deployment: - needs: - - prepare_k8s_deployment - steps: - - name: Load outputs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: cat OUTPUTS >> $OUTPUTS - - name: Apply PVC - if: ${{ inputs.targetType == 'kubernetes-cluster' && inputs.k8s.volumes.pvc == New }} - run: kubectl apply -f pvc.yaml - cleanup: | - if [[ "${{ inputs.k8s.volumes.pvc_persist }}" == "false" ]]; then - kubectl delete -f pvc.yaml - touch pvc.deleted - fi - - name: Apply Deployment and Service - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl apply -f app.yaml - cleanup: | - kubectl delete -f app.yaml - touch app.deleted - - name: Wait for Deployment to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - - log() { - while true; do - echo - echo; echo "[INFO] $(date) - Checking deployment status for ${app_name} in namespace ${namespace}..." - kubectl get deployment "${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get deployment" - - echo; echo "[INFO] $(date) - Pods status:" - kubectl get pods -l app="${app_name}" -n "${namespace}" -o wide || echo "[WARN] Unable to get pods" - - pod_name=$(kubectl get pods -l app="${app_name}" -n "${namespace}" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -n "$pod_name" ]]; then - echo; echo "[INFO] $(date) - Describing pod ${pod_name}..." - kubectl describe pod "${pod_name}" -n "${namespace}" | grep -A20 "Events" - fi - - echo "---------------------------------------------" - sleep 10 - done - } - - log & - log_pid=$! - trap "kill ${log_pid}" EXIT SIGINT SIGTERM - set -x - kubectl wait --for=condition=available --timeout=600s deployment/${app_name} -n ${namespace} - exit_code=$? - kubectl get deployment ${app_name} -n ${namespace} -o wide - kubectl describe deployment ${app_name} -n ${namespace} - exit ${exit_code} - - name: Wait for Pod to be Ready - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - env: - app_name: ${{ needs.apply_k8s_deployment.outputs.app_name }} - namespace: ${{ inputs.k8s.namespace }} - run: | - echo "Waiting for pod to be ready..." - kubectl wait --for=condition=Ready pod -l app=${app_name} -n ${namespace} --timeout=600s - pod=$(kubectl get pods -n ${namespace} -l app=${app_name} --field-selector=status.phase=Running -o jsonpath="{.items[0].metadata.name}") - echo "pod=$pod" | tee -a $OUTPUTS | tee -a OUTPUTS - touch pod.running - - name: Stream Logs - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: kubectl logs -f deployment/${{ needs.apply_k8s_deployment.outputs.app_name }} -n ${{ inputs.k8s.namespace }} - create_k8s_session: - needs: - - prepare_k8s_deployment - steps: - - name: Wait until the Kubernetes deployment reaches its final stage - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - while true; do - if [ -f "app.deleted" ]; then - echo "File app.deleted was detected. Exiting..." - exit 0 - elif [ -f "pod.running" ]; then - echo "Pod is ready" - break - fi - sleep 2 - done - - name: Get Service Name - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - run: | - source OUTPUTS - echo "service_name=${app_name}-lb" | tee -a $OUTPUTS - - name: Expose port - early-cancel: any-job-failed - if: ${{ inputs.targetType == 'kubernetes-cluster' }} - uses: parallelworks/update-session - with: - remotePort: ${{ inputs.service_k8s.image_port }} - name: ${{ sessions.session }} - targetInfo: - name: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - resourceType: services - resourceName: ${{ needs.create_k8s_session.outputs.service_name }} - -'on': - execute: - inputs: - targetType: - label: Target Type - type: dropdown - default: compute-cluster - options: - - label: Compute Cluster - value: compute-cluster - - label: Kubernetes Cluster - value: kubernetes-cluster - pwrl_host: - type: group - label: Service Host - hidden: ${{ inputs.targetType != 'compute-cluster' }} - items: - resource: - type: compute-clusters - label: Service host - optional: ${{ inputs.targetType != 'compute-cluster' }} - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: >- - Queue to submit the interactive job. Must select one! Use [qstat - -f -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; - to separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - label: Service - hidden: false - collapsed: false - items: - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.5.0.tgz - name: - type: string - label: Select remote display protocol - hidden: true - default: vncserver - load_env: - label: Command to load Virtuoso to the PATH - type: string - optional: true - tooltip: To load the environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh. - bin: - label: Launch command - type: string - default: cd /apps/extract_sim && /apps/IC23.10.140/tools/dfII/bin/virtuoso - tooltip: Command to start Virtuoso - k8s: - type: group - label: Kubernetes Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - cluster: - label: Kubernetes cluster - type: kubernetes-clusters - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - namespace: - label: Namespace - type: kubernetes-namespaces - clusterName: ${{ inputs.k8s.cluster }} - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} - volumes: - type: group - label: Volumes - collapsed: true - tooltip: Specify storage settings for Persistent Volume Claims (PVCs), including size, storage class, and mount path. - items: - pvc: - label: Persistent Volume Claim - type: dropdown - default: New - options: - - value: Existing - label: Select Existing PVC - - value: New - label: Create New PVC - pvc_mount_path: - label: Mount Path - type: string - default: /mnt - pvc_existing: - label: Select PVC Name - type: kubernetes-pvc - clusterName: ${{ inputs.k8s.cluster }} - namespace: ${{ inputs.k8s.namespace }} - hidden: ${{ inputs.k8s.volumes.pvc != Existing }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - pvc_storage_size: - label: Enter PVC Size - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - default: 10Gi - pvc_storage_class: - label: Enter PVC Storage Class - type: string - hidden: ${{ inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: true - tooltip: Leave blank to use the default storage class configured in the cluster. - pvc_persist: - label: Persist PVC After Completion - type: boolean - default: false - hidden: ${{ inputs.k8s.volumes.pvc != 'New' }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: If true, the PVC will persist after the job is canceled or completed. If false, it will be deleted. - pvc_name: - label: Enter PVC Name - type: string - hidden: ${{ inputs.k8s.volumes.pvc_persist == false || inputs.k8s.volumes.pvc != New }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - resources: - type: group - label: Resources - collapsed: true - tooltip: Configure CPU, memory, and GPU settings to define the computational resources allocated to the pod. - items: - requests: - type: group - label: Requests - items: - memory: - label: Memory - type: string - default: 2Gi - tooltip: Specify the minimum memory required for the pod (e.g., 512Mi, 1Gi). - cpu: - label: CPU - type: string - default: '2' - tooltip: Specify the minimum CPU required for the pod (e.g., 0.5, 1, 100m). Use decimal values for partial CPUs or "m" for millicores (e.g., 100m = 0.1 CPU). - limits: - type: group - label: Limits - items: - memory: - label: Memory - type: string - default: 4Gi - tooltip: Set the maximum memory the pod can use (e.g., 1Gi, 2Gi). - cpu: - label: CPU - type: string - default: '4' - tooltip: Set the maximum CPU the pod can use (e.g., 1, 2, 500m). - select_gpu: - label: Select GPU Device - type: dropdown - tooltip: Choose the type of GPU device for the deployment, if needed. Select "None" for CPU-only workloads or "Custom" to specify a custom GPU resource key. - options: - - value: None - label: None - - value: nvidia.com/gpu - label: Nvidia GPU - - value: amd.com/gpu - label: AMD GPU - - value: cloud-tpus.google.com/v3 - label: Google TPU - - value: Custom - label: Custom GPU Resource Key - gpu_resource_key: - label: Custom GPU Resource Key - type: string - hidden: ${{ inputs.k8s.resources.limits.select_gpu != Custom }} - ignore: ${{ .hidden }} - tooltip: | - Specify a custom GPU resource key for Kubernetes, such as: - - nvidia.com/gpu - - amd.com/gpu - - cloud-tpus.google.com/v3 - - nvidia.com/mig-1g.5gb - - nvidia.com/mig-2g.10gb - - nvidia.com/mig-3g.20gb - number_of_gpus: - label: Number of GPUs - type: number - step: 1 - default: 1 - min: 1 - tooltip: Specify the number of GPUs to allocate for the deployment. - hidden: ${{ inputs.k8s.resources.limits.select_gpu == None }} - ignore: ${{ .hidden }} - service_k8s: - type: group - label: Service Settings - hidden: ${{ inputs.targetType != 'kubernetes-cluster' }} - items: - image: - label: KasmVNC Image - type: string - default: kasmweb/desktop:1.16.0 - tooltip: Sample container from https://hub.docker.com/r/kasmweb/desktop - image_port: - label: KasmVNC Port - type: number - default: 6901 - tooltip: Define the port on which the KasmVNC runs inside the container. Default is 6901. - password: - label: Password - type: password - tooltip: Type in a password for user kasm_user - optional: ${{ inputs.targetType != 'kubernetes-cluster' }} diff --git a/workflow/yamls/webshell/general.yaml b/workflow/yamls/webshell/general.yaml deleted file mode 100644 index 96a0c028d..000000000 --- a/workflow/yamls/webshell/general.yaml +++ /dev/null @@ -1,310 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - if ! [ -z ${{ org.JUICE_TOKEN }} ]; then - echo "export JUICE_TOKEN=${{ org.JUICE_TOKEN }}" >> resources/host/inputs.sh - fi - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Server Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - optional: true - jobschedulertype: - type: dropdown - label: Select Controller, SLURM Partition or PBS Queue - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_partition_e_: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - Partition to submit the interactive job. Leave empty to let SLURM pick - the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; to - separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - hidden: true - label: Webshell - items: - name: - type: string - hidden: true - default: webshell - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.3.0.tgz - juice: - type: group - label: Attached GPU Settings - collapsed: true - hidden: ${{ org.JUICE_TOKEN == "" }} - items: - use_juice: - label: Enable Juice? - type: boolean - default: false - tooltip: Enable Juice to access and share remote GPUs over a network for your workload. - pool_ids: - label: Pool IDs - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Comma separated list of pool ids from which to allocate the session resources, when empty any available pool you have access to is used - vram: - label: VRAM - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Amount of VRAM requested in gibibytes. Can return a session with fewer bytes if there is not enough space on the device. Suffixes can be provided for convenience. E.g. 4 GiB (default "0") - cmd_args: - label: Juice Run Command Arguments - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Optional arguments for the juice run command to customize workload execution, e.g., "--gpu-ids string". diff --git a/workflow/yamls/webshell/hsp.yaml b/workflow/yamls/webshell/hsp.yaml deleted file mode 100644 index 923b4a9b7..000000000 --- a/workflow/yamls/webshell/hsp.yaml +++ /dev/null @@ -1,355 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - if ! [ -z ${{ org.JUICE_TOKEN }} ]; then - echo "export JUICE_TOKEN=${{ org.JUICE_TOKEN }}" >> resources/host/inputs.sh - fi - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - - value: PBS - label: PBS Queue - tooltip: Job will be submitted using SSH, sbatch or qsub, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - _sch__d_q___: - type: string - label: PBS queue - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - tooltip: - Queue to submit the interactive job. Must select one! Use [qstat -f - -Q] to list all queues on the system - scheduler_directives_pbs: - type: string - label: Scheduler directives - hidden: ${{ 'PBS' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: - e.g. -l mem=1000;-l nodes=1:ppn=4 - Use the semicolon character ; to - separate parameters. Do not include the PBS keyword. - collapsed: false - service: - type: group - hidden: true - label: Webshell - items: - name: - type: string - hidden: true - default: webshell - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.3.0.tgz - juice: - type: group - label: Attached GPU Settings - collapsed: true - hidden: ${{ org.JUICE_TOKEN == "" }} - items: - use_juice: - label: Enable Juice? - type: boolean - default: false - tooltip: Enable Juice to access and share remote GPUs over a network for your workload. - pool_ids: - label: Pool IDs - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Comma separated list of pool ids from which to allocate the session resources, when empty any available pool you have access to is used - vram: - label: VRAM - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Amount of VRAM requested in gibibytes. Can return a session with fewer bytes if there is not enough space on the device. Suffixes can be provided for convenience. E.g. 4 GiB (default "0") - cmd_args: - label: Juice Run Command Arguments - type: string - hidden: ${{ inputs.juice.use_juice == false }} - ignore: ${{ .hidden }} - optional: true - tooltip: Optional arguments for the juice run command to customize workload execution, e.g., "--gpu-ids string". diff --git a/workflow/yamls/webshell/noaa-v3.yaml b/workflow/yamls/webshell/noaa-v3.yaml deleted file mode 100644 index 4621b7389..000000000 --- a/workflow/yamls/webshell/noaa-v3.yaml +++ /dev/null @@ -1,299 +0,0 @@ -permissions: - - '*' -sessions: - session: - useTLS: false - redirect: true - -jobs: - preprocessing: - steps: - - name: Validating Target Resource - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/input_form_resource_wrapper.sh ${{ inputs.pwrl_host.resource.ip }} - - name: Process Inputs - early-cancel: any-job-failed - run: | - set -e - echo "export basepath=/me/session/${PW_USER}/${{ sessions.session }}" >> resources/host/inputs.sh - ./utils/steps-v3/preprocessing/process_inputs_sh.sh - - name: Transfer Files to Controller - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/transfer_files_to_controller.sh - - name: Controller Preprocessing - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/controller_preprocessing.sh - - name: Initialize Cancel Script - early-cancel: any-job-failed - run: ./utils/steps-v3/preprocessing/initialize_cancel_script.sh - - controller_job: - needs: - - preprocessing - steps: - - name: Create Controller Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/create_session_script.sh - - name: Launch and Monitor Controller Job - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/controller/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - compute_job: - needs: - - preprocessing - steps: - - name: Create Compute Session Script - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/create_session_script.sh - - name: Launch and Monitor Compute Job - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/compute/launch_and_monitor_job.sh - cleanup: | - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" != "CONTROLLER" ]]; then - ./kill.sh - fi - - name: Clean and Exit - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' != inputs.pwrl_host.jobschedulertype }} - run: ./utils/steps-v3/clean_and_exit.sh - - create_session: - needs: - - preprocessing - steps: - - name: Set Session Name - early-cancel: any-job-failed - run: | - session_name=$(pwd | rev | cut -d'/' -f1-2 | tr '/' '-' | rev) - echo "session_name=${session_name}" | tee -a $OUTPUTS - - name: Get Controller Hostname - early-cancel: any-job-failed - if: ${{ 'CONTROLLER' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - target_hostname=$(${sshcmd} hostname) - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Compute Hostname - early-cancel: any-job-failed - if: ${{ 'SLURM' == inputs.pwrl_host.jobschedulertype || 'PBS' == inputs.pwrl_host.jobschedulertype }} - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source utils/load-env.sh - source resources/host/inputs.sh - while true; do - failIfError - echo "Waiting for target hostname..." - - # Check if the service.port file exists and read its contents - target_hostname=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/target.hostname ]; then cat ${resource_jobdir}/target.hostname; fi\"") - - # Exit the loop if file was found and read - if [ -n "${target_hostname}" ]; then - echo "Target's hostname found: ${target_hostname}" - break - fi - - # Wait before the next check - sleep 5 - done - if [[ "${{ inputs.pwrl_host.jobschedulertype }}" == "SLURM" ]]; then - job_id=$(${sshcmd} cat ${resource_jobdir}/job.id) - if [ -z "${job_id}" ]; then - echo "Error: SLURM job ID is empty!" >&2 - exit 1 - fi - target_hostname=$(${sshcmd} squeue -j "${job_id}" --noheader --format="%N") - fi - echo "target_hostname=${target_hostname}" | tee -a $OUTPUTS - - - name: Get Remote Port - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - run: | - source resources/host/inputs.sh - while true; do - echo "Waiting for service port file..." - - # Check if the service.port file exists and read its contents - remote_port=$(${sshcmd} "bash -c \"if [ -f ${resource_jobdir}/service.port ]; then cat ${resource_jobdir}/service.port; fi\"") - - # Exit the loop if remote_port is successfully set (file was found and read) - if [ -n "$remote_port" ]; then - echo "Service port found: $remote_port" - break - fi - - # Wait before the next check - sleep 5 - done - echo "remote_port=${remote_port}" | tee -a $OUTPUTS - - name: Waiting for Server to Start - early-cancel: any-job-failed - env: - sshcmd: ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ${{ inputs.pwrl_host.resource.ip }} - remote_port: ${{ needs.create_session.outputs.remote_port }} - remote_host: ${{ needs.create_session.outputs.target_hostname }} - run: | - TIMEOUT=5 - RETRY_INTERVAL=3 - - # Function to check if server is listening - check_server() { - ${sshcmd} "unset LD_LIBRARY_PATH && curl --silent --connect-timeout \"$TIMEOUT\" \"http://${remote_host}:${remote_port}\"" >/dev/null 2>&1 - return $? - } - - # Main loop - attempt=1 - while true; do - echo "Attempt $attempt: Checking if server is listening on ${remote_host}:${remote_port}..." - - if check_server; then - echo "Success: Server is listening on ${remote_host}:${remote_port}!" - exit 0 - else - echo "Server not responding. Retrying in ${RETRY_INTERVAL} seconds..." - sleep "$RETRY_INTERVAL" - ((attempt++)) - fi - done - - name: Select Local Port - early-cancel: any-job-failed - run: | - local_port=$(pw agent open-port) - echo "local_port=${local_port}" | tee -a $OUTPUTS - - name: Expose Port - early-cancel: any-job-failed - uses: parallelworks/update-session - with: - remotePort: '${{ needs.create_session.outputs.remote_port }}' - localPort: '${{ needs.create_session.outputs.local_port }}' - remoteHost: '${{ needs.create_session.outputs.target_hostname }}' - target: ${{ inputs.pwrl_host.resource.id }} - name: ${{ sessions.session }} - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - hidden: true - label: Webshell - items: - name: - type: string - hidden: true - default: webshell - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.3.0.tgz diff --git a/workflow/yamls/webshell/noaa.yaml b/workflow/yamls/webshell/noaa.yaml deleted file mode 100644 index 652594b0a..000000000 --- a/workflow/yamls/webshell/noaa.yaml +++ /dev/null @@ -1,134 +0,0 @@ -jobs: - main: - steps: - - name: Preprocessing - run: ./utils/steps/preprocessing.sh - - name: Validating Target Resource - run: ./utils/steps/input_form_resource_wrapper.sh - - name: Process Inputs - run: ./utils/steps/process_inputs_sh.sh - - name: Controller Preprocessing - run: ./utils/steps/controller_preprocessing.sh - - name: Prepare Service JSON - run: ./utils/steps/prepare_service_json.sh - - name: Initialize Cancel Script - run: ./utils/steps/initialize_cancel_script.sh - - name: Create Session Script - run: ./utils/steps/create_session_script.sh - - name: Launch Job and Wait - run: ./utils/steps/launch_job_and_wait.sh - cleanup: ./kill.sh - - name: Clean and Exit - run: ./utils/steps/clean_and_exit.sh - -'on': - execute: - inputs: - pwrl_host: - type: group - label: Service Host - items: - resource: - type: compute-clusters - label: Service host - include-workspace: false - tooltip: Resource to host the service - nports: - type: number - label: Number of Ports to Reserve - hidden: true - default: 1 - jobschedulertype: - type: dropdown - label: Select Controller or SLURM Partition - default: CONTROLLER - options: - - value: CONTROLLER - label: Controller - - value: SLURM - label: SLURM Partition - tooltip: Job will be submitted using SSH or sbatch, respectively - _sch__dd_account_e__tag_existing: - label: SLURM account - type: slurm-accounts - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Account to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_partition_e__tag_existing: - label: SLURM partition - type: slurm-partitions - resource: ${{ inputs.pwrl_host.resource }} - tooltip: SLURM partition to submit the interactive job - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - qos_tag_existing: - label: Quality of Service [QoS] - type: slurm-qos - resource: ${{ inputs.pwrl_host.resource }} - tooltip: Select a QOS from the drop down menu - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: ${{ .hidden }} - _sch__dd_ntasks_e__tag_existing: - label: Number of tasks - type: number - min: 1 - max: 100 - default: 1 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - _sch__dd_nodes_e__tag_existing: - label: Number of nodes - type: number - default: 1 - hidden: true - ignore: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' != inputs.pwrl_host.resource.provider }} - _sch__dd_partition_e__tag_cloud: - type: slurm-partitions - label: SLURM partition - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype || 'existing' == inputs.pwrl_host.resource.provider }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - Partition to submit the interactive job. Leave empty to let SLURM - pick the optimal option. - resource: ${{ inputs.pwrl_host.resource }} - _sch__dd_time_e_: - label: Walltime - type: string - default: 01:00:00 - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - tooltip: e.g. 01:00:00 - Amount of time slurm will honor the interactive session. - scheduler_directives_slurm: - type: string - label: Scheduler directives - hidden: ${{ 'SLURM' != inputs.pwrl_host.jobschedulertype }} - ignore: ${{ .hidden }} - optional: true - tooltip: >- - e.g. --mem=1000;--gpus-per-node=1 - Use the semicolon character ; - to separate parameters. Do not include the SBATCH keyword. - collapsed: false - service: - type: group - hidden: true - label: Webshell - items: - name: - type: string - hidden: true - default: webshell - novnc_parent_install_dir: - label: noVNC installation directory - type: string - hidden: true - default: __HOME__/pw/software - novnc_tgz_basename: - label: Basename of the novnc tgz file - type: string - hidden: true - default: noVNC-1.3.0.tgz