diff --git a/agent/cmd/agent/main.go b/agent/cmd/agent/main.go index d013f7b..8275e2e 100644 --- a/agent/cmd/agent/main.go +++ b/agent/cmd/agent/main.go @@ -20,6 +20,7 @@ import ( "techulus/cloud-agent/internal/dns" agenthttp "techulus/cloud-agent/internal/http" "techulus/cloud-agent/internal/logs" + "techulus/cloud-agent/internal/metrics" "techulus/cloud-agent/internal/network" "techulus/cloud-agent/internal/paths" "techulus/cloud-agent/internal/reconcile" @@ -34,17 +35,19 @@ var ( func main() { var ( - controlPlaneURL string - token string - isProxy bool - logsEndpointFlag string - disableDNS bool + controlPlaneURL string + token string + isProxy bool + logsEndpointFlag string + metricsEndpointFlag string + disableDNS bool ) flag.StringVar(&controlPlaneURL, "url", "", "Control plane URL (required)") flag.StringVar(&token, "token", "", "Registration token (required for first run)") flag.BoolVar(&isProxy, "proxy", false, "Run as proxy node (handles TLS and public traffic)") flag.StringVar(&logsEndpointFlag, "logs-endpoint", "", "Override logs endpoint URL (optional)") + flag.StringVar(&metricsEndpointFlag, "metrics-endpoint", "", "Override metrics endpoint URL (optional)") flag.BoolVar(&disableDNS, "no-dns", false, "Disable local DNS server") flag.Parse() @@ -53,6 +56,7 @@ func main() { } var logsEndpoint string + var metricsEndpoint string if isProxy && runtime.GOOS != "linux" { log.Fatal("--proxy flag is only supported on Linux") @@ -112,6 +116,11 @@ func main() { } else if config.LoggingEndpoint != "" { logsEndpoint = config.LoggingEndpoint } + if metricsEndpointFlag != "" { + metricsEndpoint = metricsEndpointFlag + } else if config.MetricsEndpoint != "" { + metricsEndpoint = config.MetricsEndpoint + } if err = container.EnsureNetwork(config.SubnetID); err != nil { log.Printf("Warning: Failed to ensure container network: %v", err) @@ -162,6 +171,11 @@ func main() { respLoggingEndpoint = *resp.LoggingEndpoint } + var respMetricsEndpoint string + if resp.MetricsEndpoint != nil { + respMetricsEndpoint = *resp.MetricsEndpoint + } + var registryURL, registryUsername, registryPassword string if resp.RegistryURL != nil { registryURL = *resp.RegistryURL @@ -180,6 +194,7 @@ func main() { EncryptionKey: resp.EncryptionKey, IsProxy: isProxy, LoggingEndpoint: respLoggingEndpoint, + MetricsEndpoint: respMetricsEndpoint, RegistryURL: registryURL, RegistryUsername: registryUsername, RegistryPassword: registryPassword, @@ -191,6 +206,11 @@ func main() { } else if respLoggingEndpoint != "" { logsEndpoint = respLoggingEndpoint } + if metricsEndpointFlag != "" { + metricsEndpoint = metricsEndpointFlag + } else if respMetricsEndpoint != "" { + metricsEndpoint = respMetricsEndpoint + } if err = configuration.Save(config); err != nil { log.Fatalf("Failed to save config: %v", err) @@ -251,6 +271,7 @@ func main() { var traefikLogCollector *logs.TraefikCollector var logsSender *logs.VictoriaLogsSender var agentLogWriter *logs.AgentLogWriter + var metricsSender agent.MetricsSender if logsEndpoint != "" { log.Println("[logs] log collection enabled, endpoint:", logsEndpoint) @@ -265,6 +286,11 @@ func main() { log.Println("[agent-logs] Agent log collection enabled") } + if metricsEndpoint != "" { + log.Println("[metrics] metrics collection enabled, endpoint:", metricsEndpoint) + metricsSender = metrics.NewVictoriaMetricsSender(metricsEndpoint, config.ServerID) + } + var builder *build.Builder if logsSender != nil { builder = build.NewBuilder(dataDir, logsSender) @@ -294,7 +320,7 @@ func main() { privateIP := network.PrivateIP() log.Printf("Agent v%s started. Public IP: %s, Private IP: %s. Tick interval: %v", agent.Version, publicIP, privateIP, agent.TickInterval) - agentInstance := agent.NewAgent(client, reconciler, config, publicIP, privateIP, dataDir, logCollector, traefikLogCollector, builder, config.IsProxy, disableDNS) + agentInstance := agent.NewAgent(client, reconciler, config, publicIP, privateIP, dataDir, logCollector, traefikLogCollector, metricsSender, builder, config.IsProxy, disableDNS) agentInstance.Run(ctx) if agentLogFlusherDone != nil { diff --git a/agent/internal/agent/agent.go b/agent/internal/agent/agent.go index 3c56756..fcb7826 100644 --- a/agent/internal/agent/agent.go +++ b/agent/internal/agent/agent.go @@ -6,6 +6,7 @@ import ( "techulus/cloud-agent/internal/build" "techulus/cloud-agent/internal/container" + "techulus/cloud-agent/internal/health" agenthttp "techulus/cloud-agent/internal/http" "techulus/cloud-agent/internal/logs" "techulus/cloud-agent/internal/reconcile" @@ -42,6 +43,7 @@ type Config struct { EncryptionKey string `json:"encryptionKey"` IsProxy bool `json:"isProxy"` LoggingEndpoint string `json:"loggingEndpoint,omitempty"` + MetricsEndpoint string `json:"metricsEndpoint,omitempty"` RegistryURL string `json:"registryUrl,omitempty"` RegistryUsername string `json:"registryUsername,omitempty"` RegistryPassword string `json:"registryPassword,omitempty"` @@ -78,6 +80,7 @@ type Agent struct { processingStart time.Time LogCollector *logs.Collector TraefikLogCollector *logs.TraefikCollector + MetricsSender MetricsSender Builder *build.Builder isBuilding bool buildMutex sync.Mutex @@ -94,6 +97,7 @@ func NewAgent( publicIP, privateIP, dataDir string, logCollector *logs.Collector, traefikLogCollector *logs.TraefikCollector, + metricsSender MetricsSender, builder *build.Builder, isProxy bool, disableDNS bool, @@ -110,12 +114,17 @@ func NewAgent( DataDir: dataDir, LogCollector: logCollector, TraefikLogCollector: traefikLogCollector, + MetricsSender: metricsSender, Builder: builder, IsProxy: isProxy, DisableDNS: disableDNS, } } +type MetricsSender interface { + SendSystemStats(stats *health.SystemStats, collectedAt time.Time) error +} + func (a *Agent) GetState() AgentState { a.stateMutex.RLock() defer a.stateMutex.RUnlock() diff --git a/agent/internal/agent/backup.go b/agent/internal/agent/backup.go index 2397ee0..5b121c7 100644 --- a/agent/internal/agent/backup.go +++ b/agent/internal/agent/backup.go @@ -7,12 +7,14 @@ import ( "crypto/sha256" "encoding/hex" "encoding/json" + "errors" "fmt" "io" "log" "os" "path/filepath" "strings" + "syscall" "techulus/cloud-agent/internal/container" agenthttp "techulus/cloud-agent/internal/http" @@ -240,7 +242,7 @@ func (a *Agent) processVolumeRestore(backupID, serviceID, containerID, volumeNam return reportFailure(fmt.Errorf("failed to create volume parent directory: %w", err)) } - if err := os.Rename(tempExtractPath, volumePath); err != nil { + if err := moveDir(tempExtractPath, volumePath); err != nil { startContainerWithRetry() return reportFailure(fmt.Errorf("failed to move restored data to volume path: %w", err)) } @@ -256,6 +258,96 @@ func (a *Agent) processVolumeRestore(backupID, serviceID, containerID, volumeNam return nil } +func moveDir(src, dst string) error { + if err := os.Rename(src, dst); err == nil { + return nil + } else if !errors.Is(err, syscall.EXDEV) { + return err + } + + if err := copyDir(src, dst); err != nil { + return err + } + + return os.RemoveAll(src) +} + +func copyDir(src, dst string) error { + info, err := os.Stat(src) + if err != nil { + return err + } + if !info.IsDir() { + return fmt.Errorf("source is not a directory: %s", src) + } + + if err := os.MkdirAll(dst, info.Mode()); err != nil { + return err + } + + entries, err := os.ReadDir(src) + if err != nil { + return err + } + + for _, entry := range entries { + srcPath := filepath.Join(src, entry.Name()) + dstPath := filepath.Join(dst, entry.Name()) + entryInfo, err := os.Lstat(srcPath) + if err != nil { + return err + } + + if entryInfo.IsDir() { + if err := copyDir(srcPath, dstPath); err != nil { + return err + } + continue + } + + if entryInfo.Mode()&os.ModeSymlink != 0 { + linkTarget, err := os.Readlink(srcPath) + if err != nil { + return err + } + if err := os.Symlink(linkTarget, dstPath); err != nil { + return err + } + continue + } + + if !entryInfo.Mode().IsRegular() { + return fmt.Errorf("unsupported file type in restore archive: %s", srcPath) + } + + if err := copyFile(srcPath, dstPath, entryInfo.Mode()); err != nil { + return err + } + } + + return os.Chmod(dst, info.Mode()) +} + +func copyFile(src, dst string, mode os.FileMode) error { + in, err := os.Open(src) + if err != nil { + return err + } + defer in.Close() + + out, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, mode) + if err != nil { + return err + } + + _, copyErr := io.Copy(out, in) + closeErr := out.Close() + if copyErr != nil { + return copyErr + } + return closeErr +} + func createS3Client(cfg StorageConfig) (*s3.Client, error) { awsCfg, err := config.LoadDefaultConfig(context.Background(), config.WithRegion(cfg.Region), diff --git a/agent/internal/agent/reporting.go b/agent/internal/agent/reporting.go index 4513604..16b23a3 100644 --- a/agent/internal/agent/reporting.go +++ b/agent/internal/agent/reporting.go @@ -39,7 +39,15 @@ func (a *Agent) BuildStatusReport(includeResources bool) *agenthttp.StatusReport healthCollectMu.Lock() if time.Since(lastHealthCollect) >= 60*time.Second { - report.HealthStats = health.CollectSystemStats() + collectedAt := time.Now() + systemStats := health.CollectSystemStats() + if a.MetricsSender != nil { + go func() { + if err := a.MetricsSender.SendSystemStats(systemStats, collectedAt); err != nil { + log.Printf("[metrics] failed to send system stats: %v", err) + } + }() + } report.NetworkHealth = health.CollectNetworkHealth("wg0") report.ContainerHealth = health.CollectContainerHealth() report.AgentHealth = &agenthttp.AgentHealth{ @@ -48,8 +56,8 @@ func (a *Agent) BuildStatusReport(includeResources bool) *agenthttp.StatusReport } lastHealthCollect = time.Now() log.Printf("[health] collected: cpu=%.1f%%, mem=%.1f%%, disk=%.1f%%, network=%v, containers=%d running", - report.HealthStats.CpuUsagePercent, report.HealthStats.MemoryUsagePercent, - report.HealthStats.DiskUsagePercent, report.NetworkHealth.TunnelUp, + systemStats.CpuUsagePercent, systemStats.MemoryUsagePercent, + systemStats.DiskUsagePercent, report.NetworkHealth.TunnelUp, report.ContainerHealth.RunningContainers) } healthCollectMu.Unlock() diff --git a/agent/internal/api/client.go b/agent/internal/api/client.go index 18877b4..187e7b1 100644 --- a/agent/internal/api/client.go +++ b/agent/internal/api/client.go @@ -38,6 +38,7 @@ type RegisterResponse struct { WireGuardIP string `json:"wireguardIp"` EncryptionKey string `json:"encryptionKey"` LoggingEndpoint *string `json:"loggingEndpoint"` + MetricsEndpoint *string `json:"metricsEndpoint"` RegistryURL *string `json:"registryUrl"` RegistryUsername *string `json:"registryUsername"` RegistryPassword *string `json:"registryPassword"` diff --git a/agent/internal/http/client.go b/agent/internal/http/client.go index 15a4f42..838ebc9 100644 --- a/agent/internal/http/client.go +++ b/agent/internal/http/client.go @@ -251,7 +251,6 @@ type StatusReport struct { Meta map[string]string `json:"meta,omitempty"` Containers []ContainerStatus `json:"containers"` DnsInSync bool `json:"dnsInSync,omitempty"` - HealthStats *health.SystemStats `json:"healthStats,omitempty"` NetworkHealth *health.NetworkHealth `json:"networkHealth,omitempty"` ContainerHealth *health.ContainerHealth `json:"containerHealth,omitempty"` AgentHealth *AgentHealth `json:"agentHealth,omitempty"` diff --git a/agent/internal/metrics/victoria.go b/agent/internal/metrics/victoria.go new file mode 100644 index 0000000..8d90e67 --- /dev/null +++ b/agent/internal/metrics/victoria.go @@ -0,0 +1,90 @@ +package metrics + +import ( + "bytes" + "fmt" + "net/http" + "net/url" + "strings" + "time" + + "techulus/cloud-agent/internal/health" +) + +type VictoriaMetricsSender struct { + endpoint string + serverID string + username string + password string + client *http.Client +} + +func NewVictoriaMetricsSender(endpoint, serverID string) *VictoriaMetricsSender { + var username, password string + cleanEndpoint := strings.TrimRight(endpoint, "/") + + if parsedURL, err := url.Parse(endpoint); err == nil && parsedURL.User != nil { + username = parsedURL.User.Username() + password, _ = parsedURL.User.Password() + parsedURL.User = nil + cleanEndpoint = strings.TrimRight(parsedURL.String(), "/") + } + + return &VictoriaMetricsSender{ + endpoint: cleanEndpoint, + serverID: serverID, + username: username, + password: password, + client: &http.Client{ + Timeout: 15 * time.Second, + }, + } +} + +func (v *VictoriaMetricsSender) SendSystemStats(stats *health.SystemStats, collectedAt time.Time) error { + if stats == nil { + return nil + } + + timestampMs := collectedAt.UnixMilli() + serverID := escapeLabelValue(v.serverID) + + var buf bytes.Buffer + writeGauge(&buf, "techulus_node_cpu_usage_percent", serverID, stats.CpuUsagePercent, timestampMs) + writeGauge(&buf, "techulus_node_memory_usage_percent", serverID, stats.MemoryUsagePercent, timestampMs) + writeGauge(&buf, "techulus_node_memory_used_bytes", serverID, float64(stats.MemoryUsedMb)*1024*1024, timestampMs) + writeGauge(&buf, "techulus_node_disk_usage_percent", serverID, stats.DiskUsagePercent, timestampMs) + writeGauge(&buf, "techulus_node_disk_used_bytes", serverID, float64(stats.DiskUsedGb)*1024*1024*1024, timestampMs) + + req, err := http.NewRequest("POST", v.endpoint+"/api/v1/import/prometheus", &buf) + if err != nil { + return fmt.Errorf("failed to create metrics request: %w", err) + } + req.Header.Set("Content-Type", "text/plain; version=0.0.4") + if v.username != "" { + req.SetBasicAuth(v.username, v.password) + } + + resp, err := v.client.Do(req) + if err != nil { + return fmt.Errorf("failed to send metrics: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNoContent { + return fmt.Errorf("unexpected metrics status code: %d", resp.StatusCode) + } + + return nil +} + +func writeGauge(buf *bytes.Buffer, name, serverID string, value float64, timestampMs int64) { + fmt.Fprintf(buf, "%s{server_id=\"%s\"} %f %d\n", name, serverID, value, timestampMs) +} + +func escapeLabelValue(value string) string { + value = strings.ReplaceAll(value, "\\", "\\\\") + value = strings.ReplaceAll(value, "\n", "\\n") + value = strings.ReplaceAll(value, "\"", "\\\"") + return value +} diff --git a/agent/scripts/install-debug-orbstack-agent.sh b/agent/scripts/install-debug-orbstack-agent.sh new file mode 100755 index 0000000..c67f08d --- /dev/null +++ b/agent/scripts/install-debug-orbstack-agent.sh @@ -0,0 +1,184 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +SERVICE_NAME="${SERVICE_NAME:-techulus-agent}" +INSTALL_PATH="${INSTALL_PATH:-/usr/local/bin/techulus-agent}" +VERSION="${VERSION:-debug-fast-deploy-$(date +%Y%m%d%H%M%S)}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +AGENT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +BUILD_DIR="${BUILD_DIR:-$AGENT_DIR/bin/debug-orbstack}" + +if [ "$#" -gt 0 ]; then + MACHINES=("$@") +elif [ -n "${ORBSTACK_MACHINES:-}" ]; then + # shellcheck disable=SC2206 + MACHINES=(${ORBSTACK_MACHINES}) +else + MACHINES=(ubuntu-1 ubuntu-2) +fi + +log() { + printf '[%s] %s\n' "$(date '+%H:%M:%S')" "$*" +} + +die() { + log "ERROR: $*" + exit 1 +} + +run() { + log "+ $*" + "$@" +} + +goarch_for_uname() { + case "$1" in + x86_64 | amd64) + printf 'amd64' + ;; + aarch64 | arm64) + printf 'arm64' + ;; + *) + return 1 + ;; + esac +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" +} + +log "Techulus debug agent installer for OrbStack" +log "Agent source: $AGENT_DIR" +log "Build output: $BUILD_DIR" +log "Debug version: $VERSION" +log "Service name: $SERVICE_NAME" +log "Install path: $INSTALL_PATH" +log "Target machines: ${MACHINES[*]}" + +require_cmd go +require_cmd orb + +mkdir -p "$BUILD_DIR" +BUILD_DIR="$(cd "$BUILD_DIR" && pwd)" +MAC_BUILD_DIR="/mnt/mac${BUILD_DIR}" + +MACHINE_ARCHES=() +ARCHES_BUILT=" " + +for machine in "${MACHINES[@]}"; do + log "Checking OrbStack machine '$machine'" + if ! uname_value="$(orb -m "$machine" uname -m 2>&1)"; then + printf '%s\n' "$uname_value" + die "failed to run uname in OrbStack machine '$machine'" + fi + + uname_value="$(printf '%s' "$uname_value" | tr -d '\r\n')" + if ! goarch="$(goarch_for_uname "$uname_value")"; then + die "unsupported architecture '$uname_value' for machine '$machine'" + fi + + MACHINE_ARCHES+=("$machine:$goarch") + log "Machine '$machine' reports '$uname_value'; using GOARCH=$goarch" +done + +build_arch_if_needed() { + goarch="$1" + if [[ "$ARCHES_BUILT" == *" $goarch "* ]]; then + log "Debug agent for linux/$goarch already built" + return + fi + + output="$BUILD_DIR/techulus-agent-debug-linux-$goarch" + log "Building debug agent for linux/$goarch" + ( + cd "$AGENT_DIR" + run env CGO_ENABLED=0 GOOS=linux GOARCH="$goarch" go build \ + -ldflags "-X techulus/cloud-agent/internal/agent.Version=$VERSION-$goarch" \ + -o "$output" \ + ./cmd/agent + ) + run ls -lh "$output" + ARCHES_BUILT="$ARCHES_BUILT$goarch " +} + +for entry in "${MACHINE_ARCHES[@]}"; do + goarch="${entry#*:}" + build_arch_if_needed "$goarch" +done + +for entry in "${MACHINE_ARCHES[@]}"; do + machine="${entry%%:*}" + goarch="${entry#*:}" + remote_binary="$MAC_BUILD_DIR/techulus-agent-debug-linux-$goarch" + + log "Installing debug agent on '$machine'" + log "Remote binary path: $remote_binary" + + orb -m "$machine" -u root sh -s -- "$SERVICE_NAME" "$INSTALL_PATH" "$remote_binary" "$VERSION-$goarch" <<'REMOTE' +set -Eeuo pipefail + +service_name="$1" +install_path="$2" +remote_binary="$3" +version="$4" + +log() { + printf '[remote:%s] %s\n' "$(hostname)" "$*" +} + +log "Starting install for $service_name" +log "Expected version label: $version" +log "Using mounted binary: $remote_binary" + +if [ ! -f "$remote_binary" ]; then + log "ERROR: mounted binary does not exist: $remote_binary" + exit 1 +fi + +log "Binary details:" +ls -lh "$remote_binary" + +if command -v systemctl >/dev/null 2>&1; then + if systemctl status "$service_name" >/dev/null 2>&1; then + log "Stopping $service_name" + systemctl stop "$service_name" + else + log "$service_name is not currently active; continuing" + fi +else + log "ERROR: systemctl not found" + exit 1 +fi + +if [ -e "$install_path" ]; then + backup_path="${install_path}.backup.$(date +%Y%m%d%H%M%S)" + log "Backing up existing agent to $backup_path" + cp -a "$install_path" "$backup_path" +else + log "No existing agent found at $install_path" +fi + +log "Installing new agent to $install_path" +install -m 0755 "$remote_binary" "$install_path" + +log "Installed binary details:" +ls -lh "$install_path" + +log "Starting $service_name" +systemctl start "$service_name" + +log "Service status:" +systemctl status "$service_name" --no-pager + +log "Recent service logs:" +journalctl -u "$service_name" -n 30 --no-pager +REMOTE + + log "Finished install on '$machine'" +done + +log "All done" +log "Tail logs with: orb -m ${MACHINES[0]} -u root journalctl -u $SERVICE_NAME -f" diff --git a/compose.dev.yml b/compose.dev.yml index 8143b70..4419b83 100644 --- a/compose.dev.yml +++ b/compose.dev.yml @@ -41,6 +41,17 @@ services: - "-retentionPeriod=7d" restart: unless-stopped + victoria-metrics: + image: victoriametrics/victoria-metrics:latest + ports: + - "8428:8428" + volumes: + - victoria-metrics-data:/vmdata + command: + - "-storageDataPath=/vmdata" + - "-retentionPeriod=${VM_RETENTION:-30d}" + restart: unless-stopped + minio: image: minio/minio:latest ports: @@ -61,15 +72,16 @@ services: - "8289:8289" command: inngest dev -u http://host.docker.internal:3000/api/inngest healthcheck: - test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8288/health"] + test: ["CMD", "inngest", "alpha", "doctor", "healthcheck"] interval: 30s timeout: 10s retries: 3 - start_period: 30s + start_period: 40s restart: unless-stopped volumes: registry-data: victoria-logs-data: + victoria-metrics-data: postgres_data: minio_data: diff --git a/deployment/.env.example b/deployment/.env.example index df2b5ef..9425024 100644 --- a/deployment/.env.example +++ b/deployment/.env.example @@ -18,6 +18,11 @@ VL_USERNAME=admin VL_PASSWORD=your-secure-logs-password VL_RETENTION=7d +# Victoria Metrics Authentication +VM_USERNAME=admin +VM_PASSWORD=your-secure-metrics-password +VM_RETENTION=30d + # Registry REGISTRY_USERNAME=admin REGISTRY_PASSWORD=your-registry-password diff --git a/deployment/compose.postgres.yml b/deployment/compose.postgres.yml index 3c8d0dd..db8b1bf 100644 --- a/deployment/compose.postgres.yml +++ b/deployment/compose.postgres.yml @@ -79,6 +79,8 @@ services: - APP_URL=https://${ROOT_DOMAIN} - VICTORIA_LOGS_URL=https://${VL_USERNAME}:${VL_PASSWORD}@logs.${ROOT_DOMAIN} - VICTORIA_LOGS_PRIVATE_URL=http://${VL_USERNAME}:${VL_PASSWORD}@victoria-logs:9428 + - VICTORIA_METRICS_URL=https://${VM_USERNAME}:${VM_PASSWORD}@metrics.${ROOT_DOMAIN} + - VICTORIA_METRICS_PRIVATE_URL=http://${VM_USERNAME}:${VM_PASSWORD}@victoria-metrics:8428 - REGISTRY_URL=registry:5000 - REGISTRY_HOST=registry.${ROOT_DOMAIN} - INNGEST_BASE_URL=http://inngest:8288 @@ -102,6 +104,8 @@ services: - APP_URL=https://${ROOT_DOMAIN} - VICTORIA_LOGS_URL=https://${VL_USERNAME}:${VL_PASSWORD}@logs.${ROOT_DOMAIN} - VICTORIA_LOGS_PRIVATE_URL=http://${VL_USERNAME}:${VL_PASSWORD}@victoria-logs:9428 + - VICTORIA_METRICS_URL=https://${VM_USERNAME}:${VM_PASSWORD}@metrics.${ROOT_DOMAIN} + - VICTORIA_METRICS_PRIVATE_URL=http://${VM_USERNAME}:${VM_PASSWORD}@victoria-metrics:8428 - REGISTRY_URL=registry:5000 - REGISTRY_HOST=registry.${ROOT_DOMAIN} - INNGEST_BASE_URL=http://inngest:8288 @@ -113,6 +117,8 @@ services: condition: service_completed_successfully victoria-logs: condition: service_started + victoria-metrics: + condition: service_started registry: condition: service_started labels: @@ -178,6 +184,30 @@ services: retries: 3 restart: unless-stopped + victoria-metrics: + image: victoriametrics/victoria-metrics:latest + env_file: + - ./.env + volumes: + - victoria-metrics-data:/vmdata + command: + - "-storageDataPath=/vmdata" + - "-retentionPeriod=${VM_RETENTION:-30d}" + - "-httpAuth.username=${VM_USERNAME}" + - "-httpAuth.password=${VM_PASSWORD}" + labels: + - "traefik.enable=true" + - "traefik.http.routers.metrics.rule=Host(`metrics.${ROOT_DOMAIN}`)" + - "traefik.http.routers.metrics.entrypoints=websecure" + - "traefik.http.routers.metrics.tls.certresolver=letsencrypt" + - "traefik.http.services.metrics.loadbalancer.server.port=8428" + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://127.0.0.1:8428/health || wget -q --spider http://127.0.0.1:8428/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + restart: unless-stopped + inngest: image: inngest/inngest:latest env_file: @@ -196,11 +226,11 @@ services: web: condition: service_healthy healthcheck: - test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8288/health"] + test: ["CMD", "inngest", "alpha", "doctor", "healthcheck"] interval: 30s timeout: 10s retries: 3 - start_period: 30s + start_period: 40s restart: unless-stopped volumes: @@ -208,4 +238,5 @@ volumes: postgres-data: registry-data: victoria-logs-data: + victoria-metrics-data: inngest-data: diff --git a/deployment/compose.production.yml b/deployment/compose.production.yml index e95a9b5..b32e054 100644 --- a/deployment/compose.production.yml +++ b/deployment/compose.production.yml @@ -61,6 +61,8 @@ services: - APP_URL=https://${ROOT_DOMAIN} - VICTORIA_LOGS_URL=https://${VL_USERNAME}:${VL_PASSWORD}@logs.${ROOT_DOMAIN} - VICTORIA_LOGS_PRIVATE_URL=http://${VL_USERNAME}:${VL_PASSWORD}@victoria-logs:9428 + - VICTORIA_METRICS_URL=https://${VM_USERNAME}:${VM_PASSWORD}@metrics.${ROOT_DOMAIN} + - VICTORIA_METRICS_PRIVATE_URL=http://${VM_USERNAME}:${VM_PASSWORD}@victoria-metrics:8428 - REGISTRY_URL=registry:5000 - REGISTRY_HOST=registry.${ROOT_DOMAIN} - INNGEST_BASE_URL=http://inngest:8288 @@ -81,6 +83,8 @@ services: - APP_URL=https://${ROOT_DOMAIN} - VICTORIA_LOGS_URL=https://${VL_USERNAME}:${VL_PASSWORD}@logs.${ROOT_DOMAIN} - VICTORIA_LOGS_PRIVATE_URL=http://${VL_USERNAME}:${VL_PASSWORD}@victoria-logs:9428 + - VICTORIA_METRICS_URL=https://${VM_USERNAME}:${VM_PASSWORD}@metrics.${ROOT_DOMAIN} + - VICTORIA_METRICS_PRIVATE_URL=http://${VM_USERNAME}:${VM_PASSWORD}@victoria-metrics:8428 - REGISTRY_URL=registry:5000 - REGISTRY_HOST=registry.${ROOT_DOMAIN} - INNGEST_BASE_URL=http://inngest:8288 @@ -92,6 +96,8 @@ services: condition: service_completed_successfully victoria-logs: condition: service_started + victoria-metrics: + condition: service_started registry: condition: service_started labels: @@ -157,6 +163,30 @@ services: retries: 3 restart: unless-stopped + victoria-metrics: + image: victoriametrics/victoria-metrics:latest + env_file: + - ./.env + volumes: + - victoria-metrics-data:/vmdata + command: + - "-storageDataPath=/vmdata" + - "-retentionPeriod=${VM_RETENTION:-30d}" + - "-httpAuth.username=${VM_USERNAME}" + - "-httpAuth.password=${VM_PASSWORD}" + labels: + - "traefik.enable=true" + - "traefik.http.routers.metrics.rule=Host(`metrics.${ROOT_DOMAIN}`)" + - "traefik.http.routers.metrics.entrypoints=websecure" + - "traefik.http.routers.metrics.tls.certresolver=letsencrypt" + - "traefik.http.services.metrics.loadbalancer.server.port=8428" + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://127.0.0.1:8428/health || wget -q --spider http://127.0.0.1:8428/-/healthy"] + interval: 30s + timeout: 10s + retries: 3 + restart: unless-stopped + inngest: image: inngest/inngest:latest env_file: @@ -175,15 +205,16 @@ services: web: condition: service_healthy healthcheck: - test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8288/health"] + test: ["CMD", "inngest", "alpha", "doctor", "healthcheck"] interval: 30s timeout: 10s retries: 3 - start_period: 30s + start_period: 40s restart: unless-stopped volumes: letsencrypt: registry-data: victoria-logs-data: + victoria-metrics-data: inngest-data: diff --git a/web/.env.example b/web/.env.example index d7864d8..67fac89 100644 --- a/web/.env.example +++ b/web/.env.example @@ -15,6 +15,13 @@ APP_URL=https://your-domain.com VICTORIA_LOGS_URL=http://username:password@victoria-logs:9428 VICTORIA_LOGS_PRIVATE_URL=http://username:password@victoria-logs:9428 +# Metrics (optional) +VICTORIA_METRICS_URL=http://username:password@victoria-metrics:8428 +VICTORIA_METRICS_PRIVATE_URL=http://username:password@victoria-metrics:8428 +VM_USERNAME=username +VM_PASSWORD=password +VM_RETENTION=30d + # Docker Registry for builds (optional) REGISTRY_HOST=registry.example.com @@ -47,4 +54,4 @@ SMTP_USERNAME=smtp-user SMTP_PASSWORD=secret SMTP_ENCRYPTION=starttls SMTP_TIMEOUT=10000 -SMTP_ALERT_EMAILS=alerts@example.com,admin@example.com \ No newline at end of file +SMTP_ALERT_EMAILS=alerts@example.com,admin@example.com diff --git a/web/actions/backups.ts b/web/actions/backups.ts index 00817e0..28657cc 100644 --- a/web/actions/backups.ts +++ b/web/actions/backups.ts @@ -66,7 +66,10 @@ export async function restoreBackup( return { success: true }; } -export async function deleteBackup(backupId: string) { +export async function deleteBackup( + backupId: string, + options: { revalidate?: boolean } = {}, +) { const backup = await db .select({ status: volumeBackups.status, @@ -77,7 +80,9 @@ export async function deleteBackup(backupId: string) { .then((r) => r[0]); await db.delete(volumeBackups).where(eq(volumeBackups.id, backupId)); - revalidatePath(`/dashboard/projects`); + if (options.revalidate ?? true) { + revalidatePath(`/dashboard/projects`); + } if (backup?.status === "completed" && backup.storagePath) { const storageConfig = await getBackupStorageConfig(); diff --git a/web/actions/builds.ts b/web/actions/builds.ts index dc6a2cd..d480fed 100644 --- a/web/actions/builds.ts +++ b/web/actions/builds.ts @@ -1,6 +1,6 @@ "use server"; -import { eq } from "drizzle-orm"; +import { and, eq, isNull } from "drizzle-orm"; import { db } from "@/db"; import { builds, githubRepos, services } from "@/db/schema"; import { inngest } from "@/lib/inngest/client"; @@ -46,6 +46,15 @@ export async function retryBuild(buildId: string) { throw new Error("Build not found"); } + const [service] = await db + .select({ id: services.id }) + .from(services) + .where(and(eq(services.id, build.serviceId), isNull(services.deletedAt))); + + if (!service) { + throw new Error("Service not found"); + } + if (build.status !== "failed" && build.status !== "cancelled") { throw new Error(`Cannot retry build in ${build.status} status`); } @@ -72,7 +81,7 @@ export async function triggerBuild( const [service] = await db .select() .from(services) - .where(eq(services.id, serviceId)); + .where(and(eq(services.id, serviceId), isNull(services.deletedAt))); if (!service) { throw new Error("Service not found"); diff --git a/web/actions/projects.ts b/web/actions/projects.ts index b855b9d..3bf3938 100644 --- a/web/actions/projects.ts +++ b/web/actions/projects.ts @@ -1,17 +1,17 @@ "use server"; import { randomUUID } from "node:crypto"; -import { z, ZodError } from "zod"; -import { and, eq, inArray } from "drizzle-orm"; -import { - nameSchema, - replicaCountSchema, - volumeNameSchema, - containerPathSchema, - githubRepoUrlSchema, -} from "@/lib/schemas"; -import { getZodErrorMessage } from "@/lib/utils"; +import cronstrue from "cronstrue"; +import { and, desc, eq, inArray } from "drizzle-orm"; +import { revalidatePath } from "next/cache"; +import { ZodError, z } from "zod"; import { db } from "@/db"; +import { + getBackupStorageConfig, + getEnvironment, + getProject, + getService, +} from "@/db/queries"; import { deploymentPorts, deployments, @@ -25,22 +25,32 @@ import { serviceReplicas, services, serviceVolumes, + volumeBackups, workQueue, } from "@/db/schema"; -import { revalidatePath } from "next/cache"; -import { enqueueWork } from "@/lib/work-queue"; -import { - type HealthCheckConfig as ServiceHealthCheckConfig, - type PortConfig, -} from "@/lib/service-config"; -import { slugify } from "@/lib/utils"; -import { getEnvironment, getProject, getService } from "@/db/queries"; -import { allocatePort } from "@/lib/port-allocation"; -import cronstrue from "cronstrue"; -import { startMigration } from "./migrations"; import { DEFAULT_RESOURCE_LIMITS } from "@/lib/constants"; import { inngest } from "@/lib/inngest/client"; import { inngestEvents } from "@/lib/inngest/events"; +import { + calculateResourceAwarePlacement, + replaceServiceReplicaPlacements, +} from "@/lib/placement"; +import { allocatePort } from "@/lib/port-allocation"; +import { + containerPathSchema, + githubRepoUrlSchema, + nameSchema, + replicaCountSchema, + volumeNameSchema, +} from "@/lib/schemas"; +import type { + PortConfig, + HealthCheckConfig as ServiceHealthCheckConfig, +} from "@/lib/service-config"; +import { getZodErrorMessage, slugify } from "@/lib/utils"; +import { enqueueWork } from "@/lib/work-queue"; +import { deleteBackup } from "./backups"; +import { startMigration } from "./migrations"; function isValidImageReferencePart(reference: string): boolean { const tagPattern = /^[A-Za-z0-9_][A-Za-z0-9_.-]{0,127}$/; @@ -278,10 +288,26 @@ export async function deleteProject(id: string) { } } + await deleteBackupsForServices(projectServices.map((service) => service.id)); await db.delete(projects).where(eq(projects.id, id)); return { success: true }; } +async function deleteBackupsForServices(serviceIds: string[]) { + if (serviceIds.length === 0) { + return; + } + + const backups = await db + .select({ id: volumeBackups.id }) + .from(volumeBackups) + .where(inArray(volumeBackups.serviceId, serviceIds)); + + for (const backup of backups) { + await deleteBackup(backup.id, { revalidate: false }); + } +} + export async function updateProjectName(projectId: string, name: string) { try { const validatedName = nameSchema.parse(name); @@ -364,6 +390,12 @@ export async function deleteEnvironment(environmentId: string) { throw new Error("Cannot delete the production environment"); } + const envServices = await db + .select({ id: services.id }) + .from(services) + .where(eq(services.environmentId, environmentId)); + + await deleteBackupsForServices(envServices.map((service) => service.id)); await db.delete(environments).where(eq(environments.id, environmentId)); return { success: true }; } @@ -455,8 +487,12 @@ export async function createService(input: CreateServiceInput) { return { id, name, image: finalImage, sourceType }; } -export async function deleteService(serviceId: string) { - const service = await getService(serviceId); +async function hardDeleteService(serviceId: string) { + const service = await db + .select() + .from(services) + .where(eq(services.id, serviceId)) + .then((r) => r[0]); if (!service) { throw new Error("Service not found"); } @@ -467,7 +503,10 @@ export async function deleteService(serviceId: string) { .where(eq(deployments.serviceId, serviceId)); for (const dep of allDeployments) { - if (dep.status === "running" && dep.containerId) { + if ( + (dep.status === "running" || dep.status === "healthy") && + dep.containerId + ) { await db .update(deployments) .set({ status: "stopping" }) @@ -499,12 +538,244 @@ export async function deleteService(serviceId: string) { } } + const backups = await db + .select({ id: volumeBackups.id }) + .from(volumeBackups) + .where(eq(volumeBackups.serviceId, serviceId)); + + for (const backup of backups) { + await deleteBackup(backup.id, { revalidate: false }); + } + await db.delete(secrets).where(eq(secrets.serviceId, serviceId)); await db.delete(services).where(eq(services.id, serviceId)); return { success: true }; } +export async function deleteService(serviceId: string) { + const service = await getService(serviceId); + if (!service) { + throw new Error("Service not found"); + } + + if (!service.stateful) { + return hardDeleteService(serviceId); + } + + const volumes = await db + .select() + .from(serviceVolumes) + .where(eq(serviceVolumes.serviceId, serviceId)); + + if (volumes.length === 0) { + return hardDeleteService(serviceId); + } + + const storageConfig = await getBackupStorageConfig(); + if (!storageConfig) { + throw new Error( + "Backup storage must be configured before deleting a stateful service", + ); + } + + if (service.deletionStatus && service.deletionStatus !== "failed") { + throw new Error("Deletion is already in progress for this service"); + } + + const runningDeployment = await db + .select({ + id: deployments.id, + serverId: deployments.serverId, + containerId: deployments.containerId, + }) + .from(deployments) + .where( + and( + eq(deployments.serviceId, serviceId), + inArray(deployments.status, ["running", "healthy"]), + ), + ) + .then((r) => r[0]); + + const reusableBackupIds: string[] = []; + + if (!runningDeployment || !runningDeployment.containerId) { + for (const volume of volumes) { + const latestBackup = await db + .select({ id: volumeBackups.id }) + .from(volumeBackups) + .where( + and( + eq(volumeBackups.volumeId, volume.id), + eq(volumeBackups.status, "completed"), + ), + ) + .orderBy(desc(volumeBackups.createdAt)) + .limit(1) + .then((r) => r[0]); + + if (!latestBackup) { + throw new Error( + "Stateful service must be running long enough to create a recoverable backup before deletion", + ); + } + + await db + .update(volumeBackups) + .set({ isDeletionBackup: true }) + .where(eq(volumeBackups.id, latestBackup.id)); + reusableBackupIds.push(latestBackup.id); + } + } + + await db + .update(services) + .set({ deletionStatus: "backing_up", deletionError: null }) + .where(eq(services.id, serviceId)); + + try { + await inngest.send( + inngestEvents.serviceDeletionStarted.create({ + serviceId, + reusableBackupIds, + }), + ); + } catch (error) { + await db + .update(services) + .set({ + deletionStatus: "failed", + deletionError: + error instanceof Error ? error.message : "Deletion workflow failed", + }) + .where(eq(services.id, serviceId)); + throw error; + } + + revalidatePath("/dashboard/projects"); + return { success: true, softDeleteStarted: true }; +} + +export async function restoreDeletedService(serviceId: string) { + const service = await db + .select() + .from(services) + .where(eq(services.id, serviceId)) + .then((r) => r[0]); + + if (!service || !service.deletedAt) { + throw new Error("Deleted service not found"); + } + + if (service.deletionStatus && service.deletionStatus !== "failed") { + throw new Error("A deletion or restore operation is already in progress"); + } + + if (service.originalHostname) { + const existingHostname = await db + .select({ id: services.id }) + .from(services) + .where(eq(services.hostname, service.originalHostname)); + + if (existingHostname.some((s) => s.id !== serviceId)) { + throw new Error( + "Cannot restore because another service is using the original hostname", + ); + } + } + + const volumes = await db + .select({ id: serviceVolumes.id }) + .from(serviceVolumes) + .where(eq(serviceVolumes.serviceId, serviceId)); + + const backupIds: string[] = []; + for (const volume of volumes) { + const backup = await db + .select({ id: volumeBackups.id }) + .from(volumeBackups) + .where( + and( + eq(volumeBackups.volumeId, volume.id), + eq(volumeBackups.isDeletionBackup, true), + eq(volumeBackups.status, "completed"), + ), + ) + .orderBy(desc(volumeBackups.createdAt)) + .limit(1) + .then((r) => r[0]); + + if (!backup) { + throw new Error("Cannot restore because a retained backup is missing"); + } + + backupIds.push(backup.id); + } + + let targetServerId: string | null = null; + + if (service.stateful) { + const existingReplicas = await db + .select({ + id: serviceReplicas.id, + serverId: serviceReplicas.serverId, + count: serviceReplicas.count, + serverStatus: servers.status, + }) + .from(serviceReplicas) + .leftJoin(servers, eq(serviceReplicas.serverId, servers.id)) + .where(eq(serviceReplicas.serviceId, serviceId)); + + const activeReplica = existingReplicas.find((r) => r.count > 0); + + if (activeReplica?.serverStatus === "online") { + targetServerId = activeReplica.serverId; + } else { + const placements = await calculateResourceAwarePlacement(service, 1); + const targetPlacement = placements[0]; + if (!targetPlacement) { + throw new Error("Cannot restore because no online server is available"); + } + + targetServerId = targetPlacement.serverId; + await replaceServiceReplicaPlacements(serviceId, placements); + } + } + + await db + .update(services) + .set({ + deletionStatus: "restoring", + deletionError: null, + lockedServerId: targetServerId ?? service.lockedServerId, + }) + .where(eq(services.id, serviceId)); + + try { + await inngest.send( + inngestEvents.serviceRestoreStarted.create({ + serviceId, + targetServerId, + backupIds, + }), + ); + } catch (error) { + await db + .update(services) + .set({ + deletionStatus: "failed", + deletionError: + error instanceof Error ? error.message : "Restore workflow failed", + }) + .where(eq(services.id, serviceId)); + throw error; + } + + revalidatePath("/dashboard/projects"); + return { success: true }; +} + export async function updateServiceHostname( serviceId: string, hostname: string, diff --git a/web/app/(dashboard)/dashboard/projects/[slug]/[env]/deleted/page.tsx b/web/app/(dashboard)/dashboard/projects/[slug]/[env]/deleted/page.tsx new file mode 100644 index 0000000..c14075a --- /dev/null +++ b/web/app/(dashboard)/dashboard/projects/[slug]/[env]/deleted/page.tsx @@ -0,0 +1,62 @@ +import { ArrowLeft } from "lucide-react"; +import Link from "next/link"; +import { notFound } from "next/navigation"; +import { SetBreadcrumbs } from "@/components/core/breadcrumb-data"; +import { DeletedServicesPanel } from "@/components/service/deleted-services-panel"; +import { Button } from "@/components/ui/button"; +import { + getEnvironmentByName, + getProjectBySlug, + listDeletedServices, +} from "@/db/queries"; + +export default async function DeletedServicesPage({ + params, +}: { + params: Promise<{ slug: string; env: string }>; +}) { + const { slug, env: envName } = await params; + const project = await getProjectBySlug(slug); + + if (!project) { + notFound(); + } + + const environment = await getEnvironmentByName(project.id, envName); + + if (!environment) { + notFound(); + } + + const deletedServices = await listDeletedServices(project.id, environment.id); + const environmentHref = `/dashboard/projects/${slug}/${envName}`; + + return ( + <> + +
+
+
+

Deleted services

+

+ Restore stateful services before their 7-day retention expires. +

+
+ + + +
+ +
+ + ); +} diff --git a/web/app/(dashboard)/dashboard/projects/[slug]/[env]/services/[serviceId]/configuration/page.tsx b/web/app/(dashboard)/dashboard/projects/[slug]/[env]/services/[serviceId]/configuration/page.tsx index 0cd418e..c014b42 100644 --- a/web/app/(dashboard)/dashboard/projects/[slug]/[env]/services/[serviceId]/configuration/page.tsx +++ b/web/app/(dashboard)/dashboard/projects/[slug]/[env]/services/[serviceId]/configuration/page.tsx @@ -1,21 +1,22 @@ "use client"; -import { useCallback, useState } from "react"; +import { Trash2 } from "lucide-react"; import { useRouter } from "next/navigation"; +import { useCallback, useState } from "react"; import { toast } from "sonner"; import { useSWRConfig } from "swr"; import { deleteService } from "@/actions/projects"; -import { useService } from "@/components/service/service-layout-client"; -import { SourceSection } from "@/components/service/details/source-section"; -import { ReplicasSection } from "@/components/service/details/replicas-section"; -import { VolumesSection } from "@/components/service/details/volumes-section"; -import { SecretsSection } from "@/components/service/details/secrets-section"; -import { PortsSection } from "@/components/service/details/ports-section"; -import { TCPProxySection } from "@/components/service/details/tcp-proxy-section"; import { HealthCheckSection } from "@/components/service/details/health-check-section"; +import { PortsSection } from "@/components/service/details/ports-section"; +import { ReplicasSection } from "@/components/service/details/replicas-section"; import { ResourceLimitsSection } from "@/components/service/details/resource-limits-section"; -import { StartCommandSection } from "@/components/service/details/start-command-section"; import { ScheduleSection } from "@/components/service/details/schedule-section"; +import { SecretsSection } from "@/components/service/details/secrets-section"; +import { SourceSection } from "@/components/service/details/source-section"; +import { StartCommandSection } from "@/components/service/details/start-command-section"; +import { TCPProxySection } from "@/components/service/details/tcp-proxy-section"; +import { VolumesSection } from "@/components/service/details/volumes-section"; +import { useService } from "@/components/service/service-layout-client"; import { AlertDialog, AlertDialogAction, @@ -29,13 +30,32 @@ import { } from "@/components/ui/alert-dialog"; import { Button } from "@/components/ui/button"; import { Item, ItemContent, ItemMedia, ItemTitle } from "@/components/ui/item"; -import { Trash2 } from "lucide-react"; + +const ACTIVE_DELETE_BACKUP_STATUSES = ["running", "healthy"] as const; + +function formatBackupDate(value: Date | string | null | undefined) { + if (!value) return "an unknown time"; + return new Date(value).toLocaleString(); +} export default function ConfigurationPage() { const router = useRouter(); const { mutate: globalMutate } = useSWRConfig(); const { service, projectSlug, envName, proxyDomain, onUpdate } = useService(); const [isDeleting, setIsDeleting] = useState(false); + const hasActiveDeploymentForBackup = service.deployments.some( + (deployment) => + ACTIVE_DELETE_BACKUP_STATUSES.includes( + deployment.status as (typeof ACTIVE_DELETE_BACKUP_STATUSES)[number], + ) && !!deployment.containerId, + ); + const hasVolumes = (service.volumes?.length ?? 0) > 0; + const willReuseCompletedBackups = + service.stateful && hasVolumes && !hasActiveDeploymentForBackup; + const hasCompletedBackupForEveryVolume = + service.deletionBackupFallback && + service.deletionBackupFallback.backedUpVolumeCount === + service.deletionBackupFallback.volumeCount; const handleConfigSave = useCallback(() => { onUpdate(); @@ -47,6 +67,9 @@ export default function ConfigurationPage() { try { await deleteService(service.id); await globalMutate(`/api/projects/${service.projectId}/services`); + toast.success( + service.stateful ? "Delete workflow started" : "Service deleted", + ); router.push(`/dashboard/projects/${projectSlug}/${envName}`); } finally { setIsDeleting(false); @@ -89,8 +112,9 @@ export default function ConfigurationPage() { Delete this service

- Once deleted, this service and all its deployments will be - permanently removed. + {service.stateful + ? "Stateful services are backed up and retained for 7 days before permanent purge." + : "Once deleted, this service and all its deployments will be permanently removed."}

@@ -101,8 +125,45 @@ export default function ConfigurationPage() { Delete {service.name}? - This action cannot be undone. This will permanently delete - the service and all its deployments. + {service.stateful ? ( + <> + This starts a backup-first delete workflow. The service + will be restorable from Deleted services until its + retention window expires. + {willReuseCompletedBackups && + hasCompletedBackupForEveryVolume && ( + <> +
+
+ + This service is not currently running. + {" "} + Restore will use the latest completed backups for + its volumes. The oldest selected backup is from{" "} + {formatBackupDate( + service.deletionBackupFallback + ?.oldestLatestBackupAt, + )} + ; changes after that backup will not be restored. + + )} + {willReuseCompletedBackups && + !hasCompletedBackupForEveryVolume && ( + <> +
+
+ + No completed backup is available for every + volume. + {" "} + Delete will fail unless the service is running so + a fresh deletion backup can be created. + + )} + + ) : ( + "This action cannot be undone. This will permanently delete the service and all its deployments." + )}
diff --git a/web/app/(dashboard)/dashboard/servers/[id]/page.tsx b/web/app/(dashboard)/dashboard/servers/[id]/page.tsx index 16fe704..cd95ecd 100644 --- a/web/app/(dashboard)/dashboard/servers/[id]/page.tsx +++ b/web/app/(dashboard)/dashboard/servers/[id]/page.tsx @@ -1,5 +1,11 @@ import { notFound } from "next/navigation"; -import { getServerDetails } from "@/db/queries"; +import { SetBreadcrumbs } from "@/components/core/breadcrumb-data"; +import { LogViewer } from "@/components/logs/log-viewer"; +import { AgentUpdateNudge } from "@/components/server/agent-update-nudge"; +import { ServerDangerZone } from "@/components/server/server-danger-zone"; +import { ServerHeader } from "@/components/server/server-header"; +import { ServerHealthDetails } from "@/components/server/server-health-details"; +import { ServerServices } from "@/components/server/server-services"; import { Card, CardContent, @@ -7,15 +13,10 @@ import { CardHeader, CardTitle, } from "@/components/ui/card"; -import { SetBreadcrumbs } from "@/components/core/breadcrumb-data"; -import { LogViewer } from "@/components/logs/log-viewer"; import { Label } from "@/components/ui/label"; -import { ServerDangerZone } from "@/components/server/server-danger-zone"; -import { ServerHeader } from "@/components/server/server-header"; -import { ServerHealthDetails } from "@/components/server/server-health-details"; -import { ServerServices } from "@/components/server/server-services"; -import { AgentUpdateNudge } from "@/components/server/agent-update-nudge"; +import { getServerDetails, metricSnapshotToHealthStats } from "@/db/queries"; import { formatRelativeTime } from "@/lib/date"; +import { queryNodeMetricsSnapshot } from "@/lib/victoria-metrics"; async function getLatestAgentVersion(): Promise { try { @@ -39,9 +40,10 @@ export default async function ServerDetailPage({ params: Promise<{ id: string }>; }) { const { id } = await params; - const [server, latestVersion] = await Promise.all([ + const [server, latestVersion, metricsSnapshot] = await Promise.all([ getServerDetails(id), getLatestAgentVersion(), + queryNodeMetricsSnapshot(id).catch(() => null), ]); if (!server) { @@ -172,7 +174,7 @@ export default async function ServerDetailPage({ 0) { + const completedBackups = await db + .select({ + volumeId: volumeBackups.volumeId, + createdAt: volumeBackups.createdAt, + completedAt: volumeBackups.completedAt, + }) + .from(volumeBackups) + .where( + and( + eq(volumeBackups.serviceId, service.id), + eq(volumeBackups.status, "completed"), + ), + ) + .orderBy(desc(volumeBackups.createdAt)); + + const latestByVolume = new Map(); + for (const backup of completedBackups) { + if (!latestByVolume.has(backup.volumeId)) { + latestByVolume.set( + backup.volumeId, + backup.completedAt ?? backup.createdAt, + ); + } + } + + const latestBackupTimes = volumes + .map((volume) => latestByVolume.get(volume.id) ?? null) + .filter((value): value is Date | string => value !== null); + + deletionBackupFallback = { + volumeCount: volumes.length, + backedUpVolumeCount: latestBackupTimes.length, + oldestLatestBackupAt: + latestBackupTimes.length > 0 + ? latestBackupTimes.reduce((oldest, value) => + getBackupTime(value) < getBackupTime(oldest) ? value : oldest, + ) + : null, + newestLatestBackupAt: + latestBackupTimes.length > 0 + ? latestBackupTimes.reduce((newest, value) => + getBackupTime(value) > getBackupTime(newest) ? value : newest, + ) + : null, + }; + } + return { ...service, ports, @@ -150,6 +205,7 @@ export async function GET( volumes, lockedServer, latestBuild, + deletionBackupFallback, }; }), ); diff --git a/web/app/api/servers/[id]/metrics/route.ts b/web/app/api/servers/[id]/metrics/route.ts new file mode 100644 index 0000000..8ae62d1 --- /dev/null +++ b/web/app/api/servers/[id]/metrics/route.ts @@ -0,0 +1,81 @@ +import { headers } from "next/headers"; +import { auth } from "@/lib/auth"; +import { + emptyHistory, + isMetricsEnabled, + queryNodeMetricsHistory, + queryNodeMetricsSnapshot, +} from "@/lib/victoria-metrics"; + +const RANGE_OPTIONS = { + "1h": { durationMs: 60 * 60 * 1000, stepSeconds: 30 }, + "6h": { durationMs: 6 * 60 * 60 * 1000, stepSeconds: 60 }, + "24h": { durationMs: 24 * 60 * 60 * 1000, stepSeconds: 5 * 60 }, + "7d": { durationMs: 7 * 24 * 60 * 60 * 1000, stepSeconds: 30 * 60 }, +} as const; + +type RangeKey = keyof typeof RANGE_OPTIONS; + +export async function GET( + request: Request, + { params }: { params: Promise<{ id: string }> }, +) { + const session = await auth.api.getSession({ + headers: await headers(), + }); + + if (!session) { + return new Response("Unauthorized", { status: 401 }); + } + + const { id: serverId } = await params; + const url = new URL(request.url); + const range = parseRange(url.searchParams.get("range")); + + if (!isMetricsEnabled()) { + return Response.json({ + current: null, + history: emptyHistory(), + range, + enabled: false, + }); + } + + const end = new Date(); + const option = RANGE_OPTIONS[range]; + const start = new Date(end.getTime() - option.durationMs); + + try { + const [current, history] = await Promise.all([ + queryNodeMetricsSnapshot(serverId), + queryNodeMetricsHistory({ + serverId, + start, + end, + stepSeconds: option.stepSeconds, + }), + ]); + + return Response.json({ + current, + history, + range, + enabled: true, + }); + } catch (error) { + console.error("[metrics:server] failed to query metrics:", error); + return Response.json({ + current: null, + history: emptyHistory(), + range, + enabled: true, + }); + } +} + +function parseRange(value: string | null): RangeKey { + if (value && value in RANGE_OPTIONS) { + return value as RangeKey; + } + return "1h"; +} diff --git a/web/app/api/v1/agent/builds/[id]/route.ts b/web/app/api/v1/agent/builds/[id]/route.ts index ea2d4cb..dc6f1e0 100644 --- a/web/app/api/v1/agent/builds/[id]/route.ts +++ b/web/app/api/v1/agent/builds/[id]/route.ts @@ -1,4 +1,4 @@ -import { and, eq } from "drizzle-orm"; +import { and, eq, isNull } from "drizzle-orm"; import { type NextRequest, NextResponse } from "next/server"; import { db } from "@/db"; import { getSetting } from "@/db/queries"; @@ -51,10 +51,18 @@ export async function GET( const service = await db .select() .from(services) - .where(eq(services.id, build.serviceId)) + .where(and(eq(services.id, build.serviceId), isNull(services.deletedAt))) .then((r) => r[0]); if (!service) { + await db + .update(builds) + .set({ + status: "failed", + error: "Service not found", + completedAt: new Date(), + }) + .where(eq(builds.id, buildId)); return NextResponse.json({ error: "Service not found" }, { status: 404 }); } diff --git a/web/app/api/v1/agent/builds/[id]/status/route.ts b/web/app/api/v1/agent/builds/[id]/status/route.ts index 84586ff..eafcbd8 100644 --- a/web/app/api/v1/agent/builds/[id]/status/route.ts +++ b/web/app/api/v1/agent/builds/[id]/status/route.ts @@ -1,4 +1,4 @@ -import { and, eq } from "drizzle-orm"; +import { and, eq, isNull } from "drizzle-orm"; import { type NextRequest, NextResponse } from "next/server"; import { deployService } from "@/actions/projects"; import { db } from "@/db"; @@ -184,10 +184,14 @@ export async function POST( const service = await db .select() .from(services) - .where(eq(services.id, build.serviceId)) + .where(and(eq(services.id, build.serviceId), isNull(services.deletedAt))) .then((r) => r[0]); if (!service) { + await db + .update(builds) + .set({ error: "Service not found" }) + .where(eq(builds.id, buildId)); return NextResponse.json({ error: "Service not found" }, { status: 404 }); } @@ -243,7 +247,9 @@ export async function POST( await db .update(services) .set({ image: baseImageUri }) - .where(eq(services.id, build.serviceId)); + .where( + and(eq(services.id, build.serviceId), isNull(services.deletedAt)), + ); await sendBuildCompletedEvent({ buildId, @@ -305,7 +311,9 @@ export async function POST( await db .update(services) .set({ image: baseImageUri }) - .where(eq(services.id, build.serviceId)); + .where( + and(eq(services.id, build.serviceId), isNull(services.deletedAt)), + ); } await sendBuildCompletedEvent({ @@ -328,7 +336,9 @@ export async function POST( await db .update(services) .set({ image: baseImageUri }) - .where(eq(services.id, build.serviceId)); + .where( + and(eq(services.id, build.serviceId), isNull(services.deletedAt)), + ); const replicas = await db .select() diff --git a/web/app/api/v1/agent/expected-state/route.ts b/web/app/api/v1/agent/expected-state/route.ts index f364cd4..8cca75c 100644 --- a/web/app/api/v1/agent/expected-state/route.ts +++ b/web/app/api/v1/agent/expected-state/route.ts @@ -1,4 +1,4 @@ -import { and, eq, inArray } from "drizzle-orm"; +import { and, eq, inArray, isNull } from "drizzle-orm"; import { type NextRequest, NextResponse } from "next/server"; import { db } from "@/db"; import { @@ -62,7 +62,7 @@ export async function GET(request: NextRequest) { const service = await db .select() .from(services) - .where(eq(services.id, dep.serviceId)) + .where(and(eq(services.id, dep.serviceId), isNull(services.deletedAt))) .then((r) => r[0]); if (!service) continue; @@ -134,7 +134,10 @@ export async function GET(request: NextRequest) { }); } - const allServices = await db.select().from(services); + const allServices = await db + .select() + .from(services) + .where(isNull(services.deletedAt)); const dnsRecords = []; for (const service of allServices) { diff --git a/web/app/api/v1/agent/register/route.ts b/web/app/api/v1/agent/register/route.ts index d220662..5233a42 100644 --- a/web/app/api/v1/agent/register/route.ts +++ b/web/app/api/v1/agent/register/route.ts @@ -1,10 +1,10 @@ +import { and, eq, gt, isNull } from "drizzle-orm"; import { type NextRequest, NextResponse } from "next/server"; import { db } from "@/db"; import { servers } from "@/db/schema"; -import { eq, and, isNull, gt } from "drizzle-orm"; -import { assignSubnet } from "@/lib/wireguard"; import { agentRegisterSchema } from "@/lib/schemas"; import { formatZodErrors } from "@/lib/utils"; +import { assignSubnet } from "@/lib/wireguard"; const TOKEN_EXPIRY_HOURS = 24; @@ -78,6 +78,7 @@ export async function POST(request: NextRequest) { wireguardIp, encryptionKey: process.env.ENCRYPTION_KEY, loggingEndpoint: process.env.VICTORIA_LOGS_URL ?? null, + metricsEndpoint: process.env.VICTORIA_METRICS_URL ?? null, registryUrl: process.env.REGISTRY_URL ?? null, registryUsername: process.env.REGISTRY_USERNAME ?? null, registryPassword: process.env.REGISTRY_PASSWORD ?? null, diff --git a/web/app/api/webhooks/github/route.ts b/web/app/api/webhooks/github/route.ts index cbaf324..2b6b0e5 100644 --- a/web/app/api/webhooks/github/route.ts +++ b/web/app/api/webhooks/github/route.ts @@ -1,4 +1,5 @@ -import { NextRequest, NextResponse } from "next/server"; +import { and, eq, isNull } from "drizzle-orm"; +import { type NextRequest, NextResponse } from "next/server"; import { db } from "@/db"; import { builds, @@ -6,11 +7,10 @@ import { githubRepos, services, } from "@/db/schema"; -import { eq, and } from "drizzle-orm"; import { - verifyWebhookSignature, createGitHubDeployment, updateGitHubDeploymentStatus, + verifyWebhookSignature, } from "@/lib/github"; import { inngest } from "@/lib/inngest/client"; import { inngestEvents } from "@/lib/inngest/events"; @@ -140,7 +140,9 @@ async function handlePushEvent(payload: PushPayload) { const service = await db .select() .from(services) - .where(eq(services.id, githubRepo.serviceId)) + .where( + and(eq(services.id, githubRepo.serviceId), isNull(services.deletedAt)), + ) .then((r) => r[0]); if (!service) { diff --git a/web/app/fonts/IoskeleyMono-Regular.woff2 b/web/app/fonts/IoskeleyMono-Regular.woff2 new file mode 100644 index 0000000..a9351a9 Binary files /dev/null and b/web/app/fonts/IoskeleyMono-Regular.woff2 differ diff --git a/web/app/fonts/Lilex.woff2 b/web/app/fonts/Lilex.woff2 deleted file mode 100644 index 84f58fd..0000000 Binary files a/web/app/fonts/Lilex.woff2 and /dev/null differ diff --git a/web/app/globals.css b/web/app/globals.css index b07102b..24cec57 100644 --- a/web/app/globals.css +++ b/web/app/globals.css @@ -8,8 +8,8 @@ --font-sans: var(--font-inter), ui-sans-serif, system-ui, sans-serif; --font-sans--font-feature-settings: "cv11", "ss03"; --font-mono: - var(--font-lilex), ui-monospace, SFMono-Regular, "SF Mono", Menlo, Consolas, - monospace; + var(--font-ioskeley-mono), ui-monospace, SFMono-Regular, "SF Mono", + Menlo, Consolas, monospace; --color-background: var(--background); --color-foreground: var(--foreground); --color-sidebar-ring: var(--sidebar-ring); diff --git a/web/app/layout.tsx b/web/app/layout.tsx index 477d926..d9e029e 100644 --- a/web/app/layout.tsx +++ b/web/app/layout.tsx @@ -10,9 +10,10 @@ const inter = localFont({ weight: "100 900", }); -const lilex = localFont({ - src: "./fonts/Lilex.woff2", - variable: "--font-lilex", +const ioskeleyMono = localFont({ + src: "./fonts/IoskeleyMono-Regular.woff2", + variable: "--font-ioskeley-mono", + weight: "400", }); export const metadata: Metadata = { @@ -35,7 +36,7 @@ export default function RootLayout({ return ( {children} diff --git a/web/components/server/server-health-details.tsx b/web/components/server/server-health-details.tsx index 021955e..de6973e 100644 --- a/web/components/server/server-health-details.tsx +++ b/web/components/server/server-health-details.tsx @@ -9,15 +9,15 @@ import { Network, } from "lucide-react"; import useSWR from "swr"; -import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; -import { ResourceBar } from "@/components/cluster/resource-bar"; import { HealthIndicator } from "@/components/cluster/health-indicator"; +import { ResourceBar } from "@/components/cluster/resource-bar"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; import { Separator } from "@/components/ui/separator"; +import type { HealthStats, Server } from "@/db/types"; import { fetcher } from "@/lib/fetcher"; -import type { Server } from "@/db/types"; type ServerHealthData = { - healthStats: Server["healthStats"]; + healthStats: HealthStats | null; networkHealth: Server["networkHealth"]; containerHealth: Server["containerHealth"]; agentHealth: Server["agentHealth"]; diff --git a/web/components/service/deleted-services-panel.tsx b/web/components/service/deleted-services-panel.tsx new file mode 100644 index 0000000..957b8f6 --- /dev/null +++ b/web/components/service/deleted-services-panel.tsx @@ -0,0 +1,150 @@ +"use client"; + +import { RotateCcw } from "lucide-react"; +import { useRouter } from "next/navigation"; +import { useState } from "react"; +import { toast } from "sonner"; +import { restoreDeletedService } from "@/actions/projects"; +import { + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, + AlertDialogTrigger, +} from "@/components/ui/alert-dialog"; +import { Button } from "@/components/ui/button"; +import { Empty, EmptyDescription, EmptyTitle } from "@/components/ui/empty"; +import type { Service } from "@/db/types"; + +type DeletedService = Pick< + Service, + | "id" + | "name" + | "image" + | "deletedAt" + | "purgeAfter" + | "deletionStatus" + | "deletionError" +>; + +function formatDate(value: Date | string | null) { + if (!value) return "Unknown"; + return new Date(value).toLocaleString(); +} + +export function DeletedServicesPanel({ + services, +}: { + services: DeletedService[]; +}) { + const router = useRouter(); + const [restoreId, setRestoreId] = useState(null); + const [openRestoreId, setOpenRestoreId] = useState(null); + + const handleRestore = async (serviceId: string) => { + setRestoreId(serviceId); + setOpenRestoreId(null); + try { + await restoreDeletedService(serviceId); + toast.success("Restore started"); + router.refresh(); + } catch (error) { + toast.error( + error instanceof Error ? error.message : "Failed to restore service", + ); + } finally { + setRestoreId(null); + } + }; + + if (services.length === 0) { + return ( + + No deleted services + + Stateful services deleted from this environment will appear here until + their retention window expires. + + + ); + } + + return ( +
+ {services.map((service) => { + const restoreDisabled = + restoreId === service.id || + (!!service.deletionStatus && service.deletionStatus !== "failed"); + const restoreLabel = + restoreId === service.id || service.deletionStatus === "restoring" + ? "Restoring..." + : "Restore"; + + return ( +
+
+
+

{service.name}

+ {service.deletionStatus && ( + + {service.deletionStatus} + + )} +
+

+ {service.image} +

+

+ Deleted {formatDate(service.deletedAt)}. Purges{" "} + {formatDate(service.purgeAfter)}. +

+ {service.deletionError && ( +

+ {service.deletionError} +

+ )} +
+ + setOpenRestoreId(open ? service.id : null) + } + > + } + > + + {restoreLabel} + + + + Restore {service.name}? + + This will recreate the service deployment and restore its + retained volumes from the deletion backup. + + + + Cancel + handleRestore(service.id)} + disabled={restoreDisabled} + > + Restore + + + + +
+ ); + })} +
+ ); +} diff --git a/web/components/service/service-canvas.tsx b/web/components/service/service-canvas.tsx index c9e3fe3..f488be4 100644 --- a/web/components/service/service-canvas.tsx +++ b/web/components/service/service-canvas.tsx @@ -1,9 +1,5 @@ "use client"; -import { useMemo, useState } from "react"; -import { useRouter } from "next/navigation"; -import Link from "next/link"; -import useSWR from "swr"; import { ArrowLeftRight, Box, @@ -13,30 +9,15 @@ import { Lock, Network, Settings, + Trash2, Upload, } from "lucide-react"; -import type { Environment, ServiceWithDetails } from "@/db/types"; -import { fetcher } from "@/lib/fetcher"; -import { cn } from "@/lib/utils"; -import { - AddServiceMenu, - CreateDockerServiceDialog, - CreateGitHubServiceDialog, -} from "./create-service-dialog"; -import { Button } from "@/components/ui/button"; -import { getStatusColorFromDeployments } from "@/components/ui/canvas-wrapper"; +import Link from "next/link"; +import { useRouter } from "next/navigation"; +import { useMemo, useState } from "react"; +import useSWR from "swr"; import { buttonVariants } from "@/components/ui/button"; -import { - NativeSelect, - NativeSelectOption, -} from "@/components/ui/native-select"; -import { - Empty, - EmptyContent, - EmptyDescription, - EmptyMedia, - EmptyTitle, -} from "@/components/ui/empty"; +import { getStatusColorFromDeployments } from "@/components/ui/canvas-wrapper"; import { ContextMenu, ContextMenuContent, @@ -47,6 +28,25 @@ import { ContextMenuSubTrigger, ContextMenuTrigger, } from "@/components/ui/context-menu"; +import { + Empty, + EmptyContent, + EmptyDescription, + EmptyMedia, + EmptyTitle, +} from "@/components/ui/empty"; +import { + NativeSelect, + NativeSelectOption, +} from "@/components/ui/native-select"; +import type { Environment, ServiceWithDetails } from "@/db/types"; +import { fetcher } from "@/lib/fetcher"; +import { cn } from "@/lib/utils"; +import { + AddServiceMenu, + CreateDockerServiceDialog, + CreateGitHubServiceDialog, +} from "./create-service-dialog"; function ServiceCardSkeleton() { return ( @@ -105,6 +105,16 @@ function EnvironmentSelector({ ))} + + + Deleted + s.status === "online"); - const serversWithHealth = onlineServers.filter((s) => s.healthStats); + const metricsByServer = await queryNodeMetricsSnapshots( + onlineServers.map((server) => server.id), + ).catch((error) => { + console.error("[cluster-health] failed to query metrics:", error); + return new Map(); + }); + + const serversWithHealth = allServers.map((server) => ({ + ...server, + healthStats: metricSnapshotToHealthStats(metricsByServer.get(server.id)), + })); + const serversWithCurrentMetrics = serversWithHealth.filter( + (server) => server.status === "online" && server.healthStats, + ); let avgCpuUsage = 0; let avgMemoryUsage = 0; - if (serversWithHealth.length > 0) { - const cpuSum = serversWithHealth.reduce( + if (serversWithCurrentMetrics.length > 0) { + const cpuSum = serversWithCurrentMetrics.reduce( (sum, s) => sum + (s.healthStats?.cpuUsagePercent ?? 0), 0, ); - const memSum = serversWithHealth.reduce( + const memSum = serversWithCurrentMetrics.reduce( (sum, s) => sum + (s.healthStats?.memoryUsagePercent ?? 0), 0, ); - avgCpuUsage = cpuSum / serversWithHealth.length; - avgMemoryUsage = memSum / serversWithHealth.length; + avgCpuUsage = cpuSum / serversWithCurrentMetrics.length; + avgMemoryUsage = memSum / serversWithCurrentMetrics.length; } const networkHealthy = onlineServers.filter( @@ -220,7 +269,39 @@ export async function getClusterHealth() { networkHealthy, containerHealthy, }, - servers: allServers, + servers: serversWithHealth, + }; +} + +export function metricSnapshotToHealthStats( + snapshot: + | { + cpuUsagePercent: number | null; + memoryUsagePercent: number | null; + memoryUsedBytes: number | null; + diskUsagePercent: number | null; + diskUsedBytes: number | null; + } + | null + | undefined, +): HealthStats | null { + if (!snapshot) return null; + if ( + snapshot.cpuUsagePercent === null && + snapshot.memoryUsagePercent === null && + snapshot.memoryUsedBytes === null && + snapshot.diskUsagePercent === null && + snapshot.diskUsedBytes === null + ) { + return null; + } + + return { + cpuUsagePercent: snapshot.cpuUsagePercent ?? 0, + memoryUsagePercent: snapshot.memoryUsagePercent ?? 0, + memoryUsedMb: Math.round((snapshot.memoryUsedBytes ?? 0) / 1024 / 1024), + diskUsagePercent: snapshot.diskUsagePercent ?? 0, + diskUsedGb: Math.round((snapshot.diskUsedBytes ?? 0) / 1024 / 1024 / 1024), }; } diff --git a/web/db/schema.ts b/web/db/schema.ts index fef32b5..f5d6460 100644 --- a/web/db/schema.ts +++ b/web/db/schema.ts @@ -182,14 +182,6 @@ type ServerMeta = { hostname?: string; }; -export type HealthStats = { - cpuUsagePercent: number; - memoryUsagePercent: number; - memoryUsedMb: number; - diskUsagePercent: number; - diskUsedGb: number; -}; - export type NetworkPeer = { id: string; lastSeenSecs: number; @@ -232,7 +224,6 @@ export const servers = pgTable("servers", { resourcesMemory: integer("resources_memory"), resourcesDisk: integer("resources_disk"), meta: jsonb("meta").$type(), - healthStats: jsonb("health_stats").$type(), networkHealth: jsonb("network_health").$type(), containerHealth: jsonb("container_health").$type(), agentHealth: jsonb("agent_health").$type(), @@ -311,6 +302,13 @@ export const services = pgTable("services", { }), backupEnabled: boolean("backup_enabled").default(false), backupSchedule: text("backup_schedule"), + deletedAt: timestamp("deleted_at", { withTimezone: true }), + purgeAfter: timestamp("purge_after", { withTimezone: true }), + originalHostname: text("original_hostname"), + deletionStatus: text("deletion_status", { + enum: ["backing_up", "deleting", "restoring", "failed"], + }), + deletionError: text("deletion_error"), migrationStatus: text("migration_status", { enum: [ "stopping", @@ -397,6 +395,7 @@ export const volumeBackups = pgTable( checksum: text("checksum"), errorMessage: text("error_message"), isMigrationBackup: boolean("is_migration_backup").default(false), + isDeletionBackup: boolean("is_deletion_backup").default(false), createdAt: timestamp("created_at", { withTimezone: true }) .defaultNow() .notNull(), diff --git a/web/db/types.ts b/web/db/types.ts index 548c1fa..33cd916 100644 --- a/web/db/types.ts +++ b/web/db/types.ts @@ -8,11 +8,11 @@ import type { projects, rollouts, secrets, - serviceReplicas, + servers, servicePorts, + serviceReplicas, services, serviceVolumes, - servers, volumeBackups, workQueue, } from "./schema"; @@ -39,6 +39,14 @@ export type HealthStatus = Deployment["healthStatus"]; export type RolloutStatus = NonNullable; export type BuildStatus = NonNullable; +export type HealthStats = { + cpuUsagePercent: number; + memoryUsagePercent: number; + memoryUsedMb: number; + diskUsagePercent: number; + diskUsedGb: number; +}; + export type ServiceWithDetails = Service & { ports: ServicePort[]; configuredReplicas: Array; @@ -55,4 +63,10 @@ export type ServiceWithDetails = Service & { rollouts?: Rollout[]; lockedServer?: Pick | null; latestBuild?: Pick | null; + deletionBackupFallback?: { + volumeCount: number; + backedUpVolumeCount: number; + oldestLatestBackupAt: Date | string | null; + newestLatestBackupAt: Date | string | null; + } | null; }; diff --git a/web/lib/agent-status.ts b/web/lib/agent-status.ts index e1586ed..65e200c 100644 --- a/web/lib/agent-status.ts +++ b/web/lib/agent-status.ts @@ -5,7 +5,6 @@ import { type ContainerHealth, deploymentPorts, deployments, - type HealthStats, type NetworkHealth, rollouts, servers, @@ -37,7 +36,6 @@ export type StatusReport = { meta?: Record; containers: ContainerStatus[]; dnsInSync?: boolean; - healthStats?: HealthStats; networkHealth?: NetworkHealth; containerHealth?: ContainerHealth; agentHealth?: AgentHealth; @@ -74,9 +72,6 @@ export async function applyStatusReport( updateData.meta = report.meta; } - if (report.healthStats) { - updateData.healthStats = report.healthStats; - } if (report.networkHealth) { updateData.networkHealth = report.networkHealth; } diff --git a/web/lib/backup-scheduler.ts b/web/lib/backup-scheduler.ts index 01d77eb..6a0a219 100644 --- a/web/lib/backup-scheduler.ts +++ b/web/lib/backup-scheduler.ts @@ -1,4 +1,4 @@ -import { and, desc, eq, lt } from "drizzle-orm"; +import { and, desc, eq, isNull, lt } from "drizzle-orm"; import { deleteBackup } from "@/actions/backups"; import { db } from "@/db"; import { getBackupStorageConfig } from "@/db/queries"; @@ -61,7 +61,7 @@ export async function runScheduledBackups() { backupSchedule: services.backupSchedule, }) .from(services) - .where(eq(services.backupEnabled, true)); + .where(and(eq(services.backupEnabled, true), isNull(services.deletedAt))); for (const service of servicesWithBackup) { if (!service.backupSchedule) { @@ -134,6 +134,7 @@ export async function cleanupOldBackups() { and( lt(volumeBackups.createdAt, cutoffDate), eq(volumeBackups.isMigrationBackup, false), + eq(volumeBackups.isDeletionBackup, false), ), ); @@ -146,6 +147,6 @@ export async function cleanupOldBackups() { ); for (const backup of oldBackups) { - await deleteBackup(backup.id); + await deleteBackup(backup.id, { revalidate: false }); } } diff --git a/web/lib/inngest/events/index.ts b/web/lib/inngest/events/index.ts index 01a3019..32de783 100644 --- a/web/lib/inngest/events/index.ts +++ b/web/lib/inngest/events/index.ts @@ -1,22 +1,25 @@ import { eventType, staticSchema } from "inngest"; -export type { RolloutEvents } from "./rollout"; -export type { MigrationEvents } from "./migration"; export type { BackupEvents } from "./backup"; -export type { RestoreEvents } from "./restore"; export type { BuildEvents } from "./build"; +export type { MigrationEvents } from "./migration"; +export type { RestoreEvents } from "./restore"; +export type { RolloutEvents } from "./rollout"; +export type { ServiceDeletionEvents } from "./service-deletion"; -import type { RolloutEvents } from "./rollout"; -import type { MigrationEvents } from "./migration"; import type { BackupEvents } from "./backup"; -import type { RestoreEvents } from "./restore"; import type { BuildEvents } from "./build"; +import type { MigrationEvents } from "./migration"; +import type { RestoreEvents } from "./restore"; +import type { RolloutEvents } from "./rollout"; +import type { ServiceDeletionEvents } from "./service-deletion"; export type Events = RolloutEvents & MigrationEvents & BackupEvents & RestoreEvents & - BuildEvents; + BuildEvents & + ServiceDeletionEvents; type EventName = keyof Events & string; type EventData = Events[TName]["data"]; @@ -48,6 +51,9 @@ export const inngestEvents = { restoreCompleted: defineEvent("restore/completed"), restoreFailed: defineEvent("restore/failed"), + serviceDeletionStarted: defineEvent("service-deletion/started"), + serviceRestoreStarted: defineEvent("service-restore/started"), + buildTrigger: defineEvent("build/trigger"), buildStarted: defineEvent("build/started"), buildCancelled: defineEvent("build/cancelled"), diff --git a/web/lib/inngest/events/service-deletion.ts b/web/lib/inngest/events/service-deletion.ts new file mode 100644 index 0000000..5af9739 --- /dev/null +++ b/web/lib/inngest/events/service-deletion.ts @@ -0,0 +1,15 @@ +export type ServiceDeletionEvents = { + "service-deletion/started": { + data: { + serviceId: string; + reusableBackupIds: string[]; + }; + }; + "service-restore/started": { + data: { + serviceId: string; + targetServerId: string | null; + backupIds: string[]; + }; + }; +}; diff --git a/web/lib/inngest/functions/build-workflow.ts b/web/lib/inngest/functions/build-workflow.ts index 0c056b4..a1147f1 100644 --- a/web/lib/inngest/functions/build-workflow.ts +++ b/web/lib/inngest/functions/build-workflow.ts @@ -1,9 +1,9 @@ -import { eq, and } from "drizzle-orm"; +import { and, eq, isNull } from "drizzle-orm"; +import { deployService } from "@/actions/projects"; import { db } from "@/db"; -import { builds, services, projects, serviceReplicas } from "@/db/schema"; +import { builds, serviceReplicas, services } from "@/db/schema"; import { inngest } from "../client"; import { inngestEvents } from "../events"; -import { deployService } from "@/actions/projects"; export const buildWorkflow = inngest.createFunction( { @@ -61,11 +61,12 @@ export const buildWorkflow = inngest.createFunction( const service = await db .select() .from(services) - .where(eq(services.id, serviceId)) + .where(and(eq(services.id, serviceId), isNull(services.deletedAt))) .then((r) => r[0]); return ( - replicas.length > 0 || (service?.autoPlace && service?.replicas > 0) + !!service && + (replicas.length > 0 || (service.autoPlace && service.replicas > 0)) ); }); @@ -139,11 +140,12 @@ export const buildWorkflow = inngest.createFunction( const service = await db .select() .from(services) - .where(eq(services.id, serviceId)) + .where(and(eq(services.id, serviceId), isNull(services.deletedAt))) .then((r) => r[0]); return ( - replicas.length > 0 || (service?.autoPlace && service?.replicas > 0) + !!service && + (replicas.length > 0 || (service.autoPlace && service.replicas > 0)) ); }); diff --git a/web/lib/inngest/functions/index.ts b/web/lib/inngest/functions/index.ts index 5a6172d..df59c8c 100644 --- a/web/lib/inngest/functions/index.ts +++ b/web/lib/inngest/functions/index.ts @@ -15,3 +15,8 @@ export { onDeploymentFailed } from "./on-deployment-failed"; export { restoreTriggerWorkflow } from "./restore-trigger-workflow"; export { onRestoreFailed, restoreWorkflow } from "./restore-workflow"; export { rolloutWorkflow } from "./rollout-workflow"; +export { + expiredDeletedServicesPurge, + serviceDeletionWorkflow, + serviceRestoreWorkflow, +} from "./service-deletion-workflow"; diff --git a/web/lib/inngest/functions/service-deletion-workflow.ts b/web/lib/inngest/functions/service-deletion-workflow.ts new file mode 100644 index 0000000..bf38820 --- /dev/null +++ b/web/lib/inngest/functions/service-deletion-workflow.ts @@ -0,0 +1,528 @@ +import { randomUUID } from "node:crypto"; +import { and, eq, inArray, isNotNull, isNull, lte, or } from "drizzle-orm"; +import { cron } from "inngest"; +import { deleteBackup } from "@/actions/backups"; +import { deployService } from "@/actions/projects"; +import { db } from "@/db"; +import { getBackupStorageConfig } from "@/db/queries"; +import { + deploymentPorts, + deployments, + secrets, + services, + serviceVolumes, + volumeBackups, +} from "@/db/schema"; +import { enqueueWork } from "@/lib/work-queue"; +import { inngest } from "../client"; +import { inngestEvents } from "../events"; + +const DELETED_SERVICE_RETENTION_DAYS = 7; + +async function markServiceDeletionFailed(serviceId: string, error: unknown) { + await db + .update(services) + .set({ + deletionStatus: "failed", + deletionError: + error instanceof Error ? error.message : "Service operation failed", + }) + .where(eq(services.id, serviceId)); +} + +function purgeDateFrom(date: Date) { + const purgeAfter = new Date(date); + purgeAfter.setDate(purgeAfter.getDate() + DELETED_SERVICE_RETENTION_DAYS); + return purgeAfter; +} + +function restoreDate(value: Date | string | null) { + return value ? new Date(value) : null; +} + +export const serviceDeletionWorkflow = inngest.createFunction( + { + id: "service-deletion-workflow", + triggers: [inngestEvents.serviceDeletionStarted], + }, + async ({ event, step, group }) => { + const { serviceId, reusableBackupIds } = event.data; + + try { + const setup = await step.run("setup-delete", async () => { + const storageConfig = await getBackupStorageConfig(); + if (!storageConfig) { + throw new Error("Backup storage not configured"); + } + + const service = await db + .select() + .from(services) + .where(eq(services.id, serviceId)) + .then((r) => r[0]); + + if (!service) { + throw new Error("Service not found"); + } + + const volumes = await db + .select() + .from(serviceVolumes) + .where(eq(serviceVolumes.serviceId, serviceId)); + + const runningDeployment = await db + .select({ + id: deployments.id, + serverId: deployments.serverId, + containerId: deployments.containerId, + }) + .from(deployments) + .where( + and( + eq(deployments.serviceId, serviceId), + inArray(deployments.status, ["running", "healthy"]), + ), + ) + .then((r) => r[0]); + + return { service, storageConfig, volumes, runningDeployment }; + }); + + let backupIds = [...reusableBackupIds]; + let newBackupIds: string[] = []; + + if (backupIds.length === 0) { + const createdBackupIds = await step.run( + "start-delete-backups", + async () => { + const deployment = setup.runningDeployment; + if (!deployment?.containerId) { + throw new Error("No active deployment found for deletion backup"); + } + + const ids: string[] = []; + for (const volume of setup.volumes) { + const backupId = randomUUID(); + ids.push(backupId); + const storagePath = `deleted-services/${serviceId}/${volume.name}/${backupId}.tar.gz`; + + await db.insert(volumeBackups).values({ + id: backupId, + volumeId: volume.id, + volumeName: volume.name, + serviceId, + serverId: deployment.serverId, + status: "pending", + storagePath, + isDeletionBackup: true, + }); + + await enqueueWork(deployment.serverId, "backup_volume", { + backupId, + serviceId, + containerId: deployment.containerId, + volumeName: volume.name, + storagePath, + storageConfig: { + provider: setup.storageConfig.provider, + bucket: setup.storageConfig.bucket, + region: setup.storageConfig.region, + endpoint: setup.storageConfig.endpoint, + accessKey: setup.storageConfig.accessKey, + secretKey: setup.storageConfig.secretKey, + }, + }); + } + return ids; + }, + ); + + newBackupIds = createdBackupIds; + backupIds = createdBackupIds; + } + + if (newBackupIds.length > 0) { + const backupResults = await Promise.all( + newBackupIds.map((backupId) => + group.parallel(() => { + const completed = step + .waitForEvent(`wait-delete-backup-${backupId}`, { + event: inngestEvents.backupCompleted, + timeout: "30m", + if: `async.data.backupId == "${backupId}" && async.data.serviceId == "${serviceId}"`, + }) + .then((result) => ({ status: "completed" as const, result })); + + const failed = step + .waitForEvent(`wait-delete-backup-failed-${backupId}`, { + event: inngestEvents.backupFailed, + timeout: "30m", + if: `async.data.backupId == "${backupId}" && async.data.serviceId == "${serviceId}"`, + }) + .then((result) => ({ status: "failed" as const, result })); + + return Promise.race([completed, failed]); + }), + ), + ); + + const timedOut = backupResults.some((r) => r.result === null); + const failed = backupResults.find((r) => r.status === "failed"); + if (timedOut || failed) { + await step.run("mark-delete-backup-failed", async () => { + await db + .update(services) + .set({ + deletionStatus: "failed", + deletionError: + failed?.result?.data.error || "Deletion backup timed out", + }) + .where(eq(services.id, serviceId)); + }); + return { status: "failed", reason: timedOut ? "timeout" : "backup" }; + } + } + + await step.run("cleanup-service", async () => { + await db + .update(services) + .set({ deletionStatus: "deleting", deletionError: null }) + .where(eq(services.id, serviceId)); + + const allDeployments = await db + .select() + .from(deployments) + .where(eq(deployments.serviceId, serviceId)); + + for (const deployment of allDeployments) { + if ( + (deployment.status === "running" || + deployment.status === "healthy") && + deployment.containerId + ) { + await db + .update(deployments) + .set({ status: "stopping" }) + .where(eq(deployments.id, deployment.id)); + + await enqueueWork(deployment.serverId, "stop", { + deploymentId: deployment.id, + containerId: deployment.containerId, + }); + } + + await db + .delete(deploymentPorts) + .where(eq(deploymentPorts.deploymentId, deployment.id)); + } + + await db + .delete(deployments) + .where(eq(deployments.serviceId, serviceId)); + + const cleanupServerId = + setup.service.lockedServerId ?? + setup.runningDeployment?.serverId ?? + allDeployments.find((deployment) => deployment.serverId)?.serverId; + if (cleanupServerId && setup.volumes.length > 0) { + await enqueueWork(cleanupServerId, "cleanup_volumes", { serviceId }); + } + + const deletedAt = new Date(); + await db + .update(services) + .set({ + deletedAt, + purgeAfter: purgeDateFrom(deletedAt), + originalHostname: setup.service.hostname, + hostname: null, + deletionStatus: null, + deletionError: null, + }) + .where(eq(services.id, serviceId)); + }); + + return { status: "deleted", serviceId, backupIds }; + } catch (error) { + await step.run("mark-unhandled-delete-failed", async () => { + await markServiceDeletionFailed(serviceId, error); + }); + return { + status: "failed", + reason: error instanceof Error ? error.message : "delete_failed", + }; + } + }, +); + +export const serviceRestoreWorkflow = inngest.createFunction( + { + id: "service-restore-workflow", + triggers: [inngestEvents.serviceRestoreStarted], + }, + async ({ event, step, group }) => { + const { serviceId, targetServerId, backupIds } = event.data; + + try { + const setup = await step.run("setup-restore", async () => { + const storageConfig = await getBackupStorageConfig(); + if (!storageConfig) { + throw new Error("Backup storage not configured"); + } + + const service = await db + .select() + .from(services) + .where(eq(services.id, serviceId)) + .then((r) => r[0]); + + if (!service || !service.deletedAt) { + throw new Error("Deleted service not found"); + } + + const resolvedTargetServerId = targetServerId ?? service.lockedServerId; + if (!resolvedTargetServerId) { + throw new Error( + "Cannot restore because no target server is selected", + ); + } + + const backups = await db + .select() + .from(volumeBackups) + .where(inArray(volumeBackups.id, backupIds)); + + if (backups.length !== backupIds.length) { + throw new Error( + "Cannot restore because a retained backup is missing", + ); + } + + for (const backup of backups) { + if (!backup.storagePath || !backup.checksum) { + throw new Error("Backup data is incomplete"); + } + } + + return { + storageConfig, + service, + targetServerId: resolvedTargetServerId, + backups, + }; + }); + + await step.run("restore-deletion-backups", async () => { + for (const backup of setup.backups) { + await enqueueWork(setup.targetServerId, "restore_volume", { + backupId: backup.id, + serviceId, + volumeName: backup.volumeName, + storagePath: backup.storagePath, + expectedChecksum: backup.checksum, + isMigrationRestore: false, + storageConfig: { + provider: setup.storageConfig.provider, + bucket: setup.storageConfig.bucket, + region: setup.storageConfig.region, + endpoint: setup.storageConfig.endpoint, + accessKey: setup.storageConfig.accessKey, + secretKey: setup.storageConfig.secretKey, + }, + }); + } + }); + + const restoreResults = await Promise.all( + backupIds.map((backupId) => + group.parallel(() => { + const completed = step + .waitForEvent(`wait-delete-restore-${backupId}`, { + event: inngestEvents.restoreCompleted, + timeout: "30m", + if: `async.data.backupId == "${backupId}" && async.data.serviceId == "${serviceId}"`, + }) + .then((result) => ({ status: "completed" as const, result })); + + const failed = step + .waitForEvent(`wait-delete-restore-failed-${backupId}`, { + event: inngestEvents.restoreFailed, + timeout: "30m", + if: `async.data.backupId == "${backupId}" && async.data.serviceId == "${serviceId}"`, + }) + .then((result) => ({ status: "failed" as const, result })); + + return Promise.race([completed, failed]); + }), + ), + ); + + const failed = restoreResults.find((r) => r.status === "failed"); + const timedOut = restoreResults.some((r) => r.result === null); + if (failed || timedOut) { + await step.run("mark-restore-failed", async () => { + await db + .update(services) + .set({ + deletionStatus: "failed", + deletionError: + failed?.result?.data.error || "Volume restore timed out", + }) + .where(eq(services.id, serviceId)); + }); + return { status: "failed", reason: failed ? "restore" : "timeout" }; + } + + const deployResult = await step.run( + "start-restored-deployment", + async () => { + await db + .update(services) + .set({ + deletedAt: null, + purgeAfter: null, + hostname: setup.service.originalHostname, + originalHostname: null, + deletionStatus: "restoring", + deletionError: null, + lockedServerId: setup.targetServerId, + }) + .where(eq(services.id, serviceId)); + + try { + const result = await deployService(serviceId); + if (!("rolloutId" in result) || !result.rolloutId) { + throw new Error("Restore could not start a deployment"); + } + return result; + } catch (error) { + await db + .update(services) + .set({ + deletedAt: restoreDate(setup.service.deletedAt), + purgeAfter: restoreDate(setup.service.purgeAfter), + hostname: null, + originalHostname: setup.service.originalHostname, + deletionStatus: "failed", + deletionError: + error instanceof Error + ? error.message + : "Restore deployment failed", + }) + .where(eq(services.id, serviceId)); + throw error; + } + }, + ); + + const deploymentResult = await group.parallel(() => { + const healthy = step + .waitForEvent("wait-restore-deployment-healthy", { + event: inngestEvents.deploymentHealthy, + timeout: "15m", + if: `async.data.rolloutId == "${deployResult.rolloutId}" && async.data.serviceId == "${serviceId}"`, + }) + .then((result) => ({ status: "healthy" as const, result })); + + const failed = step + .waitForEvent("wait-restore-deployment-failed", { + event: inngestEvents.deploymentFailed, + timeout: "15m", + if: `async.data.rolloutId == "${deployResult.rolloutId}" && async.data.serviceId == "${serviceId}"`, + }) + .then((result) => ({ status: "failed" as const, result })); + + return Promise.race([healthy, failed]); + }); + + if (!deploymentResult.result || deploymentResult.status === "failed") { + await step.run("mark-restore-deployment-failed", async () => { + const errorMessage = + deploymentResult.status === "failed" + ? deploymentResult.result?.data.reason + : undefined; + await db + .update(services) + .set({ + deletedAt: restoreDate(setup.service.deletedAt), + purgeAfter: restoreDate(setup.service.purgeAfter), + hostname: null, + originalHostname: setup.service.originalHostname, + deletionStatus: "failed", + deletionError: + errorMessage || "Restore deployment did not become healthy", + }) + .where(eq(services.id, serviceId)); + }); + return { status: "failed", reason: "deployment" }; + } + + await step.run("mark-restore-complete", async () => { + await db + .update(volumeBackups) + .set({ isDeletionBackup: false }) + .where( + and( + eq(volumeBackups.serviceId, serviceId), + eq(volumeBackups.isDeletionBackup, true), + ), + ); + + await db + .update(services) + .set({ deletionStatus: null, deletionError: null }) + .where(eq(services.id, serviceId)); + }); + + return { status: "restored", serviceId }; + } catch (error) { + await step.run("mark-unhandled-restore-failed", async () => { + await markServiceDeletionFailed(serviceId, error); + }); + return { + status: "failed", + reason: error instanceof Error ? error.message : "restore_failed", + }; + } + }, +); + +export const expiredDeletedServicesPurge = inngest.createFunction( + { + id: "cron-expired-deleted-services-purge", + triggers: [cron("0 4 * * *")], + singleton: { mode: "skip" }, + }, + async ({ step }) => { + await step.run("purge-expired-deleted-services", async () => { + const expiredServices = await db + .select({ id: services.id }) + .from(services) + .where( + and( + isNotNull(services.deletedAt), + isNotNull(services.purgeAfter), + or( + isNull(services.deletionStatus), + eq(services.deletionStatus, "failed"), + ), + lte(services.purgeAfter, new Date()), + ), + ); + + for (const service of expiredServices) { + const backups = await db + .select({ id: volumeBackups.id }) + .from(volumeBackups) + .where(eq(volumeBackups.serviceId, service.id)); + + for (const backup of backups) { + await deleteBackup(backup.id, { revalidate: false }); + } + + await db.delete(secrets).where(eq(secrets.serviceId, service.id)); + await db.delete(services).where(eq(services.id, service.id)); + } + }); + }, +); diff --git a/web/lib/placement-planner.ts b/web/lib/placement-planner.ts index a84e1f5..5d41801 100644 --- a/web/lib/placement-planner.ts +++ b/web/lib/placement-planner.ts @@ -1,4 +1,5 @@ import { createHash } from "node:crypto"; +import type { HealthStats } from "@/db/types"; export type PlacementResult = { serverId: string; count: number }[]; @@ -6,13 +7,7 @@ export type PlacementServerSnapshot = { id: string; status: string; wireguardIp: string | null; - healthStats?: { - cpuUsagePercent: number; - memoryUsagePercent: number; - memoryUsedMb: number; - diskUsagePercent: number; - diskUsedGb: number; - } | null; + healthStats?: HealthStats | null; containerHealth?: { runtimeResponsive: boolean; runningContainers: number; diff --git a/web/lib/placement.ts b/web/lib/placement.ts index fe3755f..89a6a28 100644 --- a/web/lib/placement.ts +++ b/web/lib/placement.ts @@ -1,6 +1,7 @@ import { randomUUID } from "node:crypto"; import { and, eq, isNotNull } from "drizzle-orm"; import { db } from "@/db"; +import { metricSnapshotToHealthStats } from "@/db/queries"; import { servers, serviceReplicas, settings } from "@/db/schema"; import type { Service } from "@/db/types"; import { @@ -9,6 +10,10 @@ import { type PlacementServerSnapshot, } from "@/lib/placement-planner"; import { SETTING_KEYS } from "@/lib/settings-keys"; +import { + type NodeMetricsSnapshot, + queryNodeMetricsSnapshots, +} from "@/lib/victoria-metrics"; export type { PlacementResult }; @@ -24,7 +29,6 @@ export async function calculateResourceAwarePlacement( id: servers.id, status: servers.status, wireguardIp: servers.wireguardIp, - healthStats: servers.healthStats, containerHealth: servers.containerHealth, }) .from(servers) @@ -41,10 +45,24 @@ export async function calculateResourceAwarePlacement( getExcludedFromWorkloadPlacement(), ]); + const metricsByServer = await queryNodeMetricsSnapshots( + candidateServers.map((server) => server.id), + ).catch((error) => { + console.error("[placement] failed to query metrics:", error); + return new Map(); + }); + const serversWithMetrics = candidateServers.map((server) => { + const metrics = metricsByServer.get(server.id); + return { + ...server, + healthStats: metricSnapshotToHealthStats(metrics), + }; + }); + return calculateResourceAwarePlacementFromSnapshot({ serviceId: service.id, totalReplicas, - servers: candidateServers satisfies PlacementServerSnapshot[], + servers: serversWithMetrics satisfies PlacementServerSnapshot[], existingReplicas: allocatedReplicas, excludeServerIds: [...(excludeServerIds ?? []), ...excludedFromWorkload], }); diff --git a/web/lib/scheduler.ts b/web/lib/scheduler.ts index fb4bef9..263722b 100644 --- a/web/lib/scheduler.ts +++ b/web/lib/scheduler.ts @@ -1,5 +1,5 @@ import { CronExpressionParser } from "cron-parser"; -import { and, eq, inArray, isNotNull, lt, ne, sql } from "drizzle-orm"; +import { and, eq, inArray, isNotNull, isNull, lt, ne, sql } from "drizzle-orm"; import { triggerBuild } from "@/actions/builds"; import { deployService } from "@/actions/projects"; import { db } from "@/db"; @@ -46,6 +46,7 @@ export async function triggerRecoveryForOfflineServers( inArray(deployments.status, activeStatuses), eq(services.autoPlace, true), eq(services.stateful, false), + isNull(services.deletedAt), ), ); @@ -161,7 +162,9 @@ export async function checkAndRunScheduledDeployments(): Promise { lastScheduledDeploymentRunAt: services.lastScheduledDeploymentRunAt, }) .from(services) - .where(isNotNull(services.deploymentSchedule)); + .where( + and(isNotNull(services.deploymentSchedule), isNull(services.deletedAt)), + ); if (scheduledServices.length === 0) return; diff --git a/web/lib/victoria-metrics.ts b/web/lib/victoria-metrics.ts new file mode 100644 index 0000000..5b34c95 --- /dev/null +++ b/web/lib/victoria-metrics.ts @@ -0,0 +1,303 @@ +const VICTORIA_METRICS_URL = process.env.VICTORIA_METRICS_URL; +const VICTORIA_METRICS_PRIVATE_URL = process.env.VICTORIA_METRICS_PRIVATE_URL; + +type EndpointConfig = { + url: string; + username?: string; + password?: string; +}; + +type VictoriaInstantResponse = { + status: string; + data?: { + result?: Array<{ + metric?: Record; + value?: [number, string]; + }>; + }; + error?: string; +}; + +type VictoriaMatrixResponse = { + status: string; + data?: { + result?: Array<{ + metric: Record; + values: Array<[number, string]>; + }>; + }; + error?: string; +}; + +export type NodeMetricsSnapshot = { + cpuUsagePercent: number | null; + memoryUsagePercent: number | null; + memoryUsedBytes: number | null; + diskUsagePercent: number | null; + diskUsedBytes: number | null; +}; + +export type NodeMetricPoint = { + timestamp: string; + value: number; +}; + +export type NodeMetricsHistory = { + cpuUsagePercent: NodeMetricPoint[]; + memoryUsagePercent: NodeMetricPoint[]; + memoryUsedBytes: NodeMetricPoint[]; + diskUsagePercent: NodeMetricPoint[]; + diskUsedBytes: NodeMetricPoint[]; +}; + +const METRIC_NAMES = { + cpuUsagePercent: "techulus_node_cpu_usage_percent", + memoryUsagePercent: "techulus_node_memory_usage_percent", + memoryUsedBytes: "techulus_node_memory_used_bytes", + diskUsagePercent: "techulus_node_disk_usage_percent", + diskUsedBytes: "techulus_node_disk_used_bytes", +} as const; + +function parseEndpoint(endpoint: string): EndpointConfig { + const parsed = new URL(endpoint); + const username = parsed.username || undefined; + const password = parsed.password || undefined; + parsed.username = ""; + parsed.password = ""; + return { url: parsed.toString().replace(/\/$/, ""), username, password }; +} + +function getQueryEndpoint(): EndpointConfig | undefined { + const endpoint = VICTORIA_METRICS_PRIVATE_URL || VICTORIA_METRICS_URL; + if (!endpoint) return undefined; + return parseEndpoint(endpoint); +} + +function buildFetchOptions(config: EndpointConfig): RequestInit { + if (config.username) { + const credentials = Buffer.from( + `${config.username}:${config.password || ""}`, + ).toString("base64"); + return { headers: { Authorization: `Basic ${credentials}` } }; + } + return {}; +} + +export function isMetricsEnabled(): boolean { + return !!(VICTORIA_METRICS_PRIVATE_URL || VICTORIA_METRICS_URL); +} + +export async function queryNodeMetricsSnapshots( + serverIds: string[], +): Promise> { + const endpoint = getQueryEndpoint(); + if (!endpoint) return new Map(); + + // 5 total queries (one per metric), results grouped by server_id label. + // This collapses the previous N × 5 fan-out. + const [cpuMap, memPctMap, memBytesMap, diskPctMap, diskBytesMap] = + await Promise.all([ + queryInstantMetricGroup(endpoint, METRIC_NAMES.cpuUsagePercent).catch( + () => new Map(), + ), + queryInstantMetricGroup(endpoint, METRIC_NAMES.memoryUsagePercent).catch( + () => new Map(), + ), + queryInstantMetricGroup(endpoint, METRIC_NAMES.memoryUsedBytes).catch( + () => new Map(), + ), + queryInstantMetricGroup(endpoint, METRIC_NAMES.diskUsagePercent).catch( + () => new Map(), + ), + queryInstantMetricGroup(endpoint, METRIC_NAMES.diskUsedBytes).catch( + () => new Map(), + ), + ]); + + const result = new Map(); + for (const serverId of serverIds) { + result.set(serverId, { + cpuUsagePercent: cpuMap.get(serverId) ?? null, + memoryUsagePercent: memPctMap.get(serverId) ?? null, + memoryUsedBytes: memBytesMap.get(serverId) ?? null, + diskUsagePercent: diskPctMap.get(serverId) ?? null, + diskUsedBytes: diskBytesMap.get(serverId) ?? null, + }); + } + return result; +} + +export async function queryNodeMetricsSnapshot( + serverId: string, +): Promise { + const endpoint = getQueryEndpoint(); + if (!endpoint) return null; + + const snapshot: NodeMetricsSnapshot = { + cpuUsagePercent: null, + memoryUsagePercent: null, + memoryUsedBytes: null, + diskUsagePercent: null, + diskUsedBytes: null, + }; + + await Promise.all( + Object.entries(METRIC_NAMES).map(async ([key, metricName]) => { + const value = await queryInstantMetric( + endpoint, + metricName, + serverId, + ).catch(() => null); + snapshot[key as keyof NodeMetricsSnapshot] = value; + }), + ); + + return snapshot; +} + +export async function queryNodeMetricsHistory(options: { + serverId: string; + start: Date; + end: Date; + stepSeconds: number; +}): Promise { + const endpoint = getQueryEndpoint(); + if (!endpoint) return emptyHistory(); + + const entries = await Promise.all( + Object.entries(METRIC_NAMES).map(async ([key, metricName]) => { + const series = await queryRangeMetric(endpoint, { + metricName, + serverId: options.serverId, + start: options.start, + end: options.end, + stepSeconds: options.stepSeconds, + }).catch(() => []); + return [key, series] as const; + }), + ); + + return Object.fromEntries(entries) as NodeMetricsHistory; +} + +async function queryInstantMetric( + endpoint: EndpointConfig, + metricName: string, + serverId: string, +) { + const url = new URL(`${endpoint.url}/api/v1/query`); + url.searchParams.set( + "query", + `${metricName}{server_id="${escapePromQL(serverId)}"}`, + ); + + const response = await fetch(url.toString(), buildFetchOptions(endpoint)); + if (!response.ok) { + throw new Error( + `Failed to query metrics: ${response.status} ${response.statusText}`, + ); + } + + const data = (await response.json()) as VictoriaInstantResponse; + if (data.status !== "success") { + throw new Error(data.error || "Failed to query metrics"); + } + + const rawValue = data.data?.result?.[0]?.value?.[1]; + if (rawValue === undefined) return null; + const value = Number.parseFloat(rawValue); + return Number.isFinite(value) ? value : null; +} + +async function queryInstantMetricGroup( + endpoint: EndpointConfig, + metricName: string, +): Promise> { + const url = new URL(`${endpoint.url}/api/v1/query`); + url.searchParams.set("query", metricName); + + const response = await fetch(url.toString(), buildFetchOptions(endpoint)); + if (!response.ok) { + throw new Error( + `Failed to query metrics: ${response.status} ${response.statusText}`, + ); + } + + const data = (await response.json()) as VictoriaInstantResponse; + if (data.status !== "success") { + throw new Error(data.error || "Failed to query metrics"); + } + + const byServer = new Map(); + for (const res of data.data?.result ?? []) { + const serverId = res.metric?.server_id; + if (!serverId) continue; + const rawValue = res.value?.[1]; + if (rawValue === undefined) { + byServer.set(serverId, null); + continue; + } + const value = Number.parseFloat(rawValue); + byServer.set(serverId, Number.isFinite(value) ? value : null); + } + return byServer; +} + +async function queryRangeMetric( + endpoint: EndpointConfig, + options: { + metricName: string; + serverId: string; + start: Date; + end: Date; + stepSeconds: number; + }, +): Promise { + const url = new URL(`${endpoint.url}/api/v1/query_range`); + url.searchParams.set( + "query", + `${options.metricName}{server_id="${escapePromQL(options.serverId)}"}`, + ); + url.searchParams.set( + "start", + String(Math.floor(options.start.getTime() / 1000)), + ); + url.searchParams.set("end", String(Math.floor(options.end.getTime() / 1000))); + url.searchParams.set("step", String(options.stepSeconds)); + + const response = await fetch(url.toString(), buildFetchOptions(endpoint)); + if (!response.ok) { + throw new Error( + `Failed to query metrics range: ${response.status} ${response.statusText}`, + ); + } + + const data = (await response.json()) as VictoriaMatrixResponse; + if (data.status !== "success") { + throw new Error(data.error || "Failed to query metrics range"); + } + + return (data.data?.result?.[0]?.values ?? []) + .map(([timestamp, rawValue]) => ({ + timestamp: new Date(timestamp * 1000).toISOString(), + value: Number.parseFloat(rawValue), + })) + .filter((point) => Number.isFinite(point.value)); +} + +export function emptyHistory(): NodeMetricsHistory { + return { + cpuUsagePercent: [], + memoryUsagePercent: [], + memoryUsedBytes: [], + diskUsagePercent: [], + diskUsedBytes: [], + }; +} + +function escapePromQL(value: string) { + return value + .replace(/\\/g, "\\\\") + .replace(/"/g, '\\"') + .replace(/\n/g, "\\n"); +} diff --git a/web/package.json b/web/package.json index 00d80f6..ca882f0 100644 --- a/web/package.json +++ b/web/package.json @@ -3,75 +3,73 @@ "version": "0.1.0", "private": true, "scripts": { - "dev": "portless cloud --app-port 3000 next dev", - "build": "next build", - "start": "next start", - "lint": "next lint", - "db:generate": "drizzle-kit generate", - "db:migrate": "drizzle-kit migrate", - "db:push": "drizzle-kit push", - "db:studio": "drizzle-kit studio" - }, - "dependencies": { - "@aws-sdk/client-s3": "^3.968.0", - "@base-ui/react": "^1.0.0", - "@bprogress/next": "^3.2.12", - "@react-email/components": "^1.0.4", - "@react-email/render": "^2.0.2", - "acme-client": "^5.4.0", - "better-auth": "^1.4.9", - "class-variance-authority": "^0.7.1", - "clsx": "^2.1.1", - "cmdk": "^1.1.1", - "cron-parser": "^5.4.0", - "cronstrue": "^3.9.0", - "drizzle-orm": "^0.45.1", - "inngest": "^4.3.0", - "ip-address": "^10.1.0", - "jose": "^6.1.3", - "lucide-react": "^0.562.0", - "next": "16.2.6", - "next-themes": "^0.4.6", - "nodemailer": "^7.0.12", - "nuqs": "^2.8.6", - "pg": "^8.16.3", - "react": "19.2.6", - "react-dom": "19.2.6", - "shadcn": "^3.6.2", - "sonner": "^2.0.7", - "swr": "^2.3.8", - "tailwind-merge": "^3.4.0", - "tw-animate-css": "^1.4.0", - "validator": "^13.15.26", - "yaml": "^2.8.2", - "zod": "^4.3.5" - }, - "devDependencies": { - "@biomejs/biome": "2.3.10", - "@eslint/eslintrc": "^3", - "@tailwindcss/postcss": "^4", - "@types/node": "^22", - "@types/nodemailer": "^7.0.5", - "@types/pg": "^8.16.0", - "@types/react": "19.2.14", - "@types/react-dom": "19.2.3", - "@types/validator": "^13.15.10", - "drizzle-kit": "^0.31.8", - "eslint": "^9", - "eslint-config-next": "16.2.6", - "portless": "^0.13.0", - "tailwindcss": "^4", - "tsx": "^4.19.2", - "typescript": "^5" - }, - "overrides": { - "@types/react": "19.2.14", - "@types/react-dom": "19.2.3" - }, - "pnpm": { - "overrides": { - "@types/react": "19.2.14", - "@types/react-dom": "19.2.3" - } - } + "dev": "portless cloud --app-port 3000 next dev", + "build": "next build", + "start": "next start", + "lint": "next lint", + "db:push": "drizzle-kit push", + "db:studio": "drizzle-kit studio" + }, + "dependencies": { + "@aws-sdk/client-s3": "^3.968.0", + "@base-ui/react": "^1.0.0", + "@bprogress/next": "^3.2.12", + "@react-email/components": "^1.0.4", + "@react-email/render": "^2.0.2", + "acme-client": "^5.4.0", + "better-auth": "^1.4.9", + "class-variance-authority": "^0.7.1", + "clsx": "^2.1.1", + "cmdk": "^1.1.1", + "cron-parser": "^5.4.0", + "cronstrue": "^3.9.0", + "drizzle-orm": "^0.45.1", + "inngest": "^4.3.0", + "ip-address": "^10.1.0", + "jose": "^6.1.3", + "lucide-react": "^0.562.0", + "next": "16.2.6", + "next-themes": "^0.4.6", + "nodemailer": "^7.0.12", + "nuqs": "^2.8.6", + "pg": "^8.16.3", + "react": "19.2.6", + "react-dom": "19.2.6", + "shadcn": "^3.6.2", + "sonner": "^2.0.7", + "swr": "^2.3.8", + "tailwind-merge": "^3.4.0", + "tw-animate-css": "^1.4.0", + "validator": "^13.15.26", + "yaml": "^2.8.2", + "zod": "^4.3.5" + }, + "devDependencies": { + "@biomejs/biome": "2.3.10", + "@eslint/eslintrc": "^3", + "@tailwindcss/postcss": "^4", + "@types/node": "^22", + "@types/nodemailer": "^7.0.5", + "@types/pg": "^8.16.0", + "@types/react": "19.2.14", + "@types/react-dom": "19.2.3", + "@types/validator": "^13.15.10", + "drizzle-kit": "^0.31.8", + "eslint": "^9", + "eslint-config-next": "16.2.6", + "portless": "^0.13.0", + "tailwindcss": "^4", + "tsx": "^4.19.2", + "typescript": "^5" + }, + "overrides": { + "@types/react": "19.2.14", + "@types/react-dom": "19.2.3" + }, + "pnpm": { + "overrides": { + "@types/react": "19.2.14", + "@types/react-dom": "19.2.3" + } + } }