feat(upgrade): Approach B - image-only upgrade mode

Add a "Quick Upgrade" path that pulls latest container images and recreates only the core app services (api, admin, media-api, nginx) without touching any tracked files. Tenant content (mkdocs/, configs/, scripts/) is implicitly preserved because the script never writes outside docker. Faster (~2 min vs ~4-5 min for full upgrade) and structurally safer for releases that don't change orchestration/templates. Pieces: - scripts/image-upgrade.sh: new ~350-line script. Phases: pre-flight + mkdocs snapshot, image pull, targeted recreate (broad up -d would cascade on misconfigured infra containers — proven on marcelle), light health checks, deferred ccp-agent restart. Writes the same progress.json + result.json schema as upgrade.sh so the CCP poll loop is unchanged. - agent/src/routes/upgrade.routes.ts: POST /instance/:slug/upgrade/start-image-only. Same lock + staleness guards as the existing /upgrade/start endpoint. - api/src/services/remote-driver.ts: RemoteDriver.startImageUpgrade(). - api/src/services/upgrade.service.ts: startImageUpgrade() entry point; reuses runRemoteUpgrade with mode='image-only' (only the initial agent call differs — result schema and polling are identical). - api/src/modules/instances/instances.routes.ts: POST /:id/upgrade-images + startImageUpgradeSchema. - admin/src/pages/InstanceDetailPage.tsx: secondary "Quick Upgrade" button next to "Upgrade Now" on the Updates tab. Tooltip explains when to use it. Tested locally on marcelle (v2.10.2 idempotent run): 1m 49s, mkdocs.yml md5 unchanged, file count unchanged, only api/admin/media-api/nginx touched. Subtle bug found and fixed: `set -o pipefail` + `grep -q` shorts pipe and SIGPIPEs the writer — captured services list once instead. Bunker Admin
2026-05-21 15:20:35 -06:00 · 2026-05-21 15:20:35 -06:00 · 4a3d9d7c41
commit 4a3d9d7c41
parent 731e70ee42
7 changed files with 666 additions and 25 deletions
--- a/changemaker-control-panel/admin/src/pages/InstanceDetailPage.tsx
+++ b/changemaker-control-panel/admin/src/pages/InstanceDetailPage.tsx
@ -39,6 +39,7 @@ import {
  CloudOutlined,
  DisconnectOutlined,
  UploadOutlined,
  ThunderboltOutlined,
  BellOutlined,
  CheckCircleOutlined,
  WarningOutlined,
@ -563,6 +564,24 @@ export default function InstanceDetailPage() {
    }
  };
  // Image-only upgrade (Approach B): pulls images + recreates core app services
  // without touching tracked files. Faster + safer than full upgrade for releases
  // that don't change compose/templates.
  const handleStartImageUpgrade = async () => {
    setUpgradingInstance(true);
    try {
      const { data } = await api.post(`/instances/${id}/upgrade-images`, {});
      setCurrentUpgrade(data.data);
      message.success('Image-only upgrade started');
    } catch (err: unknown) {
      const resp = (err as { response?: { data?: { error?: { message?: string } } } })?.response
        ?.data?.error;
      message.error(resp?.message || 'Failed to start image-only upgrade');
    } finally {
      setUpgradingInstance(false);
    }
  };
  // Event handlers
  const handleAcknowledgeEvent = async (eventId: string) => {
    try {
@ -1632,25 +1651,41 @@ export default function InstanceDetailPage() {
                  closable
                />
              )}
-              <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center' }}>
+              <div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', gap: 16 }}>
-                <Typography.Text type="secondary">
+                <Typography.Text type="secondary" style={{ flex: 1 }}>
-                  Pulls latest code, runs migrations, and restarts services. CCP backup is recommended before upgrading.
+                  Full upgrade pulls the latest code, runs migrations, and restarts services. Quick upgrade only pulls images and recreates the core app — tenant content stays untouched and it&apos;s ~2 min faster. Use Quick when the release notes say no orchestration changes.
                </Typography.Text>
-                <Popconfirm
+                <Space>
-                  title="Start upgrade?"
+                  <Popconfirm
-                  description="This will pull the latest code, run database migrations, and restart all services. Brief downtime is expected."
+                    title="Start quick (image-only) upgrade?"
-                  onConfirm={handleStartUpgrade}
+                    description="Pulls new container images and recreates the API/Admin/Media/Nginx services. No filesystem changes — mkdocs and configs are not touched. Brief downtime is expected."
-                  disabled={instance.status !== 'RUNNING' && instance.status !== 'STOPPED'}
+                    onConfirm={handleStartImageUpgrade}
                >
                  <Button
                    type="primary"
                    icon={<UploadOutlined />}
                    loading={upgradingInstance}
                    disabled={instance.status !== 'RUNNING' && instance.status !== 'STOPPED'}
                  >
-                    Upgrade Now
+                    <Button
-                  </Button>
+                      icon={<ThunderboltOutlined />}
-                </Popconfirm>
+                      loading={upgradingInstance}
                      disabled={instance.status !== 'RUNNING' && instance.status !== 'STOPPED'}
                    >
                      Quick Upgrade
                    </Button>
                  </Popconfirm>
                  <Popconfirm
                    title="Start full upgrade?"
                    description="This will pull the latest code, run database migrations, and restart all services. Brief downtime is expected."
                    onConfirm={handleStartUpgrade}
                    disabled={instance.status !== 'RUNNING' && instance.status !== 'STOPPED'}
                  >
                    <Button
                      type="primary"
                      icon={<UploadOutlined />}
                      loading={upgradingInstance}
                      disabled={instance.status !== 'RUNNING' && instance.status !== 'STOPPED'}
                    >
                      Upgrade Now
                    </Button>
                  </Popconfirm>
                </Space>
              </div>
            </Space>
          )}
--- a/changemaker-control-panel/agent/src/routes/upgrade.routes.ts
+++ b/changemaker-control-panel/agent/src/routes/upgrade.routes.ts
@ -188,6 +188,85 @@ router.post('/instance/:slug/upgrade/start', async (req: Request, res: Response)
  res.status(202).json({ started: true });
 });
 // POST /instance/:slug/upgrade/start-image-only — Run image-upgrade.sh in background
 //
 // Image-only upgrade: pulls latest images + recreates services without touching
 // tracked files (no git pull, no tarball extract, no VERSION mutation). Tenant
 // content is implicitly safe because the script never writes outside data/upgrade.
 // See scripts/image-upgrade.sh for full rationale.
 //
 // Schema-compatible with /upgrade/start: writes the same progress.json + result.json
 // so the CCP poll loop in runRemoteUpgrade() works unchanged.
 router.post('/instance/:slug/upgrade/start-image-only', async (req: Request, res: Response) => {
  const slug = param(req, 'slug');
  const entry = await getSlugEntry(slug);
  const { imageTag } = req.body || {};
  // SECURITY: imageTag flows into bash via --image-tag. Constrain to a safe
  // subset of docker tag chars (semver, SHA, named tags). Reject anything
  // that could shell-escape.
  if (imageTag && !/^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,127}$/.test(String(imageTag))) {
    res.status(400).json({ error: 'VALIDATION', message: 'Invalid imageTag' });
    return;
  }
  const scriptPath = path.join(entry.basePath, 'scripts', 'image-upgrade.sh');
  try {
    await fs.access(scriptPath);
  } catch {
    res.status(404).json({ error: 'NOT_FOUND', message: 'image-upgrade.sh not found' });
    return;
  }
  // Same concurrency guards as the full /upgrade/start endpoint — uses the
  // same lock + on-disk staleness check + backup/restore mutex.
  if (isSlugLocked(slug, 'upgrade') || await isUpgradeRunningOnDisk(entry.basePath)) {
    res.status(409).json({ error: 'SLUG_BUSY', message: 'An upgrade is already in progress' });
    return;
  }
  if (isSlugLocked(slug, 'backup') || isSlugLocked(slug, 'restore')) {
    res.status(409).json({ error: 'SLUG_BUSY', message: 'A backup or restore is currently running' });
    return;
  }
  // Clear stale progress/result files (same convention as /upgrade/start)
  const progressPath = path.join(entry.basePath, 'data', 'upgrade', 'progress.json');
  const resultPath = path.join(entry.basePath, 'data', 'upgrade', 'result.json');
  await fs.mkdir(path.dirname(progressPath), { recursive: true });
  await fs.rm(progressPath, { force: true });
  await fs.rm(resultPath, { force: true });
  const args: string[] = [scriptPath, '--api-mode'];
  if (imageTag) args.push('--image-tag', String(imageTag));
  void withSlugLock(slug, 'upgrade', async () => {
    logger.info(`[image-upgrade] ${slug}: spawning ${args.join(' ')} (cwd=${entry.basePath})`);
    try {
      await new Promise<void>((resolve, reject) => {
        const proc = spawn('bash', args, {
          cwd: entry.basePath,
          env: { ...process.env, COMPOSE_ANSI: 'never' },
          stdio: ['ignore', 'ignore', 'ignore'],
        });
        proc.on('error', reject);
        proc.on('close', (code) => {
          if (code === 0) resolve();
          else reject(new Error(`image-upgrade.sh exited with code ${code}`));
        });
      });
      logger.info(`[image-upgrade] ${slug}: image-upgrade.sh completed`);
    } catch (err) {
      logger.error(`[image-upgrade] ${slug}: ${(err as Error).message}`);
    }
  }).catch((err) => {
    if (!(err instanceof SlugBusyError)) {
      logger.error(`[image-upgrade] ${slug}: lock or background error: ${(err as Error).message}`);
    }
  });
  res.status(202).json({ started: true, mode: 'image-only' });
 });
 // GET /instance/:slug/upgrade/progress — Read progress.json
 router.get('/instance/:slug/upgrade/progress', async (req: Request, res: Response) => {
  const entry = await getSlugEntry(param(req, 'slug'));
--- a/changemaker-control-panel/api/src/modules/instances/instances.routes.ts
+++ b/changemaker-control-panel/api/src/modules/instances/instances.routes.ts
@ -4,7 +4,7 @@ import rateLimit from 'express-rate-limit';
 import { prisma } from '../../lib/prisma';
 import { authenticate, requireRole } from '../../middleware/auth';
 import { validate } from '../../middleware/validate';
-import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema, startUpgradeSchema, setupRemoteTunnelSchema } from './instances.schemas';
+import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema, startUpgradeSchema, startImageUpgradeSchema, setupRemoteTunnelSchema } from './instances.schemas';
 import * as instancesService from './instances.service';
 import * as healthService from '../../services/health.service';
 import * as backupService from '../../services/backup.service';
@ -362,6 +362,25 @@ router.post(
  }
 );
 // Image-only upgrade (Approach B). Faster + safer than full upgrade for
 // releases that don't change orchestration/templates. See upgrade.service.ts
 // startImageUpgrade for full rationale.
 router.post(
  '/:id/upgrade-images',
  requireRole('SUPER_ADMIN', 'OPERATOR'),
  validate(startImageUpgradeSchema),
  async (req: Request, res: Response) => {
    const { imageTag } = req.body || {};
    const upgrade = await upgradeService.startImageUpgrade(
      req.params.id as string,
      req.user!.id,
      req.ip,
      { imageTag }
    );
    res.status(201).json({ data: upgrade });
  }
 );
 router.get(
  '/:id/upgrade-status',
  requireRole('SUPER_ADMIN', 'OPERATOR'),
--- a/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts
+++ b/changemaker-control-panel/api/src/modules/instances/instances.schemas.ts
@ -121,6 +121,17 @@ export const startUpgradeSchema = z.object({
    .optional(),
 });
 // Approach B: image-only upgrade. Pulls images + recreates core app services
 // without touching tracked files. imageTag is optional — if omitted, the
 // agent uses whatever IMAGE_TAG the install's .env / compose env defines
 // (typically `latest`). Tag must be a valid Docker tag.
 export const startImageUpgradeSchema = z.object({
  imageTag: z
    .string()
    .regex(/^[a-zA-Z0-9][a-zA-Z0-9_.-]{0,127}$/, 'Invalid imageTag')
    .optional(),
 });
 export const setupRemoteTunnelSchema = z.object({
  // Empty string or omitted → resources use standard subdomains (app., api., etc.)
  // A value like "ck" → creates ck-app., ck-api., etc. for multi-tenant domains
--- a/changemaker-control-panel/api/src/services/remote-driver.ts
+++ b/changemaker-control-panel/api/src/services/remote-driver.ts
@ -82,6 +82,10 @@ export interface StartAgentUpgradeOptions {
  branch?: string;
 }
 export interface StartAgentImageUpgradeOptions {
  imageTag?: string;
 }
 interface AgentRequestOptions {
  method: 'GET' | 'POST' | 'DELETE';
  path: string;
@ -574,6 +578,21 @@ export class RemoteDriver implements ExecutionDriver {
    });
  }
  /**
   * Trigger image-upgrade.sh --api-mode on the remote (Approach B: image-only
   * upgrade — pulls images + recreates core app services without touching
   * the install tree). Fire-and-forget; returns 202 immediately. Uses the
   * same progress/result polling endpoints as startUpgrade.
   */
  async startImageUpgrade(options: StartAgentImageUpgradeOptions = {}): Promise<void> {
    await this.request({
      method: 'POST',
      path: `/instance/${this.slug}/upgrade/start-image-only`,
      body: options,
      timeoutMs: 30_000,
    });
  }
  /**
   * Read the agent's data/upgrade/progress.json. Returns the default zero-state
   * if no progress has been written yet.
--- a/changemaker-control-panel/api/src/services/upgrade.service.ts
+++ b/changemaker-control-panel/api/src/services/upgrade.service.ts
@ -205,6 +205,10 @@ export interface StartUpgradeOptions {
  branch?: string;
 }
 export interface StartImageUpgradeOptions {
  imageTag?: string;
 }
 /**
 * Start an upgrade for an instance. Returns the created InstanceUpgrade record.
 * The actual upgrade runs asynchronously (fire-and-forget).
@ -298,6 +302,86 @@ export async function startUpgrade(
  return upgrade;
 }
 /**
 * Start an IMAGE-ONLY upgrade (Approach B). Pulls latest images + recreates
 * core app services without touching tracked files. Faster (~2 min vs ~4-5
 * min for full upgrade) and safer because no filesystem mutation outside
 * docker — tenant content (mkdocs/, configs/) is implicitly preserved.
 *
 * Use this for releases that only bump container code or schema. For
 * releases that change compose orchestration, nginx config, or other
 * tracked files, use startUpgrade() instead.
 *
 * Remote-only for now: local mode would need a `runImageUpgrade` runner
 * which we haven't built (all our instances are remote via mTLS agent).
 */
 export async function startImageUpgrade(
  instanceId: string,
  userId: string,
  ipAddress?: string,
  options?: StartImageUpgradeOptions
 ) {
  const instance = await prisma.instance.findUnique({ where: { id: instanceId } });
  if (!instance) throw new Error('Instance not found');
  if (!instance.isRemote) {
    throw new Error('Image-only upgrade is currently supported only for remote instances');
  }
  if (instance.status !== InstanceStatus.RUNNING && instance.status !== InstanceStatus.STOPPED) {
    throw new Error(`Cannot upgrade instance in ${instance.status} state`);
  }
  // Reuse the same in-progress guard as startUpgrade: only one upgrade
  // (of either type) at a time per instance.
  const active = await prisma.instanceUpgrade.findFirst({
    where: {
      instanceId,
      status: { in: [UpgradeStatus.PENDING, UpgradeStatus.IN_PROGRESS] },
    },
  });
  if (active) {
    throw new Error('An upgrade is already in progress for this instance');
  }
  // Create upgrade record. branch is unused for image-only but keep it
  // populated with current branch for audit trail consistency.
  const upgrade = await prisma.instanceUpgrade.create({
    data: {
      instanceId,
      status: UpgradeStatus.PENDING,
      previousCommit: instance.gitCommit,
      branch: instance.gitBranch,
      triggeredById: userId,
    },
  });
  // Audit log
  await prisma.auditLog.create({
    data: {
      userId,
      instanceId,
      action: AuditAction.INSTANCE_UPGRADE,
      details: {
        upgradeId: upgrade.id,
        previousCommit: instance.gitCommit,
        source: 'remote',
        mode: 'image-only',
        options: options || {},
      } as unknown as Prisma.InputJsonValue,
      ipAddress,
    },
  });
  // Fire-and-forget: reuse runRemoteUpgrade with mode='image-only'. Same
  // poll loop and result handling — only the initial agent call differs.
  runRemoteUpgrade(upgrade.id, instance, undefined, 'image-only', options).catch((err) => {
    logger.error(`[image-upgrade] Remote image upgrade orchestration failed for ${instance.slug}: ${err}`);
  });
  return upgrade;
 }
 /**
 * Async REMOTE upgrade runner.
 *
@ -316,7 +400,9 @@ export async function startUpgrade(
 async function runRemoteUpgrade(
  upgradeId: string,
  instance: Instance,
-  options?: StartUpgradeOptions
+  options?: StartUpgradeOptions,
  mode: 'full' | 'image-only' = 'full',
  imageOnlyOptions?: StartImageUpgradeOptions
 ) {
  const slug = instance.slug;
@ -333,18 +419,27 @@ async function runRemoteUpgrade(
      where: { id: upgradeId },
      data: {
        status: UpgradeStatus.IN_PROGRESS,
-        progressMessage: 'Starting remote upgrade...',
+        progressMessage: mode === 'image-only'
          ? 'Starting image-only upgrade...'
          : 'Starting remote upgrade...',
      },
    });
    // Tell the agent to start. The agent has its own mutex + stale-progress
    // check, so this can return 409 if a previous upgrade is still running.
-    logger.info(`[upgrade] ${slug}: triggering remote upgrade.sh start`);
+    if (mode === 'image-only') {
-    await driver.startUpgrade({
+      logger.info(`[upgrade] ${slug}: triggering remote image-upgrade.sh start`);
-      skipBackup: options?.skipBackup,
+      await driver.startImageUpgrade({
-      useRegistry: options?.useRegistry,
+        imageTag: imageOnlyOptions?.imageTag,
-      branch: options?.branch,
+      });
-    });
+    } else {
      logger.info(`[upgrade] ${slug}: triggering remote upgrade.sh start`);
      await driver.startUpgrade({
        skipBackup: options?.skipBackup,
        useRegistry: options?.useRegistry,
        branch: options?.branch,
      });
    }
    // Poll progress + result. We treat /result returning 200 as the signal
    // that upgrade.sh exited (successfully or with code != 0 — the script
--- a/scripts/image-upgrade.sh
+++ b/scripts/image-upgrade.sh
@ -0,0 +1,383 @@
 #!/usr/bin/env bash
 # image-upgrade.sh — Approach B: image-only upgrade
 #
 # Pulls latest images from the registry and recreates services WITHOUT touching
 # tracked files in the install tree (no git pull, no tarball extract, no VERSION
 # mutation). Tenant content (mkdocs/, configs/) is implicitly safe because this
 # script never writes outside data/upgrade/ and the docker daemon.
 #
 # Used by CCP "Quick Upgrade" button. Pairs with scripts/upgrade.sh which
 # remains the full upgrade path for orchestration-changing releases.
 #
 # Schema parity: writes data/upgrade/progress.json + result.json with the same
 # fields upgrade.sh writes, so the CCP poll loop is unchanged.
 set -euo pipefail
 PROJECT_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")/.." && pwd)"
 SCRIPT_DIR="$PROJECT_DIR/scripts"
 UPGRADE_DIR="$PROJECT_DIR/data/upgrade"
 LOG_DIR="$PROJECT_DIR/logs"
 LOG_FILE="$LOG_DIR/image-upgrade-$(date +%Y%m%d_%H%M%S).log"
 LOCK_FILE="$PROJECT_DIR/.upgrade.lock"
 PROGRESS_FILE="$UPGRADE_DIR/progress.json"
 RESULT_FILE="$UPGRADE_DIR/result.json"
 START_TIME=$SECONDS
 # --- Detect install mode ---
 if [[ -f "$PROJECT_DIR/VERSION" ]] && [[ ! -d "$PROJECT_DIR/.git" ]]; then
  INSTALL_MODE="release"
 else
  INSTALL_MODE="source"
 fi
 # --- Defaults ---
 API_MODE=false
 DRY_RUN=false
 IMAGE_TAG=""
 usage() {
  cat <<EOF
 Usage: $(basename "$0") [options]
 Image-only upgrade: pulls latest images from the configured registry and
 recreates services without touching the install tree.
 Options:
  --api-mode           Emit data/upgrade/{progress,result}.json (no TTY output)
  --dry-run            Print what would happen; do not pull or recreate
  --image-tag TAG      Override IMAGE_TAG (env var) for this run
  -h, --help           Show this help
 This script never modifies mkdocs/, configs/, scripts/, docker-compose.yml,
 or VERSION. It is the safest upgrade path for orchestration-stable releases.
 EOF
 }
 while [[ $# -gt 0 ]]; do
  case "$1" in
    --api-mode)    API_MODE=true; shift ;;
    --dry-run)     DRY_RUN=true; shift ;;
    --image-tag)   IMAGE_TAG="${2:?--image-tag requires a value}"; shift 2 ;;
    -h|--help)     usage; exit 0 ;;
    *) echo "Unknown option: $1" >&2; usage >&2; exit 1 ;;
  esac
 done
 # --- Colors ---
 if [[ -t 1 ]] && [[ -z "${NO_COLOR:-}" ]]; then
  RED='\033[0;31m'  GREEN='\033[0;32m'  YELLOW='\033[0;33m'
  CYAN='\033[0;36m' BOLD='\033[1m'      NC='\033[0m'
 else
  RED='' GREEN='' YELLOW='' CYAN='' BOLD='' NC=''
 fi
 info()    { echo -e "${CYAN}[INFO]${NC} $*"; }
 success() { echo -e "${GREEN}[ OK ]${NC} $*"; }
 warn()    { echo -e "${YELLOW}[WARN]${NC} $*"; }
 error()   { echo -e "${RED}[ERR ]${NC} $*" >&2; }
 phase()   { echo ""; echo -e "${BOLD}${CYAN}=== Phase $1: $2 ===${NC}"; }
 # --- Logging: mirror stdout/stderr to LOG_FILE ---
 # logs/ may be root-owned on installs where upgrade.sh has run via ccp-agent.
 # Fall back to /tmp if we can't write, so bunker-admin manual invocations don't
 # crash with "Permission denied" on tee.
 mkdir -p "$UPGRADE_DIR"
 if mkdir -p "$LOG_DIR" 2>/dev/null && touch "$LOG_FILE" 2>/dev/null; then
  :  # primary log location is writable
 else
  LOG_FILE="/tmp/image-upgrade-$(date +%Y%m%d_%H%M%S)-$$.log"
  echo "[INFO] logs/ not writable; using $LOG_FILE" >&2
 fi
 exec > >(tee -a "$LOG_FILE") 2>&1
 # --- Capture previous version for result.json ---
 if [[ "$INSTALL_MODE" == "release" ]]; then
  PRE_VERSION="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "unknown")"
 else
  PRE_VERSION="$(cd "$PROJECT_DIR" && git rev-parse --short HEAD 2>/dev/null || echo "unknown")"
 fi
 write_progress() {
  local phase_num="$1" phase_name="$2" pct="$3" msg="$4"
  [[ "$API_MODE" != "true" ]] && return
  mkdir -p "$UPGRADE_DIR"
  cat > "$PROGRESS_FILE" <<PEOF
 {
  "phase": ${phase_num},
  "phaseName": "${phase_name}",
  "percentage": ${pct},
  "message": "$(echo "$msg" | sed 's/"/\\"/g')",
  "lastUpdate": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
 }
 PEOF
 }
 write_result() {
  [[ "$API_MODE" != "true" ]] && return
  local success_val="$1" msg="$2"
  local warnings_json="${3:-[]}"
  local duration_secs=$((SECONDS - START_TIME))
  local new_version="$PRE_VERSION"
  if [[ "$INSTALL_MODE" == "release" ]]; then
    new_version="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "$PRE_VERSION")"
  else
    new_version="$(cd "$PROJECT_DIR" && git rev-parse --short HEAD 2>/dev/null || echo "$PRE_VERSION")"
  fi
  mkdir -p "$UPGRADE_DIR"
  cat > "$RESULT_FILE" <<REOF
 {
  "success": ${success_val},
  "message": "$(echo "$msg" | sed 's/"/\\"/g')",
  "previousCommit": "${PRE_VERSION}",
  "newCommit": "${new_version}",
  "commitCount": 0,
  "durationSeconds": ${duration_secs},
  "warnings": ${warnings_json},
  "completedAt": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
  "mode": "image-only"
 }
 REOF
  rm -f "$PROGRESS_FILE"
 }
 # --- Lock + cleanup ---
 acquire_lock() {
  if [[ -f "$LOCK_FILE" ]]; then
    local pid; pid="$(cat "$LOCK_FILE" 2>/dev/null || echo "")"
    if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
      error "Upgrade already running (pid $pid). Refusing to start."
      write_result "false" "Another upgrade is already running (pid $pid)"
      exit 1
    fi
    warn "Stale lock file found; removing"
    rm -f "$LOCK_FILE"
  fi
  echo $$ > "$LOCK_FILE"
 }
 release_lock() { rm -f "$LOCK_FILE" || true; }
 on_failure() {
  local exit_code=$?
  local line_no=${1:-?}
  error "image-upgrade.sh failed at line $line_no (exit $exit_code)"
  write_result "false" "Image upgrade failed at line $line_no (exit $exit_code)"
  release_lock
  exit "$exit_code"
 }
 trap 'on_failure $LINENO' ERR
 trap 'release_lock' EXIT
 # --- Banner ---
 echo ""
 echo -e "${BOLD}${CYAN}================================================${NC}"
 echo -e "${BOLD}  Image-Only Upgrade${NC}"
 echo -e "${BOLD}${CYAN}================================================${NC}"
 echo "Install mode: $INSTALL_MODE"
 echo "Project dir:  $PROJECT_DIR"
 echo "Pre-version:  $PRE_VERSION"
 [[ -n "$IMAGE_TAG" ]] && echo "Image tag:    $IMAGE_TAG"
 [[ "$DRY_RUN" == "true" ]] && echo "DRY RUN: no images will be pulled or services recreated"
 echo ""
 acquire_lock
 # =============================================================================
 # Phase 1: Pre-flight + mkdocs snapshot (defensive)
 # =============================================================================
 phase "1" "Pre-flight"
 write_progress 1 "Pre-flight" 10 "Snapshotting mkdocs (defensive)..."
 # Source mkdocs-snapshot.sh and run it. This is the same snapshot every
 # upgrade path takes — leaves mkdocs-backup-<timestamp>.tar.gz in project root.
 # Image-only upgrades shouldn't damage mkdocs (no filesystem mutation), but
 # the snapshot is cheap insurance and keeps operator habits consistent.
 if [[ -r "$SCRIPT_DIR/lib/mkdocs-snapshot.sh" ]]; then
  if [[ "$DRY_RUN" == "true" ]]; then
    info "[DRY RUN] Would snapshot mkdocs/"
  else
    # shellcheck disable=SC1091
    PROJECT_DIR="$PROJECT_DIR" bash -c ". $SCRIPT_DIR/lib/mkdocs-snapshot.sh; snapshot_mkdocs" \
      || warn "mkdocs snapshot failed (non-fatal; continuing)"
  fi
 else
  warn "scripts/lib/mkdocs-snapshot.sh not found; skipping snapshot"
 fi
 # Sanity-check docker
 if ! docker compose version &>/dev/null; then
  error "docker compose is not available"
  write_result "false" "docker compose not available"
  exit 1
 fi
 success "Pre-flight checks passed"
 # =============================================================================
 # Phase 2: Pull images
 # =============================================================================
 phase "2" "Pull Images"
 write_progress 2 "Pull Images" 30 "Pulling images from registry..."
 PULL_ENV=()
 if [[ -n "$IMAGE_TAG" ]]; then
  PULL_ENV+=("IMAGE_TAG=$IMAGE_TAG")
 fi
 if [[ "$DRY_RUN" == "true" ]]; then
  info "[DRY RUN] Would run: ${PULL_ENV[*]:-} docker compose pull"
 else
  info "Pulling all images (this may take a few minutes)..."
  if (( ${#PULL_ENV[@]} > 0 )); then
    if ! env "${PULL_ENV[@]}" docker compose pull; then
      warn "docker compose pull had errors (continuing — some images may be local)"
    fi
  else
    if ! docker compose pull; then
      warn "docker compose pull had errors (continuing — some images may be local)"
    fi
  fi
 fi
 success "Image pull complete"
 # =============================================================================
 # Phase 3: Recreate core app services (targeted, not broad)
 # =============================================================================
 phase "3" "Recreate Services"
 write_progress 3 "Recreate Services" 60 "Recreating core app services with new images..."
 # Targeted recreate: only the services whose IMAGES are released as part of
 # changemaker.lite (api, admin, media-api, nginx). Broader `up -d` is risky
 # because a single misconfigured mount in any service (e.g. mkdocs-site-server)
 # can cascade and leave dependent containers in "Created" state. Image-only
 # upgrade should only touch the actual code containers, not third-party
 # infrastructure that happens to live in the same compose file.
 #
 # Same Phase 6 pattern as upgrade.sh: drop ccp-agent from COMPOSE_PROFILES
 # during recreate so we don't suicide-restart the agent that spawned us.
 # Restart ccp-agent at the end via detached subshell.
 PROFILES_SAVED="${COMPOSE_PROFILES:-}"
 COMPOSE_PROFILES_WITHOUT_AGENT="$(echo "${PROFILES_SAVED}" \
  | tr ',' '\n' | grep -vx 'ccp-agent' | paste -sd, -)"
 UP_ENV=("COMPOSE_PROFILES=${COMPOSE_PROFILES_WITHOUT_AGENT}")
 if [[ -n "$IMAGE_TAG" ]]; then
  UP_ENV+=("IMAGE_TAG=$IMAGE_TAG")
 fi
 # Core services that ship as v2 release images. nginx last so it doesn't
 # briefly proxy to an old api. media-api may not be enabled on all installs;
 # tolerate it being missing from compose.
 CORE_SERVICES=(api admin media-api nginx)
 EXISTING_SERVICES=()
 # Capture the service list once. Don't pipe `docker compose config` into
 # `grep -q` directly: with `set -o pipefail`, grep exits early on match and
 # SIGPIPEs the docker writer, making the pipeline exit non-zero. The grep -q
 # would then "match" all services as missing. Capture-then-check avoids it.
 COMPOSE_SERVICES_LIST="$(docker compose config --services 2>/dev/null || true)"
 for svc in "${CORE_SERVICES[@]}"; do
  if grep -qx -- "$svc" <<<"$COMPOSE_SERVICES_LIST"; then
    EXISTING_SERVICES+=("$svc")
  else
    info "Skipping service '$svc' (not in compose file)"
  fi
 done
 if (( ${#EXISTING_SERVICES[@]} == 0 )); then
  warn "No core app services found in compose; skipping recreate"
 elif [[ "$DRY_RUN" == "true" ]]; then
  info "[DRY RUN] Would run: ${UP_ENV[*]} docker compose up -d ${EXISTING_SERVICES[*]}"
 else
  info "Recreating core services: ${EXISTING_SERVICES[*]}"
  env "${UP_ENV[@]}" docker compose up -d "${EXISTING_SERVICES[@]}"
 fi
 success "Services recreated"
 # Restart Pangolin tunnel connector if running (image may have changed)
 if docker ps --format '{{.Names}}' | grep -q 'newt'; then
  if [[ "$DRY_RUN" == "true" ]]; then
    info "[DRY RUN] Would restart newt"
  else
    info "Restarting Pangolin tunnel connector..."
    docker compose restart newt 2>/dev/null || true
    success "Newt tunnel restarted"
  fi
 fi
 # =============================================================================
 # Phase 4: Verify (light health checks)
 # =============================================================================
 phase "4" "Verification"
 write_progress 4 "Verification" 85 "Running health checks..."
 VERIFY_FAILED=false
 UPGRADE_WARNINGS="[]"
 verify_health() {
  local name="$1" check_cmd="$2" max_wait="${3:-45}"
  local waited=0
  while [[ $waited -lt $max_wait ]]; do
    if eval "$check_cmd" 2>/dev/null; then
      success "$name: healthy (${waited}s)"
      return 0
    fi
    sleep 3
    waited=$((waited + 3))
  done
  warn "$name: not responding after ${max_wait}s"
  VERIFY_FAILED=true
  return 0
 }
 if [[ "$DRY_RUN" != "true" ]]; then
  verify_health "API (port 4000)" \
    "docker compose exec -T api wget -q --spider http://localhost:4000/api/health" 60
  verify_health "Admin (port 3000)" \
    "docker compose exec -T admin wget -q --spider http://localhost:3000/" 90
  if docker ps --format '{{.Names}}' | grep -q 'changemaker-media-api'; then
    verify_health "Media API (port 4100)" \
      "docker compose exec -T media-api wget -q --spider http://127.0.0.1:4100/health" 30
  fi
  if "$VERIFY_FAILED"; then
    UPGRADE_WARNINGS='["Some health checks failed after image-only upgrade — services may still be starting"]'
  fi
 fi
 # =============================================================================
 # Summary + deferred ccp-agent restart
 # =============================================================================
 ELAPSED_MIN=$(( (SECONDS - START_TIME) / 60 ))
 ELAPSED_SEC=$(( (SECONDS - START_TIME) % 60 ))
 echo ""
 echo -e "${BOLD}${GREEN}================================================${NC}"
 echo -e "${BOLD}  Image-Only Upgrade Complete${NC}"
 echo -e "${BOLD}${GREEN}================================================${NC}"
 printf "  Previous:  %s\n" "$PRE_VERSION"
 printf "  Duration:  %dm %ds\n" "$ELAPSED_MIN" "$ELAPSED_SEC"
 printf "  Log:       %s\n" "$LOG_FILE"
 write_progress 4 "Complete" 100 "Image-only upgrade complete"
 write_result "true" "Image-only upgrade complete (previous: ${PRE_VERSION})" "$UPGRADE_WARNINGS"
 # Deferred ccp-agent restart — see upgrade.sh for full rationale. Same
 # mechanism: nohup'd, disowned subshell that picks up the new image after
 # this script has cleanly exited.
 if echo "${PROFILES_SAVED:-}" | tr ',' '\n' | grep -qx 'ccp-agent'; then
  if [[ "$DRY_RUN" == "true" ]]; then
    info "[DRY RUN] Would schedule deferred ccp-agent restart"
  else
    info "Scheduling deferred ccp-agent restart..."
    nohup bash -c "
      sleep 3
      cd '$PROJECT_DIR'
      COMPOSE_PROFILES='ccp-agent' docker compose --profile ccp-agent up -d ccp-agent
    " >/dev/null 2>&1 < /dev/null &
    disown
    success "ccp-agent restart scheduled (will pick up new image)"
  fi
 fi
 release_lock
 trap - EXIT
 exit 0