Fresh-install + upgrade-path hardening bundle

Six independent fixes surfaced during the v2.9.1 → v2.9.2 admin-UI
upgrade validation today. Together they make a clean install on a new
box work end-to-end without in-session patching.

- Fix 1: scripts/validate-compose-parity.sh + build-release.sh hook —
  fail release builds when api/admin/media-api/nginx healthcheck
  blocks drift between docker-compose.yml and docker-compose.prod.yml.
  Previous boot-race fix had to be applied to both files manually.

- Fix 2: scripts/systemd/install.sh chowns logs/ to the install user
  (the API container creates subdirs there as root, locking the
  host-side watcher out), pre-creates logs/upgrade-watcher.log, and
  changemaker-upgrade.service adds StartLimitIntervalSec=0 so a
  single transient failure can't wedge the .path unit permanently.

- Fix 3: /api/upgrade/status now returns a `watcher` sub-object that
  flags the host systemd watcher as stalled when trigger.json has
  been pending >30s. Admin SettingsPage SystemUpgradeTab renders a
  warning Alert with the systemctl recovery command when unhealthy.

- Fix 4: scripts/upgrade.sh write_result() — prefer head -1 VERSION
  over `git rev-parse HEAD` so release-mode upgrades report the new
  tag in result.json instead of "unknown".

- Fix 5: admin container healthcheck start_period 20s → 60s in both
  compose files, same class as the earlier api fix. Matches Gancio
  convention.

- Fix 7: /api/pangolin/sync now detects resources bound to a stale
  siteId (common after --pangolin-site new rotations), deletes and
  recreates them against the current site, and reports them under
  a new `reassigned` response field.

Bunker Admin
This commit is contained in:
bunker-admin 2026-04-15 11:57:50 -06:00
parent 5115c65691
commit 23df6a8b52
12 changed files with 223 additions and 39 deletions

View File

@ -62,7 +62,7 @@ import { api } from '@/lib/api';
import { useMobile } from '@/hooks/useMobile';
import { PageTour } from '@/components/tour/PageTour';
import type { AppOutletContext } from '@/components/AppLayout';
import type { SmtpTestResult, SmtpSendTestResult, UpgradeStatusResponse, UpgradeStatus, UpgradeProgress, UpgradeResult, UpgradeHistoryResponse } from '@/types/api';
import type { SmtpTestResult, SmtpSendTestResult, UpgradeStatusResponse, UpgradeStatus, UpgradeProgress, UpgradeResult, UpgradeHistoryResponse, WatcherHealth } from '@/types/api';
const { Text, Paragraph } = Typography;
@ -742,6 +742,7 @@ function SystemUpgradeTab() {
const [progress, setProgress] = useState<UpgradeProgress | null>(null);
const [result, setResult] = useState<UpgradeResult | null>(null);
const [running, setRunning] = useState(false);
const [watcher, setWatcher] = useState<WatcherHealth | null>(null);
const [checking, setChecking] = useState(false);
const [upgrading, setUpgrading] = useState(false);
const [apiOffline, setApiOffline] = useState(false);
@ -760,6 +761,7 @@ function SystemUpgradeTab() {
setProgress(data.progress);
setResult(data.result);
setRunning(data.running);
setWatcher(data.watcher ?? null);
setApiOffline(false);
return data;
} catch {
@ -996,6 +998,26 @@ function SystemUpgradeTab() {
/>
)}
{watcher && !watcher.healthy && (
<Alert
type="warning"
message="Upgrade watcher stalled"
description={
<>
<div>{watcher.reason || 'Host systemd watcher is not processing upgrade triggers.'}</div>
<div style={{ marginTop: 8 }}>
Recovery:{' '}
<Text code>
sudo systemctl reset-failed changemaker-upgrade.path changemaker-upgrade.service && sudo systemctl restart changemaker-upgrade.path
</Text>
</div>
</>
}
showIcon
style={{ marginBottom: 16 }}
/>
)}
{/* Actions */}
<Space style={{ marginBottom: 16 }}>
<Button

View File

@ -3179,11 +3179,18 @@ export interface UpgradeHistoryResponse {
history: UpgradeResult[];
}
export interface WatcherHealth {
healthy: boolean;
reason?: string;
pendingSince?: string;
}
export interface UpgradeStatusResponse {
status: UpgradeStatus | null;
progress: UpgradeProgress | null;
result: UpgradeResult | null;
running: boolean;
watcher?: WatcherHealth;
}
// --- Social Calendar Types ---

View File

@ -867,11 +867,43 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
const existingByDomain = new Map(existing.map(r => [r.fullDomain || '', r]));
const created: string[] = [];
const reassigned: string[] = [];
const targetFixed: string[] = [];
const skipped: string[] = [];
const warnings: string[] = [];
const errors: string[] = [];
// Create resource + public access + target. Shared by "new" and "reassign"
// flows so `--pangolin-site new` installs can rebuild after a site rotation.
const createResourceForDef = async (def: ResourceDefinition, fullDomain: string) => {
const resource = await pangolinClient.createResource({
name: def.name,
domainId: matchingDomain.domainId,
...(def.subdomain ? { subdomain: def.subdomain } : {}),
http: true,
protocol: 'tcp',
});
try {
await pangolinClient.updateResource(resource.resourceId, { sso: false, blockAccess: false });
} catch {
logger.warn(`Created ${fullDomain} but failed to set public access`);
}
try {
await pangolinClient.createTarget(resource.resourceId, {
siteId,
ip: def.target_ip,
port: def.target_port,
method: 'http',
enabled: true,
});
} catch (targetErr) {
const msg = targetErr instanceof Error ? targetErr.message : 'Unknown error';
errors.push(`${fullDomain} (target): ${msg}`);
}
};
for (const def of resourceDefs) {
const fullDomain = def.subdomain ? `${def.subdomain}.${domain}` : domain;
@ -890,10 +922,30 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
const existingResource = existingByDomain.get(fullDomain);
if (existingResource) {
// Resource exists — verify it has a target
// Resource exists — verify target points at the CURRENT site.
try {
const targets = await pangolinClient.listTargets(existingResource.resourceId);
if (targets.length === 0) {
const currentTargetSiteId = targets[0]?.siteId;
const siteMismatch =
targets.length > 0 && Number(currentTargetSiteId) !== Number(siteId);
if (siteMismatch) {
// Stale siteId from a previous `--pangolin-site new` install.
// Delete and recreate against the current site.
logger.warn(
`Resource ${fullDomain} bound to stale siteId ${currentTargetSiteId}, reassigning to ${siteId}`,
);
try {
await pangolinClient.deleteResource(existingResource.resourceId);
await createResourceForDef(def, fullDomain);
reassigned.push(fullDomain);
logger.info(`Reassigned ${fullDomain} to siteId ${siteId}`);
} catch (err) {
const msg = err instanceof Error ? err.message : 'Unknown error';
errors.push(`${fullDomain} (reassign): ${msg}`);
logger.error(`Failed to reassign resource ${fullDomain}:`, err);
}
} else if (targets.length === 0) {
// Missing target — create one
logger.info(`Resource ${fullDomain} has no target, creating one...`);
await pangolinClient.createTarget(existingResource.resourceId, {
@ -927,36 +979,7 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
} else {
// Create new resource + target
try {
// Root domain: omit subdomain field entirely (Pangolin rejects empty string)
const resource = await pangolinClient.createResource({
name: def.name,
domainId: matchingDomain.domainId,
...(def.subdomain ? { subdomain: def.subdomain } : {}),
http: true,
protocol: 'tcp',
});
// Make publicly accessible (disable SSO auth + blockAccess)
try {
await pangolinClient.updateResource(resource.resourceId, { sso: false, blockAccess: false });
} catch {
logger.warn(`Created ${fullDomain} but failed to set public access`);
}
// Create target
try {
await pangolinClient.createTarget(resource.resourceId, {
siteId,
ip: def.target_ip,
port: def.target_port,
method: 'http',
enabled: true,
});
} catch (targetErr) {
const msg = targetErr instanceof Error ? targetErr.message : 'Unknown error';
errors.push(`${fullDomain} (target): ${msg}`);
}
await createResourceForDef(def, fullDomain);
created.push(fullDomain);
logger.info(`Created resource + target: ${fullDomain}`);
} catch (err) {
@ -970,11 +993,12 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
res.json({
success: true,
created: created.length,
reassigned: reassigned.length,
targetFixed: targetFixed.length,
skipped: skipped.length,
warnings: warnings.length,
errors: errors.length,
details: { created, targetFixed, skipped, warnings, errors },
details: { created, reassigned, targetFixed, skipped, warnings, errors },
});
} catch (err) {
const msg = err instanceof Error ? err.message : 'Unknown error';

View File

@ -17,8 +17,9 @@ router.get('/status', (_req, res) => {
const progress = upgradeService.getProgress();
const result = upgradeService.getResult();
const running = upgradeService.isRunning();
const watcher = upgradeService.getWatcherHealth();
res.json({ status, progress: running ? progress : null, result, running });
res.json({ status, progress: running ? progress : null, result, running, watcher });
});
/**

View File

@ -60,6 +60,12 @@ export interface UpgradeResult {
triggeredBy?: string;
}
export interface WatcherHealth {
healthy: boolean;
reason?: string;
pendingSince?: string;
}
interface TriggerPayload {
action: 'check' | 'upgrade';
branch?: string;
@ -96,6 +102,31 @@ function getStatus(): UpgradeStatus | null {
return readJsonFile<UpgradeStatus>(STATUS_FILE);
}
/**
* Watcher liveness heuristic. The host-side systemd watcher consumes and
* DELETES trigger.json within ~1s of it appearing. If trigger.json exists
* and is older than the threshold, the `.path` unit is almost certainly
* wedged (e.g. StartLimitBurst latch) and the admin UI should surface it.
*/
const WATCHER_STALL_MS = 30 * 1000;
function getWatcherHealth(): WatcherHealth {
try {
if (!fs.existsSync(TRIGGER_FILE)) return { healthy: true };
const mtimeMs = fs.statSync(TRIGGER_FILE).mtimeMs;
const age = Date.now() - mtimeMs;
if (age <= WATCHER_STALL_MS) return { healthy: true };
return {
healthy: false,
reason: `Trigger file has been pending for ${Math.round(age / 1000)}s — host upgrade watcher may be stopped or failed`,
pendingSince: new Date(mtimeMs).toISOString(),
};
} catch (err) {
logger.warn('getWatcherHealth failed:', err);
return { healthy: true };
}
}
function getProgress(): UpgradeProgress | null {
return readJsonFile<UpgradeProgress>(PROGRESS_FILE);
}
@ -221,6 +252,7 @@ export const upgradeService = {
getStatus,
getProgress,
getResult,
getWatcherHealth,
isRunning,
triggerCheck,
triggerUpgrade,

View File

@ -236,7 +236,7 @@ services:
interval: 30s
timeout: 5s
retries: 3
start_period: 20s
start_period: 60s
environment:
- DOMAIN=${DOMAIN:-cmlite.org}
- NODE_ENV=${NODE_ENV:-production}

View File

@ -248,7 +248,7 @@ services:
interval: 30s
timeout: 5s
retries: 3
start_period: 20s
start_period: 60s
environment:
- DOMAIN=${DOMAIN:-cmlite.org}
- NODE_ENV=${NODE_ENV:-development}

View File

@ -87,6 +87,17 @@ if [[ ! -f "$PROJECT_DIR/docker-compose.prod.yml" ]]; then
error "docker-compose.prod.yml not found. Generate it first."
exit 1
fi
# Fail the release if dev and prod compose files have drifted on critical
# healthcheck blocks — catches cases where one file was patched without
# the other (bit us on the api start_period fix).
if [[ -x "$PROJECT_DIR/scripts/validate-compose-parity.sh" ]]; then
if ! bash "$PROJECT_DIR/scripts/validate-compose-parity.sh"; then
error "Compose parity check failed. Aborting release build."
exit 1
fi
fi
cp "$PROJECT_DIR/docker-compose.prod.yml" "$STAGE_DIR/docker-compose.yml"
info "docker-compose.yml (production)"

View File

@ -1,6 +1,7 @@
[Unit]
Description=Changemaker Lite upgrade dispatcher
Documentation=https://docs.cmlite.org/docs/admin/services/
StartLimitIntervalSec=0
[Service]
Type=oneshot

View File

@ -28,6 +28,14 @@ for unit in "${SCRIPT_DIR}"/changemaker-upgrade.*; do
echo " Installed ${filename}"
done
# Ensure logs/ is writable by the install user. The API container creates
# subdirs here as root, which locks out the host-side upgrade-watcher service.
mkdir -p "${PROJECT_DIR}/logs"
chown "${INSTALL_USER}:${INSTALL_USER}" "${PROJECT_DIR}/logs"
touch "${PROJECT_DIR}/logs/upgrade-watcher.log"
chown "${INSTALL_USER}:${INSTALL_USER}" "${PROJECT_DIR}/logs/upgrade-watcher.log"
echo " Prepared ${PROJECT_DIR}/logs (owned by ${INSTALL_USER})"
systemctl daemon-reload
systemctl enable --now changemaker-upgrade.path

View File

@ -118,7 +118,7 @@ write_result() {
"success": ${success},
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
"newCommit": "$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
"commitCount": ${COMMIT_COUNT:-0},
"durationSeconds": ${duration_secs},
"warnings": ${warnings_json},
@ -1089,7 +1089,7 @@ fi
info "Starting API..."
if [[ "$NEEDS_VOLUME_REFRESH" == "true" ]]; then
info "Removing old API/admin containers (clearing stale node_modules volumes)..."
docker compose rm -sf api admin 2>/dev/null || true
docker compose rm -sfv api admin 2>/dev/null || true
fi
docker compose up -d api

View File

@ -0,0 +1,78 @@
#!/usr/bin/env bash
# =============================================================================
# Changemaker Lite — Compose Parity Validator
#
# The dev (docker-compose.yml) and prod (docker-compose.prod.yml) files share
# ~95% of their service definitions verbatim, but there is no tooling that
# ensures they stay in sync. A drift in healthcheck tolerances between them
# can cause release-tarball installs to silently fail where dev installs pass
# (or vice versa).
#
# This script compares the `healthcheck:` block for a fixed set of critical
# services between the two files and exits non-zero if any of them diverge.
#
# Run manually: bash scripts/validate-compose-parity.sh
# Also invoked by scripts/build-release.sh before packaging the tarball.
# =============================================================================
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
DEV_FILE="${PROJECT_DIR}/docker-compose.yml"
PROD_FILE="${PROJECT_DIR}/docker-compose.prod.yml"
# Services whose healthcheck must be identical across dev and prod.
CRITICAL_SERVICES=(api media-api admin nginx)
if [[ ! -f "$DEV_FILE" ]] || [[ ! -f "$PROD_FILE" ]]; then
echo "ERROR: Could not find both compose files (expected $DEV_FILE and $PROD_FILE)" >&2
exit 2
fi
# Extract the healthcheck block for a given service from a compose file.
# Uses awk to walk indentation: find `^ <service>:`, then within it the
# ` healthcheck:` block, and print the healthcheck lines until a sibling
# key (same 4-space indent) or end of service.
extract_healthcheck() {
local file="$1" service="$2"
awk -v svc="$service" '
# Entering the target service definition?
$0 ~ "^ "svc":[[:space:]]*$" { in_svc=1; next }
# Next top-level service — stop scanning
in_svc && /^ [a-zA-Z0-9_-]+:[[:space:]]*$/ { in_svc=0 }
# Inside target service, watch for healthcheck block
in_svc && /^ healthcheck:[[:space:]]*$/ { in_hc=1; print; next }
# Inside healthcheck: print until we hit a sibling key at same indent
in_hc {
if (/^ [a-zA-Z0-9_-]+:/) { in_hc=0 }
else { print }
}
' "$file"
}
FAIL=0
for svc in "${CRITICAL_SERVICES[@]}"; do
dev_hc="$(extract_healthcheck "$DEV_FILE" "$svc")"
prod_hc="$(extract_healthcheck "$PROD_FILE" "$svc")"
if [[ -z "$dev_hc" ]] && [[ -z "$prod_hc" ]]; then
continue # service not defined in either — fine (e.g. media-api optional)
fi
if [[ "$dev_hc" != "$prod_hc" ]]; then
echo "DRIFT: healthcheck block for service '${svc}' differs between dev and prod compose files" >&2
echo "--- $(basename "$DEV_FILE")" >&2
echo "$dev_hc" >&2
echo "--- $(basename "$PROD_FILE")" >&2
echo "$prod_hc" >&2
echo "" >&2
FAIL=1
fi
done
if [[ "$FAIL" -ne 0 ]]; then
echo "Compose parity check FAILED. Update both files before releasing." >&2
exit 1
fi
echo "Compose parity: OK (${#CRITICAL_SERVICES[@]} services checked)"