Fresh-install + upgrade-path hardening bundle
Six independent fixes surfaced during the v2.9.1 → v2.9.2 admin-UI upgrade validation today. Together they make a clean install on a new box work end-to-end without in-session patching. - Fix 1: scripts/validate-compose-parity.sh + build-release.sh hook — fail release builds when api/admin/media-api/nginx healthcheck blocks drift between docker-compose.yml and docker-compose.prod.yml. Previous boot-race fix had to be applied to both files manually. - Fix 2: scripts/systemd/install.sh chowns logs/ to the install user (the API container creates subdirs there as root, locking the host-side watcher out), pre-creates logs/upgrade-watcher.log, and changemaker-upgrade.service adds StartLimitIntervalSec=0 so a single transient failure can't wedge the .path unit permanently. - Fix 3: /api/upgrade/status now returns a `watcher` sub-object that flags the host systemd watcher as stalled when trigger.json has been pending >30s. Admin SettingsPage SystemUpgradeTab renders a warning Alert with the systemctl recovery command when unhealthy. - Fix 4: scripts/upgrade.sh write_result() — prefer head -1 VERSION over `git rev-parse HEAD` so release-mode upgrades report the new tag in result.json instead of "unknown". - Fix 5: admin container healthcheck start_period 20s → 60s in both compose files, same class as the earlier api fix. Matches Gancio convention. - Fix 7: /api/pangolin/sync now detects resources bound to a stale siteId (common after --pangolin-site new rotations), deletes and recreates them against the current site, and reports them under a new `reassigned` response field. Bunker Admin
This commit is contained in:
parent
5115c65691
commit
23df6a8b52
@ -62,7 +62,7 @@ import { api } from '@/lib/api';
|
||||
import { useMobile } from '@/hooks/useMobile';
|
||||
import { PageTour } from '@/components/tour/PageTour';
|
||||
import type { AppOutletContext } from '@/components/AppLayout';
|
||||
import type { SmtpTestResult, SmtpSendTestResult, UpgradeStatusResponse, UpgradeStatus, UpgradeProgress, UpgradeResult, UpgradeHistoryResponse } from '@/types/api';
|
||||
import type { SmtpTestResult, SmtpSendTestResult, UpgradeStatusResponse, UpgradeStatus, UpgradeProgress, UpgradeResult, UpgradeHistoryResponse, WatcherHealth } from '@/types/api';
|
||||
|
||||
const { Text, Paragraph } = Typography;
|
||||
|
||||
@ -742,6 +742,7 @@ function SystemUpgradeTab() {
|
||||
const [progress, setProgress] = useState<UpgradeProgress | null>(null);
|
||||
const [result, setResult] = useState<UpgradeResult | null>(null);
|
||||
const [running, setRunning] = useState(false);
|
||||
const [watcher, setWatcher] = useState<WatcherHealth | null>(null);
|
||||
const [checking, setChecking] = useState(false);
|
||||
const [upgrading, setUpgrading] = useState(false);
|
||||
const [apiOffline, setApiOffline] = useState(false);
|
||||
@ -760,6 +761,7 @@ function SystemUpgradeTab() {
|
||||
setProgress(data.progress);
|
||||
setResult(data.result);
|
||||
setRunning(data.running);
|
||||
setWatcher(data.watcher ?? null);
|
||||
setApiOffline(false);
|
||||
return data;
|
||||
} catch {
|
||||
@ -996,6 +998,26 @@ function SystemUpgradeTab() {
|
||||
/>
|
||||
)}
|
||||
|
||||
{watcher && !watcher.healthy && (
|
||||
<Alert
|
||||
type="warning"
|
||||
message="Upgrade watcher stalled"
|
||||
description={
|
||||
<>
|
||||
<div>{watcher.reason || 'Host systemd watcher is not processing upgrade triggers.'}</div>
|
||||
<div style={{ marginTop: 8 }}>
|
||||
Recovery:{' '}
|
||||
<Text code>
|
||||
sudo systemctl reset-failed changemaker-upgrade.path changemaker-upgrade.service && sudo systemctl restart changemaker-upgrade.path
|
||||
</Text>
|
||||
</div>
|
||||
</>
|
||||
}
|
||||
showIcon
|
||||
style={{ marginBottom: 16 }}
|
||||
/>
|
||||
)}
|
||||
|
||||
{/* Actions */}
|
||||
<Space style={{ marginBottom: 16 }}>
|
||||
<Button
|
||||
|
||||
@ -3179,11 +3179,18 @@ export interface UpgradeHistoryResponse {
|
||||
history: UpgradeResult[];
|
||||
}
|
||||
|
||||
export interface WatcherHealth {
|
||||
healthy: boolean;
|
||||
reason?: string;
|
||||
pendingSince?: string;
|
||||
}
|
||||
|
||||
export interface UpgradeStatusResponse {
|
||||
status: UpgradeStatus | null;
|
||||
progress: UpgradeProgress | null;
|
||||
result: UpgradeResult | null;
|
||||
running: boolean;
|
||||
watcher?: WatcherHealth;
|
||||
}
|
||||
|
||||
// --- Social Calendar Types ---
|
||||
|
||||
@ -867,11 +867,43 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
|
||||
const existingByDomain = new Map(existing.map(r => [r.fullDomain || '', r]));
|
||||
|
||||
const created: string[] = [];
|
||||
const reassigned: string[] = [];
|
||||
const targetFixed: string[] = [];
|
||||
const skipped: string[] = [];
|
||||
const warnings: string[] = [];
|
||||
const errors: string[] = [];
|
||||
|
||||
// Create resource + public access + target. Shared by "new" and "reassign"
|
||||
// flows so `--pangolin-site new` installs can rebuild after a site rotation.
|
||||
const createResourceForDef = async (def: ResourceDefinition, fullDomain: string) => {
|
||||
const resource = await pangolinClient.createResource({
|
||||
name: def.name,
|
||||
domainId: matchingDomain.domainId,
|
||||
...(def.subdomain ? { subdomain: def.subdomain } : {}),
|
||||
http: true,
|
||||
protocol: 'tcp',
|
||||
});
|
||||
|
||||
try {
|
||||
await pangolinClient.updateResource(resource.resourceId, { sso: false, blockAccess: false });
|
||||
} catch {
|
||||
logger.warn(`Created ${fullDomain} but failed to set public access`);
|
||||
}
|
||||
|
||||
try {
|
||||
await pangolinClient.createTarget(resource.resourceId, {
|
||||
siteId,
|
||||
ip: def.target_ip,
|
||||
port: def.target_port,
|
||||
method: 'http',
|
||||
enabled: true,
|
||||
});
|
||||
} catch (targetErr) {
|
||||
const msg = targetErr instanceof Error ? targetErr.message : 'Unknown error';
|
||||
errors.push(`${fullDomain} (target): ${msg}`);
|
||||
}
|
||||
};
|
||||
|
||||
for (const def of resourceDefs) {
|
||||
const fullDomain = def.subdomain ? `${def.subdomain}.${domain}` : domain;
|
||||
|
||||
@ -890,10 +922,30 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
|
||||
const existingResource = existingByDomain.get(fullDomain);
|
||||
|
||||
if (existingResource) {
|
||||
// Resource exists — verify it has a target
|
||||
// Resource exists — verify target points at the CURRENT site.
|
||||
try {
|
||||
const targets = await pangolinClient.listTargets(existingResource.resourceId);
|
||||
if (targets.length === 0) {
|
||||
const currentTargetSiteId = targets[0]?.siteId;
|
||||
const siteMismatch =
|
||||
targets.length > 0 && Number(currentTargetSiteId) !== Number(siteId);
|
||||
|
||||
if (siteMismatch) {
|
||||
// Stale siteId from a previous `--pangolin-site new` install.
|
||||
// Delete and recreate against the current site.
|
||||
logger.warn(
|
||||
`Resource ${fullDomain} bound to stale siteId ${currentTargetSiteId}, reassigning to ${siteId}`,
|
||||
);
|
||||
try {
|
||||
await pangolinClient.deleteResource(existingResource.resourceId);
|
||||
await createResourceForDef(def, fullDomain);
|
||||
reassigned.push(fullDomain);
|
||||
logger.info(`Reassigned ${fullDomain} to siteId ${siteId}`);
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : 'Unknown error';
|
||||
errors.push(`${fullDomain} (reassign): ${msg}`);
|
||||
logger.error(`Failed to reassign resource ${fullDomain}:`, err);
|
||||
}
|
||||
} else if (targets.length === 0) {
|
||||
// Missing target — create one
|
||||
logger.info(`Resource ${fullDomain} has no target, creating one...`);
|
||||
await pangolinClient.createTarget(existingResource.resourceId, {
|
||||
@ -927,36 +979,7 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
|
||||
} else {
|
||||
// Create new resource + target
|
||||
try {
|
||||
// Root domain: omit subdomain field entirely (Pangolin rejects empty string)
|
||||
const resource = await pangolinClient.createResource({
|
||||
name: def.name,
|
||||
domainId: matchingDomain.domainId,
|
||||
...(def.subdomain ? { subdomain: def.subdomain } : {}),
|
||||
http: true,
|
||||
protocol: 'tcp',
|
||||
});
|
||||
|
||||
// Make publicly accessible (disable SSO auth + blockAccess)
|
||||
try {
|
||||
await pangolinClient.updateResource(resource.resourceId, { sso: false, blockAccess: false });
|
||||
} catch {
|
||||
logger.warn(`Created ${fullDomain} but failed to set public access`);
|
||||
}
|
||||
|
||||
// Create target
|
||||
try {
|
||||
await pangolinClient.createTarget(resource.resourceId, {
|
||||
siteId,
|
||||
ip: def.target_ip,
|
||||
port: def.target_port,
|
||||
method: 'http',
|
||||
enabled: true,
|
||||
});
|
||||
} catch (targetErr) {
|
||||
const msg = targetErr instanceof Error ? targetErr.message : 'Unknown error';
|
||||
errors.push(`${fullDomain} (target): ${msg}`);
|
||||
}
|
||||
|
||||
await createResourceForDef(def, fullDomain);
|
||||
created.push(fullDomain);
|
||||
logger.info(`Created resource + target: ${fullDomain}`);
|
||||
} catch (err) {
|
||||
@ -970,11 +993,12 @@ router.post('/sync', pangolinSetupLimiter, async (_req: Request, res: Response)
|
||||
res.json({
|
||||
success: true,
|
||||
created: created.length,
|
||||
reassigned: reassigned.length,
|
||||
targetFixed: targetFixed.length,
|
||||
skipped: skipped.length,
|
||||
warnings: warnings.length,
|
||||
errors: errors.length,
|
||||
details: { created, targetFixed, skipped, warnings, errors },
|
||||
details: { created, reassigned, targetFixed, skipped, warnings, errors },
|
||||
});
|
||||
} catch (err) {
|
||||
const msg = err instanceof Error ? err.message : 'Unknown error';
|
||||
|
||||
@ -17,8 +17,9 @@ router.get('/status', (_req, res) => {
|
||||
const progress = upgradeService.getProgress();
|
||||
const result = upgradeService.getResult();
|
||||
const running = upgradeService.isRunning();
|
||||
const watcher = upgradeService.getWatcherHealth();
|
||||
|
||||
res.json({ status, progress: running ? progress : null, result, running });
|
||||
res.json({ status, progress: running ? progress : null, result, running, watcher });
|
||||
});
|
||||
|
||||
/**
|
||||
|
||||
@ -60,6 +60,12 @@ export interface UpgradeResult {
|
||||
triggeredBy?: string;
|
||||
}
|
||||
|
||||
export interface WatcherHealth {
|
||||
healthy: boolean;
|
||||
reason?: string;
|
||||
pendingSince?: string;
|
||||
}
|
||||
|
||||
interface TriggerPayload {
|
||||
action: 'check' | 'upgrade';
|
||||
branch?: string;
|
||||
@ -96,6 +102,31 @@ function getStatus(): UpgradeStatus | null {
|
||||
return readJsonFile<UpgradeStatus>(STATUS_FILE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Watcher liveness heuristic. The host-side systemd watcher consumes and
|
||||
* DELETES trigger.json within ~1s of it appearing. If trigger.json exists
|
||||
* and is older than the threshold, the `.path` unit is almost certainly
|
||||
* wedged (e.g. StartLimitBurst latch) and the admin UI should surface it.
|
||||
*/
|
||||
const WATCHER_STALL_MS = 30 * 1000;
|
||||
|
||||
function getWatcherHealth(): WatcherHealth {
|
||||
try {
|
||||
if (!fs.existsSync(TRIGGER_FILE)) return { healthy: true };
|
||||
const mtimeMs = fs.statSync(TRIGGER_FILE).mtimeMs;
|
||||
const age = Date.now() - mtimeMs;
|
||||
if (age <= WATCHER_STALL_MS) return { healthy: true };
|
||||
return {
|
||||
healthy: false,
|
||||
reason: `Trigger file has been pending for ${Math.round(age / 1000)}s — host upgrade watcher may be stopped or failed`,
|
||||
pendingSince: new Date(mtimeMs).toISOString(),
|
||||
};
|
||||
} catch (err) {
|
||||
logger.warn('getWatcherHealth failed:', err);
|
||||
return { healthy: true };
|
||||
}
|
||||
}
|
||||
|
||||
function getProgress(): UpgradeProgress | null {
|
||||
return readJsonFile<UpgradeProgress>(PROGRESS_FILE);
|
||||
}
|
||||
@ -221,6 +252,7 @@ export const upgradeService = {
|
||||
getStatus,
|
||||
getProgress,
|
||||
getResult,
|
||||
getWatcherHealth,
|
||||
isRunning,
|
||||
triggerCheck,
|
||||
triggerUpgrade,
|
||||
|
||||
@ -236,7 +236,7 @@ services:
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
start_period: 60s
|
||||
environment:
|
||||
- DOMAIN=${DOMAIN:-cmlite.org}
|
||||
- NODE_ENV=${NODE_ENV:-production}
|
||||
|
||||
@ -248,7 +248,7 @@ services:
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
start_period: 60s
|
||||
environment:
|
||||
- DOMAIN=${DOMAIN:-cmlite.org}
|
||||
- NODE_ENV=${NODE_ENV:-development}
|
||||
|
||||
@ -87,6 +87,17 @@ if [[ ! -f "$PROJECT_DIR/docker-compose.prod.yml" ]]; then
|
||||
error "docker-compose.prod.yml not found. Generate it first."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Fail the release if dev and prod compose files have drifted on critical
|
||||
# healthcheck blocks — catches cases where one file was patched without
|
||||
# the other (bit us on the api start_period fix).
|
||||
if [[ -x "$PROJECT_DIR/scripts/validate-compose-parity.sh" ]]; then
|
||||
if ! bash "$PROJECT_DIR/scripts/validate-compose-parity.sh"; then
|
||||
error "Compose parity check failed. Aborting release build."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
cp "$PROJECT_DIR/docker-compose.prod.yml" "$STAGE_DIR/docker-compose.yml"
|
||||
info "docker-compose.yml (production)"
|
||||
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
[Unit]
|
||||
Description=Changemaker Lite upgrade dispatcher
|
||||
Documentation=https://docs.cmlite.org/docs/admin/services/
|
||||
StartLimitIntervalSec=0
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
|
||||
@ -28,6 +28,14 @@ for unit in "${SCRIPT_DIR}"/changemaker-upgrade.*; do
|
||||
echo " Installed ${filename}"
|
||||
done
|
||||
|
||||
# Ensure logs/ is writable by the install user. The API container creates
|
||||
# subdirs here as root, which locks out the host-side upgrade-watcher service.
|
||||
mkdir -p "${PROJECT_DIR}/logs"
|
||||
chown "${INSTALL_USER}:${INSTALL_USER}" "${PROJECT_DIR}/logs"
|
||||
touch "${PROJECT_DIR}/logs/upgrade-watcher.log"
|
||||
chown "${INSTALL_USER}:${INSTALL_USER}" "${PROJECT_DIR}/logs/upgrade-watcher.log"
|
||||
echo " Prepared ${PROJECT_DIR}/logs (owned by ${INSTALL_USER})"
|
||||
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now changemaker-upgrade.path
|
||||
|
||||
|
||||
@ -118,7 +118,7 @@ write_result() {
|
||||
"success": ${success},
|
||||
"message": "$(echo "$msg" | sed 's/"/\\"/g')",
|
||||
"previousCommit": "${PRE_UPGRADE_SHORT:-unknown}",
|
||||
"newCommit": "$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
|
||||
"newCommit": "$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || git rev-parse --short HEAD 2>/dev/null || echo "unknown")",
|
||||
"commitCount": ${COMMIT_COUNT:-0},
|
||||
"durationSeconds": ${duration_secs},
|
||||
"warnings": ${warnings_json},
|
||||
@ -1089,7 +1089,7 @@ fi
|
||||
info "Starting API..."
|
||||
if [[ "$NEEDS_VOLUME_REFRESH" == "true" ]]; then
|
||||
info "Removing old API/admin containers (clearing stale node_modules volumes)..."
|
||||
docker compose rm -sf api admin 2>/dev/null || true
|
||||
docker compose rm -sfv api admin 2>/dev/null || true
|
||||
fi
|
||||
docker compose up -d api
|
||||
|
||||
|
||||
78
scripts/validate-compose-parity.sh
Executable file
78
scripts/validate-compose-parity.sh
Executable file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env bash
|
||||
# =============================================================================
|
||||
# Changemaker Lite — Compose Parity Validator
|
||||
#
|
||||
# The dev (docker-compose.yml) and prod (docker-compose.prod.yml) files share
|
||||
# ~95% of their service definitions verbatim, but there is no tooling that
|
||||
# ensures they stay in sync. A drift in healthcheck tolerances between them
|
||||
# can cause release-tarball installs to silently fail where dev installs pass
|
||||
# (or vice versa).
|
||||
#
|
||||
# This script compares the `healthcheck:` block for a fixed set of critical
|
||||
# services between the two files and exits non-zero if any of them diverge.
|
||||
#
|
||||
# Run manually: bash scripts/validate-compose-parity.sh
|
||||
# Also invoked by scripts/build-release.sh before packaging the tarball.
|
||||
# =============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
DEV_FILE="${PROJECT_DIR}/docker-compose.yml"
|
||||
PROD_FILE="${PROJECT_DIR}/docker-compose.prod.yml"
|
||||
|
||||
# Services whose healthcheck must be identical across dev and prod.
|
||||
CRITICAL_SERVICES=(api media-api admin nginx)
|
||||
|
||||
if [[ ! -f "$DEV_FILE" ]] || [[ ! -f "$PROD_FILE" ]]; then
|
||||
echo "ERROR: Could not find both compose files (expected $DEV_FILE and $PROD_FILE)" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
# Extract the healthcheck block for a given service from a compose file.
|
||||
# Uses awk to walk indentation: find `^ <service>:`, then within it the
|
||||
# ` healthcheck:` block, and print the healthcheck lines until a sibling
|
||||
# key (same 4-space indent) or end of service.
|
||||
extract_healthcheck() {
|
||||
local file="$1" service="$2"
|
||||
awk -v svc="$service" '
|
||||
# Entering the target service definition?
|
||||
$0 ~ "^ "svc":[[:space:]]*$" { in_svc=1; next }
|
||||
# Next top-level service — stop scanning
|
||||
in_svc && /^ [a-zA-Z0-9_-]+:[[:space:]]*$/ { in_svc=0 }
|
||||
# Inside target service, watch for healthcheck block
|
||||
in_svc && /^ healthcheck:[[:space:]]*$/ { in_hc=1; print; next }
|
||||
# Inside healthcheck: print until we hit a sibling key at same indent
|
||||
in_hc {
|
||||
if (/^ [a-zA-Z0-9_-]+:/) { in_hc=0 }
|
||||
else { print }
|
||||
}
|
||||
' "$file"
|
||||
}
|
||||
|
||||
FAIL=0
|
||||
for svc in "${CRITICAL_SERVICES[@]}"; do
|
||||
dev_hc="$(extract_healthcheck "$DEV_FILE" "$svc")"
|
||||
prod_hc="$(extract_healthcheck "$PROD_FILE" "$svc")"
|
||||
|
||||
if [[ -z "$dev_hc" ]] && [[ -z "$prod_hc" ]]; then
|
||||
continue # service not defined in either — fine (e.g. media-api optional)
|
||||
fi
|
||||
|
||||
if [[ "$dev_hc" != "$prod_hc" ]]; then
|
||||
echo "DRIFT: healthcheck block for service '${svc}' differs between dev and prod compose files" >&2
|
||||
echo "--- $(basename "$DEV_FILE")" >&2
|
||||
echo "$dev_hc" >&2
|
||||
echo "--- $(basename "$PROD_FILE")" >&2
|
||||
echo "$prod_hc" >&2
|
||||
echo "" >&2
|
||||
FAIL=1
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$FAIL" -ne 0 ]]; then
|
||||
echo "Compose parity check FAILED. Update both files before releasing." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Compose parity: OK (${#CRITICAL_SERVICES[@]} services checked)"
|
||||
Loading…
x
Reference in New Issue
Block a user