CCP restore/tunnel/upgrade + upgrade.sh release-mode fixes + volunteer dashboard polish

- Add instance restore model, routes, and agent backup/restore endpoints
- Add Pangolin tunnel service (subdomain prefix, teardown action, CCP client)
- Add slug mutex for concurrent operation safety in agent
- Expand upgrade service with remote driver orchestration
- Fix upgrade.sh to properly handle release-mode installs (no git operations)
- Add CCP registration flags to config.sh (--ccp-url, --ccp-invite-code, --ccp-agent-url)
- Auto-detect JVB advertise IP in non-interactive mode
- Polish volunteer dashboard ActionStepsList with highlighted step component
- Add ticketed event description field + volunteer dashboard query refinements

Bunker Admin
This commit is contained in:
bunker-admin 2026-04-12 11:09:46 -06:00
parent 29d1f3998a
commit 26ec925d9b
35 changed files with 4191 additions and 329 deletions

View File

@ -10,6 +10,8 @@ import {
LinkOutlined,
CheckSquareOutlined,
CheckCircleFilled,
RightOutlined,
ThunderboltOutlined,
} from '@ant-design/icons';
import { useNavigate } from 'react-router-dom';
import { api } from '@/lib/api';
@ -66,6 +68,97 @@ function resolveStepLink(step: DashboardActionStep): { to: string; external: boo
}
}
function HighlightedStep({
step,
onNavigate,
onSelfReport,
loading,
}: {
step: DashboardActionStep;
onNavigate: (step: DashboardActionStep) => void;
onSelfReport: (step: DashboardActionStep) => void;
loading: boolean;
}) {
const isSelfReport = step.kind === 'CUSTOM' || step.kind === 'VISIT_LINK';
const canNavigate = resolveStepLink(step) !== null;
return (
<div
style={{
background: 'linear-gradient(135deg, rgba(52,152,219,0.25) 0%, rgba(41,128,185,0.15) 100%)',
border: '1px solid rgba(52,152,219,0.3)',
borderRadius: 8,
padding: '16px 20px',
margin: '0 0 2px',
}}
>
<div style={{ display: 'flex', alignItems: 'center', gap: 6, marginBottom: 8 }}>
<ThunderboltOutlined style={{ fontSize: 12, color: '#3498db' }} />
<Typography.Text strong style={{ fontSize: 12, color: '#3498db', textTransform: 'uppercase', letterSpacing: 0.5 }}>
Next Up
</Typography.Text>
</div>
<div style={{ display: 'flex', alignItems: 'center', gap: 10, marginBottom: 8 }}>
<div
style={{
width: 32,
height: 32,
borderRadius: '50%',
background: 'rgba(52,152,219,0.25)',
display: 'flex',
alignItems: 'center',
justifyContent: 'center',
fontSize: 16,
color: '#3498db',
flexShrink: 0,
}}
>
{KIND_ICONS[step.kind]}
</div>
<div style={{ flex: 1, minWidth: 0 }}>
<Typography.Text strong style={{ fontSize: 15, display: 'block' }}>
{step.label}
</Typography.Text>
{step.description && (
<Typography.Text type="secondary" style={{ fontSize: 12, display: 'block', marginTop: 2 }}>
{step.description}
</Typography.Text>
)}
</div>
</div>
<div style={{ display: 'flex', gap: 8, marginTop: 4 }}>
{isSelfReport ? (
<>
{canNavigate && (
<Button size="middle" onClick={() => onNavigate(step)} icon={<RightOutlined />}>
Open
</Button>
)}
<Button
type="primary"
size="middle"
loading={loading}
onClick={() => onSelfReport(step)}
>
Mark as done
</Button>
</>
) : (
<Button
type="primary"
size="middle"
icon={<RightOutlined />}
onClick={() => onNavigate(step)}
disabled={!canNavigate}
>
Take Action
</Button>
)}
</div>
</div>
);
}
export default function ActionStepsList({ campaign, onRefresh }: ActionStepsListProps) {
const navigate = useNavigate();
const { message } = App.useApp();
@ -95,6 +188,8 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList
};
const sortedSteps = [...campaign.steps].sort((a, b) => a.order - b.order);
const highlightedStep = sortedSteps.find((s) => !s.completed);
const remainingSteps = sortedSteps.filter((s) => s.id !== highlightedStep?.id);
return (
<Card
@ -108,7 +203,18 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList
</Typography.Text>
}
>
{sortedSteps.map((step, i) => {
{highlightedStep && (
<div style={{ padding: '12px 12px 0' }}>
<HighlightedStep
step={highlightedStep}
onNavigate={handleNavigate}
onSelfReport={handleSelfReport}
loading={completingStepId === highlightedStep.id}
/>
</div>
)}
{remainingSteps.map((step, i) => {
const isSelfReport = step.kind === 'CUSTOM' || step.kind === 'VISIT_LINK';
const canNavigate = resolveStepLink(step) !== null;
@ -119,8 +225,8 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList
display: 'flex',
alignItems: 'center',
justifyContent: 'space-between',
padding: '12px 20px',
borderTop: i > 0 ? '1px solid rgba(255,255,255,0.04)' : undefined,
padding: '10px 20px',
borderTop: (highlightedStep || i > 0) ? '1px solid rgba(255,255,255,0.04)' : undefined,
opacity: step.completed ? 0.55 : 1,
gap: 12,
}}
@ -128,22 +234,22 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList
<div style={{ display: 'flex', alignItems: 'center', gap: 10, flex: 1, minWidth: 0 }}>
<div
style={{
width: 26,
height: 26,
width: 24,
height: 24,
borderRadius: '50%',
background: step.completed ? '#52c41a' : 'rgba(52,152,219,0.15)',
background: step.completed ? '#52c41a' : 'rgba(255,255,255,0.06)',
display: 'flex',
alignItems: 'center',
justifyContent: 'center',
fontSize: 13,
fontSize: 12,
flexShrink: 0,
color: step.completed ? '#fff' : 'rgba(255,255,255,0.7)',
color: step.completed ? '#fff' : 'rgba(255,255,255,0.5)',
}}
>
{step.completed ? <CheckCircleFilled /> : KIND_ICONS[step.kind]}
</div>
<div style={{ minWidth: 0 }}>
<Typography.Text strong style={{ fontSize: 12, color: 'rgba(255,255,255,0.45)', display: 'block' }}>
<Typography.Text strong style={{ fontSize: 11, color: 'rgba(255,255,255,0.35)', display: 'block', lineHeight: 1 }}>
{KIND_LABELS[step.kind]}
</Typography.Text>
<Typography.Text
@ -163,7 +269,7 @@ export default function ActionStepsList({ campaign, onRefresh }: ActionStepsList
<div style={{ flexShrink: 0 }}>
{step.completed ? (
<Tag color="success" style={{ margin: 0 }}>Done</Tag>
<Tag color="success" style={{ margin: 0, fontSize: 11 }}>Done</Tag>
) : isSelfReport ? (
<Space size={4}>
{canNavigate && (

View File

@ -7,7 +7,7 @@ import {
import {
PlusOutlined, SearchOutlined, EditOutlined, EyeOutlined, DeleteOutlined,
CheckCircleOutlined, CloseCircleOutlined, CopyOutlined, ScanOutlined,
TagOutlined, VideoCameraOutlined, EnvironmentOutlined,
TagOutlined, VideoCameraOutlined, EnvironmentOutlined, StarOutlined, StarFilled,
} from '@ant-design/icons';
import { api } from '@/lib/api';
import dayjs from 'dayjs';
@ -45,6 +45,7 @@ interface TicketedEvent {
currentAttendees: number;
coverImageUrl: string | null;
organizerName: string | null;
featured: boolean;
ticketTiers: TicketTier[];
_count: { tickets: number; checkIns: number };
createdAt: string;
@ -198,18 +199,55 @@ export default function TicketedEventsPage() {
}
};
const handleFeature = async (id: string, featured: boolean) => {
try {
if (featured) {
// Unfeature all others first (exclusive toggle)
const othersToUnfeature = events.filter((e) => e.featured && e.id !== id);
await Promise.all(
othersToUnfeature.map((e) => api.put(`/api/ticketed-events/admin/${e.id}`, { featured: false }))
);
}
await api.put(`/api/ticketed-events/admin/${id}`, { featured });
message.success(featured ? 'Event featured on volunteer dashboard' : 'Event unfeatured');
fetchEvents();
} catch {
message.error('Failed to update featured status');
}
};
const copyLink = (slug: string) => {
navigator.clipboard.writeText(`${window.location.origin}/event/${slug}`);
message.success('Link copied');
};
const columns = [
{
title: '',
key: 'featured',
width: 36,
render: (_: unknown, record: TicketedEvent) => (
<Tooltip title={record.featured ? 'Remove from volunteer dashboard' : 'Feature on volunteer dashboard'}>
<Button
type="text"
size="small"
icon={record.featured
? <StarFilled style={{ color: '#faad14' }} />
: <StarOutlined style={{ color: 'rgba(255,255,255,0.25)' }} />}
onClick={(e) => { e.stopPropagation(); handleFeature(record.id, !record.featured); }}
/>
</Tooltip>
),
},
{
title: 'Title',
dataIndex: 'title',
key: 'title',
render: (text: string, record: TicketedEvent) => (
<a onClick={() => navigate(`/app/events/${record.id}`)}>{text}</a>
<Space>
<a onClick={() => navigate(`/app/events/${record.id}`)}>{text}</a>
{record.featured && <Tag color="gold" style={{ fontSize: 11 }}>Featured</Tag>}
</Space>
),
},
{

View File

@ -50,6 +50,7 @@ export const updateEventSchema = z.object({
maxAttendees: z.number().int().positive().nullable().optional(),
organizerName: z.string().max(200).nullable().optional(),
organizerEmail: z.string().email().nullable().optional(),
featured: z.boolean().optional(),
});
export const createTierSchema = z.object({

View File

@ -114,24 +114,31 @@ async function getReferral(userId: string): Promise<DashboardReferral> {
async function getFeaturedEvent(): Promise<DashboardFeaturedEvent | null> {
const today = new Date();
today.setHours(0, 0, 0, 0);
const event = await prisma.ticketedEvent.findFirst({
where: {
featured: true,
status: TicketedEventStatus.PUBLISHED,
date: { gte: today },
},
orderBy: { date: 'asc' },
select: {
slug: true,
title: true,
date: true,
startTime: true,
venueName: true,
coverImageUrl: true,
currentAttendees: true,
maxAttendees: true,
},
});
const eventSelect = {
slug: true,
title: true,
date: true,
startTime: true,
venueName: true,
coverImageUrl: true,
currentAttendees: true,
maxAttendees: true,
} as const;
const baseWhere = { status: TicketedEventStatus.PUBLISHED, date: { gte: today } };
// Prefer admin-featured event; fall back to next upcoming published event
const event =
await prisma.ticketedEvent.findFirst({
where: { ...baseWhere, featured: true },
orderBy: { date: 'asc' },
select: eventSelect,
}) ??
await prisma.ticketedEvent.findFirst({
where: baseWhere,
orderBy: { date: 'asc' },
select: eventSelect,
});
if (!event) return null;
return {
slug: event.slug,

View File

@ -14,7 +14,7 @@ export default function AgentRegistrationsPage() {
const fetchRegistrations = useCallback(async () => {
try {
setLoading(true);
const { data } = await api.get('/api/agents/registrations');
const { data } = await api.get('/agents/registrations');
setRegistrations(data);
} catch {
message.error('Failed to load registrations');
@ -27,7 +27,7 @@ export default function AgentRegistrationsPage() {
const handleApprove = async (id: string) => {
try {
await api.post(`/api/agents/registrations/${id}/approve`);
await api.post(`/agents/registrations/${id}/approve`);
message.success('Registration approved — agent will receive certificates on next poll');
fetchRegistrations();
setDetailModal(null);
@ -39,7 +39,7 @@ export default function AgentRegistrationsPage() {
const handleReject = async (id: string) => {
try {
await api.post(`/api/agents/registrations/${id}/reject`);
await api.post(`/agents/registrations/${id}/reject`);
message.success('Registration rejected');
fetchRegistrations();
setDetailModal(null);

View File

@ -203,8 +203,16 @@ export default function BackupsPage() {
{
title: 'Instance',
dataIndex: 'instance',
width: 160,
render: (inst: BackupRow['instance']) => inst?.name || '-',
width: 180,
render: (inst: BackupRow['instance'], record: BackupRow) => {
const isRemote = record.manifest?.source === 'remote';
return (
<Space size={4}>
<span>{inst?.name || '-'}</span>
{isRemote && <Tag color="blue">remote</Tag>}
</Space>
);
},
},
{
title: 'Status',

View File

@ -44,6 +44,7 @@ import {
WarningOutlined,
CloseCircleOutlined,
InfoCircleOutlined,
UndoOutlined,
} from '@ant-design/icons';
import dayjs from 'dayjs';
import { useNavigate, useParams } from 'react-router-dom';
@ -89,6 +90,16 @@ export default function InstanceDetailPage() {
const [backupsLoading, setBackupsLoading] = useState(false);
const [creatingBackup, setCreatingBackup] = useState(false);
// Restore state
const [restoreModal, setRestoreModal] = useState<{ backup: Backup; typedSlug: string } | null>(null);
const [restoring, setRestoring] = useState(false);
const [activeRestoreId, setActiveRestoreId] = useState<string | null>(null);
const [activeRestoreState, setActiveRestoreState] = useState<{
status: string;
logTail?: string | null;
errorMessage?: string | null;
} | null>(null);
// Feature reconfiguration state
const [featureFlags, setFeatureFlags] = useState<Record<string, boolean>>({});
const [reconfiguring, setReconfiguring] = useState(false);
@ -109,6 +120,18 @@ export default function InstanceDetailPage() {
const [tunnelSaving, setTunnelSaving] = useState(false);
const [tunnelRemoving, setTunnelRemoving] = useState(false);
// Remote tunnel state (Pangolin API managed by CCP)
const [tunnelStatus, setTunnelStatus] = useState<{
configured: boolean;
online?: boolean;
siteId?: string;
endpoint?: string;
resources?: Array<{ subdomain: string; name: string; resourceId: string; hasTarget: boolean; targetIp?: string; targetPort?: number }>;
} | null>(null);
const [tunnelStatusLoading, setTunnelStatusLoading] = useState(false);
const [tunnelSetupRunning, setTunnelSetupRunning] = useState(false);
const [tunnelSyncing, setTunnelSyncing] = useState(false);
// Upgrade state
const [updateStatus, setUpdateStatus] = useState<UpdateStatus | null>(null);
const [checkingUpdate, setCheckingUpdate] = useState(false);
@ -390,6 +413,64 @@ export default function InstanceDetailPage() {
window.open(`/api/backups/${backupId}/download`, '_blank');
};
const handleRestoreConfirm = async () => {
if (!restoreModal) return;
if (restoreModal.typedSlug !== instance?.slug) {
message.error('Typed slug does not match — restore cancelled');
return;
}
setRestoring(true);
try {
const { data } = await api.post(`/instances/${id}/restore`, {
backupId: restoreModal.backup.id,
});
const restoreId = data.data.id as string;
setActiveRestoreId(restoreId);
setActiveRestoreState({ status: 'PENDING' });
setRestoreModal(null);
message.success('Restore started — polling for progress');
} catch (err: unknown) {
const e = err as { response?: { data?: { error?: { message?: string } } } };
message.error(e?.response?.data?.error?.message || 'Failed to start restore');
} finally {
setRestoring(false);
}
};
// Poll the active restore's status every 3s until it completes or fails
useEffect(() => {
if (!activeRestoreId) return;
let cancelled = false;
const poll = async () => {
try {
const { data } = await api.get(`/instances/${id}/restores/${activeRestoreId}`);
if (cancelled) return;
const row = data.data;
setActiveRestoreState({
status: row.status,
logTail: row.logTail,
errorMessage: row.errorMessage,
});
if (row.status === 'COMPLETED') {
message.success('Restore completed successfully');
setActiveRestoreId(null);
fetchBackups();
} else if (row.status === 'FAILED') {
message.error(`Restore failed: ${row.errorMessage || 'unknown error'}`);
setActiveRestoreId(null);
}
} catch {
// keep trying; transient errors are expected during remote restart
}
};
poll();
const handle = setInterval(poll, 3000);
return () => {
cancelled = true;
clearInterval(handle);
};
}, [activeRestoreId, id, fetchBackups]);
// Initialize feature flags and tunnel form when instance loads
useEffect(() => {
if (instance) {
@ -508,6 +589,11 @@ export default function InstanceDetailPage() {
const ports = instance.portConfig as Record<string, number>;
const isProvisioning = instance.status === 'PROVISIONING';
const isRegistered = instance.isRegistered;
const isRemote = instance.isRemote;
// A "managed" instance is one CCP can run backup/restore/upgrade on.
// Local CCP-managed and remote (agent-backed) both qualify; only locally-
// adopted registered instances (isRegistered && !isRemote) are unmanaged.
const isManaged = !isRegistered || isRemote;
const canStart = instance.status === 'STOPPED' || instance.status === 'ERROR';
const canStop = instance.status === 'RUNNING' || instance.status === 'ERROR';
const canRestart = instance.status === 'RUNNING';
@ -731,7 +817,7 @@ export default function InstanceDetailPage() {
const backupsTab = (
<div>
{isRegistered && (
{!isManaged && (
<Alert
message="Backups not managed by CCP"
description="This instance was deployed outside the control panel. Use its own backup tools to manage backups."
@ -740,6 +826,15 @@ export default function InstanceDetailPage() {
style={{ marginBottom: 16 }}
/>
)}
{isRemote && (
<Alert
message="Remote instance"
description="Backups and restores run via the remote agent over mTLS. Create Backup triggers scripts/backup.sh on the remote host and streams the archive back to the control panel."
type="info"
showIcon
style={{ marginBottom: 16 }}
/>
)}
<div style={{ marginBottom: 12, display: 'flex', justifyContent: 'space-between' }}>
<Typography.Text type="secondary">
{backups.length} backup{backups.length !== 1 ? 's' : ''}
@ -749,7 +844,7 @@ export default function InstanceDetailPage() {
type="primary"
onClick={handleCreateBackup}
loading={creatingBackup}
disabled={instance.status !== 'RUNNING' || isRegistered}
disabled={instance.status !== 'RUNNING' || !isManaged}
>
Create Backup
</Button>
@ -784,20 +879,36 @@ export default function InstanceDetailPage() {
{
title: 'Size',
dataIndex: 'sizeBytes',
render: (b: number | null) => (b ? `${(b / 1024 / 1024).toFixed(1)} MB` : '-'),
render: (b: number | string | null) => {
if (b == null) return '-';
const n = typeof b === 'string' ? parseInt(b, 10) : b;
return `${(n / 1024 / 1024).toFixed(1)} MB`;
},
},
{
title: 'Actions',
width: 120,
width: 160,
render: (_: unknown, record: Backup) => (
<Space size="small">
{record.status === 'COMPLETED' && (
<Button
icon={<CloudDownloadOutlined />}
size="small"
type="text"
onClick={() => handleDownloadBackup(record.id)}
/>
<>
<Button
icon={<CloudDownloadOutlined />}
size="small"
type="text"
title="Download archive"
onClick={() => handleDownloadBackup(record.id)}
/>
{isManaged && (
<Button
icon={<UndoOutlined />}
size="small"
type="text"
title="Restore this backup (destructive)"
onClick={() => setRestoreModal({ backup: record, typedSlug: '' })}
/>
)}
</>
)}
<Popconfirm
title="Delete this backup?"
@ -1049,7 +1160,73 @@ export default function InstanceDetailPage() {
);
const tunnelConfigured = !!(instance.pangolinEndpoint && instance.pangolinNewtId);
const canConfigureTunnel = !isRegistered && (instance.status === 'RUNNING' || instance.status === 'STOPPED');
const canConfigureTunnel = isManaged && (instance.status === 'RUNNING' || instance.status === 'STOPPED');
// Fetch tunnel status for remote instances
const fetchTunnelStatus = useCallback(async () => {
if (!isRemote) return;
setTunnelStatusLoading(true);
try {
const { data } = await api.get(`/instances/${id}/tunnel/status`);
setTunnelStatus(data.data);
} catch {
setTunnelStatus(null);
} finally {
setTunnelStatusLoading(false);
}
}, [id, isRemote]);
useEffect(() => {
if (activeTab === 'tunnel' && isRemote) {
fetchTunnelStatus();
}
}, [activeTab, isRemote, fetchTunnelStatus]);
const handleRemoteTunnelSetup = async (values: { subdomainPrefix?: string }) => {
setTunnelSetupRunning(true);
try {
await api.post(`/instances/${id}/tunnel/setup`, {
subdomainPrefix: values.subdomainPrefix || instance.slug,
});
message.success('Tunnel setup complete — Newt credentials pushed to remote instance');
fetchInstance();
fetchTunnelStatus();
} catch (err: unknown) {
const e = err as { response?: { data?: { error?: { message?: string } } } };
message.error(e?.response?.data?.error?.message || 'Tunnel setup failed');
} finally {
setTunnelSetupRunning(false);
}
};
const handleTunnelSync = async () => {
setTunnelSyncing(true);
try {
const { data } = await api.post(`/instances/${id}/tunnel/sync`);
message.success(`Sync complete — ${data.data.created} new resource(s) created`);
fetchTunnelStatus();
} catch (err: unknown) {
const e = err as { response?: { data?: { error?: { message?: string } } } };
message.error(e?.response?.data?.error?.message || 'Sync failed');
} finally {
setTunnelSyncing(false);
}
};
const handleRemoteTunnelTeardown = async () => {
setTunnelRemoving(true);
try {
await api.delete(`/instances/${id}/tunnel`);
message.success('Tunnel torn down — Pangolin site deleted');
fetchInstance();
setTunnelStatus(null);
} catch (err: unknown) {
const e = err as { response?: { data?: { error?: { message?: string } } } };
message.error(e?.response?.data?.error?.message || 'Teardown failed');
} finally {
setTunnelRemoving(false);
}
};
const handleConfigureTunnel = async (values: { pangolinEndpoint: string; pangolinNewtId: string; pangolinNewtSecret?: string }) => {
setTunnelSaving(true);
@ -1088,9 +1265,111 @@ export default function InstanceDetailPage() {
}
};
const tunnelTab = (
const remoteTunnelTab = (
<Space direction="vertical" size="large" style={{ width: '100%' }}>
{isRegistered && (
{tunnelStatus?.configured ? (
<>
<Alert
message={`Tunnel active — ${tunnelStatus.online ? 'online' : 'offline'}`}
description={`Connected to ${tunnelStatus.endpoint || instance.pangolinEndpoint} (site: ${tunnelStatus.siteId})`}
type={tunnelStatus.online ? 'success' : 'warning'}
showIcon
icon={<CloudOutlined />}
/>
<Card title="Current Configuration" size="small">
<Descriptions bordered column={1}>
<Descriptions.Item label="Endpoint">
<Typography.Text copyable>{tunnelStatus.endpoint || instance.pangolinEndpoint}</Typography.Text>
</Descriptions.Item>
<Descriptions.Item label="Site ID">
<Typography.Text copyable>{tunnelStatus.siteId || instance.pangolinSiteId}</Typography.Text>
</Descriptions.Item>
<Descriptions.Item label="Newt ID">
<Typography.Text copyable>{instance.pangolinNewtId}</Typography.Text>
</Descriptions.Item>
<Descriptions.Item label="Status">
<Tag color={tunnelStatus.online ? 'green' : 'orange'}>{tunnelStatus.online ? 'Online' : 'Offline'}</Tag>
</Descriptions.Item>
</Descriptions>
</Card>
{tunnelStatus.resources && tunnelStatus.resources.length > 0 && (
<Card
title="Resources"
size="small"
extra={
<Space>
<Button icon={<SyncOutlined />} size="small" onClick={handleTunnelSync} loading={tunnelSyncing}>
Sync
</Button>
<Button icon={<ReloadOutlined />} size="small" onClick={fetchTunnelStatus} loading={tunnelStatusLoading}>
Refresh
</Button>
</Space>
}
>
<Table
dataSource={tunnelStatus.resources}
rowKey="resourceId"
size="small"
pagination={false}
columns={[
{ title: 'Subdomain', dataIndex: 'subdomain', render: (s: string) => s || '(root)' },
{ title: 'Name', dataIndex: 'name' },
{ title: 'Target', render: (_: unknown, r: { hasTarget: boolean; targetIp?: string; targetPort?: number }) =>
r.hasTarget ? `${r.targetIp}:${r.targetPort}` : <Tag color="red">No target</Tag>
},
]}
/>
</Card>
)}
<Popconfirm
title="Tear down tunnel?"
description="This will delete the Pangolin site and all resources. The Newt container will be stopped."
onConfirm={handleRemoteTunnelTeardown}
>
<Button danger icon={<DisconnectOutlined />} loading={tunnelRemoving}>
Teardown Tunnel
</Button>
</Popconfirm>
</>
) : (
<>
<Alert
message="No tunnel configured"
description="The CCP will create a Pangolin site and resources for this instance, push Newt credentials to its .env, and start the tunnel container."
type="info"
showIcon
/>
<Card title="Setup Tunnel" size="small">
<Form layout="vertical" onFinish={handleRemoteTunnelSetup}>
<Form.Item
name="subdomainPrefix"
label="Subdomain Prefix"
initialValue={instance.slug}
extra={`Resources will be created as <prefix>-app.${instance.domain}, <prefix>-api.${instance.domain}, etc.`}
rules={[{ required: true }, { pattern: /^[a-z0-9-]+$/, message: 'Lowercase alphanumeric + hyphens only' }]}
>
<Input placeholder={instance.slug} />
</Form.Item>
<Form.Item style={{ marginBottom: 0 }}>
<Button type="primary" htmlType="submit" icon={<CloudOutlined />} loading={tunnelSetupRunning}>
Setup Tunnel
</Button>
</Form.Item>
</Form>
</Card>
</>
)}
</Space>
);
const localTunnelTab = (
<Space direction="vertical" size="large" style={{ width: '100%' }}>
{!isManaged && (
<Alert
message="Tunnel management is not available for external instances"
description="This instance was deployed outside the control panel. Manage its tunnel configuration directly."
@ -1099,7 +1378,7 @@ export default function InstanceDetailPage() {
/>
)}
{!isRegistered && tunnelConfigured && (
{isManaged && tunnelConfigured && (
<Alert
message={`Tunnel active — connected to ${instance.pangolinEndpoint}`}
type="success"
@ -1108,7 +1387,7 @@ export default function InstanceDetailPage() {
/>
)}
{!isRegistered && !tunnelConfigured && (
{isManaged && !tunnelConfigured && (
<Alert
message="No tunnel configured"
description="Enter your Pangolin Newt credentials below to enable tunnel access for this instance. You can get these from your Pangolin dashboard."
@ -1133,7 +1412,7 @@ export default function InstanceDetailPage() {
</Card>
)}
{canConfigureTunnel && (
{canConfigureTunnel && !isRemote && (
<Card title={tunnelConfigured ? 'Update Tunnel' : 'Enable Tunnel'} size="small">
<Form
form={tunnelForm}
@ -1200,6 +1479,8 @@ export default function InstanceDetailPage() {
</Space>
);
const tunnelTab = isRemote ? remoteTunnelTab : localTunnelTab;
// ─── Updates Tab ──────────────────────────────────────────────
const isUpgrading = currentUpgrade?.status === 'IN_PROGRESS' || currentUpgrade?.status === 'PENDING';
@ -1278,7 +1559,7 @@ export default function InstanceDetailPage() {
)}
{/* Upgrade Action */}
{!isRegistered && (
{isManaged && (
<Card title="Upgrade" size="small">
{isUpgrading && currentUpgrade ? (
<Space direction="vertical" style={{ width: '100%' }}>
@ -1340,7 +1621,7 @@ export default function InstanceDetailPage() {
</Card>
)}
{isRegistered && (
{!isManaged && (
<Alert
message="Upgrades are not managed by CCP for external instances"
description="Run the upgrade script directly on the instance or use its own upgrade mechanism."
@ -1348,6 +1629,14 @@ export default function InstanceDetailPage() {
showIcon
/>
)}
{isRemote && (
<Alert
message="Remote instance"
description="Upgrades run via the remote agent over mTLS. The agent shells out to scripts/upgrade.sh --api-mode and the control panel polls progress every 2s."
type="info"
showIcon
/>
)}
{/* Upgrade History */}
<Card title="Upgrade History" size="small">
@ -1794,6 +2083,108 @@ export default function InstanceDetailPage() {
{ key: 'tunnel', label: 'Tunnel', children: tunnelTab },
]}
/>
{/* Restore confirmation modal (destructive action guard) */}
<Modal
title="Restore backup — destructive"
open={!!restoreModal}
onCancel={() => setRestoreModal(null)}
onOk={handleRestoreConfirm}
okText="Restore"
okButtonProps={{
danger: true,
loading: restoring,
disabled: restoreModal?.typedSlug !== instance.slug,
}}
cancelButtonProps={{ disabled: restoring }}
width={560}
>
<Alert
type="error"
showIcon
message="This will OVERWRITE the instance's databases and uploads"
description="The agent will stop application containers, drop databases, and restore from the selected backup. This cannot be undone without another backup."
style={{ marginBottom: 16 }}
/>
{restoreModal && (
<Descriptions column={1} size="small" bordered style={{ marginBottom: 16 }}>
<Descriptions.Item label="Backup ID">
<code>{restoreModal.backup.id.substring(0, 8)}</code>
</Descriptions.Item>
<Descriptions.Item label="Archive size">
{restoreModal.backup.sizeBytes
? `${(Number(restoreModal.backup.sizeBytes) / 1024 / 1024).toFixed(1)} MB`
: '-'}
</Descriptions.Item>
<Descriptions.Item label="Created">
{restoreModal.backup.completedAt
? dayjs(restoreModal.backup.completedAt).format('YYYY-MM-DD HH:mm')
: '-'}
</Descriptions.Item>
</Descriptions>
)}
<Typography.Paragraph>
Type the instance slug <strong><code>{instance.slug}</code></strong> to confirm:
</Typography.Paragraph>
<Input
value={restoreModal?.typedSlug || ''}
onChange={(e) =>
setRestoreModal((cur) => (cur ? { ...cur, typedSlug: e.target.value } : cur))
}
placeholder={instance.slug}
autoFocus
/>
</Modal>
{/* Active restore progress banner */}
{activeRestoreId && activeRestoreState && (
<Modal
title="Restore in progress"
open
closable={false}
footer={null}
width={640}
>
<Space direction="vertical" style={{ width: '100%' }}>
<div>
<Tag
color={
activeRestoreState.status === 'COMPLETED'
? 'green'
: activeRestoreState.status === 'FAILED'
? 'red'
: 'processing'
}
>
{activeRestoreState.status}
</Tag>
{activeRestoreState.status === 'RUNNING' && (
<Typography.Text type="secondary" style={{ marginLeft: 8 }}>
Agent is running scripts/restore.sh this can take several minutes
</Typography.Text>
)}
</div>
{activeRestoreState.errorMessage && (
<Alert type="error" message={activeRestoreState.errorMessage} showIcon />
)}
{activeRestoreState.logTail && (
<pre
style={{
background: '#1e1e1e',
color: '#d4d4d4',
padding: 12,
maxHeight: 300,
overflow: 'auto',
fontSize: 12,
borderRadius: 4,
}}
>
{activeRestoreState.logTail}
</pre>
)}
</Space>
</Modal>
)}
</div>
);
}

View File

@ -14,7 +14,7 @@ export default function InviteCodesPage() {
const fetchCodes = useCallback(async () => {
try {
setLoading(true);
const { data } = await api.get('/api/invite-codes');
const { data } = await api.get('/invite-codes');
setCodes(data.data || []);
} catch {
message.error('Failed to load invite codes');
@ -28,7 +28,7 @@ export default function InviteCodesPage() {
const handleCreate = async () => {
try {
setCreating(true);
const { data } = await api.post('/api/invite-codes');
const { data } = await api.post('/invite-codes');
message.success(`Invite code created: ${data.code}`);
fetchCodes();
} catch {
@ -40,7 +40,7 @@ export default function InviteCodesPage() {
const handleRevoke = async (id: string) => {
try {
await api.delete(`/api/invite-codes/${id}`);
await api.delete(`/invite-codes/${id}`);
message.success('Invite code revoked');
fetchCodes();
} catch {

View File

@ -26,6 +26,7 @@ const envSchema = z.object({
INSTANCE_SLUG: z.string().default(''),
INSTANCE_DOMAIN: z.string().default(''),
INSTANCE_BASE_PATH: z.string().default(''),
COMPOSE_PROJECT: z.string().default(''),
});
function validateEnv() {

View File

@ -1,105 +1,623 @@
import { Router, Request, Response } from 'express';
import { param } from '../utils/params';
import fs from 'fs/promises';
import path from 'path';
import { exec as execCb } from 'child_process';
import { createReadStream, createWriteStream } from 'fs';
import { pipeline as pipelineCb, Transform } from 'stream';
import { promisify } from 'util';
import * as docker from '../services/docker.service';
import path from 'path';
import crypto from 'crypto';
import { spawn } from 'child_process';
import { getSlugEntry } from '../services/registry.service';
import { env } from '../config/env';
import { logger } from '../utils/logger';
import { withSlugLock, SlugBusyError, isSlugLocked } from '../services/slug-mutex';
import { AgentError } from '../middleware/error-handler';
const pipeline = promisify(pipelineCb);
const exec = promisify(execCb);
const router = Router();
// POST /instance/:slug/backup — Run pg_dump + tar uploads → return backup info
router.post('/instance/:slug/backup', async (req: Request, res: Response) => {
const entry = await getSlugEntry(param(req, 'slug'));
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
const backupDir = path.join(env.AGENT_DATA_DIR, 'backups', param(req, 'slug'), timestamp);
await fs.mkdir(backupDir, { recursive: true });
// ─── Helpers ──────────────────────────────────────────────────────────
const { pgPassword } = req.body;
const ID_REGEX = /^[a-zA-Z0-9_-]+$/;
const ARCHIVE_PREFIX = 'changemaker-v2-backup-';
const ARCHIVE_SUFFIX = '.tar.gz';
function backupsDirFor(slug: string): string {
return path.join(env.AGENT_DATA_DIR, 'backups', slug);
}
function archivePathFor(slug: string, id: string): string {
return path.join(backupsDirFor(slug), `${ARCHIVE_PREFIX}${id}${ARCHIVE_SUFFIX}`);
}
async function sha256File(filePath: string): Promise<string> {
return new Promise((resolve, reject) => {
const hash = crypto.createHash('sha256');
const stream = createReadStream(filePath);
stream.on('data', (chunk) => hash.update(chunk));
stream.on('end', () => resolve(hash.digest('hex')));
stream.on('error', reject);
});
}
/**
* Read the manifest.json out of a backup archive without extracting it.
* backup.sh stores it at <archive>/changemaker-v2-backup-<ts>/manifest.json
*/
async function readManifestFromArchive(archivePath: string): Promise<unknown | null> {
return new Promise((resolve) => {
const proc = spawn('tar', ['-xzOf', archivePath, '--wildcards', '*/manifest.json'], {
stdio: ['ignore', 'pipe', 'ignore'],
});
let buf = '';
proc.stdout.on('data', (chunk) => (buf += chunk.toString('utf-8')));
proc.on('error', () => resolve(null));
proc.on('close', (code) => {
if (code !== 0 || !buf.trim()) return resolve(null);
try {
resolve(JSON.parse(buf));
} catch {
resolve(null);
}
});
});
}
/**
* Extract the timestamp ID from a filename like "changemaker-v2-backup-20260409_143000.tar.gz".
*/
function idFromFilename(filename: string): string | null {
if (!filename.startsWith(ARCHIVE_PREFIX) || !filename.endsWith(ARCHIVE_SUFFIX)) return null;
return filename.slice(ARCHIVE_PREFIX.length, filename.length - ARCHIVE_SUFFIX.length);
}
// ─── Routes ───────────────────────────────────────────────────────────
/**
* POST /instance/:slug/backup
* Shells out to the remote CML's scripts/backup.sh. Returns archive metadata
* so the CCP can immediately stream it down via the /download endpoint.
*/
router.post('/instance/:slug/backup', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
const entry = await getSlugEntry(slug);
try {
// 1. pg_dump
const dumpFile = path.join(backupDir, 'database.sql');
const dump = await docker.composeExec(
entry.basePath, entry.composeProject,
'v2-postgres',
'pg_dump -U changemaker -d changemaker',
300_000,
pgPassword ? { PGPASSWORD: pgPassword } : undefined
);
await fs.writeFile(dumpFile, dump, 'utf-8');
const result = await withSlugLock(slug, 'backup', async () => {
const backupsDir = backupsDirFor(slug);
await fs.mkdir(backupsDir, { recursive: true });
// Gzip the dump
await exec(`gzip '${dumpFile}'`, { timeout: 120_000 });
// Verify scripts/backup.sh exists
const scriptPath = path.join(entry.basePath, 'scripts', 'backup.sh');
try {
await fs.access(scriptPath);
} catch {
throw new AgentError(500, `scripts/backup.sh not found at ${scriptPath}`, 'BACKUP_SCRIPT_MISSING');
}
// 2. Tar uploads if exists
const uploadsDir = path.join(entry.basePath, 'uploads');
let hasUploads = false;
try {
await fs.access(uploadsDir);
hasUploads = true;
} catch { /* no uploads dir */ }
if (hasUploads) {
await exec(
`tar -czf '${path.join(backupDir, 'uploads.tar.gz')}' -C '${entry.basePath}' uploads`,
{ timeout: 300_000 }
// Snapshot existing archive filenames so we can identify the new one
const existingFiles = new Set(
(await fs.readdir(backupsDir)).filter((f) => f.startsWith(ARCHIVE_PREFIX) && f.endsWith(ARCHIVE_SUFFIX))
);
}
// 3. Create final archive
const archiveName = `backup-${param(req, 'slug')}-${timestamp}.tar.gz`;
const archivePath = path.join(env.AGENT_DATA_DIR, 'backups', archiveName);
await exec(
`tar -czf '${archivePath}' -C '${path.dirname(backupDir)}' '${timestamp}'`,
{ timeout: 300_000 }
);
const logPath = path.join(backupsDir, `backup-${Date.now()}.log`);
const logFd = await fs.open(logPath, 'w');
// Clean up temp dir
await fs.rm(backupDir, { recursive: true, force: true });
// Spawn backup.sh with cwd=basePath so its .env detection works.
// Retention is effectively disabled here — CCP manages retention of
// the streamed-down archives, not the agent's transient copies.
//
// Container names: backup.sh defaults to `changemaker-v2-postgres` and
// `listmonk-db`, which match the main CML's `container_name:` overrides.
// If a deployment has custom naming, the operator can set PG_CONTAINER /
// LISTMONK_PG_CONTAINER in the instance's own .env (backup.sh loads it).
const spawnEnv: NodeJS.ProcessEnv = {
...process.env,
BACKUP_DIR: backupsDir,
RETENTION_DAYS: '36500', // ~100 years; CCP controls retention
};
const stats = await fs.stat(archivePath);
const backupId = timestamp;
logger.info(`[backup] Running scripts/backup.sh for ${slug} (basePath=${entry.basePath})`);
logger.info(`[backup] Created backup for ${param(req, 'slug')}: ${archivePath} (${stats.size} bytes)`);
const exitCode: number = await new Promise((resolve, reject) => {
const proc = spawn('bash', ['scripts/backup.sh'], {
cwd: entry.basePath,
env: spawnEnv,
stdio: ['ignore', 'pipe', 'pipe'],
});
proc.stdout.on('data', (chunk) => logFd.write(chunk).catch(() => {}));
proc.stderr.on('data', (chunk) => logFd.write(chunk).catch(() => {}));
proc.on('error', reject);
proc.on('close', (code) => resolve(code ?? 1));
});
res.json({
backupId,
archivePath,
sizeBytes: stats.size,
timestamp,
await logFd.close();
if (exitCode !== 0) {
// Return the tail of the log so the CCP can display it
let logTail = '';
try {
const fullLog = await fs.readFile(logPath, 'utf-8');
logTail = fullLog.split('\n').slice(-40).join('\n');
} catch { /* ignore */ }
throw new AgentError(500, `backup.sh exited with code ${exitCode}\n${logTail}`, 'BACKUP_FAILED');
}
// Find the new archive
const afterFiles = (await fs.readdir(backupsDir)).filter(
(f) => f.startsWith(ARCHIVE_PREFIX) && f.endsWith(ARCHIVE_SUFFIX)
);
const newFiles = afterFiles.filter((f) => !existingFiles.has(f));
if (newFiles.length === 0) {
throw new AgentError(500, 'backup.sh succeeded but no new archive was created', 'BACKUP_NO_OUTPUT');
}
// Pick the most recently modified (in case of oddities)
newFiles.sort();
const newest = newFiles[newFiles.length - 1] as string;
const archivePath = path.join(backupsDir, newest);
const backupId = idFromFilename(newest);
if (!backupId || !ID_REGEX.test(backupId)) {
throw new AgentError(500, `Unexpected archive filename: ${newest}`, 'BACKUP_NAME_INVALID');
}
const stats = await fs.stat(archivePath);
const sha256 = await sha256File(archivePath);
const manifest = await readManifestFromArchive(archivePath);
// Delete the log file once we know the backup succeeded
try { await fs.unlink(logPath); } catch { /* ignore */ }
logger.info(`[backup] ${slug}: created ${newest} (${stats.size} bytes, sha256=${sha256.substring(0, 16)}...)`);
return {
backupId,
filename: newest,
sizeBytes: stats.size,
sha256,
manifest,
createdAt: stats.mtime.toISOString(),
};
});
res.json(result);
} catch (err) {
// Clean up on failure
try { await fs.rm(backupDir, { recursive: true, force: true }); } catch { /* ignore */ }
if (err instanceof SlugBusyError) {
res.status(409).json({ error: 'SLUG_BUSY', message: err.message });
return;
}
throw err;
}
});
// GET /instance/:slug/backup/:id/download — Stream backup archive
router.get('/instance/:slug/backup/:id/download', async (req: Request, res: Response) => {
const archiveName = `backup-${param(req, 'slug')}-${param(req, 'id')}.tar.gz`;
const archivePath = path.join(env.AGENT_DATA_DIR, 'backups', archiveName);
/**
* GET /instance/:slug/backups
* Lists backup archives currently held on the agent for this slug.
*/
router.get('/instance/:slug/backups', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
await getSlugEntry(slug); // validate slug is registered
const backupsDir = backupsDirFor(slug);
let entries: string[] = [];
try {
await fs.access(archivePath);
entries = await fs.readdir(backupsDir);
} catch {
res.json({ data: [] });
return;
}
const results = [];
for (const filename of entries) {
const id = idFromFilename(filename);
if (!id) continue;
try {
const stats = await fs.stat(path.join(backupsDir, filename));
results.push({
backupId: id,
filename,
sizeBytes: stats.size,
createdAt: stats.mtime.toISOString(),
});
} catch { /* skip */ }
}
results.sort((a, b) => (a.createdAt < b.createdAt ? 1 : -1));
res.json({ data: results });
});
/**
* GET /instance/:slug/backup/:id/download
* Streams the backup archive (supports Content-Length so the CCP can verify size).
*/
router.get('/instance/:slug/backup/:id/download', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
const id = param(req, 'id');
if (!ID_REGEX.test(id)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid backup id' });
return;
}
await getSlugEntry(slug);
const archivePath = archivePathFor(slug, id);
try {
const stats = await fs.stat(archivePath);
res.setHeader('Content-Type', 'application/gzip');
res.setHeader('Content-Length', String(stats.size));
res.setHeader('Content-Disposition', `attachment; filename="${path.basename(archivePath)}"`);
const stream = createReadStream(archivePath);
stream.on('error', (err) => {
logger.error(`[backup] stream error for ${archivePath}: ${err.message}`);
if (!res.headersSent) res.status(500).end();
else res.destroy(err);
});
stream.pipe(res);
} catch {
res.status(404).json({ error: 'NOT_FOUND', message: 'Backup archive not found' });
return;
}
});
const stats = await fs.stat(archivePath);
res.setHeader('Content-Type', 'application/gzip');
res.setHeader('Content-Length', stats.size);
res.setHeader('Content-Disposition', `attachment; filename="${archiveName}"`);
/**
* DELETE /instance/:slug/backup/:id
* Deletes the archive from the agent's disk. The CCP calls this after it has
* successfully streamed the archive to its own storage.
*/
router.delete('/instance/:slug/backup/:id', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
const id = param(req, 'id');
if (!ID_REGEX.test(id)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid backup id' });
return;
}
await getSlugEntry(slug);
const { createReadStream } = await import('fs');
const stream = createReadStream(archivePath);
stream.pipe(res);
const archivePath = archivePathFor(slug, id);
// Path traversal defense: ensure the resolved path is still inside the slug's backups dir
const resolved = path.resolve(archivePath);
const boundary = path.resolve(backupsDirFor(slug));
if (!resolved.startsWith(boundary + path.sep)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid backup id' });
return;
}
try {
await fs.unlink(archivePath);
logger.info(`[backup] ${slug}: deleted ${path.basename(archivePath)}`);
res.json({ deleted: true });
} catch (err) {
const code = (err as NodeJS.ErrnoException).code;
if (code === 'ENOENT') {
res.status(404).json({ error: 'NOT_FOUND', message: 'Backup archive not found' });
return;
}
throw err;
}
});
// ─── Restore ──────────────────────────────────────────────────────────
// Hard cap on a single restore upload. The CCP is trusted, but a buggy or
// compromised CCP shouldn't be able to fill the agent's disk in one request.
// 20 GB is well above any realistic Changemaker Lite backup size.
const MAX_RESTORE_UPLOAD_BYTES = 20 * 1024 * 1024 * 1024;
function restoresDirFor(slug: string): string {
return path.join(env.AGENT_DATA_DIR, 'restores', slug);
}
function restoreUploadDir(slug: string, uploadId: string): string {
return path.join(restoresDirFor(slug), uploadId);
}
interface RestoreState {
status: 'UPLOADED' | 'RUNNING' | 'COMPLETED' | 'FAILED';
uploadId: string;
startedAt: string;
completedAt?: string;
exitCode?: number;
logTail?: string;
errorMessage?: string;
options?: Record<string, unknown>;
}
async function readRestoreState(slug: string, uploadId: string): Promise<RestoreState | null> {
const statePath = path.join(restoreUploadDir(slug, uploadId), 'restore-state.json');
try {
const content = await fs.readFile(statePath, 'utf-8');
return JSON.parse(content) as RestoreState;
} catch {
return null;
}
}
async function writeRestoreState(slug: string, uploadId: string, state: RestoreState): Promise<void> {
const statePath = path.join(restoreUploadDir(slug, uploadId), 'restore-state.json');
await fs.writeFile(statePath, JSON.stringify(state, null, 2), 'utf-8');
}
/**
* POST /instance/:slug/restore/upload?sha256=<hex>
* Accepts an application/octet-stream upload of a backup archive and writes
* it to the agent's restores directory. Verifies SHA256 as it streams if
* the hash doesn't match, the partial file is deleted and we return 400.
*
* Returns `{ uploadId, sizeBytes, sha256 }`.
*/
router.post('/instance/:slug/restore/upload', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
await getSlugEntry(slug);
if (isSlugLocked(slug, 'restore')) {
res.status(409).json({ error: 'SLUG_BUSY', message: 'A restore is already in progress for this slug' });
return;
}
if (isSlugLocked(slug, 'backup')) {
res.status(409).json({ error: 'SLUG_BUSY', message: 'A backup is in progress for this slug' });
return;
}
const expectedSha256 = typeof req.query.sha256 === 'string' ? req.query.sha256.toLowerCase() : undefined;
if (!expectedSha256 || !/^[a-f0-9]{64}$/.test(expectedSha256)) {
res.status(400).json({ error: 'VALIDATION', message: 'sha256 query parameter required (64 hex chars)' });
return;
}
const uploadId = crypto.randomBytes(16).toString('hex');
const uploadDir = restoreUploadDir(slug, uploadId);
await fs.mkdir(uploadDir, { recursive: true });
const archivePath = path.join(uploadDir, 'archive.tar.gz');
const hash = crypto.createHash('sha256');
let bytesWritten = 0;
const hashTransform = new Transform({
transform(chunk: Buffer, _enc, cb) {
bytesWritten += chunk.length;
if (bytesWritten > MAX_RESTORE_UPLOAD_BYTES) {
// Abort the stream — pipeline() will reject and the catch block below
// will remove the partial upload directory.
cb(new AgentError(
413,
`Upload exceeds maximum allowed size of ${MAX_RESTORE_UPLOAD_BYTES} bytes`,
'UPLOAD_TOO_LARGE'
));
return;
}
hash.update(chunk);
cb(null, chunk);
},
});
try {
const writeStream = createWriteStream(archivePath);
await pipeline(req, hashTransform, writeStream);
const sha256 = hash.digest('hex');
if (sha256 !== expectedSha256) {
// Integrity failure — nuke the upload
await fs.rm(uploadDir, { recursive: true, force: true });
res.status(400).json({
error: 'SHA256_MISMATCH',
message: `Expected sha256 ${expectedSha256}, got ${sha256}`,
});
return;
}
const stats = await fs.stat(archivePath);
// Persist initial state so the progress endpoint works even before apply
await writeRestoreState(slug, uploadId, {
status: 'UPLOADED',
uploadId,
startedAt: new Date().toISOString(),
});
logger.info(`[restore] ${slug}: uploaded ${bytesWritten} bytes (sha256=${sha256.substring(0, 16)}...) upload_id=${uploadId}`);
res.json({
uploadId,
sizeBytes: stats.size,
sha256,
});
} catch (err) {
// Stream error or write error — clean up
try { await fs.rm(uploadDir, { recursive: true, force: true }); } catch { /* ignore */ }
throw err;
}
});
/**
* POST /instance/:slug/restore/:uploadId/apply
* Body: { confirm: true, skipDb?, skipUploads?, skipListmonk?, dryRun? }
*
* Fires off `scripts/restore.sh --archive <path> --force` in the background
* and writes progress to restore-state.json. The CCP polls the progress
* endpoint for updates. Mutex prevents concurrent restores/backups.
*/
router.post('/instance/:slug/restore/:uploadId/apply', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
const uploadId = param(req, 'uploadId');
if (!ID_REGEX.test(uploadId)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' });
return;
}
const entry = await getSlugEntry(slug);
const { confirm, skipDb, skipUploads, skipListmonk, dryRun } = req.body ?? {};
if (confirm !== true) {
res.status(400).json({ error: 'CONFIRMATION_REQUIRED', message: 'Body must include { confirm: true }' });
return;
}
const uploadDir = restoreUploadDir(slug, uploadId);
// Path traversal defense
const resolvedDir = path.resolve(uploadDir);
const boundary = path.resolve(restoresDirFor(slug));
if (!resolvedDir.startsWith(boundary + path.sep)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' });
return;
}
const archivePath = path.join(uploadDir, 'archive.tar.gz');
try {
await fs.access(archivePath);
} catch {
res.status(404).json({ error: 'NOT_FOUND', message: 'Upload not found or already applied' });
return;
}
// Verify scripts/restore.sh exists
const scriptPath = path.join(entry.basePath, 'scripts', 'restore.sh');
try {
await fs.access(scriptPath);
} catch {
res.status(500).json({ error: 'RESTORE_SCRIPT_MISSING', message: `scripts/restore.sh not found at ${scriptPath}` });
return;
}
// Check mutex state (don't block — tell caller it's busy)
if (isSlugLocked(slug, 'restore') || isSlugLocked(slug, 'backup')) {
res.status(409).json({ error: 'SLUG_BUSY', message: 'Slug is busy with backup or restore' });
return;
}
// Fire-and-forget: acquire lock and run in background. Return immediately
// so CCP can start polling /progress.
const options = {
skipDb: !!skipDb,
skipUploads: !!skipUploads,
skipListmonk: !!skipListmonk,
dryRun: !!dryRun,
};
await writeRestoreState(slug, uploadId, {
status: 'RUNNING',
uploadId,
startedAt: new Date().toISOString(),
options,
});
// Build restore.sh args (all flags, no user input interpolated into a shell string)
const args = ['scripts/restore.sh', '--archive', archivePath, '--force'];
if (options.skipDb) args.push('--skip-db');
if (options.skipUploads) args.push('--skip-uploads');
if (options.skipListmonk) args.push('--skip-listmonk');
if (options.dryRun) args.push('--dry-run');
const logPath = path.join(uploadDir, 'restore.log');
// Schedule the background task — don't await inside the handler
void withSlugLock(slug, 'restore', async () => {
const logFd = await fs.open(logPath, 'w');
logger.info(`[restore] ${slug}: running ${args.join(' ')} (cwd=${entry.basePath})`);
const exitCode: number = await new Promise((resolve, reject) => {
const proc = spawn('bash', args, {
cwd: entry.basePath,
env: { ...process.env },
stdio: ['ignore', 'pipe', 'pipe'],
});
proc.stdout.on('data', (chunk) => logFd.write(chunk).catch(() => {}));
proc.stderr.on('data', (chunk) => logFd.write(chunk).catch(() => {}));
proc.on('error', reject);
proc.on('close', (code) => resolve(code ?? 1));
});
await logFd.close();
// Read the tail of the log for the state file
let logTail = '';
try {
const fullLog = await fs.readFile(logPath, 'utf-8');
logTail = fullLog.split('\n').slice(-80).join('\n');
} catch { /* ignore */ }
const state: RestoreState = {
status: exitCode === 0 ? 'COMPLETED' : 'FAILED',
uploadId,
startedAt: (await readRestoreState(slug, uploadId))?.startedAt || new Date().toISOString(),
completedAt: new Date().toISOString(),
exitCode,
logTail,
options,
...(exitCode !== 0 ? { errorMessage: `restore.sh exited with code ${exitCode}` } : {}),
};
await writeRestoreState(slug, uploadId, state);
logger.info(`[restore] ${slug}: restore.sh finished with exit ${exitCode}`);
}).catch(async (err) => {
logger.error(`[restore] ${slug}: background restore failed: ${(err as Error).message}`);
// If the mutex was the issue, state is already written. Otherwise, mark failed.
if (!(err instanceof SlugBusyError)) {
try {
await writeRestoreState(slug, uploadId, {
status: 'FAILED',
uploadId,
startedAt: new Date().toISOString(),
completedAt: new Date().toISOString(),
errorMessage: (err as Error).message,
options,
});
} catch { /* ignore */ }
}
});
res.status(202).json({ applied: true, uploadId, options });
});
/**
* GET /instance/:slug/restore/:uploadId/progress
* Returns the current state of a running or completed restore.
*/
router.get('/instance/:slug/restore/:uploadId/progress', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
const uploadId = param(req, 'uploadId');
if (!ID_REGEX.test(uploadId)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' });
return;
}
await getSlugEntry(slug);
const state = await readRestoreState(slug, uploadId);
if (!state) {
res.status(404).json({ error: 'NOT_FOUND', message: 'Restore not found' });
return;
}
res.json(state);
});
/**
* DELETE /instance/:slug/restore/:uploadId
* Removes a restore upload directory. Refuses if a restore is currently running.
*/
router.delete('/instance/:slug/restore/:uploadId', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
const uploadId = param(req, 'uploadId');
if (!ID_REGEX.test(uploadId)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' });
return;
}
await getSlugEntry(slug);
const uploadDir = restoreUploadDir(slug, uploadId);
const resolvedDir = path.resolve(uploadDir);
const boundary = path.resolve(restoresDirFor(slug));
if (!resolvedDir.startsWith(boundary + path.sep)) {
res.status(400).json({ error: 'INVALID_ID', message: 'Invalid upload id' });
return;
}
const state = await readRestoreState(slug, uploadId);
if (state?.status === 'RUNNING') {
res.status(409).json({ error: 'RESTORE_RUNNING', message: 'Cannot delete a running restore' });
return;
}
try {
await fs.rm(uploadDir, { recursive: true, force: true });
res.json({ deleted: true });
} catch (err) {
throw err;
}
});
export default router;

View File

@ -4,6 +4,13 @@ import { registerSlug, unregisterSlug, listSlugs } from '../services/registry.se
const router = Router();
// SECURITY: defense-in-depth slug validation. The CCP enforces ^[a-z0-9-]+$
// upstream via Zod, but the registry slug is later interpolated into
// filesystem paths (backupsDirFor, etc.), so we validate independently here.
// A poisoned registry entry could otherwise let a compromised or buggy CCP
// escape AGENT_DATA_DIR.
const SLUG_RE = /^[a-z0-9-]{2,50}$/;
// POST /instances/register — Register a slug→basePath mapping
router.post('/instances/register', async (req: Request, res: Response) => {
const { slug, basePath, composeProject } = req.body;
@ -11,14 +18,23 @@ router.post('/instances/register', async (req: Request, res: Response) => {
res.status(400).json({ error: 'VALIDATION', message: 'slug, basePath, and composeProject required' });
return;
}
if (typeof slug !== 'string' || !SLUG_RE.test(slug)) {
res.status(400).json({ error: 'VALIDATION', message: 'Invalid slug format (expected ^[a-z0-9-]{2,50}$)' });
return;
}
await registerSlug(slug, basePath, composeProject);
res.json({ registered: slug });
});
// DELETE /instances/:slug — Unregister slug
router.delete('/instances/:slug', async (req: Request, res: Response) => {
await unregisterSlug(param(req, 'slug'));
res.json({ unregistered: param(req, 'slug') });
const slug = param(req, 'slug');
if (!SLUG_RE.test(slug)) {
res.status(400).json({ error: 'VALIDATION', message: 'Invalid slug format' });
return;
}
await unregisterSlug(slug);
res.json({ unregistered: slug });
});
// GET /instances — List all managed slugs

View File

@ -1,11 +1,12 @@
import { Router, Request, Response } from 'express';
import { param } from '../utils/params';
import { execFile } from 'child_process';
import { execFile, spawn } from 'child_process';
import { promisify } from 'util';
import fs from 'fs/promises';
import path from 'path';
import { getSlugEntry } from '../services/registry.service';
import { logger } from '../utils/logger';
import { withSlugLock, SlugBusyError, isSlugLocked } from '../services/slug-mutex';
const execFileAsync = promisify(execFile);
const router = Router();
@ -13,9 +14,108 @@ const router = Router();
/** Validate a git branch name — prevent shell injection. */
const SAFE_BRANCH = /^[a-zA-Z0-9][a-zA-Z0-9_.\/-]{0,99}$/;
// POST /instance/:slug/upgrade/start — Run upgrade.sh
/**
* Max age of an in-progress upgrade (by progress.json mtime) before we
* consider a previous attempt dead and allow a new one through.
*
* SECURITY NOTE: this must be LONGER than the CCP's REMOTE_UPGRADE_TIMEOUT
* AND longer than any realistic legitimate upgrade duration. The concern is
* a concurrent-upgrade scenario:
* - upgrade.sh is running and legitimately slow (large image pull + DB
* migration)
* - at 15 min the CCP side times out and marks the row FAILED
* - admin clicks "Upgrade" again CCP's DB check sees no active row
* - if this staleness window is <= realistic upgrade time, the second
* /upgrade/start call would ALSO pass this check, spawning a second
* upgrade.sh process racing against the still-running first one
*
* 45 min gives headroom over the 15-min CCP timeout and covers realistic
* upgrade durations. For a truly bulletproof guard, switch to a PID lock
* file that verifies the process is still alive.
*/
const STALE_UPGRADE_MTIME_MS = 45 * 60 * 1000;
/**
* Returns true if there's an in-progress upgrade visible on disk.
*
* Used as a second-line guard in case the in-memory mutex was lost to an
* agent restart mid-upgrade. The check looks at progress.json mtime and
* the absence of a result.json together they indicate "started but not
* finished within the staleness window".
*/
async function isUpgradeRunningOnDisk(basePath: string): Promise<boolean> {
const progressPath = path.join(basePath, 'data', 'upgrade', 'progress.json');
const resultPath = path.join(basePath, 'data', 'upgrade', 'result.json');
let progressStat: import('fs').Stats;
try {
progressStat = await fs.stat(progressPath);
} catch {
return false; // no progress file → no in-progress upgrade
}
// If a result file exists with mtime >= progress mtime, the run is finished
try {
const resultStat = await fs.stat(resultPath);
if (resultStat.mtimeMs >= progressStat.mtimeMs) return false;
} catch { /* no result file yet */ }
// Stale: progress file is old and no result was written → assume the
// previous attempt died and let a new one through
if (Date.now() - progressStat.mtimeMs > STALE_UPGRADE_MTIME_MS) return false;
return true;
}
// POST /instance/:slug/upgrade/check — Run upgrade-check.sh and return status.json
router.post('/instance/:slug/upgrade/check', async (req: Request, res: Response) => {
const slug = param(req, 'slug');
const entry = await getSlugEntry(slug);
// Refuse during a running upgrade — check writes status.json which could
// race with upgrade.sh writing other files in data/upgrade/
if (isSlugLocked(slug, 'upgrade') || await isUpgradeRunningOnDisk(entry.basePath)) {
res.status(409).json({ error: 'SLUG_BUSY', message: 'An upgrade is currently running' });
return;
}
const scriptPath = path.join(entry.basePath, 'scripts', 'upgrade-check.sh');
try {
await fs.access(scriptPath);
} catch {
res.status(404).json({ error: 'SCRIPT_NOT_FOUND', message: `upgrade-check.sh not found at ${scriptPath}` });
return;
}
// Run upgrade-check.sh — it writes data/upgrade/status.json. Use execFile
// (no shell) and a 60s timeout. Failures are non-fatal: the script may
// still have written status.json before erroring out, so we always try
// to read it afterwards.
try {
await execFileAsync('bash', [scriptPath], {
cwd: entry.basePath,
timeout: 60_000,
maxBuffer: 4 * 1024 * 1024,
env: { ...process.env, COMPOSE_ANSI: 'never' },
});
} catch (err) {
logger.warn(`[upgrade] ${slug}: upgrade-check.sh failed: ${(err as Error).message}`);
// continue — try to read status.json anyway
}
const statusPath = path.join(entry.basePath, 'data', 'upgrade', 'status.json');
try {
const content = await fs.readFile(statusPath, 'utf-8');
res.json(JSON.parse(content));
} catch {
res.status(500).json({ error: 'STATUS_NOT_AVAILABLE', message: 'upgrade-check.sh did not produce status.json' });
}
});
// POST /instance/:slug/upgrade/start — Run upgrade.sh in the background
router.post('/instance/:slug/upgrade/start', async (req: Request, res: Response) => {
const entry = await getSlugEntry(param(req, 'slug'));
const slug = param(req, 'slug');
const entry = await getSlugEntry(slug);
const { skipBackup, useRegistry, branch } = req.body || {};
// SECURITY: Validate branch name to prevent injection
@ -28,26 +128,64 @@ router.post('/instance/:slug/upgrade/start', async (req: Request, res: Response)
try {
await fs.access(scriptPath);
} catch {
res.status(400).json({ error: 'NOT_FOUND', message: 'upgrade.sh not found' });
res.status(404).json({ error: 'NOT_FOUND', message: 'upgrade.sh not found' });
return;
}
// SECURITY: Use execFile with args array — no shell interpolation
const args = ['--api-mode', '--force'];
// Refuse if an upgrade is already running (in-memory or on-disk indicators)
if (isSlugLocked(slug, 'upgrade') || await isUpgradeRunningOnDisk(entry.basePath)) {
res.status(409).json({ error: 'SLUG_BUSY', message: 'An upgrade is already in progress' });
return;
}
// Backup or restore concurrency: refuse to start an upgrade while either is running
if (isSlugLocked(slug, 'backup') || isSlugLocked(slug, 'restore')) {
res.status(409).json({ error: 'SLUG_BUSY', message: 'A backup or restore is currently running' });
return;
}
// Clear stale progress/result files before starting so the on-disk staleness
// check doesn't think a brand-new upgrade is still finishing.
const progressPath = path.join(entry.basePath, 'data', 'upgrade', 'progress.json');
const resultPath = path.join(entry.basePath, 'data', 'upgrade', 'result.json');
await fs.mkdir(path.dirname(progressPath), { recursive: true });
await fs.rm(progressPath, { force: true });
await fs.rm(resultPath, { force: true });
// SECURITY: Use spawn with args array — no shell interpolation
const args: string[] = [scriptPath, '--api-mode', '--force'];
if (skipBackup) args.push('--skip-backup');
if (useRegistry) args.push('--use-registry');
if (branch) args.push('--branch', branch);
// Fire-and-forget — CCP polls progress
execFileAsync('bash', [scriptPath, ...args], {
cwd: entry.basePath,
timeout: 600_000,
maxBuffer: 10 * 1024 * 1024,
// Schedule the background task under the slug lock. Use void so the
// promise doesn't block the response. Errors are caught and logged; the
// CCP detects them via the absence of a result file or via the timeout.
void withSlugLock(slug, 'upgrade', async () => {
logger.info(`[upgrade] ${slug}: spawning ${args.join(' ')} (cwd=${entry.basePath})`);
try {
await new Promise<void>((resolve, reject) => {
const proc = spawn('bash', args, {
cwd: entry.basePath,
env: { ...process.env, COMPOSE_ANSI: 'never' },
stdio: ['ignore', 'ignore', 'ignore'], // upgrade.sh writes its own logs
});
proc.on('error', reject);
proc.on('close', (code) => {
if (code === 0) resolve();
else reject(new Error(`upgrade.sh exited with code ${code}`));
});
});
logger.info(`[upgrade] ${slug}: upgrade.sh completed`);
} catch (err) {
logger.error(`[upgrade] ${slug}: ${(err as Error).message}`);
}
}).catch((err) => {
logger.error(`[upgrade] ${param(req, 'slug')} failed: ${(err as Error).message}`);
if (!(err instanceof SlugBusyError)) {
logger.error(`[upgrade] ${slug}: lock or background error: ${(err as Error).message}`);
}
});
res.json({ started: true });
res.status(202).json({ started: true });
});
// GET /instance/:slug/upgrade/progress — Read progress.json

View File

@ -53,8 +53,24 @@ if (hasCerts()) {
app.use(errorHandler);
const server = https.createServer(tlsOptions, app);
server.listen(env.AGENT_PORT, () => {
server.listen(env.AGENT_PORT, async () => {
logger.info(`CCP Agent (mTLS) listening on port ${env.AGENT_PORT}`);
// Auto-register this instance's slug if configured
if (env.INSTANCE_SLUG && env.INSTANCE_BASE_PATH) {
const { registerSlug, getSlugEntry } = await import('./services/registry.service');
try {
await getSlugEntry(env.INSTANCE_SLUG);
logger.debug(`[registry] Slug ${env.INSTANCE_SLUG} already registered`);
} catch {
// Detect compose project name: use env override, or derive from basePath directory name
// (Docker Compose default: directory name with special chars stripped)
const pathMod = await import('path');
const composeProject = env.COMPOSE_PROJECT
|| pathMod.basename(env.INSTANCE_BASE_PATH).replace(/[^a-zA-Z0-9]/g, '').toLowerCase();
await registerSlug(env.INSTANCE_SLUG, env.INSTANCE_BASE_PATH, composeProject);
}
}
});
} else {
// Pre-approval mode — start HTTP, only health + phone-home polling

View File

@ -0,0 +1,65 @@
/**
* Per-slug single-flight mutex.
*
* Guards long-running, mutating operations (backup, restore, upgrade) so that
* two concurrent CCP calls for the same slug can't trample each other.
*
* Usage:
* await withSlugLock(slug, 'backup', async () => { ... });
*
* If a lock is already held for (slug, op), throws SlugBusyError which the
* route handler should convert to HTTP 409.
*/
export class SlugBusyError extends Error {
constructor(public slug: string, public op: string) {
super(`Slug ${slug} is busy: ${op} already in progress`);
this.name = 'SlugBusyError';
}
}
type LockKey = string;
const locks = new Map<LockKey, { op: string; startedAt: number }>();
function key(slug: string, op: string): LockKey {
return `${slug}::${op}`;
}
/**
* Run `fn` while holding a single-flight lock on (slug, op).
* Throws SlugBusyError immediately if another call is already running.
*/
export async function withSlugLock<T>(
slug: string,
op: string,
fn: () => Promise<T>
): Promise<T> {
const k = key(slug, op);
if (locks.has(k)) {
throw new SlugBusyError(slug, op);
}
locks.set(k, { op, startedAt: Date.now() });
try {
return await fn();
} finally {
locks.delete(k);
}
}
/**
* Returns true if a lock is currently held for (slug, op).
*/
export function isSlugLocked(slug: string, op: string): boolean {
return locks.has(key(slug, op));
}
/**
* Returns debug info about all active locks.
*/
export function listActiveLocks(): Array<{ slug: string; op: string; ageMs: number }> {
const now = Date.now();
return Array.from(locks.entries()).map(([k, v]) => {
const [slug] = k.split('::');
return { slug: slug ?? '', op: v.op, ageMs: now - v.startedAt };
});
}

View File

@ -0,0 +1,34 @@
-- CreateEnum
CREATE TYPE "RestoreStatus" AS ENUM ('PENDING', 'UPLOADING', 'RUNNING', 'COMPLETED', 'FAILED');
-- AlterEnum
ALTER TYPE "AuditAction" ADD VALUE 'BACKUP_RESTORE';
-- CreateTable
CREATE TABLE "instance_restores" (
"id" TEXT NOT NULL,
"instance_id" TEXT NOT NULL,
"backup_id" TEXT NOT NULL,
"status" "RestoreStatus" NOT NULL DEFAULT 'PENDING',
"upload_id" TEXT,
"progress_json" JSONB,
"log_tail" TEXT,
"error_message" TEXT,
"triggered_by_id" TEXT,
"started_at" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP,
"completed_at" TIMESTAMP(3),
CONSTRAINT "instance_restores_pkey" PRIMARY KEY ("id")
);
-- CreateIndex
CREATE INDEX "instance_restores_instance_id_started_at_idx" ON "instance_restores"("instance_id", "started_at");
-- CreateIndex
CREATE INDEX "instance_restores_backup_id_idx" ON "instance_restores"("backup_id");
-- AddForeignKey
ALTER TABLE "instance_restores" ADD CONSTRAINT "instance_restores_instance_id_fkey" FOREIGN KEY ("instance_id") REFERENCES "instances"("id") ON DELETE CASCADE ON UPDATE CASCADE;
-- AddForeignKey
ALTER TABLE "instance_restores" ADD CONSTRAINT "instance_restores_backup_id_fkey" FOREIGN KEY ("backup_id") REFERENCES "backups"("id") ON DELETE CASCADE ON UPDATE CASCADE;

View File

@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "instances" ADD COLUMN "pangolin_subdomain_prefix" TEXT;

View File

@ -0,0 +1,2 @@
-- AlterEnum
ALTER TYPE "AuditAction" ADD VALUE 'PANGOLIN_TEARDOWN';

View File

@ -109,6 +109,7 @@ model Instance {
pangolinSiteId String? @map("pangolin_site_id")
pangolinNewtId String? @map("pangolin_newt_id")
pangolinNewtSecret String? @map("pangolin_newt_secret")
pangolinSubdomainPrefix String? @map("pangolin_subdomain_prefix")
// SMTP
smtpHost String? @map("smtp_host")
@ -125,6 +126,7 @@ model Instance {
portAllocations PortAllocation[]
healthChecks HealthCheck[]
backups Backup[]
restores InstanceRestore[]
auditLogs AuditLog[]
upgrades InstanceUpgrade[]
events InstanceEvent[]
@ -196,12 +198,44 @@ model Backup {
s3Uploaded Boolean @default(false) @map("s3_uploaded")
s3Key String? @map("s3_key")
instance Instance @relation(fields: [instanceId], references: [id], onDelete: Cascade)
instance Instance @relation(fields: [instanceId], references: [id], onDelete: Cascade)
restores InstanceRestore[]
@@index([instanceId, startedAt])
@@map("backups")
}
// ─── Restore ───────────────────────────────────────────────
enum RestoreStatus {
PENDING
UPLOADING
RUNNING
COMPLETED
FAILED
}
model InstanceRestore {
id String @id @default(uuid())
instanceId String @map("instance_id")
backupId String @map("backup_id")
status RestoreStatus @default(PENDING)
uploadId String? @map("upload_id")
progressJson Json? @map("progress_json")
logTail String? @map("log_tail")
errorMessage String? @map("error_message")
triggeredById String? @map("triggered_by_id")
startedAt DateTime @default(now()) @map("started_at")
completedAt DateTime? @map("completed_at")
instance Instance @relation(fields: [instanceId], references: [id], onDelete: Cascade)
backup Backup @relation(fields: [backupId], references: [id], onDelete: Cascade)
@@index([instanceId, startedAt])
@@index([backupId])
@@map("instance_restores")
}
// ─── Audit Log ─────────────────────────────────────────────
enum AuditAction {
@ -215,7 +249,9 @@ enum AuditAction {
SECRETS_VIEWED
BACKUP_CREATE
BACKUP_DELETE
BACKUP_RESTORE
PANGOLIN_SETUP
PANGOLIN_TEARDOWN
PANGOLIN_SYNC
AGENT_CONNECT
AGENT_REGISTER

View File

@ -54,10 +54,11 @@ const envSchema = z.object({
USE_REGISTRY_IMAGES: z.enum(['true', 'false']).default('true').transform((v) => v === 'true'),
IMAGE_TAG: z.string().default('latest'),
// Pangolin (optional)
// Pangolin (optional — for remote tunnel management)
PANGOLIN_API_URL: z.string().default(''),
PANGOLIN_API_KEY: z.string().default(''),
PANGOLIN_ORG_ID: z.string().default(''),
PANGOLIN_ENDPOINT: z.string().default(''), // Newt WebSocket URL (may differ from API URL)
// Health checks
HEALTH_CHECK_INTERVAL_MS: z.coerce.number().default(300_000), // 5 min (0 to disable)

View File

@ -169,7 +169,7 @@ router.post('/registrations/:id/approve', authenticate, requireRole('SUPER_ADMIN
});
// Issue mTLS certificates
const certMaterials = await issueAgentCert(instance.id, registration.slug);
const certMaterials = await issueAgentCert(instance.id, registration.slug, registration.agentUrl);
// Mark invite code as used
const invite = await prisma.agentInviteCode.findUnique({ where: { id: registration.inviteCodeId } });
@ -189,7 +189,7 @@ router.post('/registrations/:id/approve', authenticate, requireRole('SUPER_ADMIN
caCertPem: certMaterials.caCertPem,
agentCertPem: certMaterials.agentCertPem,
agentKeyPem: certMaterials.agentKeyPem,
ccpFingerprint: certMaterials.caFingerprint,
ccpFingerprint: certMaterials.fingerprint,
},
},
});

View File

@ -4,11 +4,13 @@ import rateLimit from 'express-rate-limit';
import { prisma } from '../../lib/prisma';
import { authenticate, requireRole } from '../../middleware/auth';
import { validate } from '../../middleware/validate';
import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema } from './instances.schemas';
import { createInstanceSchema, updateInstanceSchema, registerInstanceSchema, reconfigureInstanceSchema, configureTunnelSchema, importInstancesSchema, startUpgradeSchema, setupRemoteTunnelSchema } from './instances.schemas';
import * as instancesService from './instances.service';
import * as healthService from '../../services/health.service';
import * as backupService from '../../services/backup.service';
import * as restoreService from '../../services/restore.service';
import * as upgradeService from '../../services/upgrade.service';
import * as tunnelService from '../../services/tunnel.service';
import { discoverInstances } from '../../services/discovery.service';
const secretsLimiter = rateLimit({
@ -186,6 +188,18 @@ router.delete(
'/:id/tunnel',
requireRole('SUPER_ADMIN', 'OPERATOR'),
async (req: Request, res: Response) => {
// Branch: remote instances use the CCP's Pangolin API to teardown;
// local instances use the existing manual removal logic.
const instance = await prisma.instance.findUnique({ where: { id: req.params.id as string } });
if (instance?.isRemote && instance.pangolinSiteId) {
const result = await tunnelService.teardownTunnel(
req.params.id as string,
req.user!.id,
req.ip
);
res.json({ data: result });
return;
}
const result = await instancesService.removeTunnel(
req.params.id as string,
req.user!.id,
@ -195,6 +209,47 @@ router.delete(
}
);
// Remote tunnel setup via CCP's Pangolin API credentials
router.post(
'/:id/tunnel/setup',
requireRole('SUPER_ADMIN'),
validate(setupRemoteTunnelSchema),
async (req: Request, res: Response) => {
const { subdomainPrefix } = req.body || {};
const result = await tunnelService.setupTunnel(
req.params.id as string,
{ subdomainPrefix },
req.user!.id,
req.ip
);
res.status(201).json({ data: result });
}
);
// Get tunnel status (resource matrix) — works for both local and remote
router.get(
'/:id/tunnel/status',
requireRole('SUPER_ADMIN', 'OPERATOR'),
async (req: Request, res: Response) => {
const status = await tunnelService.getTunnelStatus(req.params.id as string);
res.json({ data: status });
}
);
// Re-sync resources (idempotent — creates missing, leaves existing)
router.post(
'/:id/tunnel/sync',
requireRole('SUPER_ADMIN'),
async (req: Request, res: Response) => {
const result = await tunnelService.syncResources(
req.params.id as string,
req.user!.id,
req.ip
);
res.json({ data: result });
}
);
// ─── Lifecycle Endpoints ─────────────────────────────────────────────
router.post(
@ -280,6 +335,7 @@ router.post(
router.post(
'/:id/upgrade',
requireRole('SUPER_ADMIN', 'OPERATOR'),
validate(startUpgradeSchema),
async (req: Request, res: Response) => {
const { skipBackup, useRegistry, branch } = req.body || {};
const upgrade = await upgradeService.startUpgrade(
@ -356,4 +412,76 @@ router.get(
}
);
// ─── Restores ──────────────────────────────────────────────────────
/**
* POST /:id/restore
* Body: { backupId, options? }
* Starts a restore of the given backup onto this instance. Returns the
* InstanceRestore row immediately; caller polls GET /:id/restores or
* GET /:id/restores/:restoreId for status.
*
* DESTRUCTIVE: overwrites databases and uploads. Requires SUPER_ADMIN.
*/
router.post(
'/:id/restore',
requireRole('SUPER_ADMIN'),
async (req: Request, res: Response) => {
const instanceId = req.params.id as string;
const { backupId, options } = req.body ?? {};
if (!backupId || typeof backupId !== 'string') {
res.status(400).json({ error: { message: 'backupId (string) is required', code: 'VALIDATION' } });
return;
}
// Defensive: ensure the backup belongs to this instance
const backup = await prisma.backup.findUnique({ where: { id: backupId } });
if (!backup) {
res.status(404).json({ error: { message: 'Backup not found', code: 'NOT_FOUND' } });
return;
}
if (backup.instanceId !== instanceId) {
res.status(400).json({
error: {
message: 'Backup does not belong to this instance (cross-instance restore is not supported)',
code: 'CROSS_INSTANCE_RESTORE',
},
});
return;
}
const restore = await restoreService.createRestore({
backupId,
triggeredById: req.user!.id,
ipAddress: req.ip,
options,
});
res.status(201).json({ data: restore });
}
);
router.get(
'/:id/restores',
requireRole('SUPER_ADMIN', 'OPERATOR'),
async (req: Request, res: Response) => {
const page = Math.max(1, parseInt(req.query.page as string, 10) || 1);
const limit = Math.min(100, Math.max(1, parseInt(req.query.limit as string, 10) || 50));
const result = await restoreService.listRestores(req.params.id as string, page, limit);
res.json(result);
}
);
router.get(
'/:id/restores/:restoreId',
requireRole('SUPER_ADMIN', 'OPERATOR'),
async (req: Request, res: Response) => {
const restore = await restoreService.getRestore(req.params.restoreId as string);
if (restore.instanceId !== req.params.id) {
res.status(404).json({ error: { message: 'Restore not found', code: 'NOT_FOUND' } });
return;
}
res.json({ data: restore });
}
);
export default router;

View File

@ -108,9 +108,32 @@ export const importInstancesSchema = z.object({
instances: z.array(registerInstanceSchema).min(1).max(50),
});
// SECURITY: branch name is interpolated into a shell command string in the
// local `runUpgrade` path (exec, not spawn), so we must enforce the same
// strict allow-list the agent uses on its own end. This blocks names starting
// with `-` (avoiding flag confusion), shell metachars, and anything exotic.
export const startUpgradeSchema = z.object({
skipBackup: z.boolean().optional(),
useRegistry: z.boolean().optional(),
branch: z
.string()
.regex(/^[a-zA-Z0-9][a-zA-Z0-9_.\/-]{0,99}$/, 'Invalid branch name')
.optional(),
});
export const setupRemoteTunnelSchema = z.object({
subdomainPrefix: z
.string()
.min(1)
.max(50)
.regex(/^[a-z0-9-]+$/, 'Prefix must be lowercase alphanumeric with hyphens')
.optional(),
});
export type CreateInstanceInput = z.infer<typeof createInstanceSchema>;
export type UpdateInstanceInput = z.infer<typeof updateInstanceSchema>;
export type RegisterInstanceInput = z.infer<typeof registerInstanceSchema>;
export type ReconfigureInstanceInput = z.infer<typeof reconfigureInstanceSchema>;
export type ConfigureTunnelInput = z.infer<typeof configureTunnelSchema>;
export type ImportInstancesInput = z.infer<typeof importInstancesSchema>;
export type StartUpgradeInput = z.infer<typeof startUpgradeSchema>;

View File

@ -8,6 +8,12 @@ import { env } from './config/env';
import { logger } from './utils/logger';
import { errorHandler } from './middleware/error-handler';
// BigInt JSON serialization. Prisma's BigInt columns (e.g. Backup.sizeBytes)
// don't have a toJSON method by default, so res.json() throws. Stringify them.
(BigInt.prototype as unknown as { toJSON: () => string }).toJSON = function () {
return this.toString();
};
// Route imports
import authRoutes from './modules/auth/auth.routes';
import instanceRoutes from './modules/instances/instances.routes';

View File

@ -1,5 +1,6 @@
import { Prisma, BackupStatus, AuditAction, InstanceStatus } from '@prisma/client';
import fs from 'fs/promises';
import { createReadStream } from 'fs';
import path from 'path';
import crypto from 'crypto';
import { execFile as execFileCb } from 'child_process';
@ -10,6 +11,7 @@ import { AppError } from '../middleware/error-handler';
import { decryptJson } from '../utils/encryption';
import * as docker from './docker.service';
import { logger } from '../utils/logger';
import { getRemoteDriverForInstance } from './execution-driver';
const execFile = promisify(execFileCb);
/**
@ -24,11 +26,16 @@ function assertPathWithinBoundary(filePath: string, boundary: string, label: str
}
/**
* Compute SHA-256 hash of a file.
* Compute SHA-256 hash of a file by streaming its contents.
*/
async function fileHash(filePath: string): Promise<string> {
const fileBuffer = await fs.readFile(filePath);
return crypto.createHash('sha256').update(fileBuffer).digest('hex');
return new Promise((resolve, reject) => {
const hash = crypto.createHash('sha256');
const stream = createReadStream(filePath);
stream.on('data', (chunk) => hash.update(chunk));
stream.on('end', () => resolve(hash.digest('hex')));
stream.on('error', reject);
});
}
/**
@ -52,7 +59,11 @@ export async function createBackup(instanceId: string, userId?: string, ipAddres
throw new AppError(400, `Cannot backup instance in ${instance.status} state`, 'INVALID_STATE');
}
if ((instance as { isRegistered?: boolean }).isRegistered) {
// `isRegistered` + `isRemote` = a remote CCP-managed instance (agent on the
// far side). `isRegistered` alone (without `isRemote`) would mean a local
// host-managed instance that CCP doesn't own the compose files for — that
// case we still can't back up.
if (instance.isRegistered && !instance.isRemote) {
throw new AppError(400, 'Backups not managed by CCP for registered instances', 'NOT_MANAGED');
}
@ -72,9 +83,31 @@ export async function createBackup(instanceId: string, userId?: string, ipAddres
return backup;
}
type BackupInstance = {
id: string;
slug: string;
basePath: string;
composeProject: string;
encryptedSecrets: string | null;
isRemote: boolean;
agentUrl: string | null;
};
async function performBackup(
backupId: string,
instance: { id: string; slug: string; basePath: string; composeProject: string; encryptedSecrets: string | null },
instance: BackupInstance,
userId?: string,
ipAddress?: string
) {
if (instance.isRemote) {
return performRemoteBackup(backupId, instance, userId, ipAddress);
}
return performLocalBackup(backupId, instance, userId, ipAddress);
}
async function performLocalBackup(
backupId: string,
instance: BackupInstance,
userId?: string,
ipAddress?: string
) {
@ -221,6 +254,168 @@ async function performBackup(
}
}
/**
* Run a backup on a remote agent and stream the resulting archive to CCP storage.
*
* Flow:
* 1. Tell agent to run scripts/backup.sh { backupId, sizeBytes, sha256, manifest }
* 2. Stream archive from agent $BACKUP_STORAGE_PATH/{slug}/backup-{slug}-{backupId}.tar.gz
* 3. Verify local SHA256 matches what the agent reported (defense in depth)
* 4. Tell agent to delete its local copy (reclaim remote disk)
* 5. Update Backup row as COMPLETED
*
* On failure at any step after the remote backup was created, we leave the
* agent-side archive in place so the operator can retry the download.
*/
async function performRemoteBackup(
backupId: string,
instance: BackupInstance,
userId?: string,
ipAddress?: string
) {
let archivePath: string | null = null;
let agentBackupId: string | null = null;
try {
await prisma.backup.update({
where: { id: backupId },
data: { status: BackupStatus.IN_PROGRESS },
});
const driver = await getRemoteDriverForInstance({
id: instance.id,
slug: instance.slug,
isRemote: instance.isRemote,
agentUrl: instance.agentUrl,
});
// 1. Trigger the backup on the agent (this blocks until backup.sh completes)
logger.info(`[backup] ${instance.slug}: triggering remote backup via agent`);
const result = await driver.createBackup();
agentBackupId = result.backupId;
logger.info(
`[backup] ${instance.slug}: agent backup complete — ${result.filename} ` +
`(${(result.sizeBytes / 1024 / 1024).toFixed(1)} MB, sha256=${result.sha256.substring(0, 16)}...)`
);
// 2. Resolve the destination archive path on CCP storage
const archiveName = `backup-${instance.slug}-${result.backupId}.tar.gz`;
archivePath = path.join(env.BACKUP_STORAGE_PATH, instance.slug, archiveName);
// Path traversal guard (slug should be safe but better to assert)
assertPathWithinBoundary(archivePath, env.BACKUP_STORAGE_PATH, 'Backup archive');
await fs.mkdir(path.dirname(archivePath), { recursive: true });
// 3. Stream the archive from the agent to CCP storage
logger.info(`[backup] ${instance.slug}: streaming archive to ${archivePath}`);
const { bytesWritten } = await driver.downloadBackup(result.backupId, archivePath);
if (bytesWritten !== result.sizeBytes) {
throw new Error(
`Downloaded size ${bytesWritten} does not match agent-reported size ${result.sizeBytes}`
);
}
// 4. Re-hash the downloaded file and compare to the agent-reported hash.
//
// SECURITY NOTE: this check authenticates *transmission integrity* only,
// not content integrity against a malicious agent. Both the file bytes
// and the expected hash are supplied by the (semi-trusted) agent, so a
// compromised agent can trivially make this check pass while delivering
// arbitrary content. The check still catches accidental corruption (bit
// flips, truncation) and is essentially free.
//
// The mTLS channel guarantees that the bytes weren't modified in transit
// by an outside attacker. The remaining trust gap — "what if the agent
// itself is compromised?" — must be addressed before Phase B (restore)
// ships, since restore feeds the archive into pg_restore. Either:
// (a) HMAC-sign the hash on the agent with its mTLS private key and
// verify on the CCP using the agent cert public key, or
// (b) limit restore operations to require an additional out-of-band
// admin confirmation step.
const localSha256 = await fileHash(archivePath);
if (localSha256 !== result.sha256) {
throw new Error(
`SHA256 mismatch: agent reported ${result.sha256}, local file hashed ${localSha256}`
);
}
// 5. Reclaim disk on the remote agent
try {
await driver.deleteBackup(result.backupId);
} catch (err) {
logger.warn(
`[backup] ${instance.slug}: failed to delete remote backup ${result.backupId}: ${(err as Error).message}`
);
// Non-fatal — CCP has the archive, remote copy will age out next retention sweep
}
// 6. Persist the result. Store sha256 and agentBackupId inside the manifest
// since we don't have dedicated columns.
const mergedManifest = {
...(result.manifest as Record<string, unknown> | null ?? {}),
source: 'remote',
agentBackupId: result.backupId,
sha256: result.sha256,
createdAt: result.createdAt,
};
await prisma.backup.update({
where: { id: backupId },
data: {
status: BackupStatus.COMPLETED,
archivePath,
sizeBytes: BigInt(bytesWritten),
manifest: mergedManifest as unknown as Prisma.InputJsonValue,
completedAt: new Date(),
},
});
if (userId) {
await prisma.auditLog.create({
data: {
userId,
instanceId: instance.id,
action: AuditAction.BACKUP_CREATE,
details: {
backupId,
archiveName,
sizeBytes: bytesWritten,
source: 'remote',
agentBackupId: result.backupId,
},
ipAddress,
},
});
}
logger.info(
`[backup] ${instance.slug}: remote backup stored at ${archivePath} ` +
`(${(bytesWritten / 1024 / 1024).toFixed(1)} MB)`
);
} catch (err) {
await prisma.backup.update({
where: { id: backupId },
data: {
status: BackupStatus.FAILED,
errorMessage: (err as Error).message,
completedAt: new Date(),
},
});
// Clean up any partial local file; leave the remote copy so retry is possible
if (archivePath) {
try { await fs.unlink(archivePath); } catch { /* ignore */ }
}
if (agentBackupId) {
logger.warn(
`[backup] ${instance.slug}: leaving agent-side backup ${agentBackupId} in place for retry`
);
}
throw err;
}
}
/**
* Delete a backup (file + DB record).
*/

View File

@ -0,0 +1,368 @@
/**
* Pangolin Integration API client for the CCP.
*
* Ported from the main CML's pangolin.client.ts. Adapted to:
* - Accept credentials via constructor (not env singleton)
* - Be instantiable per-call so the CCP can use its own API token
* to manage tunnels for multiple remote instances
*
* The CCP never exposes its Pangolin API key to remote instances it
* only pushes the resulting Newt credentials via the agent's writeFiles.
*/
import { logger } from '../utils/logger';
// ─── Types ─────────────────────────────────────────────────────────
export interface PangolinSite {
siteId: string;
name: string;
orgId: string;
niceId: string;
pubKey?: string;
subnet?: string;
megabytesIn?: number;
megabytesOut?: number;
lastSeen?: string;
online?: boolean;
type?: string;
address?: string;
}
export interface PangolinExitNode {
exitNodeId: string;
name: string;
location?: string;
region?: string;
online: boolean;
capacity?: number;
latency?: number;
}
export interface PangolinResource {
resourceId: string;
name: string;
subdomain?: string;
fullDomain?: string;
ssl?: boolean;
blockAccess?: boolean;
active?: boolean;
proxyPort?: number;
protocol?: string;
domainBindings?: string[];
http?: boolean;
targets?: PangolinTarget[];
}
export interface PangolinTarget {
targetId: string;
resourceId: string;
siteId: string;
ip: string;
port: number;
method: string;
enabled?: boolean;
}
export interface PangolinNewt {
newtId: string;
secret: string;
siteId: string;
}
export interface PangolinSiteDefaults {
newtId: string;
newtSecret: string;
address: string;
}
export interface CreateSitePayload {
name: string;
type?: string;
subnet?: string;
exitNodeId?: string;
newtId?: string;
secret?: string;
address?: string;
}
export interface CreateHttpResourcePayload {
name: string;
domainId: string;
subdomain?: string;
http: true;
protocol: 'tcp';
}
export interface CreateTargetPayload {
siteId: string | number;
ip: string;
port: number;
method: 'http' | 'https';
enabled?: boolean;
}
export interface PangolinDomain {
domainId: string;
baseDomain: string;
verified: boolean;
type?: string;
failed?: boolean;
configManaged?: boolean;
}
export interface UpdateResourcePayload {
name?: string;
subdomain?: string;
fullDomain?: string;
ssl?: boolean;
sso?: boolean;
active?: boolean;
blockAccess?: boolean;
proxyPort?: number;
protocol?: string;
domainBindings?: string[];
}
export interface UpdateCertificatePayload {
autoRenew?: boolean;
}
export interface PangolinCertificate {
certId: string;
domainId: string;
domain: string;
status: 'PENDING' | 'ACTIVE' | 'EXPIRED' | 'FAILED';
issuedAt?: string;
expiresAt?: string;
autoRenew?: boolean;
issuer?: string;
}
export interface PangolinConnectedClient {
clientId: string;
resourceId: string;
ipAddress: string;
connectedAt: string;
lastSeen: string;
bytesIn: number;
bytesOut: number;
online: boolean;
}
// ─── Helpers ───────────────────────────────────────────────────────
/** Redact credential fields from Pangolin API request bodies before logging. */
function redactSecrets(body: unknown): unknown {
if (!body || typeof body !== 'object') return body;
const obj = body as Record<string, unknown>;
const redacted = { ...obj };
if ('secret' in redacted) redacted.secret = '[REDACTED]';
if ('newtSecret' in redacted) redacted.newtSecret = '[REDACTED]';
return redacted;
}
// ─── Client ────────────────────────────────────────────────────────
export class CcpPangolinClient {
constructor(
private baseUrl: string,
private apiKey: string,
private orgId: string
) {}
get configured(): boolean {
return !!(this.baseUrl && this.apiKey && this.orgId);
}
private async request<T>(method: string, path: string, body?: unknown): Promise<T> {
if (!this.configured) {
throw new Error('Pangolin API not configured. Set PANGOLIN_API_URL, PANGOLIN_API_KEY, PANGOLIN_ORG_ID in CCP .env');
}
const url = `${this.baseUrl}${path}`;
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 15000);
try {
logger.debug(`[pangolin] ${method} ${path}${body ? ` body=${JSON.stringify(redactSecrets(body))}` : ''}`);
const res = await fetch(url, {
method,
headers: {
'Authorization': `Bearer ${this.apiKey}`,
'Content-Type': 'application/json',
},
body: body ? JSON.stringify(body) : undefined,
signal: controller.signal,
});
if (!res.ok) {
const text = await res.text().catch(() => '');
throw new Error(`Pangolin API ${method} ${path} returned ${res.status}: ${text}`);
}
const contentType = res.headers.get('content-type') || '';
if (contentType.includes('application/json')) {
const json = await res.json();
return this.unwrapResponse<T>(json);
}
return {} as T;
} finally {
clearTimeout(timeout);
}
}
private unwrapResponse<T>(json: unknown): T {
if (json && typeof json === 'object' && !Array.isArray(json)) {
const obj = json as Record<string, unknown>;
if ('data' in obj && 'success' in obj) {
return obj.data as T;
}
}
return json as T;
}
// ─── Health ───────────────────────────────────────────────────
async healthCheck(): Promise<boolean> {
try {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), 5000);
try {
const res = await fetch(`${this.baseUrl}/`, {
headers: { 'Authorization': `Bearer ${this.apiKey}` },
signal: controller.signal,
});
return res.ok;
} finally {
clearTimeout(timeout);
}
} catch {
return false;
}
}
// ─── Site Defaults ────────────────────────────────────────────
async pickSiteDefaults(): Promise<PangolinSiteDefaults> {
const res = await this.request<unknown>('GET', `/org/${this.orgId}/pick-site-defaults`);
const obj = res as Record<string, unknown>;
const newtId = obj.newtId as string || '';
const newtSecret = obj.newtSecret as string || obj.secret as string || '';
const address = obj.clientAddress as string || obj.address as string || '';
if (!newtId || !newtSecret) {
throw new Error('Pangolin did not return Newt credentials from pick-site-defaults');
}
return { newtId, newtSecret, address };
}
// ─── Sites ────────────────────────────────────────────────────
async listSites(): Promise<PangolinSite[]> {
const res = await this.request<unknown>('GET', `/org/${this.orgId}/sites`);
return this.extractArray(res, 'sites', 'listSites');
}
async getSite(siteId: string): Promise<PangolinSite> {
return this.request<PangolinSite>('GET', `/site/${siteId}`);
}
async createSite(data: CreateSitePayload): Promise<PangolinSite & { newt?: PangolinNewt }> {
return this.request<PangolinSite & { newt?: PangolinNewt }>('PUT', `/org/${this.orgId}/site`, data);
}
async deleteSite(siteId: string): Promise<void> {
await this.request<void>('DELETE', `/site/${siteId}`);
}
async listExitNodes(): Promise<PangolinExitNode[]> {
try {
const res = await this.request<unknown>('GET', `/org/${this.orgId}/exit-nodes`);
return this.extractArray(res, 'exitNodes', 'listExitNodes');
} catch {
return [];
}
}
// ─── Resources ────────────────────────────────────────────────
async listResources(): Promise<PangolinResource[]> {
const res = await this.request<unknown>('GET', `/org/${this.orgId}/resources`);
return this.extractArray(res, 'resources', 'listResources');
}
async getResource(resourceId: string): Promise<PangolinResource> {
return this.request<PangolinResource>('GET', `/resource/${resourceId}`);
}
async createResource(data: CreateHttpResourcePayload): Promise<PangolinResource> {
logger.info(`[pangolin] createResource: ${data.name} (subdomain: ${data.subdomain || '(root)'})`);
return this.request<PangolinResource>('PUT', `/org/${this.orgId}/resource`, data);
}
async updateResource(resourceId: string, data: UpdateResourcePayload): Promise<PangolinResource> {
return this.request<PangolinResource>('POST', `/resource/${resourceId}`, data);
}
async deleteResource(resourceId: string): Promise<void> {
await this.request<void>('DELETE', `/resource/${resourceId}`);
}
// ─── Targets ──────────────────────────────────────────────────
async createTarget(resourceId: string, data: CreateTargetPayload): Promise<PangolinTarget> {
logger.info(`[pangolin] createTarget: resource=${resourceId}, ip=${data.ip}:${data.port}`);
const payload = { ...data, siteId: Number(data.siteId) };
return this.request<PangolinTarget>('PUT', `/resource/${resourceId}/target`, payload);
}
async listTargets(resourceId: string): Promise<PangolinTarget[]> {
const res = await this.request<unknown>('GET', `/resource/${resourceId}/targets`);
return this.extractArray(res, 'targets', 'listTargets');
}
async deleteTarget(targetId: string): Promise<void> {
await this.request<void>('DELETE', `/target/${targetId}`);
}
// ─── Domains ──────────────────────────────────────────────────
async listDomains(): Promise<PangolinDomain[]> {
const res = await this.request<unknown>('GET', `/org/${this.orgId}/domains`);
return this.extractArray(res, 'domains', 'listDomains');
}
// ─── Certificates ─────────────────────────────────────────────
async getCertificate(domainId: string, domain: string): Promise<PangolinCertificate> {
return this.request<PangolinCertificate>('GET', `/org/${this.orgId}/certificate/${domainId}/${domain}`);
}
async updateCertificate(certId: string, data: UpdateCertificatePayload): Promise<PangolinCertificate> {
return this.request<PangolinCertificate>('POST', `/certificate/${certId}`, data);
}
// ─── Clients ──────────────────────────────────────────────────
async listClients(resourceId: string): Promise<PangolinConnectedClient[]> {
const res = await this.request<unknown>('GET', `/resource/${resourceId}/clients`);
return this.extractArray(res, 'clients', 'listClients');
}
// ─── Helpers ──────────────────────────────────────────────────
private extractArray<T>(res: unknown, key: string, context: string): T[] {
if (Array.isArray(res)) return res as T[];
if (res && typeof res === 'object') {
const obj = res as Record<string, unknown>;
if (Array.isArray(obj[key])) return obj[key] as T[];
if (obj.data && typeof obj.data === 'object') {
const dataObj = obj.data as Record<string, unknown>;
if (Array.isArray(dataObj[key])) return dataObj[key] as T[];
}
if (Array.isArray(obj.data)) return obj.data as T[];
}
logger.warn(`[pangolin] ${context}: could not extract array from response`);
return [];
}
}

View File

@ -90,7 +90,7 @@ export async function ensureCA() {
* Issue a certificate for a remote agent, signed by the CA.
* Returns the certificate materials (plaintext) for one-time display.
*/
export async function issueAgentCert(instanceId: string, slug: string) {
export async function issueAgentCert(instanceId: string, slug: string, agentUrl?: string) {
const ca = await ensureCA();
const caKeyPem = decrypt(ca.encryptedKey);
@ -110,12 +110,29 @@ export async function issueAgentCert(instanceId: string, slug: string) {
await fs.writeFile(caCertFile, ca.certPem);
await fs.writeFile(serialFile, crypto.randomBytes(16).toString('hex'));
// Extensions for server+client auth
await fs.writeFile(extFile, [
// Build SAN entries from the agent URL hostname
const sanEntries: string[] = [];
if (agentUrl) {
try {
const hostname = new URL(agentUrl).hostname;
// Detect IP vs DNS name
if (/^\d{1,3}(\.\d{1,3}){3}$/.test(hostname) || hostname.includes(':')) {
sanEntries.push(`IP:${hostname}`);
} else {
sanEntries.push(`DNS:${hostname}`);
}
} catch { /* ignore invalid URL */ }
}
sanEntries.push(`DNS:${commonName}`);
// Extensions for server+client auth with SANs
const extLines = [
'basicConstraints=CA:FALSE',
'keyUsage=digitalSignature,keyEncipherment',
'extendedKeyUsage=serverAuth,clientAuth',
].join('\n'));
`subjectAltName=${sanEntries.join(',')}`,
];
await fs.writeFile(extFile, extLines.join('\n'));
// Generate agent key
await exec(

View File

@ -60,7 +60,20 @@ export async function getDriverForInstance(instance: DriverInstance): Promise<Ex
const { getLocalDriver } = await import('./local-driver');
return getLocalDriver();
}
return getRemoteDriverForInstance(instance);
}
/**
* Resolve a RemoteDriver for a remote instance. Throws if the instance is
* local, missing an agent URL, or has no valid mTLS certificate.
*
* Use this when you need to call RemoteDriver-specific methods like
* createBackup() that don't exist on the ExecutionDriver interface.
*/
export async function getRemoteDriverForInstance(instance: DriverInstance) {
if (!instance.isRemote) {
throw new Error(`Instance ${instance.slug} is not remote`);
}
if (!instance.agentUrl) {
throw new Error(`Remote instance ${instance.slug} has no agent URL configured`);
}

View File

@ -1,10 +1,87 @@
import https from 'https';
import fs from 'fs';
import { pipeline } from 'stream/promises';
import { env } from '../config/env';
import type { ExecutionDriver } from './execution-driver';
import { AgentUnreachableError } from './execution-driver';
import type { ContainerInfo } from './docker.service';
import { logger } from '../utils/logger';
export interface AgentBackupResult {
backupId: string;
filename: string;
sizeBytes: number;
sha256: string;
manifest: unknown | null;
createdAt: string;
}
export interface AgentBackupListEntry {
backupId: string;
filename: string;
sizeBytes: number;
createdAt: string;
}
export interface AgentRestoreUploadResult {
uploadId: string;
sizeBytes: number;
sha256: string;
}
export interface AgentRestoreOptions {
skipDb?: boolean;
skipUploads?: boolean;
skipListmonk?: boolean;
dryRun?: boolean;
}
export interface AgentRestoreState {
status: 'UPLOADED' | 'RUNNING' | 'COMPLETED' | 'FAILED';
uploadId: string;
startedAt: string;
completedAt?: string;
exitCode?: number;
logTail?: string;
errorMessage?: string;
options?: AgentRestoreOptions;
}
export interface AgentUpdateStatus {
branch: string;
currentCommit: string;
currentMessage?: string;
remoteCommit: string | null;
commitsBehind: number;
changelog: Array<{ hash: string; message: string; date: string; author: string }>;
checkedAt: string;
error: string | null;
}
export interface AgentUpgradeProgress {
phase?: number;
phaseName?: string;
percentage?: number;
message?: string;
timestamp?: string;
}
export interface AgentUpgradeResult {
success: boolean;
message?: string;
previousCommit?: string;
newCommit?: string;
commitCount?: number;
durationSeconds?: number;
warnings?: string[];
}
export interface StartAgentUpgradeOptions {
skipBackup?: boolean;
useRegistry?: boolean;
branch?: string;
}
interface AgentRequestOptions {
method: 'GET' | 'POST' | 'DELETE';
path: string;
@ -261,4 +338,261 @@ export class RemoteDriver implements ExecutionDriver {
timeoutMs: env.AGENT_LONG_OP_TIMEOUT_MS,
});
}
// ─── Backup Operations ──────────────────────────────────────
/**
* Trigger a backup on the remote agent. The agent shells out to scripts/backup.sh
* and returns metadata for the resulting archive. The archive stays on the
* agent's disk until downloadBackup() + deleteBackup() are called.
*/
async createBackup(): Promise<AgentBackupResult> {
return this.request<AgentBackupResult>({
method: 'POST',
path: `/instance/${this.slug}/backup`,
timeoutMs: env.AGENT_LONG_OP_TIMEOUT_MS,
});
}
/**
* List backup archives currently held on the agent for this slug.
*/
async listAgentBackups(): Promise<AgentBackupListEntry[]> {
const resp = await this.request<{ data: AgentBackupListEntry[] }>({
method: 'GET',
path: `/instance/${this.slug}/backups`,
});
return resp.data;
}
/**
* Delete an archive from the agent's disk. Called after a successful download.
*/
async deleteBackup(backupId: string): Promise<void> {
await this.request({
method: 'DELETE',
path: `/instance/${this.slug}/backup/${encodeURIComponent(backupId)}`,
});
}
/**
* Stream a backup archive from the agent to a local file path.
* Verifies the Content-Length header matches the bytes written.
*/
async downloadBackup(backupId: string, destPath: string): Promise<{ bytesWritten: number }> {
const url = new URL(
`/instance/${this.slug}/backup/${encodeURIComponent(backupId)}/download`,
this.agentUrl
);
const timeoutMs = env.AGENT_LONG_OP_TIMEOUT_MS;
return new Promise((resolve, reject) => {
const req = https.request(
{
hostname: url.hostname,
port: url.port || 7443,
path: url.pathname + url.search,
method: 'GET',
headers: { Accept: 'application/gzip' },
cert: this.clientCert,
key: this.clientKey,
ca: this.caCert,
rejectUnauthorized: true,
timeout: timeoutMs,
},
async (res) => {
if (res.statusCode && res.statusCode >= 400) {
let body = '';
res.on('data', (c) => (body += c));
res.on('end', () => reject(new Error(`Agent returned ${res.statusCode}: ${body.substring(0, 500)}`)));
return;
}
const expectedSize = res.headers['content-length']
? parseInt(res.headers['content-length'] as string, 10)
: null;
try {
const out = fs.createWriteStream(destPath);
await pipeline(res, out);
const stats = await fs.promises.stat(destPath);
if (expectedSize !== null && stats.size !== expectedSize) {
reject(new Error(`Downloaded size ${stats.size} does not match Content-Length ${expectedSize}`));
return;
}
resolve({ bytesWritten: stats.size });
} catch (err) {
reject(err);
}
}
);
req.on('error', (err) => {
reject(new AgentUnreachableError(this.agentUrl, err));
});
req.on('timeout', () => {
req.destroy();
reject(new AgentUnreachableError(this.agentUrl, new Error(`Timed out after ${timeoutMs}ms`)));
});
req.end();
});
}
// ─── Restore Operations ─────────────────────────────────────
/**
* Stream a backup archive from a local path to the agent's upload endpoint.
* The expected SHA256 is passed as a query parameter and the agent verifies
* it during ingestion if it mismatches, the upload is rejected with 400.
*/
async uploadRestore(
archivePath: string,
expectedSha256: string
): Promise<AgentRestoreUploadResult> {
const stats = await fs.promises.stat(archivePath);
const url = new URL(
`/instance/${this.slug}/restore/upload?sha256=${encodeURIComponent(expectedSha256)}`,
this.agentUrl
);
const timeoutMs = env.AGENT_LONG_OP_TIMEOUT_MS;
return new Promise((resolve, reject) => {
const req = https.request(
{
hostname: url.hostname,
port: url.port || 7443,
path: url.pathname + url.search,
method: 'POST',
headers: {
'Content-Type': 'application/octet-stream',
'Content-Length': String(stats.size),
},
cert: this.clientCert,
key: this.clientKey,
ca: this.caCert,
rejectUnauthorized: true,
timeout: timeoutMs,
},
(res) => {
let body = '';
res.on('data', (c) => (body += c));
res.on('end', () => {
if (res.statusCode && res.statusCode >= 400) {
try {
const err = JSON.parse(body);
reject(new Error(err.message || `Agent returned ${res.statusCode}`));
} catch {
reject(new Error(`Agent returned ${res.statusCode}: ${body.substring(0, 500)}`));
}
return;
}
try {
resolve(JSON.parse(body) as AgentRestoreUploadResult);
} catch (err) {
reject(err);
}
});
}
);
req.on('error', (err) => {
reject(new AgentUnreachableError(this.agentUrl, err));
});
req.on('timeout', () => {
req.destroy();
reject(new AgentUnreachableError(this.agentUrl, new Error(`Timed out after ${timeoutMs}ms`)));
});
const fileStream = fs.createReadStream(archivePath);
fileStream.on('error', (err) => {
req.destroy();
reject(err);
});
fileStream.pipe(req);
});
}
/**
* Tell the agent to apply a previously-uploaded restore archive. The agent
* fires `scripts/restore.sh` in the background and returns immediately.
* Use `getRestoreProgress()` to poll for completion.
*/
async applyRestore(uploadId: string, options: AgentRestoreOptions = {}): Promise<void> {
await this.request({
method: 'POST',
path: `/instance/${this.slug}/restore/${encodeURIComponent(uploadId)}/apply`,
body: { confirm: true, ...options },
});
}
/**
* Poll the agent for the current state of a restore.
*/
async getRestoreProgress(uploadId: string): Promise<AgentRestoreState> {
return this.request<AgentRestoreState>({
method: 'GET',
path: `/instance/${this.slug}/restore/${encodeURIComponent(uploadId)}/progress`,
});
}
/**
* Delete a restore upload dir from the agent's disk. Called after the CCP
* has finalized the InstanceRestore row.
*/
async deleteRestoreUpload(uploadId: string): Promise<void> {
await this.request({
method: 'DELETE',
path: `/instance/${this.slug}/restore/${encodeURIComponent(uploadId)}`,
});
}
// ─── Upgrade Operations ─────────────────────────────────────
/**
* Run upgrade-check.sh on the remote and return the parsed status.json.
*/
async checkForUpdates(): Promise<AgentUpdateStatus> {
return this.request<AgentUpdateStatus>({
method: 'POST',
path: `/instance/${this.slug}/upgrade/check`,
timeoutMs: 90_000,
});
}
/**
* Trigger upgrade.sh --api-mode on the remote. Fire-and-forget; agent
* spawns the script in the background and returns 202 immediately.
* Use getUpgradeProgress / getUpgradeResult to track completion.
*/
async startUpgrade(options: StartAgentUpgradeOptions = {}): Promise<void> {
await this.request({
method: 'POST',
path: `/instance/${this.slug}/upgrade/start`,
body: options,
timeoutMs: 30_000,
});
}
/**
* Read the agent's data/upgrade/progress.json. Returns the default zero-state
* if no progress has been written yet.
*/
async getUpgradeProgress(): Promise<AgentUpgradeProgress> {
return this.request<AgentUpgradeProgress>({
method: 'GET',
path: `/instance/${this.slug}/upgrade/progress`,
});
}
/**
* Read the agent's data/upgrade/result.json. Throws if no result is yet
* available; the caller should treat that as "still running".
*/
async getUpgradeResult(): Promise<AgentUpgradeResult> {
return this.request<AgentUpgradeResult>({
method: 'GET',
path: `/instance/${this.slug}/upgrade/result`,
});
}
}

View File

@ -0,0 +1,376 @@
import fs from 'fs/promises';
import path from 'path';
import crypto from 'crypto';
import { createReadStream } from 'fs';
import { Prisma, RestoreStatus, AuditAction, InstanceStatus } from '@prisma/client';
import { prisma } from '../lib/prisma';
import { env } from '../config/env';
import { AppError } from '../middleware/error-handler';
import { logger } from '../utils/logger';
import { getRemoteDriverForInstance } from './execution-driver';
import type { AgentRestoreOptions, AgentRestoreState } from './remote-driver';
/**
* Validate that a path is within the allowed backup storage boundary.
*/
function assertPathWithinBoundary(filePath: string, boundary: string, label: string): void {
const normalized = path.resolve(filePath);
const normalizedBoundary = path.resolve(boundary);
if (!normalized.startsWith(normalizedBoundary + path.sep)) {
throw new AppError(403, `${label} path outside allowed directory`, 'FORBIDDEN');
}
}
/**
* Compute SHA-256 hash of a file by streaming its contents.
*/
async function fileHash(filePath: string): Promise<string> {
return new Promise((resolve, reject) => {
const hash = crypto.createHash('sha256');
const stream = createReadStream(filePath);
stream.on('data', (chunk) => hash.update(chunk));
stream.on('end', () => resolve(hash.digest('hex')));
stream.on('error', reject);
});
}
const POLL_INTERVAL_MS = 3_000;
const POLL_TIMEOUT_MS = 15 * 60 * 1_000; // 15 min
interface StartRestoreArgs {
backupId: string;
triggeredById?: string;
ipAddress?: string | null;
options?: AgentRestoreOptions;
}
/**
* Kick off a restore for the given backup. Creates an InstanceRestore row
* and runs the full upload apply poll loop asynchronously. Returns the
* row so the caller (HTTP handler) can respond immediately.
*/
export async function createRestore(args: StartRestoreArgs) {
const backup = await prisma.backup.findUnique({
where: { id: args.backupId },
include: { instance: true },
});
if (!backup) {
throw new AppError(404, 'Backup not found', 'NOT_FOUND');
}
if (backup.status !== 'COMPLETED') {
throw new AppError(400, `Backup is ${backup.status}, not COMPLETED`, 'INVALID_STATE');
}
if (!backup.archivePath) {
throw new AppError(400, 'Backup has no archive path', 'NO_ARCHIVE');
}
const instance = backup.instance;
if (instance.status !== InstanceStatus.RUNNING) {
throw new AppError(400, `Cannot restore to instance in ${instance.status} state`, 'INVALID_STATE');
}
// Phase B only supports remote restore. Local restore is deliberately stubbed
// — if you need it, add a performLocalRestore branch below. This also covers
// the registered-but-local case (CCP-adopted instances) since they have
// isRemote=false.
if (!instance.isRemote) {
throw new AppError(501, 'Local restore is not implemented — Phase B covers remote only', 'NOT_IMPLEMENTED');
}
// Make sure the archive is where it says it is and inside the boundary
assertPathWithinBoundary(backup.archivePath, env.BACKUP_STORAGE_PATH, 'Backup archive');
try {
await fs.access(backup.archivePath);
} catch {
throw new AppError(404, 'Archive file is missing on disk', 'ARCHIVE_MISSING');
}
const restore = await prisma.instanceRestore.create({
data: {
instanceId: instance.id,
backupId: backup.id,
status: RestoreStatus.PENDING,
triggeredById: args.triggeredById ?? null,
},
});
// Fire-and-forget orchestration
performRemoteRestore(restore.id, backup.archivePath, args.options ?? {}, args.triggeredById, args.ipAddress ?? null)
.catch((err) => {
logger.error(`[restore] ${restore.id} failed: ${(err as Error).message}`);
});
return restore;
}
/**
* End-to-end remote restore orchestration.
*
* Flow:
* 1. Compute sha256 of the archive on CCP disk
* 2. Upload to agent with sha256 query param (agent re-verifies on stream)
* 3. Apply via agent (shells out to restore.sh --force)
* 4. Poll progress every 3s until COMPLETED/FAILED or timeout
* 5. Delete the agent-side upload
* 6. Update the InstanceRestore row + audit log
*/
/**
* Write a BACKUP_RESTORE audit log entry. Wrapped in a try/catch so that an
* audit-log DB failure can never mask the underlying restore status update.
*
* Called in all three terminal paths:
* - success (outcome: 'success')
* - agent reported failure (outcome: 'agent_failed')
* - orchestration error / timeout / unexpected throw (outcome: 'orchestration_error')
*/
async function writeRestoreAuditLog(args: {
restoreId: string;
instanceId: string;
backupId: string;
triggeredById?: string;
ipAddress?: string | null;
options: AgentRestoreOptions;
outcome: 'success' | 'agent_failed' | 'orchestration_error';
sha256?: string;
uploadId?: string | null;
errorMessage?: string;
}): Promise<void> {
if (!args.triggeredById) return;
try {
await prisma.auditLog.create({
data: {
userId: args.triggeredById,
instanceId: args.instanceId,
action: AuditAction.BACKUP_RESTORE,
details: {
backupId: args.backupId,
restoreId: args.restoreId,
source: 'remote',
outcome: args.outcome,
options: args.options as unknown as Prisma.InputJsonValue,
...(args.sha256 ? { sha256: args.sha256 } : {}),
...(args.uploadId ? { agentUploadId: args.uploadId } : {}),
...(args.errorMessage ? { errorMessage: args.errorMessage.substring(0, 500) } : {}),
},
ipAddress: args.ipAddress ?? null,
},
});
} catch (err) {
logger.error(`[restore] failed to write audit log for ${args.restoreId}: ${(err as Error).message}`);
}
}
async function performRemoteRestore(
restoreId: string,
archivePath: string,
options: AgentRestoreOptions,
triggeredById?: string,
ipAddress?: string | null
) {
const restore = await prisma.instanceRestore.findUnique({
where: { id: restoreId },
include: { instance: true, backup: true },
});
if (!restore) {
logger.error(`[restore] row ${restoreId} vanished mid-flight`);
return;
}
const instance = restore.instance;
let uploadId: string | null = null;
let sha256: string | undefined;
try {
await prisma.instanceRestore.update({
where: { id: restoreId },
data: { status: RestoreStatus.UPLOADING },
});
const driver = await getRemoteDriverForInstance({
id: instance.id,
slug: instance.slug,
isRemote: instance.isRemote,
agentUrl: instance.agentUrl,
});
// 1. Compute local SHA256 (authoritative — the agent will verify against this).
// We persist this in the audit log so there's an immutable record of exactly
// which bytes were restored, useful for post-incident comparison.
logger.info(`[restore] ${instance.slug}: hashing archive ${path.basename(archivePath)}`);
sha256 = await fileHash(archivePath);
// 2. Stream upload to agent
logger.info(`[restore] ${instance.slug}: uploading archive (sha256=${sha256.substring(0, 16)}...)`);
const uploadResult = await driver.uploadRestore(archivePath, sha256);
uploadId = uploadResult.uploadId;
await prisma.instanceRestore.update({
where: { id: restoreId },
data: { uploadId, status: RestoreStatus.RUNNING },
});
// 3. Apply
logger.info(`[restore] ${instance.slug}: applying restore ${uploadId}`);
await driver.applyRestore(uploadId, options);
// 4. Poll progress
const deadline = Date.now() + POLL_TIMEOUT_MS;
let finalState: AgentRestoreState | null = null;
while (Date.now() < deadline) {
await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
try {
const state = await driver.getRestoreProgress(uploadId);
// Mirror progress to the DB row so the UI shows updates
await prisma.instanceRestore.update({
where: { id: restoreId },
data: {
progressJson: state as unknown as Prisma.InputJsonValue,
logTail: state.logTail ?? null,
},
});
if (state.status === 'COMPLETED' || state.status === 'FAILED') {
finalState = state;
break;
}
} catch (err) {
logger.warn(`[restore] ${instance.slug}: poll error: ${(err as Error).message}`);
// Keep polling — transient network blips shouldn't fail the restore
}
}
if (!finalState) {
throw new Error(`Restore timed out after ${Math.round(POLL_TIMEOUT_MS / 1000)}s`);
}
// 5. Clean up agent-side upload (best effort)
try {
await driver.deleteRestoreUpload(uploadId);
} catch (err) {
logger.warn(`[restore] ${instance.slug}: failed to delete agent upload ${uploadId}: ${(err as Error).message}`);
}
// 6. Finalize DB row
if (finalState.status === 'COMPLETED') {
await prisma.instanceRestore.update({
where: { id: restoreId },
data: {
status: RestoreStatus.COMPLETED,
progressJson: finalState as unknown as Prisma.InputJsonValue,
logTail: finalState.logTail ?? null,
completedAt: new Date(),
},
});
await writeRestoreAuditLog({
restoreId,
instanceId: instance.id,
backupId: restore.backupId,
triggeredById,
ipAddress,
options,
outcome: 'success',
sha256,
uploadId,
});
logger.info(`[restore] ${instance.slug}: restore ${restoreId} COMPLETED`);
} else {
const errMsg = finalState.errorMessage || `Agent reported FAILED (exit ${finalState.exitCode})`;
await prisma.instanceRestore.update({
where: { id: restoreId },
data: {
status: RestoreStatus.FAILED,
progressJson: finalState as unknown as Prisma.InputJsonValue,
logTail: finalState.logTail ?? null,
errorMessage: errMsg,
completedAt: new Date(),
},
});
await writeRestoreAuditLog({
restoreId,
instanceId: instance.id,
backupId: restore.backupId,
triggeredById,
ipAddress,
options,
outcome: 'agent_failed',
sha256,
uploadId,
errorMessage: errMsg,
});
logger.warn(`[restore] ${instance.slug}: restore ${restoreId} FAILED (exit ${finalState.exitCode})`);
}
} catch (err) {
const errMsg = (err as Error).message;
await prisma.instanceRestore.update({
where: { id: restoreId },
data: {
status: RestoreStatus.FAILED,
errorMessage: errMsg,
completedAt: new Date(),
},
});
await writeRestoreAuditLog({
restoreId,
instanceId: instance.id,
backupId: restore.backupId,
triggeredById,
ipAddress,
options,
outcome: 'orchestration_error',
sha256,
uploadId,
errorMessage: errMsg,
});
logger.error(`[restore] ${restore.instance.slug}: ${errMsg}`);
// Best-effort cleanup of the agent upload if we got that far
if (uploadId) {
try {
const driver = await getRemoteDriverForInstance({
id: instance.id,
slug: instance.slug,
isRemote: instance.isRemote,
agentUrl: instance.agentUrl,
});
await driver.deleteRestoreUpload(uploadId);
} catch { /* ignore */ }
}
}
}
/**
* List restores with optional filtering and pagination.
*/
export async function listRestores(instanceId?: string, page = 1, limit = 50) {
const where = instanceId ? { instanceId } : {};
const [data, total] = await Promise.all([
prisma.instanceRestore.findMany({
where,
orderBy: { startedAt: 'desc' },
skip: (page - 1) * limit,
take: limit,
include: {
instance: { select: { id: true, name: true, slug: true } },
backup: { select: { id: true, archivePath: true, sizeBytes: true } },
},
}),
prisma.instanceRestore.count({ where }),
]);
return { data, total, page, limit };
}
/**
* Get a single restore by ID.
*/
export async function getRestore(restoreId: string) {
const restore = await prisma.instanceRestore.findUnique({
where: { id: restoreId },
include: {
instance: { select: { id: true, name: true, slug: true } },
backup: { select: { id: true, archivePath: true, sizeBytes: true, manifest: true } },
},
});
if (!restore) {
throw new AppError(404, 'Restore not found', 'NOT_FOUND');
}
return restore;
}

View File

@ -0,0 +1,599 @@
/**
* Remote tunnel management service.
*
* Orchestrates Pangolin site/resource/target creation on behalf of remote CML
* instances, then pushes Newt credentials to the remote host via the mTLS agent.
* The CCP holds the Pangolin API token centrally remote instances never touch
* the Pangolin API themselves.
*/
import { AuditAction, Prisma } from '@prisma/client';
import { prisma } from '../lib/prisma';
import { env } from '../config/env';
import { AppError } from '../middleware/error-handler';
import { logger } from '../utils/logger';
import { getRemoteDriverForInstance } from './execution-driver';
import {
CcpPangolinClient,
type PangolinDomain,
type PangolinResource,
} from './ccp-pangolin.client';
// ─── Resource definitions ──────────────────────────────────────────
interface ResourceDef {
subdomain: string;
name: string;
required?: boolean;
featureFlag?: string;
}
const RESOURCE_DEFINITIONS: ResourceDef[] = [
{ subdomain: 'app', name: 'Admin GUI', required: true },
{ subdomain: 'api', name: 'API', required: true },
{ subdomain: '', name: 'Public Site', required: true },
{ subdomain: 'media', name: 'Media API', featureFlag: 'enableMedia' },
{ subdomain: 'db', name: 'NocoDB', required: false },
{ subdomain: 'docs', name: 'Docs', required: false },
{ subdomain: 'code', name: 'Code Server', required: false },
{ subdomain: 'git', name: 'Gitea', required: false },
{ subdomain: 'home', name: 'Homepage', required: false },
{ subdomain: 'listmonk', name: 'Listmonk', featureFlag: 'enableListmonk' },
{ subdomain: 'qr', name: 'Mini QR', required: false },
{ subdomain: 'draw', name: 'Excalidraw', required: false },
{ subdomain: 'vault', name: 'Vaultwarden', required: false },
{ subdomain: 'mail', name: 'MailHog', required: false },
{ subdomain: 'chat', name: 'Rocket.Chat', featureFlag: 'enableChat' },
{ subdomain: 'events', name: 'Gancio', featureFlag: 'enableGancio' },
{ subdomain: 'meet', name: 'Jitsi Meet', featureFlag: 'enableMeet' },
{ subdomain: 'grafana', name: 'Grafana', featureFlag: 'enableMonitoring' },
];
// ─── Helpers ───────────────────────────────────────────────────────
function getPangolinClient(): CcpPangolinClient {
if (!env.PANGOLIN_API_URL || !env.PANGOLIN_API_KEY || !env.PANGOLIN_ORG_ID) {
throw new AppError(
501,
'Pangolin API not configured on this CCP. Set PANGOLIN_API_URL, PANGOLIN_API_KEY, PANGOLIN_ORG_ID in the CCP .env file.',
'PANGOLIN_NOT_CONFIGURED'
);
}
return new CcpPangolinClient(env.PANGOLIN_API_URL, env.PANGOLIN_API_KEY, env.PANGOLIN_ORG_ID);
}
function fullSubdomain(prefix: string, sub: string): string {
if (!sub) return prefix; // root domain → prefix alone (e.g., "ck")
return `${prefix}-${sub}`; // e.g., "ck-app", "ck-api"
}
function shouldCreateResource(
def: ResourceDef,
instance: Record<string, unknown>
): boolean {
if (def.required) return true;
if (def.featureFlag) return !!(instance as Record<string, unknown>)[def.featureFlag];
return true; // optional with no feature flag → always create
}
async function findDomainForInstance(
client: CcpPangolinClient,
instanceDomain: string
): Promise<PangolinDomain> {
const domains = await client.listDomains();
// Match the instance's domain against registered Pangolin base domains
// e.g., instance.domain = "cursedknowledge.org" → look for base domain "cursedknowledge.org"
// or broader: instance.domain = "app.example.com" → look for "example.com"
const exact = domains.find((d) => d.baseDomain === instanceDomain);
if (exact) return exact;
// Try matching parent domain (e.g., sub.example.com → example.com)
const parts = instanceDomain.split('.');
for (let i = 1; i < parts.length - 1; i++) {
const parent = parts.slice(i).join('.');
const match = domains.find((d) => d.baseDomain === parent);
if (match) return match;
}
throw new AppError(
400,
`No Pangolin domain matches instance domain "${instanceDomain}". Available: ${domains.map((d) => d.baseDomain).join(', ')}`,
'DOMAIN_NOT_FOUND'
);
}
// ─── Setup ─────────────────────────────────────────────────────────
export interface SetupTunnelOptions {
subdomainPrefix?: string;
}
export interface TunnelSetupResult {
siteId: string;
newtId: string;
endpoint: string;
resourceCount: number;
resources: Array<{ subdomain: string; name: string; resourceId: string }>;
}
export async function setupTunnel(
instanceId: string,
options: SetupTunnelOptions,
userId?: string,
ipAddress?: string | null
): Promise<TunnelSetupResult> {
const client = getPangolinClient();
const instance = await prisma.instance.findUnique({ where: { id: instanceId } });
if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND');
if (!instance.isRemote) throw new AppError(400, 'Tunnel setup via Pangolin API is only for remote instances', 'NOT_REMOTE');
if (instance.pangolinSiteId) {
throw new AppError(400, 'Tunnel is already configured. Use sync to update resources, or teardown first.', 'ALREADY_CONFIGURED');
}
const prefix = options.subdomainPrefix || instance.slug;
const driver = await getRemoteDriverForInstance({
id: instance.id,
slug: instance.slug,
isRemote: instance.isRemote,
agentUrl: instance.agentUrl,
});
// 1. Get Newt credentials
logger.info(`[tunnel] ${instance.slug}: picking site defaults`);
const defaults = await client.pickSiteDefaults();
// 2. Create site
logger.info(`[tunnel] ${instance.slug}: creating Pangolin site`);
const site = await client.createSite({
name: instance.slug,
type: 'newt',
newtId: defaults.newtId,
secret: defaults.newtSecret,
address: defaults.address,
});
const siteId = String(site.siteId);
const newtId = site.newt?.newtId || defaults.newtId;
const newtSecret = site.newt?.secret || defaults.newtSecret;
// The Pangolin endpoint (what Newt connects to) may be different from
// the API URL. E.g., API = api.bnkserve.org/v1, endpoint = pangolin.bnkserve.org.
// If PANGOLIN_ENDPOINT is set, use it. Otherwise derive from API URL.
let endpoint = env.PANGOLIN_ENDPOINT || '';
if (!endpoint) {
const endpointUrl = new URL(env.PANGOLIN_API_URL);
endpoint = `${endpointUrl.protocol}//${endpointUrl.hostname}${endpointUrl.port ? ':' + endpointUrl.port : ''}`;
}
// 3. Find matching domain
const domain = await findDomainForInstance(client, instance.domain);
logger.info(`[tunnel] ${instance.slug}: matched domain ${domain.baseDomain} (id: ${domain.domainId})`);
// 4. Create resources + targets
const createdResources: Array<{ subdomain: string; name: string; resourceId: string }> = [];
const existingResources = await client.listResources();
for (const def of RESOURCE_DEFINITIONS) {
if (!shouldCreateResource(def, instance as unknown as Record<string, unknown>)) {
logger.debug(`[tunnel] ${instance.slug}: skipping ${def.name} (feature not enabled)`);
continue;
}
const sub = fullSubdomain(prefix, def.subdomain);
// Build the expected full domain so we can do an idempotent check against
// Pangolin's existing resources. Pangolin returns `fullDomain` not `subdomain`.
const expectedFullDomain = sub
? `${sub}.${domain.baseDomain}`
: domain.baseDomain;
// Idempotent: skip if a resource with this fullDomain already exists
const existing = existingResources.find(
(r) => r.fullDomain === expectedFullDomain
);
if (existing) {
logger.debug(`[tunnel] ${instance.slug}: resource ${def.name} (${expectedFullDomain}) already exists`);
createdResources.push({ subdomain: sub, name: def.name, resourceId: String(existing.resourceId) });
continue;
}
try {
const resourcePayload: Record<string, unknown> = {
name: def.name,
domainId: domain.domainId,
http: true,
protocol: 'tcp',
};
// Root domain: omit subdomain entirely (empty string is rejected by Pangolin)
if (sub) resourcePayload.subdomain = sub;
const resource = await client.createResource(resourcePayload as unknown as Parameters<typeof client.createResource>[0]);
// Make the resource public (no SSO, no access block)
try {
await client.updateResource(resource.resourceId, { sso: false, blockAccess: false });
} catch (err) {
logger.warn(`[tunnel] ${instance.slug}: failed to make ${def.name} public: ${(err as Error).message}`);
}
// Create target pointing to nginx:80 on the remote host
await client.createTarget(resource.resourceId, {
siteId: Number(siteId),
ip: 'nginx',
port: 80,
method: 'http',
enabled: true,
});
createdResources.push({ subdomain: sub, name: def.name, resourceId: resource.resourceId });
logger.info(`[tunnel] ${instance.slug}: created resource ${def.name}${sub}.${domain.baseDomain}`);
} catch (err) {
if (def.required) throw err;
logger.warn(`[tunnel] ${instance.slug}: failed to create optional resource ${def.name}: ${(err as Error).message}`);
}
}
// 5. Push Newt credentials to remote .env
logger.info(`[tunnel] ${instance.slug}: pushing Newt credentials to remote .env`);
const envLines = [
`PANGOLIN_ENDPOINT=${endpoint}`,
`PANGOLIN_SITE_ID=${siteId}`,
`PANGOLIN_NEWT_ID=${newtId}`,
`PANGOLIN_NEWT_SECRET=${newtSecret}`,
].join('\n') + '\n';
// Read current .env, append/replace Pangolin vars
const currentEnv = await driver.readEnvFile('');
const envContent = buildUpdatedEnv(currentEnv, {
PANGOLIN_ENDPOINT: endpoint,
PANGOLIN_SITE_ID: siteId,
PANGOLIN_NEWT_ID: newtId,
PANGOLIN_NEWT_SECRET: newtSecret,
});
await driver.writeFiles('', [{ relativePath: '.env', content: envContent }]);
// 6. Persist on Instance row
await prisma.instance.update({
where: { id: instanceId },
data: {
pangolinEndpoint: endpoint,
pangolinSiteId: siteId,
pangolinNewtId: newtId,
pangolinNewtSecret: newtSecret,
pangolinSubdomainPrefix: prefix,
},
});
// 7. Recreate Newt container to pick up the new .env vars.
// `docker compose restart` does NOT re-read .env — it only sends SIGTERM+restart.
// `docker compose up -d newt` detects env var changes (via ${PANGOLIN_NEWT_ID}
// expansion in docker-compose.yml) and recreates the container automatically.
logger.info(`[tunnel] ${instance.slug}: recreating newt container with new credentials`);
try {
await driver.composeUp('', '', ['newt']);
} catch (err) {
logger.warn(`[tunnel] ${instance.slug}: composeUp(newt) failed: ${(err as Error).message}`);
}
// 8. Audit log
if (userId) {
await prisma.auditLog.create({
data: {
userId,
instanceId,
action: AuditAction.PANGOLIN_SETUP,
details: {
source: 'remote',
siteId,
newtId,
endpoint,
resourceCount: createdResources.length,
subdomainPrefix: prefix,
} as unknown as Prisma.InputJsonValue,
ipAddress: ipAddress ?? null,
},
});
}
logger.info(`[tunnel] ${instance.slug}: tunnel setup complete — ${createdResources.length} resources created`);
return {
siteId,
newtId,
endpoint,
resourceCount: createdResources.length,
resources: createdResources,
};
}
// ─── Sync ──────────────────────────────────────────────────────────
export async function syncResources(
instanceId: string,
userId?: string,
ipAddress?: string | null
) {
const client = getPangolinClient();
const instance = await prisma.instance.findUnique({ where: { id: instanceId } });
if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND');
if (!instance.pangolinSiteId) throw new AppError(400, 'No tunnel configured', 'NO_TUNNEL');
const prefix = instance.pangolinSubdomainPrefix || instance.slug;
const domain = await findDomainForInstance(client, instance.domain);
const existingResources = await client.listResources();
const siteId = instance.pangolinSiteId;
let created = 0;
for (const def of RESOURCE_DEFINITIONS) {
if (!shouldCreateResource(def, instance as unknown as Record<string, unknown>)) continue;
const sub = fullSubdomain(prefix, def.subdomain);
const expectedFullDomain = sub ? `${sub}.${domain.baseDomain}` : domain.baseDomain;
const existing = existingResources.find((r) => r.fullDomain === expectedFullDomain);
if (existing) continue;
try {
const resourcePayload: Record<string, unknown> = {
name: def.name,
domainId: domain.domainId,
http: true,
protocol: 'tcp',
};
if (sub) resourcePayload.subdomain = sub;
const resource = await client.createResource(resourcePayload as unknown as Parameters<typeof client.createResource>[0]);
await client.updateResource(resource.resourceId, { sso: false, blockAccess: false });
await client.createTarget(resource.resourceId, {
siteId: Number(siteId),
ip: 'nginx',
port: 80,
method: 'http',
enabled: true,
});
created++;
logger.info(`[tunnel] ${instance.slug}: sync created ${def.name} (${sub})`);
} catch (err) {
if (def.required) throw err;
logger.warn(`[tunnel] ${instance.slug}: sync failed for ${def.name}: ${(err as Error).message}`);
}
}
if (userId) {
await prisma.auditLog.create({
data: {
userId,
instanceId,
action: AuditAction.PANGOLIN_SYNC,
details: { source: 'remote', created, siteId } as unknown as Prisma.InputJsonValue,
ipAddress: ipAddress ?? null,
},
});
}
return { synced: true, created };
}
// ─── Teardown ──────────────────────────────────────────────────────
export async function teardownTunnel(
instanceId: string,
userId?: string,
ipAddress?: string | null
) {
const client = getPangolinClient();
const instance = await prisma.instance.findUnique({ where: { id: instanceId } });
if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND');
if (!instance.pangolinSiteId) throw new AppError(400, 'No tunnel configured', 'NO_TUNNEL');
const siteId = instance.pangolinSiteId;
// Delete site from Pangolin (cascades resources + targets)
try {
await client.deleteSite(siteId);
logger.info(`[tunnel] ${instance.slug}: deleted Pangolin site ${siteId}`);
} catch (err) {
logger.warn(`[tunnel] ${instance.slug}: deleteSite failed (may already be gone): ${(err as Error).message}`);
}
// Clear Instance fields
await prisma.instance.update({
where: { id: instanceId },
data: {
pangolinEndpoint: null,
pangolinSiteId: null,
pangolinNewtId: null,
pangolinNewtSecret: null,
},
});
// Push empty Pangolin vars to remote .env
if (instance.isRemote) {
try {
const driver = await getRemoteDriverForInstance({
id: instance.id,
slug: instance.slug,
isRemote: instance.isRemote,
agentUrl: instance.agentUrl,
});
const currentEnv = await driver.readEnvFile('');
const envContent = buildUpdatedEnv(currentEnv, {
PANGOLIN_ENDPOINT: '',
PANGOLIN_SITE_ID: '',
PANGOLIN_NEWT_ID: '',
PANGOLIN_NEWT_SECRET: '',
});
await driver.writeFiles('', [{ relativePath: '.env', content: envContent }]);
// Stop newt container (best effort)
try {
await driver.composeStop('', '');
await driver.composeUp('', ''); // restart everything except newt won't start without creds
} catch { /* ignore */ }
} catch (err) {
logger.warn(`[tunnel] ${instance.slug}: failed to push empty env to remote: ${(err as Error).message}`);
}
}
// Audit log
if (userId) {
await prisma.auditLog.create({
data: {
userId,
instanceId,
action: AuditAction.PANGOLIN_TEARDOWN,
details: { source: 'remote', siteId } as unknown as Prisma.InputJsonValue,
ipAddress: ipAddress ?? null,
},
});
}
return { tornDown: true };
}
// ─── Status ────────────────────────────────────────────────────────
export interface TunnelStatus {
configured: boolean;
online?: boolean;
siteId?: string;
endpoint?: string;
resources?: Array<{
subdomain: string;
name: string;
resourceId: string;
hasTarget: boolean;
targetIp?: string;
targetPort?: number;
}>;
}
export async function getTunnelStatus(instanceId: string): Promise<TunnelStatus> {
const instance = await prisma.instance.findUnique({ where: { id: instanceId } });
if (!instance) throw new AppError(404, 'Instance not found', 'NOT_FOUND');
if (!instance.pangolinSiteId) {
return { configured: false };
}
// For local instances, return stored values without querying Pangolin API
if (!instance.isRemote) {
return {
configured: true,
siteId: instance.pangolinSiteId ?? undefined,
endpoint: instance.pangolinEndpoint ?? undefined,
};
}
const client = getPangolinClient();
let online = false;
try {
const site = await client.getSite(instance.pangolinSiteId);
online = site.online ?? false;
} catch (err) {
logger.warn(`[tunnel] ${instance.slug}: getSite failed: ${(err as Error).message}`);
}
const resources: TunnelStatus['resources'] = [];
try {
const allResources = await client.listResources();
const siteIdNum = Number(instance.pangolinSiteId);
// Filter to resources that have a target pointing to our siteId.
// This is the most reliable filter since it uses the actual Pangolin
// site association rather than guessing from subdomain names.
for (const res of allResources) {
let hasTarget = false;
let targetIp: string | undefined;
let targetPort: number | undefined;
let belongsToUs = false;
try {
const targets = await client.listTargets(String(res.resourceId));
for (const t of targets) {
if (Number(t.siteId) === siteIdNum) {
belongsToUs = true;
hasTarget = true;
targetIp = t.ip;
targetPort = t.port;
break;
}
}
} catch { /* ignore */ }
if (belongsToUs) {
// Extract subdomain from fullDomain for display
const fd = res.fullDomain || '';
const domainSuffix = `.${instance.domain}`;
const subdomain = fd.endsWith(domainSuffix)
? fd.slice(0, -domainSuffix.length)
: fd === instance.domain ? '' : fd;
resources.push({
subdomain,
name: res.name,
resourceId: String(res.resourceId),
hasTarget,
targetIp,
targetPort,
});
}
}
} catch (err) {
logger.warn(`[tunnel] ${instance.slug}: listResources failed: ${(err as Error).message}`);
}
return {
configured: true,
online,
siteId: instance.pangolinSiteId ?? undefined,
endpoint: instance.pangolinEndpoint ?? undefined,
resources,
};
}
// ─── .env Helpers ──────────────────────────────────────────────────
/**
* Quote a .env value if it contains characters that dotenv parsers interpret:
* # (comment), = (separator), spaces, quotes, backslashes, newlines.
* Pangolin-issued UUIDs/base64 secrets typically don't need quoting, but
* defensive quoting prevents silent corruption if they ever do.
*/
function quoteEnvValue(value: string): string {
if (/[\s#"'\\=\n\r]/.test(value)) {
return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n')}"`;
}
return value;
}
/**
* Build an updated .env string by replacing/appending the given key-value pairs.
* Preserves all existing keys not in the update set.
*/
function buildUpdatedEnv(
currentEnv: Record<string, string> | null,
updates: Record<string, string>
): string {
const lines: string[] = [];
const seen = new Set<string>();
// If we have the current env, reproduce it with replacements
if (currentEnv) {
for (const [key, value] of Object.entries(currentEnv)) {
if (key in updates) {
if (updates[key]) lines.push(`${key}=${quoteEnvValue(updates[key]!)}`);
// If update value is empty, omit the line (remove the var)
seen.add(key);
} else {
lines.push(`${key}=${quoteEnvValue(value)}`);
}
}
}
// Append new keys not already in the file
for (const [key, value] of Object.entries(updates)) {
if (!seen.has(key) && value) {
lines.push(`${key}=${quoteEnvValue(value)}`);
}
}
return lines.join('\n') + '\n';
}

View File

@ -2,14 +2,61 @@ import { exec as execCb } from 'child_process';
import { promisify } from 'util';
import fs from 'fs/promises';
import path from 'path';
import { UpgradeStatus, AuditAction, InstanceStatus, Prisma } from '@prisma/client';
import { UpgradeStatus, AuditAction, InstanceStatus, Prisma, Instance } from '@prisma/client';
import { prisma } from '../lib/prisma';
import { logger } from '../utils/logger';
import { createEvent } from './event.service';
import { getRemoteDriverForInstance } from './execution-driver';
import type { AgentUpdateStatus } from './remote-driver';
/**
* Write an INSTANCE_UPGRADE audit log entry capturing a terminal outcome.
* Wrapped in try/catch so that an audit-log DB failure cannot mask the
* underlying upgrade row status update.
*
* Called from all three terminal paths (both local and remote):
* - 'completed' upgrade.sh/agent reported success
* - 'failed' upgrade.sh/agent reported failure
* - 'orchestration_error' CCP-side exception, timeout, or unreachable agent
*/
async function writeUpgradeAuditLog(args: {
upgradeId: string;
instanceId: string;
triggeredById: string | null;
source: 'local' | 'remote';
outcome: 'completed' | 'failed' | 'orchestration_error';
previousCommit: string | null;
newCommit: string | null;
durationSeconds: number | null;
errorMessage?: string | null;
}): Promise<void> {
if (!args.triggeredById) return;
try {
await prisma.auditLog.create({
data: {
userId: args.triggeredById,
instanceId: args.instanceId,
action: AuditAction.INSTANCE_UPGRADE,
details: {
upgradeId: args.upgradeId,
source: args.source,
outcome: args.outcome,
previousCommit: args.previousCommit,
newCommit: args.newCommit,
durationSeconds: args.durationSeconds,
...(args.errorMessage ? { errorMessage: args.errorMessage.substring(0, 500) } : {}),
} as unknown as Prisma.InputJsonValue,
},
});
} catch (err) {
logger.error(`[upgrade] failed to write audit log for ${args.upgradeId}: ${(err as Error).message}`);
}
}
const exec = promisify(execCb);
const UPGRADE_TIMEOUT = 600_000; // 10 minutes
const UPGRADE_TIMEOUT = 600_000; // 10 minutes — local upgrades
const REMOTE_UPGRADE_TIMEOUT = 15 * 60 * 1000; // 15 minutes — remote (network round trips)
const PROGRESS_POLL_INTERVAL = 2_000; // 2 seconds
// ─── Update Check ─────────────────────────────────────────────────
@ -26,13 +73,57 @@ export interface UpdateStatus {
}
/**
* Check for available updates by running upgrade-check.sh in the instance's basePath.
* Falls back to reading an existing status.json if the script isn't available.
* Check for available updates. Branches on instance.isRemote:
* - Local: runs upgrade-check.sh in the instance's basePath and reads status.json
* - Remote: calls the agent's POST /upgrade/check endpoint over mTLS
*/
export async function checkForUpdates(instanceId: string): Promise<UpdateStatus> {
const instance = await prisma.instance.findUnique({ where: { id: instanceId } });
if (!instance) throw new Error('Instance not found');
if (instance.isRemote) {
return checkForUpdatesRemote(instance);
}
return checkForUpdatesLocal(instance);
}
/**
* Remote check: ask the agent to run upgrade-check.sh and return its status.json.
*/
async function checkForUpdatesRemote(instance: Instance): Promise<UpdateStatus> {
try {
const driver = await getRemoteDriverForInstance({
id: instance.id,
slug: instance.slug,
isRemote: instance.isRemote,
agentUrl: instance.agentUrl,
});
const status: AgentUpdateStatus = await driver.checkForUpdates();
return {
branch: status.branch,
currentCommit: status.currentCommit,
currentMessage: status.currentMessage,
remoteCommit: status.remoteCommit,
commitsBehind: status.commitsBehind,
changelog: status.changelog,
checkedAt: status.checkedAt,
error: status.error,
};
} catch (err) {
logger.warn(`[upgrade] remote check failed for ${instance.slug}: ${(err as Error).message}`);
return {
branch: instance.gitBranch,
currentCommit: instance.gitCommit || 'unknown',
remoteCommit: null,
commitsBehind: 0,
changelog: [],
checkedAt: new Date().toISOString(),
error: `Remote check failed: ${(err as Error).message}`,
};
}
}
async function checkForUpdatesLocal(instance: Instance): Promise<UpdateStatus> {
const basePath = instance.basePath;
const statusFile = path.join(basePath, 'data', 'upgrade', 'status.json');
const scriptPath = path.join(basePath, 'scripts', 'upgrade-check.sh');
@ -119,16 +210,21 @@ export async function startUpgrade(
throw new Error('An upgrade is already in progress for this instance');
}
// Get current commit for tracking
let currentCommit: string | null = null;
try {
const { stdout } = await exec('git rev-parse --short HEAD', {
cwd: instance.basePath,
timeout: 5_000,
});
currentCommit = stdout.trim();
} catch {
// Non-critical — may be a release install without .git
// Get current commit for tracking. For local instances we can read it from
// git directly; for remote instances we either trust the DB-tracked value
// (set by previous upgrade-check) or leave it null and let upgrade.sh
// report the previous commit in result.json.
let currentCommit: string | null = instance.gitCommit;
if (!instance.isRemote) {
try {
const { stdout } = await exec('git rev-parse --short HEAD', {
cwd: instance.basePath,
timeout: 5_000,
});
currentCommit = stdout.trim();
} catch {
// Non-critical — may be a release install without .git
}
}
const branch = options?.branch || instance.gitBranch;
@ -154,20 +250,222 @@ export async function startUpgrade(
upgradeId: upgrade.id,
previousCommit: currentCommit,
branch,
source: instance.isRemote ? 'remote' : 'local',
options: options || {},
} as unknown as Prisma.InputJsonValue,
ipAddress,
},
});
// Fire-and-forget: run the upgrade asynchronously
runUpgrade(upgrade.id, instance.basePath, instance.slug, options).catch((err) => {
logger.error(`[upgrade] Upgrade orchestration failed for ${instance.slug}: ${err}`);
});
// Fire-and-forget: branch on isRemote
if (instance.isRemote) {
runRemoteUpgrade(upgrade.id, instance, options).catch((err) => {
logger.error(`[upgrade] Remote upgrade orchestration failed for ${instance.slug}: ${err}`);
});
} else {
runUpgrade(upgrade.id, instance.basePath, instance.slug, options).catch((err) => {
logger.error(`[upgrade] Upgrade orchestration failed for ${instance.slug}: ${err}`);
});
}
return upgrade;
}
/**
* Async REMOTE upgrade runner.
*
* Flow:
* 1. Get RemoteDriver
* 2. Mark InstanceUpgrade IN_PROGRESS
* 3. Tell agent to start upgrade.sh in --api-mode
* 4. Poll agent /upgrade/progress every 2s, mirror to DB
* 5. Try /upgrade/result every poll cycle; when present, finalize
* 6. On timeout (15 min), mark FAILED and create error event
*
* Note: there is no shell or filesystem access on the CCP side everything
* goes through the mTLS agent. The agent's spawn of upgrade.sh is itself
* fire-and-forget under a slug mutex.
*/
async function runRemoteUpgrade(
upgradeId: string,
instance: Instance,
options?: StartUpgradeOptions
) {
const slug = instance.slug;
try {
const driver = await getRemoteDriverForInstance({
id: instance.id,
slug: instance.slug,
isRemote: instance.isRemote,
agentUrl: instance.agentUrl,
});
// Mark IN_PROGRESS
await prisma.instanceUpgrade.update({
where: { id: upgradeId },
data: {
status: UpgradeStatus.IN_PROGRESS,
progressMessage: 'Starting remote upgrade...',
},
});
// Tell the agent to start. The agent has its own mutex + stale-progress
// check, so this can return 409 if a previous upgrade is still running.
logger.info(`[upgrade] ${slug}: triggering remote upgrade.sh start`);
await driver.startUpgrade({
skipBackup: options?.skipBackup,
useRegistry: options?.useRegistry,
branch: options?.branch,
});
// Poll progress + result. We treat /result returning 200 as the signal
// that upgrade.sh exited (successfully or with code != 0 — the script
// writes result.json either way in --api-mode).
const deadline = Date.now() + REMOTE_UPGRADE_TIMEOUT;
let lastProgress: { phase?: number; phaseName?: string; percentage?: number; message?: string } = {};
while (Date.now() < deadline) {
await new Promise((r) => setTimeout(r, PROGRESS_POLL_INTERVAL));
// Try to fetch the result first; if it exists, we're done
let result = null;
try {
result = await driver.getUpgradeResult();
} catch {
// No result yet — keep polling progress
}
if (result) {
// Final result available — write it and exit
const upgradeRowBefore = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } });
await prisma.instanceUpgrade.update({
where: { id: upgradeId },
data: {
status: result.success ? UpgradeStatus.COMPLETED : UpgradeStatus.FAILED,
newCommit: result.newCommit || null,
commitCount: result.commitCount || 0,
percentage: 100,
phaseName: 'Complete',
progressMessage: result.message || 'Upgrade completed',
durationSeconds: result.durationSeconds || null,
warnings: result.warnings?.length ? (result.warnings as unknown as Prisma.InputJsonValue) : undefined,
errorMessage: result.success ? null : (result.message || 'Upgrade failed'),
completedAt: new Date(),
},
});
// Update Instance.gitCommit if we have a new commit
if (result.newCommit) {
await prisma.instance.update({
where: { id: instance.id },
data: { gitCommit: result.newCommit },
});
}
if (!result.success) {
await createEvent(
instance.id,
'ERROR',
'upgrade',
'Remote upgrade failed',
result.message || 'The remote upgrade process failed. Check the agent log for details.',
{ upgradeId, source: 'remote', warnings: result.warnings }
);
}
await writeUpgradeAuditLog({
upgradeId,
instanceId: instance.id,
triggeredById: upgradeRowBefore?.triggeredById ?? null,
source: 'remote',
outcome: result.success ? 'completed' : 'failed',
previousCommit: upgradeRowBefore?.previousCommit ?? null,
newCommit: result.newCommit || null,
durationSeconds: result.durationSeconds || null,
errorMessage: result.success ? null : (result.message || 'Upgrade failed'),
});
logger.info(`[upgrade] ${slug}: remote upgrade ${result.success ? 'COMPLETED' : 'FAILED'}`);
return;
}
// No result yet — pull progress
try {
const progress = await driver.getUpgradeProgress();
// Only update DB if something actually changed (avoid hot-loop writes)
if (
progress.phase !== lastProgress.phase ||
progress.percentage !== lastProgress.percentage ||
progress.message !== lastProgress.message
) {
lastProgress = {
phase: progress.phase,
phaseName: progress.phaseName,
percentage: progress.percentage,
message: progress.message,
};
await prisma.instanceUpgrade.update({
where: { id: upgradeId },
data: {
currentPhase: progress.phase || 0,
phaseName: progress.phaseName || null,
percentage: progress.percentage || 0,
progressMessage: progress.message || null,
},
});
}
} catch (err) {
// Transient network blip during a long upgrade — keep polling
logger.debug(`[upgrade] ${slug}: progress poll error: ${(err as Error).message}`);
}
}
// Timeout — mark FAILED
throw new Error(`Remote upgrade timed out after ${Math.round(REMOTE_UPGRADE_TIMEOUT / 60_000)} minutes`);
} catch (err) {
const errorMsg = (err as Error).message;
const isTimeout = errorMsg.includes('timed out');
const upgradeRowBefore = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } });
await prisma.instanceUpgrade.update({
where: { id: upgradeId },
data: {
status: UpgradeStatus.FAILED,
errorMessage: isTimeout ? errorMsg : errorMsg.slice(0, 2000),
progressMessage: 'Failed',
completedAt: new Date(),
},
});
await createEvent(
instance.id,
'ERROR',
'upgrade',
isTimeout ? 'Remote upgrade timed out' : 'Remote upgrade failed',
errorMsg.slice(0, 500),
{ upgradeId, source: 'remote' }
);
await writeUpgradeAuditLog({
upgradeId,
instanceId: instance.id,
triggeredById: upgradeRowBefore?.triggeredById ?? null,
source: 'remote',
outcome: 'orchestration_error',
previousCommit: upgradeRowBefore?.previousCommit ?? null,
newCommit: null,
durationSeconds: null,
errorMessage: errorMsg,
});
// Don't flip the instance to ERROR state for remote upgrades — the agent
// health check will reflect the real state on the next poll, and we don't
// want to mask a recovered instance with stale CCP-side ERROR.
logger.error(`[upgrade] ${slug}: ${errorMsg}`);
}
}
/**
* Async upgrade runner. Runs upgrade.sh and polls progress.
*/
@ -271,19 +569,32 @@ async function runUpgrade(
});
}
if (!result.success) {
const upgradeRow = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } });
if (!result.success && upgradeRow) {
// Create error event
const upgrade = await prisma.instanceUpgrade.findUnique({ where: { id: upgradeId } });
if (upgrade) {
await createEvent(
upgrade.instanceId,
'ERROR',
'upgrade',
'Upgrade failed',
result.message || 'The upgrade process failed. Check logs for details.',
{ upgradeId, previousCommit: upgrade.previousCommit, warnings: result.warnings }
);
}
await createEvent(
upgradeRow.instanceId,
'ERROR',
'upgrade',
'Upgrade failed',
result.message || 'The upgrade process failed. Check logs for details.',
{ upgradeId, previousCommit: upgradeRow.previousCommit, warnings: result.warnings }
);
}
if (upgradeRow) {
await writeUpgradeAuditLog({
upgradeId,
instanceId: upgradeRow.instanceId,
triggeredById: upgradeRow.triggeredById,
source: 'local',
outcome: result.success ? 'completed' : 'failed',
previousCommit: upgradeRow.previousCommit,
newCommit: result.newCommit || newCommit,
durationSeconds: result.durationSeconds || null,
errorMessage: result.success ? null : (result.message || 'Upgrade failed'),
});
}
logger.info(`[upgrade] ${slug}: Upgrade ${result.success ? 'completed' : 'failed'}`);
@ -327,6 +638,18 @@ async function runUpgrade(
statusMessage: `Upgrade failed: ${isTimeout ? 'timeout' : errorMsg.slice(0, 200)}`,
},
});
await writeUpgradeAuditLog({
upgradeId,
instanceId: upgrade.instanceId,
triggeredById: upgrade.triggeredById,
source: 'local',
outcome: 'orchestration_error',
previousCommit: upgrade.previousCommit,
newCommit: null,
durationSeconds: result.durationSeconds || null,
errorMessage: errorMsg,
});
}
logger.error(`[upgrade] ${slug}: Upgrade failed: ${errorMsg}`);

View File

@ -38,6 +38,11 @@ NI_MAPBOX_KEY=""
NI_MAXMIND_ACCOUNT_ID=""
NI_MAXMIND_LICENSE_KEY=""
# CCP (Changemaker Control Panel) registration flags
NI_CCP_URL=""
NI_CCP_INVITE_CODE=""
NI_CCP_AGENT_URL=""
# --- Arg parser ---
while [[ $# -gt 0 ]]; do
case "$1" in
@ -62,6 +67,10 @@ while [[ $# -gt 0 ]]; do
--mapbox-key) NI_MAPBOX_KEY="$2"; shift 2 ;;
--maxmind-account-id) NI_MAXMIND_ACCOUNT_ID="$2"; shift 2 ;;
--maxmind-license-key) NI_MAXMIND_LICENSE_KEY="$2"; shift 2 ;;
# CCP (Changemaker Control Panel)
--ccp-url) NI_CCP_URL="$2"; shift 2 ;;
--ccp-invite-code) NI_CCP_INVITE_CODE="$2"; shift 2 ;;
--ccp-agent-url) NI_CCP_AGENT_URL="$2"; shift 2 ;;
--help|-h)
echo "Usage: bash config.sh [OPTIONS]"
echo ""
@ -91,6 +100,11 @@ while [[ $# -gt 0 ]]; do
echo " --maxmind-account-id ID MaxMind GeoIP account ID"
echo " --maxmind-license-key K MaxMind GeoIP license key"
echo ""
echo "CCP (Changemaker Control Panel) — all 3 flags required to register:"
echo " --ccp-url URL CCP server URL (e.g., https://ccp.example.com)"
echo " --ccp-invite-code CODE One-time invite code from CCP"
echo " --ccp-agent-url URL Agent URL the CCP reaches (e.g., https://this-host:7443)"
echo ""
echo "Example:"
echo " bash config.sh --non-interactive --domain example.org --admin-password MyStr0ngPass123"
echo " bash config.sh -y --domain example.org --admin-password MyStr0ngPass123 \\"
@ -798,6 +812,17 @@ configure_features() {
else
warn "Set JVB_ADVERTISE_IP in .env before starting Jitsi containers."
fi
else
# Non-interactive: auto-detect public IP for NAT traversal
local detected_ip
detected_ip=$(curl -sf --max-time 5 https://ifconfig.me 2>/dev/null || \
curl -sf --max-time 5 https://api.ipify.org 2>/dev/null || true)
if [[ -n "$detected_ip" ]]; then
update_env_var "JVB_ADVERTISE_IP" "$detected_ip"
success "JVB advertise IP auto-detected: $detected_ip"
else
warn "Could not auto-detect public IP. Set JVB_ADVERTISE_IP in .env before starting Jitsi."
fi
fi
else
MEET_ENABLED="no"
@ -838,13 +863,6 @@ configure_features() {
update_env_var "ENABLE_PEOPLE" "false"
fi
if prompt_yes_no "Enable Analytics & GeoIP (visitor tracking, geo dashboard)?"; then
update_env_var "ENABLE_ANALYTICS" "true"
success "Analytics enabled"
else
update_env_var "ENABLE_ANALYTICS" "false"
fi
if prompt_yes_no "Enable Docs Comments & Version History (Gitea-backed)?"; then
update_env_var "GITEA_COMMENTS_ENABLED" "true"
success "Docs Comments & Version History enabled"
@ -881,8 +899,14 @@ configure_features() {
fi
if prompt_yes_no "Enable Monitoring stack (Prometheus, Grafana, Alertmanager, cAdvisor)?" "y"; then
update_env_var "COMPOSE_PROFILES" "monitoring"
success "Monitoring enabled (COMPOSE_PROFILES=monitoring)"
local existing_profiles
existing_profiles=$(grep -oP 'COMPOSE_PROFILES=\K.*' "$ENV_FILE" 2>/dev/null || echo "")
if [[ -z "$existing_profiles" ]]; then
update_env_var "COMPOSE_PROFILES" "monitoring"
elif [[ "$existing_profiles" != *"monitoring"* ]]; then
update_env_var "COMPOSE_PROFILES" "${existing_profiles},monitoring"
fi
success "Monitoring enabled (COMPOSE_PROFILES includes monitoring)"
MONITORING_ENABLED="yes"
else
MONITORING_ENABLED="no"
@ -1401,6 +1425,35 @@ pangolin_connect_first_site() {
configure_control_panel() {
header "Control Panel Registration"
# Non-interactive: use --ccp-* flags if all three provided, otherwise skip
if [[ "$NON_INTERACTIVE" == "true" ]]; then
if [[ -n "$NI_CCP_URL" && -n "$NI_CCP_INVITE_CODE" && -n "$NI_CCP_AGENT_URL" ]]; then
update_env_var "ENABLE_CCP_AGENT" "true"
update_env_var "CCP_URL" "$NI_CCP_URL"
update_env_var "CCP_INVITE_CODE" "$NI_CCP_INVITE_CODE"
update_env_var "CCP_AGENT_URL" "$NI_CCP_AGENT_URL"
# Append ccp-agent to existing profiles (don't clobber monitoring)
local existing_profiles
existing_profiles=$(grep -oP 'COMPOSE_PROFILES=\K.*' "$ENV_FILE" 2>/dev/null || echo "")
if [[ -z "$existing_profiles" ]]; then
update_env_var "COMPOSE_PROFILES" "ccp-agent"
elif [[ "$existing_profiles" != *"ccp-agent"* ]]; then
update_env_var "COMPOSE_PROFILES" "${existing_profiles},ccp-agent"
fi
success "CCP registration configured ($NI_CCP_URL)"
else
update_env_var "ENABLE_CCP_AGENT" "false"
if [[ -n "$NI_CCP_URL" || -n "$NI_CCP_INVITE_CODE" || -n "$NI_CCP_AGENT_URL" ]]; then
warn "CCP registration needs all 3 flags: --ccp-url, --ccp-invite-code, --ccp-agent-url"
else
info "Skipping CCP registration (no --ccp-url provided)"
fi
fi
return
fi
if prompt_yes_no "Register this instance with a Changemaker Control Panel?"; then
echo ""
read -rp " Enter Control Panel URL (e.g., https://ccp.example.com): " ccp_url
@ -2152,9 +2205,15 @@ main() {
header "Release Mode Settings"
update_env_var "IMAGE_TAG" "latest"
update_env_var "NODE_ENV" "production"
# Ensure monitoring is included if user opted in
# Ensure monitoring is included if user opted in (preserve existing profiles)
if [[ "${MONITORING_ENABLED:-no}" == "yes" ]]; then
update_env_var "COMPOSE_PROFILES" "monitoring"
local existing_profiles
existing_profiles=$(grep -oP 'COMPOSE_PROFILES=\K.*' "$ENV_FILE" 2>/dev/null || echo "")
if [[ -z "$existing_profiles" ]]; then
update_env_var "COMPOSE_PROFILES" "monitoring"
elif [[ "$existing_profiles" != *"monitoring"* ]]; then
update_env_var "COMPOSE_PROFILES" "${existing_profiles},monitoring"
fi
fi
success "Set IMAGE_TAG=latest, NODE_ENV=production (pre-built images)"
fi

View File

@ -103,7 +103,8 @@ cp "$PROJECT_DIR/api/prisma/init-nocodb-db.sh" "$STAGE_DIR/scripts/"
cp "$PROJECT_DIR/api/prisma/init-gancio-db.sh" "$STAGE_DIR/scripts/"
# Runtime scripts
for script in nocodb-init.sh gitea-init.sh mkdocs-entrypoint.sh backup.sh \
for script in nocodb-init.sh gitea-init.sh mkdocs-entrypoint.sh \
backup.sh restore.sh \
upgrade.sh upgrade-check.sh upgrade-watcher.sh \
uninstall.sh test-deployment.sh; do
if [[ -f "$PROJECT_DIR/scripts/$script" ]]; then

View File

@ -294,7 +294,7 @@ if [[ "$START_SERVICES" =~ ^[Yy]$ ]]; then
info " Database migrations and seeding run automatically on first boot."
echo ""
CORE_SERVICES=("v2-postgres" "redis" "api" "admin")
CORE_SERVICES=("v2-postgres" "redis" "api" "admin" "nginx")
ELAPSED=0
ALL_HEALTHY=false

View File

@ -359,9 +359,13 @@ trap on_failure EXIT
acquire_lock
load_env
# Determine branch
# Determine branch (source mode only — release installs have no git)
if [[ -z "$BRANCH" ]]; then
BRANCH="$(git rev-parse --abbrev-ref HEAD)"
if [[ "$INSTALL_MODE" == "release" ]]; then
BRANCH="release"
else
BRANCH="$(git rev-parse --abbrev-ref HEAD)"
fi
fi
# =============================================================================
@ -461,13 +465,15 @@ else
exit 1
fi
# Remote reachable
info "Checking git remote..."
if timeout 10 git ls-remote origin HEAD &>/dev/null 2>&1; then
success "Git remote reachable"
else
error "Cannot reach git remote. Check your network or remote configuration."
exit 1
# Remote reachable (source mode only — release mode pulls from Gitea API later)
if [[ "$INSTALL_MODE" == "source" ]]; then
info "Checking git remote..."
if timeout 10 git ls-remote origin HEAD &>/dev/null 2>&1; then
success "Git remote reachable"
else
error "Cannot reach git remote. Check your network or remote configuration."
exit 1
fi
fi
# Working directory checks
@ -490,9 +496,16 @@ fi
success "Disk space: ${AVAILABLE_MB}MB available"
# Record pre-upgrade state
PRE_UPGRADE_COMMIT="$(git rev-parse HEAD)"
PRE_UPGRADE_SHORT="$(git rev-parse --short HEAD)"
info "Current commit: $PRE_UPGRADE_SHORT ($(git log -1 --format='%s' HEAD))"
if [[ "$INSTALL_MODE" == "source" ]]; then
PRE_UPGRADE_COMMIT="$(git rev-parse HEAD)"
PRE_UPGRADE_SHORT="$(git rev-parse --short HEAD)"
info "Current commit: $PRE_UPGRADE_SHORT ($(git log -1 --format='%s' HEAD))"
else
# Release mode: derive "commit" from VERSION file (format: <tag>\n<sha>)
PRE_UPGRADE_COMMIT="$(head -2 "$PROJECT_DIR/VERSION" 2>/dev/null | tail -1 || echo "release")"
PRE_UPGRADE_SHORT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
info "Current version: $PRE_UPGRADE_SHORT"
fi
info "Target branch: $BRANCH"
# Record running containers (for restoring monitoring profile later)
@ -502,31 +515,36 @@ if docker ps --format '{{.Names}}' | grep -q 'prometheus-changemaker'; then
info "Monitoring stack detected (will restart after upgrade)"
fi
# Warn about uncommitted changes in project-owned paths
PROJECT_OWNED_PATHS="api/ admin/ docker-compose.yml"
DIRTY_PROJECT_FILES="$(git diff --name-only HEAD -- $PROJECT_OWNED_PATHS 2>/dev/null || true)"
if [[ -n "$DIRTY_PROJECT_FILES" ]]; then
warn "Uncommitted changes in project-owned files:"
echo "$DIRTY_PROJECT_FILES" | while read -r f; do echo " $f"; done
if [[ "$FORCE" != "true" ]]; then
error "Commit or stash these changes first, or use --force to continue."
exit 1
# Source-mode-only checks: dirty files + upstream commit comparison
if [[ "$INSTALL_MODE" == "source" ]]; then
# Warn about uncommitted changes in project-owned paths
PROJECT_OWNED_PATHS="api/ admin/ docker-compose.yml"
DIRTY_PROJECT_FILES="$(git diff --name-only HEAD -- $PROJECT_OWNED_PATHS 2>/dev/null || true)"
if [[ -n "$DIRTY_PROJECT_FILES" ]]; then
warn "Uncommitted changes in project-owned files:"
echo "$DIRTY_PROJECT_FILES" | while read -r f; do echo " $f"; done
if [[ "$FORCE" != "true" ]]; then
error "Commit or stash these changes first, or use --force to continue."
exit 1
fi
warn "Continuing with --force (changes will be stashed)"
fi
warn "Continuing with --force (changes will be stashed)"
fi
# Check for available updates
LOCAL_HEAD="$(git rev-parse HEAD)"
REMOTE_HEAD="$(git ls-remote origin "$BRANCH" | cut -f1)"
if [[ "$LOCAL_HEAD" == "$REMOTE_HEAD" ]]; then
info "Already up to date ($PRE_UPGRADE_SHORT). No upstream changes."
if [[ "$FORCE" != "true" ]]; then
success "Nothing to upgrade."
release_lock
exit 0
# Check for available updates
LOCAL_HEAD="$(git rev-parse HEAD)"
REMOTE_HEAD="$(git ls-remote origin "$BRANCH" | cut -f1)"
if [[ "$LOCAL_HEAD" == "$REMOTE_HEAD" ]]; then
info "Already up to date ($PRE_UPGRADE_SHORT). No upstream changes."
if [[ "$FORCE" != "true" ]]; then
success "Nothing to upgrade."
release_lock
exit 0
fi
warn "Continuing with --force despite no upstream changes."
fi
warn "Continuing with --force despite no upstream changes."
fi
# Release mode: the upstream-version comparison happens later in the
# release-mode block (line ~597) which queries the Gitea Releases API.
# =============================================================================
# Phase 2: Backup
@ -669,100 +687,105 @@ elif [[ "$DRY_RUN" == "true" ]]; then
exit 0
fi
# Step 0: Save user-modifiable paths before any git operations
save_user_paths
# Source-mode git pull flow. Release mode handles its update via tarball
# download in the block above and skips this entire section.
if [[ "$INSTALL_MODE" == "source" ]]; then
# Step 0: Save user-modifiable paths before any git operations
save_user_paths
# Step 0b: Clear skip-worktree flags that prevent merge (e.g., repo-data JSON files)
SKIP_WORKTREE_FILES="$(git ls-files -v | grep '^S ' | awk '{print $2}' || true)"
if [[ -n "$SKIP_WORKTREE_FILES" ]]; then
info "Clearing skip-worktree flags on $(echo "$SKIP_WORKTREE_FILES" | wc -l | xargs) file(s)..."
echo "$SKIP_WORKTREE_FILES" | xargs git update-index --no-skip-worktree
success "Skip-worktree flags cleared"
fi
# Step 0c: Fix Docker-owned directories that block git checkout
for owned_dir in api/upgrade api/uploads api/configs; do
if [[ -d "$PROJECT_DIR/$owned_dir" ]] && [[ ! -w "$PROJECT_DIR/$owned_dir" ]]; then
info "Fixing permissions on $owned_dir..."
docker run --rm -v "$PROJECT_DIR/$owned_dir:/fix" alpine chown -R "$(id -u):$(id -g)" /fix 2>/dev/null || true
# Step 0b: Clear skip-worktree flags that prevent merge (e.g., repo-data JSON files)
SKIP_WORKTREE_FILES="$(git ls-files -v | grep '^S ' | awk '{print $2}' || true)"
if [[ -n "$SKIP_WORKTREE_FILES" ]]; then
info "Clearing skip-worktree flags on $(echo "$SKIP_WORKTREE_FILES" | wc -l | xargs) file(s)..."
echo "$SKIP_WORKTREE_FILES" | xargs git update-index --no-skip-worktree
success "Skip-worktree flags cleared"
fi
done
# Step 1: Stash user changes if any exist
HAS_CHANGES=false
if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then
HAS_CHANGES=true
STASH_NAME="upgrade-${TIMESTAMP}"
info "Stashing local changes as '$STASH_NAME'..."
git stash push --include-untracked -m "$STASH_NAME"
success "Local changes stashed"
fi
# Step 0c: Fix Docker-owned directories that block git checkout
for owned_dir in api/upgrade api/uploads api/configs; do
if [[ -d "$PROJECT_DIR/$owned_dir" ]] && [[ ! -w "$PROJECT_DIR/$owned_dir" ]]; then
info "Fixing permissions on $owned_dir..."
docker run --rm -v "$PROJECT_DIR/$owned_dir:/fix" alpine chown -R "$(id -u):$(id -g)" /fix 2>/dev/null || true
fi
done
# Step 3: Pull updates
info "Pulling updates from origin/$BRANCH..."
if ! git pull origin "$BRANCH" --no-edit 2>&1; then
error "git pull failed. This may indicate upstream force-push or branch issues."
# Step 1: Stash user changes if any exist
HAS_CHANGES=false
if [[ -n "$(git status --porcelain 2>/dev/null)" ]]; then
HAS_CHANGES=true
STASH_NAME="upgrade-${TIMESTAMP}"
info "Stashing local changes as '$STASH_NAME'..."
git stash push --include-untracked -m "$STASH_NAME"
success "Local changes stashed"
fi
# Step 3: Pull updates
info "Pulling updates from origin/$BRANCH..."
if ! git pull origin "$BRANCH" --no-edit 2>&1; then
error "git pull failed. This may indicate upstream force-push or branch issues."
if [[ "$HAS_CHANGES" == "true" ]]; then
warn "Your stashed changes can be recovered with: git stash pop"
fi
exit 1
fi
POST_PULL_COMMIT="$(git rev-parse --short HEAD)"
success "Updated to $POST_PULL_COMMIT"
# Step 4: Pop stash and handle conflicts
if [[ "$HAS_CHANGES" == "true" ]]; then
warn "Your stashed changes can be recovered with: git stash pop"
fi
exit 1
fi
info "Restoring local changes..."
if git stash pop 2>&1; then
success "Local changes restored cleanly"
else
warn "Merge conflicts detected during stash pop"
POST_PULL_COMMIT="$(git rev-parse --short HEAD)"
success "Updated to $POST_PULL_COMMIT"
# Auto-resolve user-modifiable paths by keeping user's version
RESOLVED_COUNT=0
for user_path in "${USER_PATHS[@]}"; do
CONFLICTED="$(git diff --name-only --diff-filter=U -- "$user_path" 2>/dev/null || true)"
if [[ -n "$CONFLICTED" ]]; then
while IFS= read -r cf; do
info " Auto-resolving (keeping yours): $cf"
git checkout --theirs "$cf" 2>/dev/null || true
git add "$cf"
RESOLVED_COUNT=$((RESOLVED_COUNT + 1))
done < <(echo "$CONFLICTED")
fi
done
# Step 4: Pop stash and handle conflicts
if [[ "$HAS_CHANGES" == "true" ]]; then
info "Restoring local changes..."
if git stash pop 2>&1; then
success "Local changes restored cleanly"
else
warn "Merge conflicts detected during stash pop"
# Auto-resolve user-modifiable paths by keeping user's version
RESOLVED_COUNT=0
for user_path in "${USER_PATHS[@]}"; do
CONFLICTED="$(git diff --name-only --diff-filter=U -- "$user_path" 2>/dev/null || true)"
if [[ -n "$CONFLICTED" ]]; then
while IFS= read -r cf; do
info " Auto-resolving (keeping yours): $cf"
git checkout --theirs "$cf" 2>/dev/null || true
git add "$cf"
RESOLVED_COUNT=$((RESOLVED_COUNT + 1))
done < <(echo "$CONFLICTED")
# Check if any conflicts remain in project-owned files
REMAINING_CONFLICTS="$(git diff --name-only --diff-filter=U 2>/dev/null || true)"
if [[ -n "$REMAINING_CONFLICTS" ]]; then
error "Unresolved conflicts in project-owned files:"
echo "$REMAINING_CONFLICTS" | while read -r f; do echo " $f"; done
echo ""
error "These files have upstream changes that conflict with your edits."
error "Resolve manually, then run the upgrade again."
info "Your pre-upgrade commit: $PRE_UPGRADE_COMMIT"
info "To abort: git merge --abort OR git checkout $PRE_UPGRADE_COMMIT"
exit 1
fi
done
# Check if any conflicts remain in project-owned files
REMAINING_CONFLICTS="$(git diff --name-only --diff-filter=U 2>/dev/null || true)"
if [[ -n "$REMAINING_CONFLICTS" ]]; then
error "Unresolved conflicts in project-owned files:"
echo "$REMAINING_CONFLICTS" | while read -r f; do echo " $f"; done
echo ""
error "These files have upstream changes that conflict with your edits."
error "Resolve manually, then run the upgrade again."
info "Your pre-upgrade commit: $PRE_UPGRADE_COMMIT"
info "To abort: git merge --abort OR git checkout $PRE_UPGRADE_COMMIT"
exit 1
fi
if [[ $RESOLVED_COUNT -gt 0 ]]; then
success "Auto-resolved $RESOLVED_COUNT user-modifiable path(s) (kept your versions)"
if [[ $RESOLVED_COUNT -gt 0 ]]; then
success "Auto-resolved $RESOLVED_COUNT user-modifiable path(s) (kept your versions)"
fi
fi
fi
fi
# Step 4b: Restore user-modifiable paths (unconditionally overwrites with saved copies)
restore_user_paths
# Step 4b: Restore user-modifiable paths (unconditionally overwrites with saved copies)
restore_user_paths
# Step 4c: Restore any tracked files accidentally deleted by restore_user_paths
# (can happen when save_user_paths can't read root-owned files in user paths)
DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)"
if [[ -n "$DELETED_TRACKED" ]]; then
info "Restoring $(echo "$DELETED_TRACKED" | wc -l | xargs) tracked file(s) deleted during restore..."
echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true
success "Tracked files restored from HEAD"
# Step 4c: Restore any tracked files accidentally deleted by restore_user_paths
# (can happen when save_user_paths can't read root-owned files in user paths)
DELETED_TRACKED="$(git ls-files --deleted 2>/dev/null || true)"
if [[ -n "$DELETED_TRACKED" ]]; then
info "Restoring $(echo "$DELETED_TRACKED" | wc -l | xargs) tracked file(s) deleted during restore..."
echo "$DELETED_TRACKED" | xargs git checkout HEAD -- 2>/dev/null || true
success "Tracked files restored from HEAD"
fi
fi
# End of source-mode git pull flow
# Step 5: Detect new env vars
info "Checking for new environment variables..."
@ -791,24 +814,30 @@ if [[ -f "$PROJECT_DIR/.env.example" ]] && [[ -f "$PROJECT_DIR/.env" ]]; then
fi
fi
# Step 6: Print update summary
COMMIT_RANGE="${PRE_UPGRADE_SHORT}..${POST_PULL_COMMIT}"
COMMIT_COUNT="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | wc -l | xargs)"
echo ""
info "Update summary: $COMMIT_COUNT commit(s) ($COMMIT_RANGE)"
git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | head -20
if [[ "$COMMIT_COUNT" -gt 20 ]]; then
info " ... and $((COMMIT_COUNT - 20)) more"
fi
# Flag commits that may require manual attention
BREAKING_COMMITS="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" --grep="BREAKING" --grep="\[manual\]" 2>/dev/null || true)"
if [[ -n "$BREAKING_COMMITS" ]]; then
# Step 6: Print update summary (source mode only — release mode has no commit range)
COMMIT_COUNT=0
if [[ "$INSTALL_MODE" == "source" ]]; then
COMMIT_RANGE="${PRE_UPGRADE_SHORT}..${POST_PULL_COMMIT}"
# Use || true and check pipefail-safe to survive git failures
COMMIT_COUNT="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | wc -l | xargs || echo 0)"
echo ""
warn "Commits requiring manual attention:"
echo "$BREAKING_COMMITS" | while read -r line; do
echo -e " ${YELLOW}$line${NC}"
done
info "Update summary: $COMMIT_COUNT commit(s) ($COMMIT_RANGE)"
git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" 2>/dev/null | head -20 || true
if [[ "$COMMIT_COUNT" -gt 20 ]]; then
info " ... and $((COMMIT_COUNT - 20)) more"
fi
# Flag commits that may require manual attention
BREAKING_COMMITS="$(git log --oneline "$PRE_UPGRADE_COMMIT..HEAD" --grep="BREAKING" --grep="\[manual\]" 2>/dev/null || true)"
if [[ -n "$BREAKING_COMMITS" ]]; then
echo ""
warn "Commits requiring manual attention:"
echo "$BREAKING_COMMITS" | while read -r line; do
echo -e " ${YELLOW}$line${NC}"
done
fi
else
info "Update summary: ${PRE_UPGRADE_SHORT} → release"
fi
# =============================================================================
@ -1135,7 +1164,10 @@ verify_service_health() {
done
warn "$name: not responding after ${max_wait}s"
VERIFY_FAILED=true
return 1
# Always return 0 — under set -e a non-zero return from this helper would
# exit the script before write_result runs. The VERIFY_FAILED flag is the
# signal the caller actually checks.
return 0
}
# API health (with polling — may still be running migrations)
@ -1194,7 +1226,11 @@ fi
# =============================================================================
ELAPSED="$(elapsed)"
FINAL_COMMIT="$(git rev-parse --short HEAD)"
if [[ "$INSTALL_MODE" == "source" ]]; then
FINAL_COMMIT="$(git rev-parse --short HEAD)"
else
FINAL_COMMIT="$(head -1 "$PROJECT_DIR/VERSION" 2>/dev/null || echo "release")"
fi
# Collect warnings for API mode result
UPGRADE_WARNINGS="[]"
@ -1211,7 +1247,11 @@ echo -e "${BOLD}${GREEN} Upgrade Complete${NC}"
echo -e "${BOLD}${GREEN}══════════════════════════════════════════════════${NC}"
echo ""
echo -e " ${BOLD}Previous:${NC} $PRE_UPGRADE_SHORT"
echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT ($(git log -1 --format='%s' HEAD))"
if [[ "$INSTALL_MODE" == "source" ]]; then
echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT ($(git log -1 --format='%s' HEAD 2>/dev/null || echo "$FINAL_COMMIT"))"
else
echo -e " ${BOLD}Current:${NC} $FINAL_COMMIT"
fi
echo -e " ${BOLD}Commits:${NC} $COMMIT_COUNT"
echo -e " ${BOLD}Duration:${NC} $ELAPSED"
echo -e " ${BOLD}Log:${NC} $LOG_FILE"