feat(ota): phase 3 — rollouts + automated rollback

Rollouts (server side):
- /admin/firmware/rollouts page lists + creates campaigns. Pick release,
  target kiosk_ids (empty = whole channel), percentage (1-100).
- Active rollouts override channel-latest in /api/kiosk/firmware/check.
- Deterministic bucket via sha256(rollout_id:kiosk_id) % 100 — same kiosk
  consistently lands in the same bucket across re-checks.
- Pause / resume / complete state controls.

Rollback (kiosk side):
- Before swap, kiosk writes firmware-applying.json marker.
- After clean boot + first successful heartbeat, marker deleted.
- New ExecStartPre hook (/usr/local/sbin/betterframe-firmware-rollback.sh)
  runs every service start; stale marker (>120s) + .prev present →
  restore .prev. Pairs with systemd's StartLimit to catch crash loops.
This commit is contained in:
Mitchell R 2026-05-14 07:28:20 +02:00
parent 6a8f6d76af
commit 69cd0391b5
9 changed files with 308 additions and 2 deletions

View file

@ -229,10 +229,12 @@ if [ "${INSTALL_KIOSK}" = "1" ]; then
printf 'BetterFrame Kiosk\n\n' > /etc/issue
rm -f /etc/update-motd.d/10-uname /etc/update-motd.d/* 2>/dev/null || true
echo "==> Installing PAM + systemd unit"
echo "==> Installing PAM + systemd unit + firmware rollback hook"
install -m 644 "${REPO_ROOT}/deploy/pam.d/cage" /etc/pam.d/cage
install -m 644 "${REPO_ROOT}/deploy/systemd/betterframe-kiosk.service" \
/etc/systemd/system/betterframe-kiosk.service
install -m 755 "${REPO_ROOT}/deploy/systemd/betterframe-firmware-rollback.sh" \
/usr/local/sbin/betterframe-firmware-rollback.sh
if [ ! -e /etc/default/betterframe-kiosk ]; then
cat > /etc/default/betterframe-kiosk <<'EOF'

View file

@ -0,0 +1,43 @@
#!/usr/bin/env bash
# Rollback the kiosk binary if a recent OTA update never reached a healthy
# heartbeat. Run as ExecStartPre on the betterframe-kiosk service.
#
# Logic:
# - Marker file at /var/lib/betterframe/kiosk/firmware-applying.json
# written by the kiosk just before swapping in the new binary.
# - Kiosk deletes it after a successful heartbeat post-boot.
# - If we're running and the marker still exists older than 120s, the
# previous start failed before heartbeat → restore .prev, drop the marker.
#
# Idempotent. Silent on the happy path. Logs to journal otherwise.
set -euo pipefail
BIN="/opt/betterframe/kiosk/betterframe-kiosk"
PREV="${BIN}.prev"
MARKER="/var/lib/betterframe/kiosk/firmware-applying.json"
if [ ! -f "$MARKER" ]; then
exit 0
fi
# Marker mtime in epoch seconds.
marker_mtime=$(stat -c %Y "$MARKER" 2>/dev/null || stat -f %m "$MARKER" 2>/dev/null || echo 0)
now=$(date +%s)
age=$(( now - marker_mtime ))
# Marker fresh → previous boot is still in progress, leave it.
if [ "$age" -lt 120 ]; then
exit 0
fi
# Stale marker + .prev present → rollback.
if [ -f "$PREV" ]; then
echo "[bf-firmware-rollback] stale apply marker (${age}s old) + .prev exists — rolling back" >&2
cp -f "$PREV" "$BIN"
chmod +x "$BIN"
rm -f "$MARKER"
else
echo "[bf-firmware-rollback] stale marker but no .prev — clearing marker, manual intervention needed" >&2
rm -f "$MARKER"
fi

View file

@ -32,6 +32,7 @@ Environment=GST_DEBUG=1
Environment=BETTERFRAME_SERVER=http://localhost
# Let the unprivileged kiosk process control the Pi fan PWM sysfs files.
ExecStartPre=+/bin/sh -c 'for d in /sys/class/hwmon/hwmon*; do [ -e "$d/pwm1" ] || continue; chgrp bfkiosk "$d/pwm1" "$d/pwm1_enable" 2>/dev/null || true; chmod g+w "$d/pwm1" "$d/pwm1_enable" 2>/dev/null || true; done'
ExecStartPre=+/usr/local/sbin/betterframe-firmware-rollback.sh
ExecStart=/usr/bin/cage -s -- /opt/betterframe/kiosk/betterframe-kiosk
Restart=always
RestartSec=2

View file

@ -157,6 +157,21 @@ pub fn apply(server: &str, key: &str, info: &UpdateInfo) -> Result<(), String> {
f.sync_all().ok();
}
// Drop a marker file the systemd ExecStartPre script reads to detect a
// failed first boot of the new binary. We delete it after a clean boot
// (see `mark_firmware_applied()`). If we crash before that, next start
// sees a stale marker → restores .prev.
if let Some(dir) = bin.parent() {
let marker = dir.join("firmware-applying.json");
let payload = serde_json::json!({
"version": info.version,
"attempt_at": chrono_now_iso(),
"bin": bin.to_string_lossy(),
"prev": prev_path.to_string_lossy(),
});
let _ = fs::write(&marker, payload.to_string());
}
// Save current binary as .prev so an out-of-band rollback can restore it.
if bin.exists() {
let _ = fs::remove_file(&prev_path);
@ -190,6 +205,28 @@ fn verify_signature(public_key_pem: &str, sha256_hex: &str, sig_b64url: &str) ->
.map_err(|e| format!("verify: {e}"))
}
/// Clear the in-progress marker. Call after the kiosk has booted cleanly and
/// reported back to the server — proves the new binary survives startup.
pub fn mark_firmware_applied() {
let bin = binary_path();
if let Some(dir) = bin.parent() {
let marker = dir.join("firmware-applying.json");
if marker.exists() {
let _ = fs::remove_file(marker);
}
}
}
fn chrono_now_iso() -> String {
// Sidesteps adding a chrono dep — Unix epoch ms is enough for the
// ExecStartPre rollback check.
let secs = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs())
.unwrap_or(0);
format!("{secs}")
}
fn hex_lower(bytes: &[u8]) -> String {
const HEX: &[u8; 16] = b"0123456789abcdef";
let mut s = String::with_capacity(bytes.len() * 2);

View file

@ -252,8 +252,16 @@ fn activate(app: &Application) {
// Heartbeat loop — reports display geometry + hwmon, also checks for
// firmware updates so kiosks pick up new builds without admin push.
let mut first_iter = true;
loop {
send_heartbeat_now(&server, &key);
if first_iter {
// Successfully heart-beat at least once → consider this boot a
// healthy one. Clears the rollback-pending marker so the next
// start doesn't try to roll back a healthy install.
firmware::mark_firmware_applied();
first_iter = false;
}
maybe_apply_firmware_update(&server, &key);
std::thread::sleep(std::time::Duration::from_secs(60));
}

View file

@ -15,6 +15,7 @@ import { htmlPage, htmlFragment } from "./html-response.js";
import type { AdminDeps } from "./index.js";
import {
FirmwarePage,
FirmwareRolloutsPage,
KioskFirmwarePanel,
} from "../../web-templates/admin-pages.js";
import { getCoordinator } from "../../shared/coordinator-registry.js";
@ -168,4 +169,68 @@ export function registerFirmwareRoutes(app: H3, deps: AdminDeps): void {
const dispatched = getCoordinator().sendToKiosk(id, { type: "firmware_check" });
return { ok: true, dispatched };
});
// ---- Rollouts -----------------------------------------------------------
app.get("/admin/firmware/rollouts", (event) => {
const user = event.context.user!;
const rollouts = deps.repo.listFirmwareRollouts();
const releases = deps.repo.listFirmwareReleases();
const kiosks = deps.repo.listKiosks();
return htmlPage(FirmwareRolloutsPage({
user: user.username,
rollouts,
releases,
kiosks,
}));
});
app.post("/admin/firmware/rollouts/new", async (event) => {
const body = await readBody<Record<string, string | string[]>>(event);
const releaseId = String(body?.["release_id"] ?? "");
if (!releaseId) throw createError({ statusCode: 400, statusMessage: "release_id required" });
const release = deps.repo.getFirmwareRelease(releaseId);
if (!release) throw createError({ statusCode: 404, statusMessage: "release not found" });
const percentage = clamp(Number(body?.["percentage"] ?? 100), 1, 100);
const targetsRaw = body?.["target_kiosk_ids"];
const targets: number[] = Array.isArray(targetsRaw)
? targetsRaw.map((s) => Number(s)).filter((n) => Number.isFinite(n))
: typeof targetsRaw === "string" && targetsRaw
? targetsRaw.split(",").map((s) => Number(s.trim())).filter((n) => Number.isFinite(n))
: [];
const user = event.context.user!;
const rollout = deps.repo.createFirmwareRollout({
id: randomUUID(),
release_id: releaseId,
target_kiosk_ids: targets,
percentage,
created_by: user.id ?? null,
});
deps.repo.updateFirmwareRolloutState(rollout.id, "active");
// Bump every targeted kiosk to check now (best-effort over WS).
const coord = getCoordinator();
if (targets.length === 0) {
const allKiosks = deps.repo.listKiosks();
for (const k of allKiosks) coord.sendToKiosk(k.id, { type: "firmware_check" });
} else {
for (const id of targets) coord.sendToKiosk(id, { type: "firmware_check" });
}
return new Response(null, { status: 302, headers: { location: "/admin/firmware/rollouts" } });
});
app.post("/admin/firmware/rollouts/:id/state", async (event) => {
const id = String(getRouterParam(event, "id"));
const body = await readBody<{ state: string }>(event);
const state = body?.state;
if (state !== "paused" && state !== "active" && state !== "complete") {
throw createError({ statusCode: 400, statusMessage: "invalid state" });
}
deps.repo.updateFirmwareRolloutState(id, state);
return new Response(null, { status: 302, headers: { location: "/admin/firmware/rollouts" } });
});
}
function clamp(n: number, lo: number, hi: number): number {
if (!Number.isFinite(n)) return lo;
return Math.max(lo, Math.min(hi, Math.floor(n)));
}

View file

@ -22,6 +22,7 @@ import { initiatePairing, claimPairing } from "../../shared/pairing.js";
import { generateBundle } from "../../shared/bundle.js";
import { initNoderedBridge, type NoderedBridge } from "../../shared/nodered-bridge.js";
import { initFirmware, type FirmwareApi } from "../../shared/firmware.js";
import { createHash } from "node:crypto";
import type { Repository } from "../service-store/repository.js";
import type { AuthApi } from "../../shared/auth.js";
import type { SecretsApi } from "../../shared/secrets.js";
@ -428,10 +429,24 @@ function registerKioskRoutes(
const currentVersion = url.searchParams.get("current")?.trim() ?? kiosk.kiosk_app_version ?? "";
let release = null;
// Explicit per-kiosk pin wins over all rollout / channel selection.
if (kiosk.firmware_target_version) {
release = repo.getFirmwareReleaseByVersionArch(kiosk.firmware_target_version, arch);
if (release?.yanked_at) release = null;
}
// Active rollouts: most-recent matching, with bucket eligibility.
if (!release) {
const rollouts = repo.listActiveRolloutsForKiosk(kiosk.id);
for (const rollout of rollouts) {
if (!isKioskInRolloutBucket(kiosk.id, rollout.id, rollout.percentage)) continue;
const r = repo.getFirmwareRelease(rollout.release_id);
if (!r || r.yanked_at) continue;
if (r.arch !== arch) continue;
release = r;
break;
}
}
// Channel-latest fallback.
if (!release) {
const channel = (kiosk.firmware_channel ?? "stable") as FirmwareChannel;
release = repo.getLatestFirmwareRelease(channel, arch);
@ -508,3 +523,19 @@ function registerKioskRoutes(
return { ok: true };
});
}
/**
* Deterministic bucket assignment for gradual rollouts. Same (kioskId,
* rolloutId) always lands in the same bucket, so a 50% rollout consistently
* targets the same half of the fleet across re-checks. Switch from 50%100%
* gracefully adds the previously-excluded half rather than reshuffling.
*/
function isKioskInRolloutBucket(kioskId: number, rolloutId: string, percentage: number): boolean {
if (percentage >= 100) return true;
if (percentage <= 0) return false;
const h = createHash("sha256")
.update(`${rolloutId}:${String(kioskId)}`)
.digest();
const bucket = h.readUInt32BE(0) % 100;
return bucket < percentage;
}

View file

@ -1210,6 +1210,20 @@ export class Repository {
return r ? rowToFirmwareRollout(r as Record<string, unknown>) : null;
}
/**
* Active rollouts whose target list either includes this kiosk OR is
* empty (= "all kiosks on the release channel"). Ordered most-recent first
* so a newer rollout supersedes older ones.
*/
listActiveRolloutsForKiosk(kioskId: number): FirmwareRollout[] {
const rs = this.prep(
`SELECT * FROM firmware_rollouts WHERE state = 'active' ORDER BY created_at DESC`,
).all();
return rs
.map((r) => rowToFirmwareRollout(r as Record<string, unknown>))
.filter((r) => r.target_kiosk_ids.length === 0 || r.target_kiosk_ids.includes(kioskId));
}
listFirmwareRollouts(): FirmwareRollout[] {
const rs = this.prep(
"SELECT * FROM firmware_rollouts ORDER BY created_at DESC",

View file

@ -8,6 +8,7 @@ import type {
Display,
Entity,
FirmwareRelease,
FirmwareRollout,
Kiosk,
KioskGpioBinding,
Label,
@ -2673,10 +2674,11 @@ interface FirmwarePageProps {
export function FirmwarePage(props: FirmwarePageProps) {
return (
<Layout title="Firmware" user={props.user} activeNav="kiosks">
<Layout title="Firmware" user={props.user} activeNav="firmware">
<p style="color:#666; margin-bottom:1rem">
Signed kiosk firmware artifacts. Uploaded binaries are hashed +
Ed25519-signed by the server before kiosks can install them.
<a href="/admin/firmware/rollouts" style="margin-left:0.5rem">Rollouts </a>
</p>
<div class="card" style="margin-bottom:1.5rem">
@ -2876,3 +2878,106 @@ export function KioskLocalPanel(props: KioskLocalPanelProps) {
</div>
);
}
// ---- Firmware rollouts -----------------------------------------------------
interface FirmwareRolloutsPageProps {
user: string;
rollouts: FirmwareRollout[];
releases: FirmwareRelease[];
kiosks: Kiosk[];
}
export function FirmwareRolloutsPage(props: FirmwareRolloutsPageProps) {
const releaseById = new Map(props.releases.map((r) => [r.id, r]));
const kioskById = new Map(props.kiosks.map((k) => [k.id, k]));
return (
<Layout title="Firmware rollouts" user={props.user} activeNav="kiosks">
<p style="color:#666; margin-bottom:1rem">
Push a specific release to a slice of the fleet. <code>percentage</code>
buckets kiosks deterministically by id, so re-running a 50% rollout
with the same targets touches the same half.
</p>
<div class="card" style="margin-bottom:1.5rem">
<h2 style="margin:0 0 1rem; font-size:1.1rem">New rollout</h2>
<form method="post" action="/admin/firmware/rollouts/new"
style="display:grid; grid-template-columns:1fr 1fr; gap:0.75rem">
<div class="form-group">
<label for="release_id">Release</label>
<select id="release_id" name="release_id" class="form-input" required>
<option value="">--</option>
{props.releases.filter((r) => !r.yanked_at).map((r) => (
<option value={r.id}>{r.version} · {r.channel} · {r.arch}</option>
))}
</select>
</div>
<div class="form-group">
<label for="percentage">Percentage</label>
<input id="percentage" name="percentage" type="number" min="1" max="100" value="100" class="form-input" />
</div>
<div class="form-group" style="grid-column:1/-1">
<label for="target_kiosk_ids">Targets (leave empty = all kiosks on release channel)</label>
<select id="target_kiosk_ids" name="target_kiosk_ids" class="form-input" multiple size="6">
{props.kiosks.map((k) => (
<option value={String(k.id)}>{k.name} (#{String(k.id)})</option>
))}
</select>
<div class="form-hint">Cmd/Ctrl-click to multi-select. Or post a comma-separated id list via API.</div>
</div>
<button type="submit" class="btn btn-primary" style="grid-column:1/-1">Create + activate</button>
</form>
</div>
<div class="table-wrap">
<table>
<thead>
<tr>
<th>Release</th>
<th>State</th>
<th>%</th>
<th>Targets</th>
<th>Created</th>
<th></th>
</tr>
</thead>
<tbody>
{props.rollouts.length === 0 ? (
<tr><td colspan="6" style="text-align:center; color:#999; padding:2rem">No rollouts yet.</td></tr>
) : (
props.rollouts.map((r) => {
const rel = releaseById.get(r.release_id);
const targetCount = r.target_kiosk_ids.length;
const targetSummary = targetCount === 0
? "(all on channel)"
: r.target_kiosk_ids.slice(0, 3).map((id) => kioskById.get(id)?.name ?? `#${String(id)}`).join(", ")
+ (targetCount > 3 ? ` +${String(targetCount - 3)} more` : "");
return (
<tr>
<td><strong>{rel?.version ?? r.release_id}</strong>{rel && <span style="color:#999"> ({rel.channel}/{rel.arch})</span>}</td>
<td><span class={`badge ${r.state === "active" ? "badge-green" : r.state === "paused" ? "badge-yellow" : r.state === "complete" ? "badge-gray" : "badge-blue"}`}>{r.state}</span></td>
<td>{String(r.percentage)}%</td>
<td style="font-size:0.85rem">{targetSummary}</td>
<td style="font-size:0.85rem; white-space:nowrap">{formatTime(r.created_at)}</td>
<td>
<form method="post" action={`/admin/firmware/rollouts/${r.id}/state`} style="display:inline">
<input type="hidden" name="state" value={r.state === "paused" ? "active" : "paused"} />
<button type="submit" class="btn btn-sm" style="margin-right:0.25rem">
{r.state === "paused" ? "Resume" : "Pause"}
</button>
</form>
<form method="post" action={`/admin/firmware/rollouts/${r.id}/state`} style="display:inline">
<input type="hidden" name="state" value="complete" />
<button type="submit" class="btn btn-sm btn-danger">Complete</button>
</form>
</td>
</tr>
);
})
)}
</tbody>
</table>
</div>
</Layout>
);
}