mirror of
https://github.com/BetterCorp/BetterFrame.git
synced 2026-05-26 17:56:34 +00:00
feat(ota): phase 3 — rollouts + automated rollback
Rollouts (server side): - /admin/firmware/rollouts page lists + creates campaigns. Pick release, target kiosk_ids (empty = whole channel), percentage (1-100). - Active rollouts override channel-latest in /api/kiosk/firmware/check. - Deterministic bucket via sha256(rollout_id:kiosk_id) % 100 — same kiosk consistently lands in the same bucket across re-checks. - Pause / resume / complete state controls. Rollback (kiosk side): - Before swap, kiosk writes firmware-applying.json marker. - After clean boot + first successful heartbeat, marker deleted. - New ExecStartPre hook (/usr/local/sbin/betterframe-firmware-rollback.sh) runs every service start; stale marker (>120s) + .prev present → restore .prev. Pairs with systemd's StartLimit to catch crash loops.
This commit is contained in:
parent
6a8f6d76af
commit
69cd0391b5
9 changed files with 308 additions and 2 deletions
|
|
@ -229,10 +229,12 @@ if [ "${INSTALL_KIOSK}" = "1" ]; then
|
|||
printf 'BetterFrame Kiosk\n\n' > /etc/issue
|
||||
rm -f /etc/update-motd.d/10-uname /etc/update-motd.d/* 2>/dev/null || true
|
||||
|
||||
echo "==> Installing PAM + systemd unit"
|
||||
echo "==> Installing PAM + systemd unit + firmware rollback hook"
|
||||
install -m 644 "${REPO_ROOT}/deploy/pam.d/cage" /etc/pam.d/cage
|
||||
install -m 644 "${REPO_ROOT}/deploy/systemd/betterframe-kiosk.service" \
|
||||
/etc/systemd/system/betterframe-kiosk.service
|
||||
install -m 755 "${REPO_ROOT}/deploy/systemd/betterframe-firmware-rollback.sh" \
|
||||
/usr/local/sbin/betterframe-firmware-rollback.sh
|
||||
|
||||
if [ ! -e /etc/default/betterframe-kiosk ]; then
|
||||
cat > /etc/default/betterframe-kiosk <<'EOF'
|
||||
|
|
|
|||
43
deploy/systemd/betterframe-firmware-rollback.sh
Executable file
43
deploy/systemd/betterframe-firmware-rollback.sh
Executable file
|
|
@ -0,0 +1,43 @@
|
|||
#!/usr/bin/env bash
|
||||
# Rollback the kiosk binary if a recent OTA update never reached a healthy
|
||||
# heartbeat. Run as ExecStartPre on the betterframe-kiosk service.
|
||||
#
|
||||
# Logic:
|
||||
# - Marker file at /var/lib/betterframe/kiosk/firmware-applying.json
|
||||
# written by the kiosk just before swapping in the new binary.
|
||||
# - Kiosk deletes it after a successful heartbeat post-boot.
|
||||
# - If we're running and the marker still exists older than 120s, the
|
||||
# previous start failed before heartbeat → restore .prev, drop the marker.
|
||||
#
|
||||
# Idempotent. Silent on the happy path. Logs to journal otherwise.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BIN="/opt/betterframe/kiosk/betterframe-kiosk"
|
||||
PREV="${BIN}.prev"
|
||||
MARKER="/var/lib/betterframe/kiosk/firmware-applying.json"
|
||||
|
||||
if [ ! -f "$MARKER" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Marker mtime in epoch seconds.
|
||||
marker_mtime=$(stat -c %Y "$MARKER" 2>/dev/null || stat -f %m "$MARKER" 2>/dev/null || echo 0)
|
||||
now=$(date +%s)
|
||||
age=$(( now - marker_mtime ))
|
||||
|
||||
# Marker fresh → previous boot is still in progress, leave it.
|
||||
if [ "$age" -lt 120 ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Stale marker + .prev present → rollback.
|
||||
if [ -f "$PREV" ]; then
|
||||
echo "[bf-firmware-rollback] stale apply marker (${age}s old) + .prev exists — rolling back" >&2
|
||||
cp -f "$PREV" "$BIN"
|
||||
chmod +x "$BIN"
|
||||
rm -f "$MARKER"
|
||||
else
|
||||
echo "[bf-firmware-rollback] stale marker but no .prev — clearing marker, manual intervention needed" >&2
|
||||
rm -f "$MARKER"
|
||||
fi
|
||||
|
|
@ -32,6 +32,7 @@ Environment=GST_DEBUG=1
|
|||
Environment=BETTERFRAME_SERVER=http://localhost
|
||||
# Let the unprivileged kiosk process control the Pi fan PWM sysfs files.
|
||||
ExecStartPre=+/bin/sh -c 'for d in /sys/class/hwmon/hwmon*; do [ -e "$d/pwm1" ] || continue; chgrp bfkiosk "$d/pwm1" "$d/pwm1_enable" 2>/dev/null || true; chmod g+w "$d/pwm1" "$d/pwm1_enable" 2>/dev/null || true; done'
|
||||
ExecStartPre=+/usr/local/sbin/betterframe-firmware-rollback.sh
|
||||
ExecStart=/usr/bin/cage -s -- /opt/betterframe/kiosk/betterframe-kiosk
|
||||
Restart=always
|
||||
RestartSec=2
|
||||
|
|
|
|||
|
|
@ -157,6 +157,21 @@ pub fn apply(server: &str, key: &str, info: &UpdateInfo) -> Result<(), String> {
|
|||
f.sync_all().ok();
|
||||
}
|
||||
|
||||
// Drop a marker file the systemd ExecStartPre script reads to detect a
|
||||
// failed first boot of the new binary. We delete it after a clean boot
|
||||
// (see `mark_firmware_applied()`). If we crash before that, next start
|
||||
// sees a stale marker → restores .prev.
|
||||
if let Some(dir) = bin.parent() {
|
||||
let marker = dir.join("firmware-applying.json");
|
||||
let payload = serde_json::json!({
|
||||
"version": info.version,
|
||||
"attempt_at": chrono_now_iso(),
|
||||
"bin": bin.to_string_lossy(),
|
||||
"prev": prev_path.to_string_lossy(),
|
||||
});
|
||||
let _ = fs::write(&marker, payload.to_string());
|
||||
}
|
||||
|
||||
// Save current binary as .prev so an out-of-band rollback can restore it.
|
||||
if bin.exists() {
|
||||
let _ = fs::remove_file(&prev_path);
|
||||
|
|
@ -190,6 +205,28 @@ fn verify_signature(public_key_pem: &str, sha256_hex: &str, sig_b64url: &str) ->
|
|||
.map_err(|e| format!("verify: {e}"))
|
||||
}
|
||||
|
||||
/// Clear the in-progress marker. Call after the kiosk has booted cleanly and
|
||||
/// reported back to the server — proves the new binary survives startup.
|
||||
pub fn mark_firmware_applied() {
|
||||
let bin = binary_path();
|
||||
if let Some(dir) = bin.parent() {
|
||||
let marker = dir.join("firmware-applying.json");
|
||||
if marker.exists() {
|
||||
let _ = fs::remove_file(marker);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn chrono_now_iso() -> String {
|
||||
// Sidesteps adding a chrono dep — Unix epoch ms is enough for the
|
||||
// ExecStartPre rollback check.
|
||||
let secs = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map(|d| d.as_secs())
|
||||
.unwrap_or(0);
|
||||
format!("{secs}")
|
||||
}
|
||||
|
||||
fn hex_lower(bytes: &[u8]) -> String {
|
||||
const HEX: &[u8; 16] = b"0123456789abcdef";
|
||||
let mut s = String::with_capacity(bytes.len() * 2);
|
||||
|
|
|
|||
|
|
@ -252,8 +252,16 @@ fn activate(app: &Application) {
|
|||
|
||||
// Heartbeat loop — reports display geometry + hwmon, also checks for
|
||||
// firmware updates so kiosks pick up new builds without admin push.
|
||||
let mut first_iter = true;
|
||||
loop {
|
||||
send_heartbeat_now(&server, &key);
|
||||
if first_iter {
|
||||
// Successfully heart-beat at least once → consider this boot a
|
||||
// healthy one. Clears the rollback-pending marker so the next
|
||||
// start doesn't try to roll back a healthy install.
|
||||
firmware::mark_firmware_applied();
|
||||
first_iter = false;
|
||||
}
|
||||
maybe_apply_firmware_update(&server, &key);
|
||||
std::thread::sleep(std::time::Duration::from_secs(60));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import { htmlPage, htmlFragment } from "./html-response.js";
|
|||
import type { AdminDeps } from "./index.js";
|
||||
import {
|
||||
FirmwarePage,
|
||||
FirmwareRolloutsPage,
|
||||
KioskFirmwarePanel,
|
||||
} from "../../web-templates/admin-pages.js";
|
||||
import { getCoordinator } from "../../shared/coordinator-registry.js";
|
||||
|
|
@ -168,4 +169,68 @@ export function registerFirmwareRoutes(app: H3, deps: AdminDeps): void {
|
|||
const dispatched = getCoordinator().sendToKiosk(id, { type: "firmware_check" });
|
||||
return { ok: true, dispatched };
|
||||
});
|
||||
|
||||
// ---- Rollouts -----------------------------------------------------------
|
||||
|
||||
app.get("/admin/firmware/rollouts", (event) => {
|
||||
const user = event.context.user!;
|
||||
const rollouts = deps.repo.listFirmwareRollouts();
|
||||
const releases = deps.repo.listFirmwareReleases();
|
||||
const kiosks = deps.repo.listKiosks();
|
||||
return htmlPage(FirmwareRolloutsPage({
|
||||
user: user.username,
|
||||
rollouts,
|
||||
releases,
|
||||
kiosks,
|
||||
}));
|
||||
});
|
||||
|
||||
app.post("/admin/firmware/rollouts/new", async (event) => {
|
||||
const body = await readBody<Record<string, string | string[]>>(event);
|
||||
const releaseId = String(body?.["release_id"] ?? "");
|
||||
if (!releaseId) throw createError({ statusCode: 400, statusMessage: "release_id required" });
|
||||
const release = deps.repo.getFirmwareRelease(releaseId);
|
||||
if (!release) throw createError({ statusCode: 404, statusMessage: "release not found" });
|
||||
const percentage = clamp(Number(body?.["percentage"] ?? 100), 1, 100);
|
||||
const targetsRaw = body?.["target_kiosk_ids"];
|
||||
const targets: number[] = Array.isArray(targetsRaw)
|
||||
? targetsRaw.map((s) => Number(s)).filter((n) => Number.isFinite(n))
|
||||
: typeof targetsRaw === "string" && targetsRaw
|
||||
? targetsRaw.split(",").map((s) => Number(s.trim())).filter((n) => Number.isFinite(n))
|
||||
: [];
|
||||
const user = event.context.user!;
|
||||
const rollout = deps.repo.createFirmwareRollout({
|
||||
id: randomUUID(),
|
||||
release_id: releaseId,
|
||||
target_kiosk_ids: targets,
|
||||
percentage,
|
||||
created_by: user.id ?? null,
|
||||
});
|
||||
deps.repo.updateFirmwareRolloutState(rollout.id, "active");
|
||||
// Bump every targeted kiosk to check now (best-effort over WS).
|
||||
const coord = getCoordinator();
|
||||
if (targets.length === 0) {
|
||||
const allKiosks = deps.repo.listKiosks();
|
||||
for (const k of allKiosks) coord.sendToKiosk(k.id, { type: "firmware_check" });
|
||||
} else {
|
||||
for (const id of targets) coord.sendToKiosk(id, { type: "firmware_check" });
|
||||
}
|
||||
return new Response(null, { status: 302, headers: { location: "/admin/firmware/rollouts" } });
|
||||
});
|
||||
|
||||
app.post("/admin/firmware/rollouts/:id/state", async (event) => {
|
||||
const id = String(getRouterParam(event, "id"));
|
||||
const body = await readBody<{ state: string }>(event);
|
||||
const state = body?.state;
|
||||
if (state !== "paused" && state !== "active" && state !== "complete") {
|
||||
throw createError({ statusCode: 400, statusMessage: "invalid state" });
|
||||
}
|
||||
deps.repo.updateFirmwareRolloutState(id, state);
|
||||
return new Response(null, { status: 302, headers: { location: "/admin/firmware/rollouts" } });
|
||||
});
|
||||
}
|
||||
|
||||
function clamp(n: number, lo: number, hi: number): number {
|
||||
if (!Number.isFinite(n)) return lo;
|
||||
return Math.max(lo, Math.min(hi, Math.floor(n)));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import { initiatePairing, claimPairing } from "../../shared/pairing.js";
|
|||
import { generateBundle } from "../../shared/bundle.js";
|
||||
import { initNoderedBridge, type NoderedBridge } from "../../shared/nodered-bridge.js";
|
||||
import { initFirmware, type FirmwareApi } from "../../shared/firmware.js";
|
||||
import { createHash } from "node:crypto";
|
||||
import type { Repository } from "../service-store/repository.js";
|
||||
import type { AuthApi } from "../../shared/auth.js";
|
||||
import type { SecretsApi } from "../../shared/secrets.js";
|
||||
|
|
@ -428,10 +429,24 @@ function registerKioskRoutes(
|
|||
const currentVersion = url.searchParams.get("current")?.trim() ?? kiosk.kiosk_app_version ?? "";
|
||||
|
||||
let release = null;
|
||||
// Explicit per-kiosk pin wins over all rollout / channel selection.
|
||||
if (kiosk.firmware_target_version) {
|
||||
release = repo.getFirmwareReleaseByVersionArch(kiosk.firmware_target_version, arch);
|
||||
if (release?.yanked_at) release = null;
|
||||
}
|
||||
// Active rollouts: most-recent matching, with bucket eligibility.
|
||||
if (!release) {
|
||||
const rollouts = repo.listActiveRolloutsForKiosk(kiosk.id);
|
||||
for (const rollout of rollouts) {
|
||||
if (!isKioskInRolloutBucket(kiosk.id, rollout.id, rollout.percentage)) continue;
|
||||
const r = repo.getFirmwareRelease(rollout.release_id);
|
||||
if (!r || r.yanked_at) continue;
|
||||
if (r.arch !== arch) continue;
|
||||
release = r;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Channel-latest fallback.
|
||||
if (!release) {
|
||||
const channel = (kiosk.firmware_channel ?? "stable") as FirmwareChannel;
|
||||
release = repo.getLatestFirmwareRelease(channel, arch);
|
||||
|
|
@ -508,3 +523,19 @@ function registerKioskRoutes(
|
|||
return { ok: true };
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Deterministic bucket assignment for gradual rollouts. Same (kioskId,
|
||||
* rolloutId) always lands in the same bucket, so a 50% rollout consistently
|
||||
* targets the same half of the fleet across re-checks. Switch from 50%→100%
|
||||
* gracefully adds the previously-excluded half rather than reshuffling.
|
||||
*/
|
||||
function isKioskInRolloutBucket(kioskId: number, rolloutId: string, percentage: number): boolean {
|
||||
if (percentage >= 100) return true;
|
||||
if (percentage <= 0) return false;
|
||||
const h = createHash("sha256")
|
||||
.update(`${rolloutId}:${String(kioskId)}`)
|
||||
.digest();
|
||||
const bucket = h.readUInt32BE(0) % 100;
|
||||
return bucket < percentage;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1210,6 +1210,20 @@ export class Repository {
|
|||
return r ? rowToFirmwareRollout(r as Record<string, unknown>) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Active rollouts whose target list either includes this kiosk OR is
|
||||
* empty (= "all kiosks on the release channel"). Ordered most-recent first
|
||||
* so a newer rollout supersedes older ones.
|
||||
*/
|
||||
listActiveRolloutsForKiosk(kioskId: number): FirmwareRollout[] {
|
||||
const rs = this.prep(
|
||||
`SELECT * FROM firmware_rollouts WHERE state = 'active' ORDER BY created_at DESC`,
|
||||
).all();
|
||||
return rs
|
||||
.map((r) => rowToFirmwareRollout(r as Record<string, unknown>))
|
||||
.filter((r) => r.target_kiosk_ids.length === 0 || r.target_kiosk_ids.includes(kioskId));
|
||||
}
|
||||
|
||||
listFirmwareRollouts(): FirmwareRollout[] {
|
||||
const rs = this.prep(
|
||||
"SELECT * FROM firmware_rollouts ORDER BY created_at DESC",
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ import type {
|
|||
Display,
|
||||
Entity,
|
||||
FirmwareRelease,
|
||||
FirmwareRollout,
|
||||
Kiosk,
|
||||
KioskGpioBinding,
|
||||
Label,
|
||||
|
|
@ -2673,10 +2674,11 @@ interface FirmwarePageProps {
|
|||
|
||||
export function FirmwarePage(props: FirmwarePageProps) {
|
||||
return (
|
||||
<Layout title="Firmware" user={props.user} activeNav="kiosks">
|
||||
<Layout title="Firmware" user={props.user} activeNav="firmware">
|
||||
<p style="color:#666; margin-bottom:1rem">
|
||||
Signed kiosk firmware artifacts. Uploaded binaries are hashed +
|
||||
Ed25519-signed by the server before kiosks can install them.
|
||||
<a href="/admin/firmware/rollouts" style="margin-left:0.5rem">Rollouts →</a>
|
||||
</p>
|
||||
|
||||
<div class="card" style="margin-bottom:1.5rem">
|
||||
|
|
@ -2876,3 +2878,106 @@ export function KioskLocalPanel(props: KioskLocalPanelProps) {
|
|||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
// ---- Firmware rollouts -----------------------------------------------------
|
||||
|
||||
interface FirmwareRolloutsPageProps {
|
||||
user: string;
|
||||
rollouts: FirmwareRollout[];
|
||||
releases: FirmwareRelease[];
|
||||
kiosks: Kiosk[];
|
||||
}
|
||||
|
||||
export function FirmwareRolloutsPage(props: FirmwareRolloutsPageProps) {
|
||||
const releaseById = new Map(props.releases.map((r) => [r.id, r]));
|
||||
const kioskById = new Map(props.kiosks.map((k) => [k.id, k]));
|
||||
return (
|
||||
<Layout title="Firmware rollouts" user={props.user} activeNav="kiosks">
|
||||
<p style="color:#666; margin-bottom:1rem">
|
||||
Push a specific release to a slice of the fleet. <code>percentage</code>
|
||||
buckets kiosks deterministically by id, so re-running a 50% rollout
|
||||
with the same targets touches the same half.
|
||||
</p>
|
||||
|
||||
<div class="card" style="margin-bottom:1.5rem">
|
||||
<h2 style="margin:0 0 1rem; font-size:1.1rem">New rollout</h2>
|
||||
<form method="post" action="/admin/firmware/rollouts/new"
|
||||
style="display:grid; grid-template-columns:1fr 1fr; gap:0.75rem">
|
||||
<div class="form-group">
|
||||
<label for="release_id">Release</label>
|
||||
<select id="release_id" name="release_id" class="form-input" required>
|
||||
<option value="">--</option>
|
||||
{props.releases.filter((r) => !r.yanked_at).map((r) => (
|
||||
<option value={r.id}>{r.version} · {r.channel} · {r.arch}</option>
|
||||
))}
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="percentage">Percentage</label>
|
||||
<input id="percentage" name="percentage" type="number" min="1" max="100" value="100" class="form-input" />
|
||||
</div>
|
||||
<div class="form-group" style="grid-column:1/-1">
|
||||
<label for="target_kiosk_ids">Targets (leave empty = all kiosks on release channel)</label>
|
||||
<select id="target_kiosk_ids" name="target_kiosk_ids" class="form-input" multiple size="6">
|
||||
{props.kiosks.map((k) => (
|
||||
<option value={String(k.id)}>{k.name} (#{String(k.id)})</option>
|
||||
))}
|
||||
</select>
|
||||
<div class="form-hint">Cmd/Ctrl-click to multi-select. Or post a comma-separated id list via API.</div>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary" style="grid-column:1/-1">Create + activate</button>
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div class="table-wrap">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Release</th>
|
||||
<th>State</th>
|
||||
<th>%</th>
|
||||
<th>Targets</th>
|
||||
<th>Created</th>
|
||||
<th></th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{props.rollouts.length === 0 ? (
|
||||
<tr><td colspan="6" style="text-align:center; color:#999; padding:2rem">No rollouts yet.</td></tr>
|
||||
) : (
|
||||
props.rollouts.map((r) => {
|
||||
const rel = releaseById.get(r.release_id);
|
||||
const targetCount = r.target_kiosk_ids.length;
|
||||
const targetSummary = targetCount === 0
|
||||
? "(all on channel)"
|
||||
: r.target_kiosk_ids.slice(0, 3).map((id) => kioskById.get(id)?.name ?? `#${String(id)}`).join(", ")
|
||||
+ (targetCount > 3 ? ` +${String(targetCount - 3)} more` : "");
|
||||
return (
|
||||
<tr>
|
||||
<td><strong>{rel?.version ?? r.release_id}</strong>{rel && <span style="color:#999"> ({rel.channel}/{rel.arch})</span>}</td>
|
||||
<td><span class={`badge ${r.state === "active" ? "badge-green" : r.state === "paused" ? "badge-yellow" : r.state === "complete" ? "badge-gray" : "badge-blue"}`}>{r.state}</span></td>
|
||||
<td>{String(r.percentage)}%</td>
|
||||
<td style="font-size:0.85rem">{targetSummary}</td>
|
||||
<td style="font-size:0.85rem; white-space:nowrap">{formatTime(r.created_at)}</td>
|
||||
<td>
|
||||
<form method="post" action={`/admin/firmware/rollouts/${r.id}/state`} style="display:inline">
|
||||
<input type="hidden" name="state" value={r.state === "paused" ? "active" : "paused"} />
|
||||
<button type="submit" class="btn btn-sm" style="margin-right:0.25rem">
|
||||
{r.state === "paused" ? "Resume" : "Pause"}
|
||||
</button>
|
||||
</form>
|
||||
<form method="post" action={`/admin/firmware/rollouts/${r.id}/state`} style="display:inline">
|
||||
<input type="hidden" name="state" value="complete" />
|
||||
<button type="submit" class="btn btn-sm btn-danger">Complete</button>
|
||||
</form>
|
||||
</td>
|
||||
</tr>
|
||||
);
|
||||
})
|
||||
)}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</Layout>
|
||||
);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue