feat(os-ota): kiosk-side RAUC bundle consumer

Phase 3 of the OS OTA pipeline. New module kiosk/src/os_update.rs polls
/api/kiosk/os/check with the kiosk's compatibility string and current OS
version (read from /etc/betterframe/os-compatibility +
/etc/betterframe/os-version, both written by the image build), downloads
the bundle, sha256-verifies the transport, and hands off to
`rauc install`. RAUC takes it from there: CMS signature verify against
/etc/rauc/keyring.pem, copy into inactive A/B slot, arm tryboot via the
custom bootloader backend, return. We then post /api/kiosk/os/applied
and `systemctl reboot` into the new slot.

Wired into the existing 60s heartbeat loop in ui.rs, gated by
BF_ENABLE_OS_OTA=1 (default OFF so dev kiosks on non-A/B images don't
keep trying + failing). Runs BEFORE the kiosk-binary check on each tick
so an OS bundle that ships an updated kiosk binary doesn't race the
firmware path.

On clean-boot heartbeat success we now also call `rauc status
mark-good` so the boot-attempts counter resets — three bad boots in a
row will auto-roll back without us needing a separate rollback path.

What's NOT in this commit:
  - A/B partition layout in the pi-gen image (task #6, blocks actual
    deployment — bundles can be served + accepted but `rauc install`
    will refuse without two valid slots).
  - Admin UI for managing releases + rollouts (task #4).
This commit is contained in:
Mitchell R 2026-05-21 10:47:45 +02:00
parent 084c119c44
commit 659670b494
No known key found for this signature in database
4 changed files with 301 additions and 2 deletions

View file

@ -40,6 +40,7 @@ gpiod = "0.3"
sha2 = "0.10" sha2 = "0.10"
ed25519-dalek = { version = "2", features = ["pem"] } ed25519-dalek = { version = "2", features = ["pem"] }
base64 = "0.22" base64 = "0.22"
urlencoding = "2"
# Local HTTP server on kiosk (LAN GET-only layout switch + admin proxy) # Local HTTP server on kiosk (LAN GET-only layout switch + admin proxy)
axum = "0.7" axum = "0.7"

View file

@ -4,6 +4,7 @@ mod firmware;
mod gpio; mod gpio;
mod hwmon; mod hwmon;
mod local_server; mod local_server;
mod os_update;
mod pipeline; mod pipeline;
mod server; mod server;
mod ui; mod ui;

233
kiosk/src/os_update.rs Normal file
View file

@ -0,0 +1,233 @@
//! Kiosk-side full-OS OTA via RAUC.
//!
//! Mirrors `firmware.rs` (which handles the kiosk binary) but for the
//! complete OS image. Server endpoints:
//!
//! GET /api/kiosk/os/check?compatibility=<X>&current=<ver>
//! → { up_to_date: true } | { up_to_date: false, update: {...} }
//! GET /api/kiosk/os/download/:release_id
//! → octet-stream .raucb bytes
//! POST /api/kiosk/os/applied { version, error? }
//!
//! Signature verification is RAUC's job — bundles are signed with the
//! X.509 cert pair generated by scripts/gen-rauc-signing-keys.sh, and
//! the corresponding CA cert is baked into the image at
//! /etc/rauc/keyring.pem. We only sha256-check the download here to
//! catch transport corruption before handing off to `rauc install`.
//!
//! Slot switching, atomic copy, and rollback are RAUC's job too —
//! we just shell out to `rauc install`, post the outcome, and tell
//! systemd to reboot. The custom bootloader backend
//! (deploy/rauc/betterframe-rauc-boot.sh) flips Pi 5 tryboot on the
//! next boot.
//!
//! Gated by env `BF_ENABLE_OS_OTA=1`. Default OFF so dev kiosks running
//! a non-A/B layout don't try (and fail) to RAUC-install bundles.
//!
//! Compatibility: read from `/etc/betterframe/os-compatibility` (written
//! at image build time). Falls back to env `BF_RAUC_COMPATIBILITY`, then
//! a hardcoded default matching deploy/rauc/system.conf.
use std::fs;
use std::path::PathBuf;
use std::process::Command;
use std::time::Duration;
use serde::Deserialize;
use sha2::{Digest, Sha256};
use tracing::{info, warn};
pub const DEFAULT_COMPATIBILITY: &str = "betterframe-rpi5-aarch64";
fn compatibility() -> String {
if let Ok(s) = fs::read_to_string("/etc/betterframe/os-compatibility") {
let trimmed = s.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
}
std::env::var("BF_RAUC_COMPATIBILITY").unwrap_or_else(|_| DEFAULT_COMPATIBILITY.to_string())
}
fn current_os_version() -> String {
if let Ok(s) = fs::read_to_string("/etc/betterframe/os-version") {
let trimmed = s.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
}
String::new()
}
#[derive(Debug, Deserialize)]
pub struct CheckResponse {
pub up_to_date: bool,
pub update: Option<UpdateInfo>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct UpdateInfo {
pub release_id: String,
pub version: String,
#[allow(dead_code)]
pub channel: String,
#[allow(dead_code)]
pub compatibility: String,
pub sha256: String,
pub size_bytes: u64,
#[allow(dead_code)]
pub bundle_format: Option<String>,
pub download_url: String,
}
/// Hit `/api/kiosk/os/check`. Returns `Some(UpdateInfo)` when an upgrade is
/// available. `None` on up-to-date, network failure, or parse error.
pub fn check(server: &str, key: &str) -> Option<UpdateInfo> {
let compat = compatibility();
let cur = current_os_version();
let url = format!(
"{server}/api/kiosk/os/check?compatibility={compat}&current={cur}",
compat = urlencoding::encode(&compat),
cur = urlencoding::encode(&cur),
);
let client = reqwest::blocking::Client::new();
let resp = match client
.get(&url)
.header("Authorization", format!("Bearer {key}"))
.timeout(Duration::from_secs(10))
.send()
{
Ok(r) => r,
Err(err) => {
warn!("os-update check: request failed: {err}");
return None;
}
};
if !resp.status().is_success() {
warn!("os-update check: HTTP {}", resp.status());
return None;
}
match resp.json::<CheckResponse>() {
Ok(c) if !c.up_to_date => c.update,
Ok(_) => None,
Err(err) => {
warn!("os-update check: parse failed: {err}");
None
}
}
}
/// Download → sha256 verify → `rauc install` → post outcome → reboot.
///
/// On success: reboots the system (does not return). On failure: posts the
/// error to /api/kiosk/os/applied and returns Err so the caller logs it.
pub fn apply(server: &str, key: &str, info: &UpdateInfo) -> Result<(), String> {
info!(
"os-update: applying {} ({} bytes, release {})",
info.version, info.size_bytes, info.release_id
);
// 1. Download
let url = format!("{}{}", server, info.download_url);
let client = reqwest::blocking::Client::new();
let resp = client
.get(&url)
.header("Authorization", format!("Bearer {key}"))
.timeout(Duration::from_secs(600)) // OS bundles run hundreds of MB
.send()
.map_err(|e| format!("download request: {e}"))?;
if !resp.status().is_success() {
return Err(format!("download HTTP {}", resp.status()));
}
let bytes = resp.bytes().map_err(|e| format!("download body: {e}"))?;
if bytes.len() as u64 != info.size_bytes {
return Err(format!(
"size mismatch: expected {}, got {}",
info.size_bytes,
bytes.len()
));
}
// 2. sha256 (catch transport corruption; RAUC will re-verify the CMS
// signature separately when it opens the bundle).
let mut hasher = Sha256::new();
hasher.update(&bytes);
let digest = hasher.finalize();
let got_sha = hex_lower(&digest);
if got_sha != info.sha256 {
return Err(format!(
"sha256 mismatch: expected {}, got {}",
info.sha256, got_sha
));
}
// 3. Stage on disk for `rauc install` (it expects a file path, not a fd).
// /var/tmp survives /tmp's potential tmpfs size cap; bundles can be big.
let staging_dir = PathBuf::from("/var/tmp/betterframe");
fs::create_dir_all(&staging_dir).map_err(|e| format!("mkdir staging: {e}"))?;
let bundle_path = staging_dir.join(format!("os-{}.raucb", info.release_id));
fs::write(&bundle_path, &bytes).map_err(|e| format!("write bundle: {e}"))?;
// 4. Hand off to rauc. `rauc install` blocks until the bundle is fully
// copied into the inactive slot and bootloader is flipped. Exit code 0
// = success; anything else = leave current slot booted, no reboot.
let status = Command::new("rauc")
.args(["install", bundle_path.to_str().unwrap_or("")])
.status()
.map_err(|e| {
let _ = report_applied(server, key, &info.version, Some(&format!("rauc spawn: {e}")));
format!("rauc spawn: {e}")
})?;
let _ = fs::remove_file(&bundle_path);
if !status.success() {
let msg = format!("rauc install exit {status:?}");
let _ = report_applied(server, key, &info.version, Some(&msg));
return Err(msg);
}
// 5. Report success BEFORE reboot. After this we lose the server
// connection mid-call; that's fine, server sets last_attempt_at from
// the next heartbeat anyway, but recording success now means the
// admin UI shows progress immediately.
let _ = report_applied(server, key, &info.version, None);
info!("os-update: rauc install OK → rebooting into the new slot");
// RAUC's custom bootloader backend has already armed tryboot for the
// freshly-written slot. Reboot picks it up. On failure to reach the
// new slot, tryboot rolls back automatically on the next power cycle.
match Command::new("systemctl").arg("reboot").status() {
Ok(_) => {
// systemctl reboot returns before the reboot completes; sleep
// briefly so we don't race main() into a re-entry.
std::thread::sleep(Duration::from_secs(30));
std::process::exit(0);
}
Err(e) => Err(format!("systemctl reboot: {e}")),
}
}
fn report_applied(server: &str, key: &str, version: &str, error: Option<&str>) -> Result<(), String> {
let payload = if let Some(err) = error {
serde_json::json!({ "version": version, "error": err })
} else {
serde_json::json!({ "version": version })
};
reqwest::blocking::Client::new()
.post(format!("{server}/api/kiosk/os/applied"))
.header("Authorization", format!("Bearer {key}"))
.json(&payload)
.timeout(Duration::from_secs(5))
.send()
.map(|_| ())
.map_err(|e| format!("report applied: {e}"))
}
fn hex_lower(bytes: &[u8]) -> String {
const HEX: &[u8; 16] = b"0123456789abcdef";
let mut s = String::with_capacity(bytes.len() * 2);
for b in bytes {
s.push(HEX[(b >> 4) as usize] as char);
s.push(HEX[(b & 0x0f) as usize] as char);
}
s
}

View file

@ -267,18 +267,26 @@ fn activate(app: &Application) {
}); });
// Heartbeat loop — reports display geometry + hwmon, also checks for // Heartbeat loop — reports display geometry + hwmon, also checks for
// firmware updates so kiosks pick up new builds without admin push. // firmware + OS bundle updates so kiosks pick up new builds without
// admin push.
let mut first_iter = true; let mut first_iter = true;
loop { loop {
let heartbeat_ok = send_heartbeat_now(&server, &key); let heartbeat_ok = send_heartbeat_now(&server, &key);
if first_iter && heartbeat_ok { if first_iter && heartbeat_ok {
// Successfully heart-beat at least once → consider this boot a // Successfully heart-beat at least once → consider this boot a
// healthy one. Clears the rollback-pending marker so the next // healthy one. Clears the rollback-pending marker so the next
// start doesn't try to roll back a healthy install. // start doesn't try to roll back a healthy install, AND tells
// RAUC the current slot is good so its boot-attempts counter
// resets (otherwise three bad boots auto-roll back).
firmware::mark_firmware_applied(); firmware::mark_firmware_applied();
mark_kiosk_healthy(); mark_kiosk_healthy();
mark_rauc_slot_good();
first_iter = false; first_iter = false;
} }
// OS bundle first — if it succeeds it reboots and we never reach
// the firmware check below this iteration. Order matters: an OS
// bundle update can ship an app-binary change anyway.
maybe_apply_os_update(&server, &key);
maybe_apply_firmware_update(&server, &key); maybe_apply_firmware_update(&server, &key);
std::thread::sleep(std::time::Duration::from_secs(60)); std::thread::sleep(std::time::Duration::from_secs(60));
} }
@ -442,6 +450,62 @@ fn mark_kiosk_healthy() {
} }
} }
/// Tell RAUC the current slot is good so its boot-attempts counter doesn't
/// fire a rollback after a clean boot. No-op when RAUC isn't installed
/// (dev / non-A/B kiosks). RAUC's `mark-good` reads the running slot from
/// /proc/device-tree/chosen/bootloader/partition via our custom bootloader
/// backend — we just shell out and ignore non-zero exit (e.g. running
/// kiosk on a non-RAUC image).
fn mark_rauc_slot_good() {
use std::process::Command;
let _ = Command::new("rauc")
.args(["status", "mark-good"])
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.status();
}
/// Ask the server whether a full-OS RAUC bundle is available for this
/// kiosk. On hit, download + sha256 + `rauc install` + reboot. On miss or
/// error: log + keep running. Gated by BF_ENABLE_OS_OTA=1 (default OFF
/// for dev kiosks running a non-A/B image).
fn maybe_apply_os_update(server_url: &str, kiosk_key: &str) {
if std::env::var("BF_ENABLE_OS_OTA").as_deref() != Ok("1") {
return;
}
let Some(info) = os_update::check(server_url, kiosk_key) else {
return;
};
info!("os-update: bundle {} available", info.version);
server::report_kiosk_log(
server_url,
kiosk_key,
"info",
"os update available",
serde_json::json!({
"target_version": &info.version,
"channel": &info.channel,
"release_id": &info.release_id,
"size_bytes": info.size_bytes,
}),
);
if let Err(err) = os_update::apply(server_url, kiosk_key, &info) {
warn!("os-update: apply failed: {err}");
server::report_kiosk_log(
server_url,
kiosk_key,
"error",
"os update failed",
serde_json::json!({
"target_version": &info.version,
"release_id": &info.release_id,
"error": &err,
}),
);
}
// Success path doesn't return — apply() reboots the system.
}
/// Ask the server whether an update is available. On hit, download + verify /// Ask the server whether an update is available. On hit, download + verify
/// + swap + report + exit (systemd brings up the new binary). On miss or /// + swap + report + exit (systemd brings up the new binary). On miss or
/// error: log + keep running. Designed to be safe to call from any thread. /// error: log + keep running. Designed to be safe to call from any thread.