Bootloop detection & recovery (#4793)

* added boot loop detection and config backup
* automatic OTA rollback if loading backup does not fix it
* added new file handling functions
* adding verification of json files, added config restore at bootup if broken
* added function to compare contents of two files for future use (currently not used)
pull/4841/head
Damian Schneider 2025-08-15 20:43:04 +02:00 zatwierdzone przez GitHub
rodzic b8b59b2bb1
commit c9c442a933
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
6 zmienionych plików z 327 dodań i 2 usunięć

Wyświetl plik

@ -772,9 +772,30 @@ bool deserializeConfig(JsonObject doc, bool fromFS) {
return (doc["sv"] | true); return (doc["sv"] | true);
} }
static const char s_cfg_json[] PROGMEM = "/cfg.json"; static const char s_cfg_json[] PROGMEM = "/cfg.json";
bool backupConfig() {
return backupFile(s_cfg_json);
}
bool restoreConfig() {
return restoreFile(s_cfg_json);
}
bool verifyConfig() {
return validateJsonFile(s_cfg_json);
}
// rename config file and reboot
void resetConfig() {
DEBUG_PRINTLN(F("Reset config"));
char backupname[32];
strcpy(backupname, s_cfg_json);
strcat(backupname, ".rst.json");
WLED_FS.rename(s_cfg_json, backupname);
doReboot = true;
}
bool deserializeConfigFromFS() { bool deserializeConfigFromFS() {
[[maybe_unused]] bool success = deserializeConfigSec(); [[maybe_unused]] bool success = deserializeConfigSec();
#ifdef WLED_ADD_EEPROM_SUPPORT #ifdef WLED_ADD_EEPROM_SUPPORT
@ -800,6 +821,7 @@ bool deserializeConfigFromFS() {
void serializeConfigToFS() { void serializeConfigToFS() {
serializeConfigSec(); serializeConfigSec();
backupConfig(); // backup before writing new config
DEBUG_PRINTLN(F("Writing settings to /cfg.json...")); DEBUG_PRINTLN(F("Writing settings to /cfg.json..."));

Wyświetl plik

@ -24,6 +24,10 @@ void handleIO();
void IRAM_ATTR touchButtonISR(); void IRAM_ATTR touchButtonISR();
//cfg.cpp //cfg.cpp
bool backupConfig();
bool restoreConfig();
bool verifyConfig();
void resetConfig();
bool deserializeConfig(JsonObject doc, bool fromFS = false); bool deserializeConfig(JsonObject doc, bool fromFS = false);
bool deserializeConfigFromFS(); bool deserializeConfigFromFS();
bool deserializeConfigSec(); bool deserializeConfigSec();
@ -223,6 +227,11 @@ inline bool writeObjectToFileUsingId(const String &file, uint16_t id, const Json
inline bool writeObjectToFile(const String &file, const char* key, const JsonDocument* content) { return writeObjectToFile(file.c_str(), key, content); }; inline bool writeObjectToFile(const String &file, const char* key, const JsonDocument* content) { return writeObjectToFile(file.c_str(), key, content); };
inline bool readObjectFromFileUsingId(const String &file, uint16_t id, JsonDocument* dest, const JsonDocument* filter = nullptr) { return readObjectFromFileUsingId(file.c_str(), id, dest); }; inline bool readObjectFromFileUsingId(const String &file, uint16_t id, JsonDocument* dest, const JsonDocument* filter = nullptr) { return readObjectFromFileUsingId(file.c_str(), id, dest); };
inline bool readObjectFromFile(const String &file, const char* key, JsonDocument* dest, const JsonDocument* filter = nullptr) { return readObjectFromFile(file.c_str(), key, dest); }; inline bool readObjectFromFile(const String &file, const char* key, JsonDocument* dest, const JsonDocument* filter = nullptr) { return readObjectFromFile(file.c_str(), key, dest); };
bool copyFile(const char* src_path, const char* dst_path);
bool backupFile(const char* filename);
bool restoreFile(const char* filename);
bool validateJsonFile(const char* filename);
void dumpFilesToSerial();
//hue.cpp //hue.cpp
void handleHue(); void handleHue();
@ -580,6 +589,10 @@ extern "C" {
#define d_free free #define d_free free
#endif #endif
void handleBootLoop(); // detect and handle bootloops
#ifndef ESP8266
void bootloopCheckOTA(); // swap boot image if bootloop is detected instead of restoring config
#endif
// RAII guard class for the JSON Buffer lock // RAII guard class for the JSON Buffer lock
// Modeled after std::lock_guard // Modeled after std::lock_guard
class JSONBufferGuard { class JSONBufferGuard {

Wyświetl plik

@ -439,3 +439,156 @@ bool handleFileRead(AsyncWebServerRequest* request, String path){
} }
return false; return false;
} }
// copy a file, delete destination file if incomplete to prevent corrupted files
bool copyFile(const char* src_path, const char* dst_path) {
DEBUG_PRINTF("copyFile from %s to %s\n", src_path, dst_path);
if(!WLED_FS.exists(src_path)) {
DEBUG_PRINTLN(F("file not found"));
return false;
}
bool success = true; // is set to false on error
File src = WLED_FS.open(src_path, "r");
File dst = WLED_FS.open(dst_path, "w");
if (src && dst) {
uint8_t buf[128]; // copy file in 128-byte blocks
while (src.available() > 0) {
size_t bytesRead = src.read(buf, sizeof(buf));
if (bytesRead == 0) {
success = false;
break; // error, no data read
}
size_t bytesWritten = dst.write(buf, bytesRead);
if (bytesWritten != bytesRead) {
success = false;
break; // error, not all data written
}
}
} else {
success = false; // error, could not open files
}
if(src) src.close();
if(dst) dst.close();
if (!success) {
DEBUG_PRINTLN(F("copy failed"));
WLED_FS.remove(dst_path); // delete incomplete file
}
return success;
}
// compare two files, return true if identical
bool compareFiles(const char* path1, const char* path2) {
DEBUG_PRINTF("compareFile %s and %s\n", path1, path2);
if (!WLED_FS.exists(path1) || !WLED_FS.exists(path2)) {
DEBUG_PRINTLN(F("file not found"));
return false;
}
bool identical = true; // set to false on mismatch
File f1 = WLED_FS.open(path1, "r");
File f2 = WLED_FS.open(path2, "r");
if (f1 && f2) {
uint8_t buf1[128], buf2[128];
while (f1.available() > 0 || f2.available() > 0) {
size_t len1 = f1.read(buf1, sizeof(buf1));
size_t len2 = f2.read(buf2, sizeof(buf2));
if (len1 != len2) {
identical = false;
break; // files differ in size or read failed
}
if (memcmp(buf1, buf2, len1) != 0) {
identical = false;
break; // files differ in content
}
}
} else {
identical = false; // error opening files
}
if (f1) f1.close();
if (f2) f2.close();
return identical;
}
static const char s_backup_json[] PROGMEM = "/bkp.";
bool backupFile(const char* filename) {
DEBUG_PRINTF("backup %s \n", filename);
if (!validateJsonFile(filename)) {
DEBUG_PRINTLN(F("broken file"));
return false;
}
char backupname[32];
snprintf(backupname, sizeof(backupname), "%s%s", s_backup_json, filename + 1); // skip leading '/' in filename
if (copyFile(filename, backupname)) {
DEBUG_PRINTLN(F("backup ok"));
return true;
}
DEBUG_PRINTLN(F("backup failed"));
return false;
}
bool restoreFile(const char* filename) {
DEBUG_PRINTF("restore %s \n", filename);
char backupname[32];
snprintf(backupname, sizeof(backupname), "%s%s", s_backup_json, filename + 1); // skip leading '/' in filename
if (!WLED_FS.exists(backupname)) {
DEBUG_PRINTLN(F("no backup found"));
return false;
}
if (!validateJsonFile(backupname)) {
DEBUG_PRINTLN(F("broken backup"));
return false;
}
if (copyFile(backupname, filename)) {
DEBUG_PRINTLN(F("restore ok"));
return true;
}
DEBUG_PRINTLN(F("restore failed"));
return false;
}
bool validateJsonFile(const char* filename) {
if (!WLED_FS.exists(filename)) return false;
File file = WLED_FS.open(filename, "r");
if (!file) return false;
StaticJsonDocument<0> doc, filter; // https://arduinojson.org/v6/how-to/validate-json/
bool result = deserializeJson(doc, file, DeserializationOption::Filter(filter)) == DeserializationError::Ok;
file.close();
if (!result) {
DEBUG_PRINTF("Invalid JSON file %s\n", filename);
} else {
DEBUG_PRINTF("Valid JSON file %s\n", filename);
}
return result;
}
// print contents of all files in root dir to Serial except wsec files
void dumpFilesToSerial() {
File rootdir = WLED_FS.open("/", "r");
File rootfile = rootdir.openNextFile();
while (rootfile) {
size_t len = strlen(rootfile.name());
// skip files starting with "wsec" and dont end in .json
if (strncmp(rootfile.name(), "wsec", 4) != 0 && len >= 6 && strcmp(rootfile.name() + len - 5, ".json") == 0) {
Serial.println(rootfile.name());
while (rootfile.available()) {
Serial.write(rootfile.read());
}
Serial.println();
Serial.println();
}
rootfile.close();
rootfile = rootdir.openNextFile();
}
}

Wyświetl plik

@ -1,6 +1,12 @@
#include "wled.h" #include "wled.h"
#include "fcn_declare.h" #include "fcn_declare.h"
#include "const.h" #include "const.h"
#ifdef ESP8266
#include "user_interface.h" // for bootloop detection
#elif ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
#include "esp32/rtc.h" // for bootloop detection
#include <Update.h>
#endif
//helper to get int value at a position in string //helper to get int value at a position in string
@ -706,6 +712,125 @@ void *realloc_malloc(void *ptr, size_t size) {
} }
#endif #endif
// bootloop detection and handling
// checks if the ESP reboots multiple times due to a crash or watchdog timeout
// if a bootloop is detected: restore settings from backup, then reset settings, then switch boot image (and repeat)
#define BOOTLOOP_THRESHOLD 5 // number of consecutive crashes to trigger bootloop detection
#define BOOTLOOP_ACTION_RESTORE 0 // default action: restore config from /cfg.bak
#define BOOTLOOP_ACTION_RESET 1 // if restore does not work, reset config (rename /cfg.json to /cfg.fault)
#define BOOTLOOP_ACTION_OTA 2 // swap the boot partition
#define BOOTLOOP_ACTION_DUMP 3 // nothing seems to help, dump files to serial and reboot (until hardware reset)
#ifdef ESP8266
#define BOOTLOOP_INTERVAL_TICKS (5 * 160000) // time limit between crashes: ~5 seconds in RTC ticks
#define BOOT_TIME_IDX 0 // index in RTC memory for boot time
#define CRASH_COUNTER_IDX 1 // index in RTC memory for crash counter
#define ACTIONT_TRACKER_IDX 2 // index in RTC memory for boot action
#else
#define BOOTLOOP_INTERVAL_TICKS 5000 // time limit between crashes: ~5 seconds in milliseconds
// variables in RTC_NOINIT memory persist between reboots (but not on hardware reset)
RTC_NOINIT_ATTR static uint32_t bl_last_boottime;
RTC_NOINIT_ATTR static uint32_t bl_crashcounter;
RTC_NOINIT_ATTR static uint32_t bl_actiontracker;
void bootloopCheckOTA() { bl_actiontracker = BOOTLOOP_ACTION_OTA; } // swap boot image if bootloop is detected instead of restoring config
#endif
// detect bootloop by checking the reset reason and the time since last boot
static bool detectBootLoop() {
#if !defined(ESP8266)
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(4, 4, 0)
uint32_t rtctime = esp_rtc_get_time_us() / 1000; // convert to milliseconds
esp_reset_reason_t reason = esp_reset_reason();
if (!(reason == ESP_RST_PANIC || reason == ESP_RST_WDT || reason == ESP_RST_INT_WDT || reason == ESP_RST_TASK_WDT)) {
// no crash detected, init variables
bl_crashcounter = 0;
bl_last_boottime = rtctime;
if(reason != ESP_RST_SW)
bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler)
} else if (reason == ESP_RST_BROWNOUT) {
// crash due to brownout can't be detected unless using flash memory to store bootloop variables
// this is a simpler way to preemtively revert the config in case current brownout is caused by a bad choice of settings
DEBUG_PRINTLN(F("brownout detected"));
//restoreConfig(); // TODO: blindly restoring config if brownout detected is a bad idea, need a better way (if at all)
} else {
uint32_t rebootinterval = rtctime - bl_last_boottime;
bl_last_boottime = rtctime; // store current runtime for next reboot
if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) {
bl_crashcounter++;
if (bl_crashcounter >= BOOTLOOP_THRESHOLD) {
DEBUG_PRINTLN(F("!BOOTLOOP DETECTED!"));
bl_crashcounter = 0;
return true;
}
}
}
#endif
#else // ESP8266
rst_info* resetreason = system_get_rst_info();
uint32_t bl_last_boottime;
uint32_t bl_crashcounter;
uint32_t bl_actiontracker;
uint32_t rtctime = system_get_rtc_time();
if (!(resetreason->reason == REASON_EXCEPTION_RST || resetreason->reason == REASON_WDT_RST)) {
// no crash detected, init variables
bl_crashcounter = 0;
ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t));
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
if(resetreason->reason != REASON_SOFT_RESTART) {
bl_actiontracker = BOOTLOOP_ACTION_RESTORE; // init action tracker if not an intentional reboot (e.g. from OTA or bootloop handler)
ESP.rtcUserMemoryWrite(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t));
}
} else {
// system has crashed
ESP.rtcUserMemoryRead(BOOT_TIME_IDX, &bl_last_boottime, sizeof(uint32_t));
ESP.rtcUserMemoryRead(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
uint32_t rebootinterval = rtctime - bl_last_boottime;
ESP.rtcUserMemoryWrite(BOOT_TIME_IDX, &rtctime, sizeof(uint32_t)); // store current ticks for next reboot
if (rebootinterval < BOOTLOOP_INTERVAL_TICKS) {
bl_crashcounter++;
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
if (bl_crashcounter >= BOOTLOOP_THRESHOLD) {
DEBUG_PRINTLN(F("BOOTLOOP DETECTED"));
bl_crashcounter = 0;
ESP.rtcUserMemoryWrite(CRASH_COUNTER_IDX, &bl_crashcounter, sizeof(uint32_t));
return true;
}
}
}
#endif
return false; // no bootloop detected
}
void handleBootLoop() {
DEBUG_PRINTLN(F("checking for bootloop"));
if (!detectBootLoop()) return; // no bootloop detected
#ifdef ESP8266
uint32_t bl_actiontracker;
ESP.rtcUserMemoryRead(ACTIONT_TRACKER_IDX, &bl_actiontracker, sizeof(uint32_t));
#endif
if (bl_actiontracker == BOOTLOOP_ACTION_RESTORE) {
restoreConfig(); // note: if this fails, could reset immediately. instead just let things play out and save a few lines of code
bl_actiontracker = BOOTLOOP_ACTION_RESET; // reset config if it keeps bootlooping
} else if (bl_actiontracker == BOOTLOOP_ACTION_RESET) {
resetConfig();
bl_actiontracker = BOOTLOOP_ACTION_OTA; // swap boot partition if it keeps bootlooping. On ESP8266 this is the same as BOOTLOOP_ACTION_NONE
}
#ifndef ESP8266
else if (bl_actiontracker == BOOTLOOP_ACTION_OTA) {
if(Update.canRollBack()) {
DEBUG_PRINTLN(F("Swapping boot partition..."));
Update.rollBack(); // swap boot partition
}
bl_actiontracker = BOOTLOOP_ACTION_DUMP; // out of options
}
#endif
else
dumpFilesToSerial();
ESP.restart(); // restart cleanly and don't wait for another crash
}
/* /*
* Fixed point integer based Perlin noise functions by @dedehai * Fixed point integer based Perlin noise functions by @dedehai
* Note: optimized for speed and to mimic fastled inoise functions, not for accuracy or best randomness * Note: optimized for speed and to mimic fastled inoise functions, not for accuracy or best randomness

Wyświetl plik

@ -410,6 +410,9 @@ void WLED::setup()
DEBUGFS_PRINTLN(F("FS failed!")); DEBUGFS_PRINTLN(F("FS failed!"));
errorFlag = ERR_FS_BEGIN; errorFlag = ERR_FS_BEGIN;
} }
handleBootLoop(); // check for bootloop and take action (requires WLED_FS)
#ifdef WLED_ADD_EEPROM_SUPPORT #ifdef WLED_ADD_EEPROM_SUPPORT
else deEEP(); else deEEP();
#else #else
@ -425,6 +428,11 @@ void WLED::setup()
WLED_SET_AP_SSID(); // otherwise it is empty on first boot until config is saved WLED_SET_AP_SSID(); // otherwise it is empty on first boot until config is saved
multiWiFi.push_back(WiFiConfig(CLIENT_SSID,CLIENT_PASS)); // initialise vector with default WiFi multiWiFi.push_back(WiFiConfig(CLIENT_SSID,CLIENT_PASS)); // initialise vector with default WiFi
if(!verifyConfig()) {
if(!restoreConfig()) {
resetConfig();
}
}
DEBUG_PRINTLN(F("Reading config")); DEBUG_PRINTLN(F("Reading config"));
bool needsCfgSave = deserializeConfigFromFS(); bool needsCfgSave = deserializeConfigFromFS();
DEBUG_PRINTF_P(PSTR("heap %u\n"), ESP.getFreeHeap()); DEBUG_PRINTF_P(PSTR("heap %u\n"), ESP.getFreeHeap());

Wyświetl plik

@ -411,6 +411,9 @@ void initServer()
serveMessage(request, 500, F("Update failed!"), F("Please check your file and retry!"), 254); serveMessage(request, 500, F("Update failed!"), F("Please check your file and retry!"), 254);
} else { } else {
serveMessage(request, 200, F("Update successful!"), FPSTR(s_rebooting), 131); serveMessage(request, 200, F("Update successful!"), FPSTR(s_rebooting), 131);
#ifndef ESP8266
bootloopCheckOTA(); // let the bootloop-checker know there was an OTA update
#endif
doReboot = true; doReboot = true;
} }
},[](AsyncWebServerRequest *request, String filename, size_t index, uint8_t *data, size_t len, bool isFinal){ },[](AsyncWebServerRequest *request, String filename, size_t index, uint8_t *data, size_t len, bool isFinal){
@ -429,8 +432,9 @@ void initServer()
UsermodManager::onUpdateBegin(true); // notify usermods that update is about to begin (some may require task de-init) UsermodManager::onUpdateBegin(true); // notify usermods that update is about to begin (some may require task de-init)
lastEditTime = millis(); // make sure PIN does not lock during update lastEditTime = millis(); // make sure PIN does not lock during update
strip.suspend(); strip.suspend();
#ifdef ESP8266 backupConfig(); // backup current config in case the update ends badly
strip.resetSegments(); // free as much memory as you can strip.resetSegments(); // free as much memory as you can
#ifdef ESP8266
Update.runAsync(true); Update.runAsync(true);
#endif #endif
Update.begin((ESP.getFreeSketchSpace() - 0x1000) & 0xFFFFF000); Update.begin((ESP.getFreeSketchSpace() - 0x1000) & 0xFFFFF000);