bug #4184: fix config file loss due to filesystem write errors (#4397)

* Use SafeFile for atomic file writing (with xor checksum readback)
* Write db.proto last because it could be the largest file on the FS (and less critical)
* Don't keep a tmp file around while writing db.proto (because too big to fit two files in the filesystem)
* generate a new critial fault if we encounter errors writing to flash
either CriticalErrorCode_FLASH_CORRUPTION_RECOVERABLE or CriticalErrorCode_FLASH_CORRUPTION_UNRECOVERABLE
(depending on if the second write attempt worked)
* reformat the filesystem if we detect it is corrupted (then rewrite our config files) (only on nrf52 - not sure
yet if we should bother on ESP32)
* If we have to format the FS, make sure to preserve the oem.proto if it exists

Co-authored-by: Ben Meadors <benmmeadors@gmail.com>
pull/4408/head
geeksville 2024-08-06 11:59:06 -07:00 zatwierdzone przez GitHub
rodzic c1870f91fc
commit 66c41e683d
Nie znaleziono w bazie danych klucza dla tego podpisu
ID klucza GPG: B5690EEEBB952194
7 zmienionych plików z 278 dodań i 83 usunięć

99
src/SafeFile.cpp 100644
Wyświetl plik

@ -0,0 +1,99 @@
#include "SafeFile.h"
#ifdef FSCom
// Only way to work on both esp32 and nrf52
static File openFile(const char *filename, bool fullAtomic)
{
if (!fullAtomic)
FSCom.remove(filename); // Nuke the old file to make space (ignore if it !exists)
String filenameTmp = filename;
filenameTmp += ".tmp";
return FSCom.open(filenameTmp.c_str(), FILE_O_WRITE);
}
SafeFile::SafeFile(const char *_filename, bool fullAtomic)
: filename(_filename), f(openFile(_filename, fullAtomic)), fullAtomic(fullAtomic)
{
}
size_t SafeFile::write(uint8_t ch)
{
if (!f)
return 0;
hash ^= ch;
return f.write(ch);
}
size_t SafeFile::write(const uint8_t *buffer, size_t size)
{
if (!f)
return 0;
for (size_t i = 0; i < size; i++) {
hash ^= buffer[i];
}
return f.write((uint8_t const *)buffer, size); // This nasty cast is _IMPORTANT_ otherwise the correct adafruit method does
// not get used (they made a mistake in their typing)
}
/**
* Atomically close the file (deleting any old versions) and readback the contents to confirm the hash matches
*
* @return false for failure
*/
bool SafeFile::close()
{
if (!f)
return false;
f.close();
if (!testReadback())
return false;
// brief window of risk here ;-)
if (fullAtomic && FSCom.exists(filename.c_str()) && !FSCom.remove(filename.c_str())) {
LOG_ERROR("Can't remove old pref file\n");
return false;
}
String filenameTmp = filename;
filenameTmp += ".tmp";
if (!renameFile(filenameTmp.c_str(), filename.c_str())) {
LOG_ERROR("Error: can't rename new pref file\n");
return false;
}
return true;
}
/// Read our (closed) tempfile back in and compare the hash
bool SafeFile::testReadback()
{
String filenameTmp = filename;
filenameTmp += ".tmp";
auto f2 = FSCom.open(filenameTmp.c_str(), FILE_O_READ);
if (!f2) {
LOG_ERROR("Can't open tmp file for readback\n");
return false;
}
int c = 0;
uint8_t test_hash = 0;
while ((c = f2.read()) >= 0) {
test_hash ^= (uint8_t)c;
}
f2.close();
if (test_hash != hash) {
LOG_ERROR("Readback failed hash mismatch\n");
return false;
}
return true;
}
#endif

49
src/SafeFile.h 100644
Wyświetl plik

@ -0,0 +1,49 @@
#pragma once
#include "FSCommon.h"
#include "configuration.h"
#ifdef FSCom
/**
* This class provides 'safe'/paranoid file writing.
*
* Some of our filesystems (in particular the nrf52) may have bugs beneath our layer. Therefore we want to
* be very careful about how we write files. This class provides a restricted (Stream only) writing API for writing to files.
*
* Notably:
* - we keep a simple xor hash of all characters that were written.
* - We do not allow seeking (because we want to maintain our hash)
* - we provide an close() method which is similar to close but returns false if we were unable to successfully write the
* file. Also this method
* - atomically replaces any old version of the file on the disk with our new file (after first rereading the file from the disk
* to confirm the hash matches)
* - Some files are super huge so we can't do the full atomic rename/copy (because of filesystem size limits). If !fullAtomic
* then we still do the readback to verify file is valid so higher level code can handle failures.
*/
class SafeFile : public Print
{
public:
SafeFile(char const *filepath, bool fullAtomic = false);
virtual size_t write(uint8_t);
virtual size_t write(const uint8_t *buffer, size_t size);
/**
* Atomically close the file (deleting any old versions) and readback the contents to confirm the hash matches
*
* @return false for failure
*/
bool close();
private:
/// Read our (closed) tempfile back in and compare the hash
bool testReadback();
String filename;
File f;
bool fullAtomic;
uint8_t hash = 0;
};
#endif

Wyświetl plik

@ -1090,7 +1090,7 @@ int32_t GPS::runOnce()
if (devicestate.did_gps_reset && scheduling.elapsedSearchMs() > 60 * 1000UL && !hasFlow()) {
LOG_DEBUG("GPS is not communicating, trying factory reset on next bootup.\n");
devicestate.did_gps_reset = false;
nodeDB->saveDeviceStateToDisk();
nodeDB->saveToDisk(SEGMENT_DEVICESTATE);
return disable(); // Stop the GPS thread as it can do nothing useful until next reboot.
}
}

Wyświetl plik

@ -14,6 +14,7 @@
#include "PowerFSM.h"
#include "RTC.h"
#include "Router.h"
#include "SafeFile.h"
#include "TypeConversions.h"
#include "error.h"
#include "main.h"
@ -53,6 +54,28 @@ meshtastic_LocalConfig config;
meshtastic_LocalModuleConfig moduleConfig;
meshtastic_ChannelFile channelFile;
meshtastic_OEMStore oemStore;
static bool hasOemStore = false;
// These are not publically exposed - copied from InternalFileSystem.cpp
// #define FLASH_NRF52_PAGE_SIZE 4096
// #define LFS_FLASH_TOTAL_SIZE (7*FLASH_NRF52_PAGE_SIZE)
// #define LFS_BLOCK_SIZE 128
/// List all files in the FS and test write and readback.
/// Useful for filesystem stress testing - normally stripped from build by the linker.
void flashTest()
{
auto filesManifest = getFiles("/", 5);
uint32_t totalSize = 0;
for (size_t i = 0; i < filesManifest.size(); i++) {
LOG_INFO("File %s (size %d)\n", filesManifest[i].file_name, filesManifest[i].size_bytes);
totalSize += filesManifest[i].size_bytes;
}
LOG_INFO("%d files (total size %u)\n", filesManifest.size(), totalSize);
// LOG_INFO("Filesystem block size %u, total bytes %u", LFS_FLASH_TOTAL_SIZE, LFS_BLOCK_SIZE);
nodeDB->saveToDisk();
}
bool meshtastic_DeviceState_callback(pb_istream_t *istream, pb_ostream_t *ostream, const pb_field_iter_t *field)
{
@ -608,22 +631,30 @@ LoadFileResult NodeDB::loadProto(const char *filename, size_t protoSize, size_t
void NodeDB::loadFromDisk()
{
devicestate.version =
0; // Mark the current device state as completely unusable, so that if we fail reading the entire file from
// disk we will still factoryReset to restore things.
// static DeviceState scratch; We no longer read into a tempbuf because this structure is 15KB of valuable RAM
auto state = loadProto(prefFileName, sizeof(meshtastic_DeviceState) + MAX_NUM_NODES * sizeof(meshtastic_NodeInfo),
sizeof(meshtastic_DeviceState), &meshtastic_DeviceState_msg, &devicestate);
if (state != LoadFileResult::LOAD_SUCCESS) {
installDefaultDeviceState(); // Our in RAM copy might now be corrupt
// See https://github.com/meshtastic/firmware/issues/4184#issuecomment-2269390786
// It is very important to try and use the saved prefs even if we fail to read meshtastic_DeviceState. Because most of our
// critical config may still be valid (in the other files - loaded next).
// Also, if we did fail on reading we probably failed on the enormous (and non critical) nodeDB. So DO NOT install default
// device state.
// if (state != LoadFileResult::LOAD_SUCCESS) {
// installDefaultDeviceState(); // Our in RAM copy might now be corrupt
//} else {
if (devicestate.version < DEVICESTATE_MIN_VER) {
LOG_WARN("Devicestate %d is old, discarding\n", devicestate.version);
factoryReset();
} else {
if (devicestate.version < DEVICESTATE_MIN_VER) {
LOG_WARN("Devicestate %d is old, discarding\n", devicestate.version);
factoryReset();
} else {
LOG_INFO("Loaded saved devicestate version %d, with nodecount: %d\n", devicestate.version,
devicestate.node_db_lite.size());
meshNodes = &devicestate.node_db_lite;
numMeshNodes = devicestate.node_db_lite.size();
}
LOG_INFO("Loaded saved devicestate version %d, with nodecount: %d\n", devicestate.version,
devicestate.node_db_lite.size());
meshNodes = &devicestate.node_db_lite;
numMeshNodes = devicestate.node_db_lite.size();
}
meshNodes->resize(MAX_NUM_NODES);
@ -669,6 +700,7 @@ void NodeDB::loadFromDisk()
state = loadProto(oemConfigFile, meshtastic_OEMStore_size, sizeof(meshtastic_OEMStore), &meshtastic_OEMStore_msg, &oemStore);
if (state == LoadFileResult::LOAD_SUCCESS) {
LOG_INFO("Loaded OEMStore\n");
hasOemStore = true;
}
// 2.4.X - configuration migration to update new default intervals
@ -693,48 +725,26 @@ void NodeDB::loadFromDisk()
}
/** Save a protobuf from a file, return true for success */
bool NodeDB::saveProto(const char *filename, size_t protoSize, const pb_msgdesc_t *fields, const void *dest_struct)
bool NodeDB::saveProto(const char *filename, size_t protoSize, const pb_msgdesc_t *fields, const void *dest_struct,
bool fullAtomic)
{
bool okay = false;
#ifdef FSCom
// static DeviceState scratch; We no longer read into a tempbuf because this structure is 15KB of valuable RAM
String filenameTmp = filename;
filenameTmp += ".tmp";
auto f = FSCom.open(filenameTmp.c_str(), FILE_O_WRITE);
if (f) {
LOG_INFO("Saving %s\n", filename);
pb_ostream_t stream = {&writecb, &f, protoSize};
auto f = SafeFile(filename, fullAtomic);
if (!pb_encode(&stream, fields, dest_struct)) {
LOG_ERROR("Error: can't encode protobuf %s\n", PB_GET_ERROR(&stream));
} else {
okay = true;
}
f.flush();
f.close();
LOG_INFO("Saving %s\n", filename);
pb_ostream_t stream = {&writecb, static_cast<Print *>(&f), protoSize};
// brief window of risk here ;-)
if (FSCom.exists(filename) && !FSCom.remove(filename)) {
LOG_WARN("Can't remove old pref file\n");
}
if (!renameFile(filenameTmp.c_str(), filename)) {
LOG_ERROR("Error: can't rename new pref file\n");
}
if (!pb_encode(&stream, fields, dest_struct)) {
LOG_ERROR("Error: can't encode protobuf %s\n", PB_GET_ERROR(&stream));
} else {
LOG_ERROR("Can't write prefs\n");
#ifdef ARCH_NRF52
static uint8_t failedCounter = 0;
failedCounter++;
if (failedCounter >= 2) {
LOG_ERROR("Failed to save file twice. Rebooting...\n");
delay(100);
NVIC_SystemReset();
// We used to blow away the filesystem here, but that's a bit extreme
// FSCom.format();
// // After formatting, the device needs to be restarted
// nodeDB->resetRadioConfig(true);
}
#endif
okay = true;
}
bool writeSucceeded = f.close();
if (!okay || !writeSucceeded) {
LOG_ERROR("Can't write prefs!\n");
}
#else
LOG_ERROR("ERROR: Filesystem not implemented\n");
@ -742,32 +752,32 @@ bool NodeDB::saveProto(const char *filename, size_t protoSize, const pb_msgdesc_
return okay;
}
void NodeDB::saveChannelsToDisk()
bool NodeDB::saveChannelsToDisk()
{
#ifdef FSCom
FSCom.mkdir("/prefs");
#endif
saveProto(channelFileName, meshtastic_ChannelFile_size, &meshtastic_ChannelFile_msg, &channelFile);
return saveProto(channelFileName, meshtastic_ChannelFile_size, &meshtastic_ChannelFile_msg, &channelFile);
}
void NodeDB::saveDeviceStateToDisk()
bool NodeDB::saveDeviceStateToDisk()
{
#ifdef FSCom
FSCom.mkdir("/prefs");
#endif
saveProto(prefFileName, sizeof(devicestate) + numMeshNodes * meshtastic_NodeInfoLite_size, &meshtastic_DeviceState_msg,
&devicestate);
// Note: if MAX_NUM_NODES=100 and meshtastic_NodeInfoLite_size=166, so will be approximately 17KB
// Because so huge we _must_ not use fullAtomic, because the filesystem is probably too small to hold two copies of this
return saveProto(prefFileName, sizeof(devicestate) + numMeshNodes * meshtastic_NodeInfoLite_size, &meshtastic_DeviceState_msg,
&devicestate, false);
}
void NodeDB::saveToDisk(int saveWhat)
bool NodeDB::saveToDiskNoRetry(int saveWhat)
{
bool success = true;
#ifdef FSCom
FSCom.mkdir("/prefs");
#endif
if (saveWhat & SEGMENT_DEVICESTATE) {
saveDeviceStateToDisk();
}
if (saveWhat & SEGMENT_CONFIG) {
config.has_device = true;
config.has_display = true;
@ -777,7 +787,7 @@ void NodeDB::saveToDisk(int saveWhat)
config.has_network = true;
config.has_bluetooth = true;
saveProto(configFileName, meshtastic_LocalConfig_size, &meshtastic_LocalConfig_msg, &config);
success &= saveProto(configFileName, meshtastic_LocalConfig_size, &meshtastic_LocalConfig_msg, &config);
}
if (saveWhat & SEGMENT_MODULECONFIG) {
@ -794,12 +804,45 @@ void NodeDB::saveToDisk(int saveWhat)
moduleConfig.has_audio = true;
moduleConfig.has_paxcounter = true;
saveProto(moduleConfigFileName, meshtastic_LocalModuleConfig_size, &meshtastic_LocalModuleConfig_msg, &moduleConfig);
success &=
saveProto(moduleConfigFileName, meshtastic_LocalModuleConfig_size, &meshtastic_LocalModuleConfig_msg, &moduleConfig);
}
// We might need to rewrite the OEM data if we are reformatting the FS
if ((saveWhat & SEGMENT_OEM) && hasOemStore) {
success &= saveProto(oemConfigFile, meshtastic_OEMStore_size, &meshtastic_OEMStore_msg, &oemStore);
}
if (saveWhat & SEGMENT_CHANNELS) {
saveChannelsToDisk();
success &= saveChannelsToDisk();
}
if (saveWhat & SEGMENT_DEVICESTATE) {
success &= saveDeviceStateToDisk();
}
return success;
}
bool NodeDB::saveToDisk(int saveWhat)
{
bool success = saveToDiskNoRetry(saveWhat);
if (!success) {
LOG_ERROR("Failed to save to disk, retrying...\n");
#ifdef ARCH_NRF52 // @geeksville is not ready yet to say we should do this on other platforms. See bug #4184 discussion
FSCom.format();
// We need to rewrite the OEM data if we are reformatting the FS
saveWhat |= SEGMENT_OEM;
#endif
success = saveToDiskNoRetry(saveWhat);
RECORD_CRITICALERROR(success ? meshtastic_CriticalErrorCode_FLASH_CORRUPTION_RECOVERABLE
: meshtastic_CriticalErrorCode_FLASH_CORRUPTION_UNRECOVERABLE);
}
return success;
}
const meshtastic_NodeInfoLite *NodeDB::readNextMeshNode(uint32_t &readIndex)
@ -1059,4 +1102,4 @@ void recordCriticalError(meshtastic_CriticalErrorCode code, uint32_t address, co
LOG_ERROR("A critical failure occurred, portduino is exiting...");
exit(2);
#endif
}
}

Wyświetl plik

@ -19,6 +19,7 @@ DeviceState versions used to be defined in the .proto file but really only this
#define SEGMENT_MODULECONFIG 2
#define SEGMENT_DEVICESTATE 4
#define SEGMENT_CHANNELS 8
#define SEGMENT_OEM 16
#define DEVICESTATE_CUR_VER 23
#define DEVICESTATE_MIN_VER 22
@ -72,8 +73,8 @@ class NodeDB
NodeDB();
/// write to flash
void saveToDisk(int saveWhat = SEGMENT_CONFIG | SEGMENT_MODULECONFIG | SEGMENT_DEVICESTATE | SEGMENT_CHANNELS),
saveChannelsToDisk(), saveDeviceStateToDisk();
/// @return true if the save was successful
bool saveToDisk(int saveWhat = SEGMENT_CONFIG | SEGMENT_MODULECONFIG | SEGMENT_DEVICESTATE | SEGMENT_CHANNELS);
/** Reinit radio config if needed, because either:
* a) sometimes a buggy android app might send us bogus settings or
@ -130,7 +131,8 @@ class NodeDB
LoadFileResult loadProto(const char *filename, size_t protoSize, size_t objSize, const pb_msgdesc_t *fields,
void *dest_struct);
bool saveProto(const char *filename, size_t protoSize, const pb_msgdesc_t *fields, const void *dest_struct);
bool saveProto(const char *filename, size_t protoSize, const pb_msgdesc_t *fields, const void *dest_struct,
bool fullAtomic = true);
void installRoleDefaults(meshtastic_Config_DeviceConfig_Role role);
@ -181,6 +183,13 @@ class NodeDB
/// Reinit device state from scratch (not loading from disk)
void installDefaultDeviceState(), installDefaultChannels(), installDefaultConfig(), installDefaultModuleConfig();
/// write to flash
/// @return true if the save was successful
bool saveToDiskNoRetry(int saveWhat);
bool saveChannelsToDisk();
bool saveDeviceStateToDisk();
};
extern NodeDB *nodeDB;
@ -228,4 +237,4 @@ extern uint32_t error_address;
ModuleConfig_RangeTestConfig_size + ModuleConfig_SerialConfig_size + ModuleConfig_StoreForwardConfig_size + \
ModuleConfig_TelemetryConfig_size + ModuleConfig_size)
// Please do not remove this comment, it makes trunk and compiler happy at the same time.
// Please do not remove this comment, it makes trunk and compiler happy at the same time.

Wyświetl plik

@ -58,7 +58,7 @@ bool readcb(pb_istream_t *stream, uint8_t *buf, size_t count)
/// Write to an arduino file
bool writecb(pb_ostream_t *stream, const uint8_t *buf, size_t count)
{
File *file = (File *)stream->state;
auto file = (Print *)stream->state;
// LOG_DEBUG("writing %d bytes to protobuf file\n", count);
return file->write(buf, count) == count;
}
@ -71,4 +71,4 @@ bool is_in_helper(uint32_t n, const uint32_t *array, pb_size_t count)
return true;
return false;
}
}

Wyświetl plik

@ -5,6 +5,7 @@
#include "../mesh/generated/meshtastic/telemetry.pb.h"
#include "FSCommon.h"
#include "NAU7802Sensor.h"
#include "SafeFile.h"
#include "TelemetrySensor.h"
#include <pb_decode.h>
#include <pb_encode.h>
@ -95,27 +96,21 @@ void NAU7802Sensor::tare()
bool NAU7802Sensor::saveCalibrationData()
{
if (FSCom.exists(nau7802ConfigFileName) && !FSCom.remove(nau7802ConfigFileName)) {
LOG_WARN("Can't remove old state file\n");
}
auto file = FSCom.open(nau7802ConfigFileName, FILE_O_WRITE);
auto file = SafeFile(nau7802ConfigFileName);
nau7802config.zeroOffset = nau7802.getZeroOffset();
nau7802config.calibrationFactor = nau7802.getCalibrationFactor();
bool okay = false;
if (file) {
LOG_INFO("%s state write to %s.\n", sensorName, nau7802ConfigFileName);
pb_ostream_t stream = {&writecb, &file, meshtastic_Nau7802Config_size};
if (!pb_encode(&stream, &meshtastic_Nau7802Config_msg, &nau7802config)) {
LOG_ERROR("Error: can't encode protobuf %s\n", PB_GET_ERROR(&stream));
} else {
okay = true;
}
file.flush();
file.close();
LOG_INFO("%s state write to %s.\n", sensorName, nau7802ConfigFileName);
pb_ostream_t stream = {&writecb, static_cast<Print *>(&file), meshtastic_Nau7802Config_size};
if (!pb_encode(&stream, &meshtastic_Nau7802Config_msg, &nau7802config)) {
LOG_ERROR("Error: can't encode protobuf %s\n", PB_GET_ERROR(&stream));
} else {
LOG_INFO("Can't write %s state (File: %s).\n", sensorName, nau7802ConfigFileName);
okay = true;
}
okay &= file.close();
return okay;
}