Complete Rebuild, with Subsystems for each component. RTOS Tasks. (help by Claude)
This commit is contained in:
428
vesper/src/HealthMonitor/HealthMonitor.cpp
Normal file
428
vesper/src/HealthMonitor/HealthMonitor.cpp
Normal file
@@ -0,0 +1,428 @@
|
||||
/*
|
||||
* ═══════════════════════════════════════════════════════════════════════════════════
|
||||
* HEALTHMONITOR.CPP - System Health Monitoring Implementation
|
||||
* ═══════════════════════════════════════════════════════════════════════════════════
|
||||
*/
|
||||
|
||||
#include "HealthMonitor.hpp"
|
||||
#include "../BellEngine/BellEngine.hpp"
|
||||
#include "../OutputManager/OutputManager.hpp"
|
||||
#include "../Communication/Communication.hpp"
|
||||
#include "../Player/Player.hpp"
|
||||
#include "../TimeKeeper/TimeKeeper.hpp"
|
||||
#include "../Telemetry/Telemetry.hpp"
|
||||
#include "../OTAManager/OTAManager.hpp"
|
||||
#include "../Networking/Networking.hpp"
|
||||
#include "../ConfigManager/ConfigManager.hpp"
|
||||
#include "../FileManager/FileManager.hpp"
|
||||
#include <ArduinoJson.h>
|
||||
|
||||
HealthMonitor::HealthMonitor() {
|
||||
initializeSubsystemHealth();
|
||||
}
|
||||
|
||||
HealthMonitor::~HealthMonitor() {
|
||||
if (_monitoringTaskHandle != nullptr) {
|
||||
vTaskDelete(_monitoringTaskHandle);
|
||||
_monitoringTaskHandle = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
bool HealthMonitor::begin() {
|
||||
LOG_INFO("🏥 Initializing Health Monitor System");
|
||||
|
||||
// Create monitoring task if auto-monitoring is enabled
|
||||
if (_autoMonitoring) {
|
||||
xTaskCreatePinnedToCore(
|
||||
monitoringTask,
|
||||
"HealthMonitor",
|
||||
4096,
|
||||
this,
|
||||
3, // Medium priority
|
||||
&_monitoringTaskHandle,
|
||||
0 // Core 0 (different from BellEngine which uses Core 1)
|
||||
);
|
||||
|
||||
if (_monitoringTaskHandle != nullptr) {
|
||||
LOG_INFO("✅ Health Monitor initialized with automatic monitoring");
|
||||
return true;
|
||||
} else {
|
||||
LOG_ERROR("❌ Failed to create Health Monitor task");
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
LOG_INFO("✅ Health Monitor initialized (manual mode)");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void HealthMonitor::initializeSubsystemHealth() {
|
||||
// Initialize all subsystem health entries
|
||||
// Mark critical subsystems that must be healthy for operation
|
||||
|
||||
_subsystemHealth["BellEngine"] = SubsystemHealth("BellEngine", true);
|
||||
_subsystemHealth["OutputManager"] = SubsystemHealth("OutputManager", true);
|
||||
_subsystemHealth["ConfigManager"] = SubsystemHealth("ConfigManager", true);
|
||||
_subsystemHealth["FileManager"] = SubsystemHealth("FileManager", true);
|
||||
_subsystemHealth["Communication"] = SubsystemHealth("Communication", false); // Non-critical
|
||||
_subsystemHealth["Player"] = SubsystemHealth("Player", true);
|
||||
_subsystemHealth["TimeKeeper"] = SubsystemHealth("TimeKeeper", false); // Non-critical
|
||||
_subsystemHealth["Telemetry"] = SubsystemHealth("Telemetry", false); // Non-critical
|
||||
_subsystemHealth["OTAManager"] = SubsystemHealth("OTAManager", false); // Non-critical
|
||||
_subsystemHealth["Networking"] = SubsystemHealth("Networking", false); // Non-critical
|
||||
|
||||
LOG_DEBUG("🏗️ Initialized health monitoring for %d subsystems", _subsystemHealth.size());
|
||||
}
|
||||
|
||||
void HealthMonitor::monitoringTask(void* parameter) {
|
||||
HealthMonitor* monitor = static_cast<HealthMonitor*>(parameter);
|
||||
LOG_INFO("🏥 Health Monitor task started on Core %d", xPortGetCoreID());
|
||||
|
||||
while (true) {
|
||||
monitor->monitoringLoop();
|
||||
vTaskDelay(pdMS_TO_TICKS(monitor->_healthCheckInterval));
|
||||
}
|
||||
}
|
||||
|
||||
void HealthMonitor::monitoringLoop() {
|
||||
|
||||
if (_player) {
|
||||
if (_player->_status != PlayerStatus::STOPPED) {
|
||||
LOG_VERBOSE("⏸️ Skipping health check during active playback");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_VERBOSE("🔍 Performing periodic health check...");
|
||||
|
||||
HealthStatus overallHealth = performFullHealthCheck();
|
||||
|
||||
// Log warnings for any unhealthy subsystems
|
||||
uint8_t criticalCount = getCriticalFailureCount();
|
||||
uint8_t warningCount = getWarningCount();
|
||||
|
||||
if (criticalCount > 0) {
|
||||
LOG_WARNING("🚨 Health Monitor: %d critical failures detected!", criticalCount);
|
||||
|
||||
// List critical failures
|
||||
for (const auto& [name, health] : _subsystemHealth) {
|
||||
if (health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED) {
|
||||
LOG_ERROR("❌ CRITICAL: %s - %s", name.c_str(), health.lastError.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
// Check if firmware rollback is recommended
|
||||
if (shouldRollbackFirmware()) {
|
||||
LOG_ERROR("🔄 FIRMWARE ROLLBACK RECOMMENDED - Too many critical failures");
|
||||
// In a real system, this would trigger an OTA rollback
|
||||
// For now, we just log the recommendation
|
||||
}
|
||||
} else if (warningCount > 0) {
|
||||
LOG_WARNING("⚠️ Health Monitor: %d warnings detected", warningCount);
|
||||
} else {
|
||||
LOG_VERBOSE("✅ All subsystems healthy");
|
||||
}
|
||||
}
|
||||
|
||||
HealthStatus HealthMonitor::performFullHealthCheck() {
|
||||
unsigned long startTime = millis();
|
||||
uint8_t checkedSystems = 0;
|
||||
|
||||
// Check BellEngine
|
||||
if (_bellEngine) {
|
||||
bool healthy = _bellEngine->isHealthy();
|
||||
updateSubsystemHealth("BellEngine",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
||||
healthy ? "" : "BellEngine health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check OutputManager
|
||||
if (_outputManager) {
|
||||
bool healthy = _outputManager->isHealthy();
|
||||
updateSubsystemHealth("OutputManager",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
||||
healthy ? "" : "OutputManager health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check Communication
|
||||
if (_communication) {
|
||||
bool healthy = _communication->isHealthy();
|
||||
updateSubsystemHealth("Communication",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
||||
healthy ? "" : "Communication health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check Player
|
||||
if (_player) {
|
||||
bool healthy = _player->isHealthy();
|
||||
updateSubsystemHealth("Player",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
||||
healthy ? "" : "Player health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check TimeKeeper
|
||||
if (_timeKeeper) {
|
||||
bool healthy = _timeKeeper->isHealthy();
|
||||
updateSubsystemHealth("TimeKeeper",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
||||
healthy ? "" : "TimeKeeper health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check Telemetry
|
||||
if (_telemetry) {
|
||||
bool healthy = _telemetry->isHealthy();
|
||||
updateSubsystemHealth("Telemetry",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
||||
healthy ? "" : "Telemetry health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check OTAManager
|
||||
if (_otaManager) {
|
||||
bool healthy = _otaManager->isHealthy();
|
||||
updateSubsystemHealth("OTAManager",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
||||
healthy ? "" : "OTAManager health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check Networking
|
||||
if (_networking) {
|
||||
bool healthy = _networking->isHealthy();
|
||||
updateSubsystemHealth("Networking",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
||||
healthy ? "" : "Networking health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check ConfigManager
|
||||
if (_configManager) {
|
||||
bool healthy = _configManager->isHealthy();
|
||||
updateSubsystemHealth("ConfigManager",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
||||
healthy ? "" : "ConfigManager health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
// Check FileManager
|
||||
if (_fileManager) {
|
||||
bool healthy = _fileManager->isHealthy();
|
||||
updateSubsystemHealth("FileManager",
|
||||
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
||||
healthy ? "" : "FileManager health check failed");
|
||||
checkedSystems++;
|
||||
}
|
||||
|
||||
unsigned long elapsed = millis() - startTime;
|
||||
LOG_VERBOSE("🔍 Health check completed: %d systems in %lums", checkedSystems, elapsed);
|
||||
|
||||
return calculateOverallHealth();
|
||||
}
|
||||
|
||||
HealthStatus HealthMonitor::checkSubsystemHealth(const String& subsystemName) {
|
||||
// Perform health check on specific subsystem
|
||||
auto it = _subsystemHealth.find(subsystemName);
|
||||
if (it == _subsystemHealth.end()) {
|
||||
LOG_WARNING("❓ Unknown subsystem: %s", subsystemName.c_str());
|
||||
return HealthStatus::FAILED;
|
||||
}
|
||||
|
||||
bool healthy = false;
|
||||
|
||||
// Check specific subsystem
|
||||
if (subsystemName == "BellEngine" && _bellEngine) {
|
||||
healthy = _bellEngine->isHealthy();
|
||||
} else if (subsystemName == "OutputManager" && _outputManager) {
|
||||
healthy = _outputManager->isHealthy();
|
||||
} else if (subsystemName == "Communication" && _communication) {
|
||||
healthy = _communication->isHealthy();
|
||||
} else if (subsystemName == "Player" && _player) {
|
||||
healthy = _player->isHealthy();
|
||||
} else if (subsystemName == "TimeKeeper" && _timeKeeper) {
|
||||
healthy = _timeKeeper->isHealthy();
|
||||
} else if (subsystemName == "Telemetry" && _telemetry) {
|
||||
healthy = _telemetry->isHealthy();
|
||||
} else if (subsystemName == "OTAManager" && _otaManager) {
|
||||
healthy = _otaManager->isHealthy();
|
||||
} else if (subsystemName == "Networking" && _networking) {
|
||||
healthy = _networking->isHealthy();
|
||||
} else if (subsystemName == "ConfigManager" && _configManager) {
|
||||
healthy = _configManager->isHealthy();
|
||||
} else if (subsystemName == "FileManager" && _fileManager) {
|
||||
healthy = _fileManager->isHealthy();
|
||||
} else {
|
||||
LOG_WARNING("🔌 Subsystem %s not connected to health monitor", subsystemName.c_str());
|
||||
return HealthStatus::FAILED;
|
||||
}
|
||||
|
||||
HealthStatus status = healthy ? HealthStatus::HEALTHY :
|
||||
(it->second.isCritical ? HealthStatus::CRITICAL : HealthStatus::WARNING);
|
||||
|
||||
updateSubsystemHealth(subsystemName, status,
|
||||
healthy ? "" : subsystemName + " health check failed");
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
const std::map<String, SubsystemHealth>& HealthMonitor::getAllSubsystemHealth() const {
|
||||
return _subsystemHealth;
|
||||
}
|
||||
|
||||
SubsystemHealth HealthMonitor::getSubsystemHealth(const String& subsystemName) const {
|
||||
auto it = _subsystemHealth.find(subsystemName);
|
||||
if (it != _subsystemHealth.end()) {
|
||||
return it->second;
|
||||
}
|
||||
|
||||
// Return default unhealthy status for unknown subsystems
|
||||
SubsystemHealth unknown(subsystemName);
|
||||
unknown.status = HealthStatus::FAILED;
|
||||
unknown.lastError = "Subsystem not found";
|
||||
return unknown;
|
||||
}
|
||||
|
||||
bool HealthMonitor::isFirmwareStable() const {
|
||||
return areCriticalSubsystemsHealthy() && (getCriticalFailureCount() == 0);
|
||||
}
|
||||
|
||||
uint8_t HealthMonitor::getCriticalFailureCount() const {
|
||||
uint8_t count = 0;
|
||||
|
||||
for (const auto& [name, health] : _subsystemHealth) {
|
||||
if (health.isCritical &&
|
||||
(health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
uint8_t HealthMonitor::getWarningCount() const {
|
||||
uint8_t count = 0;
|
||||
|
||||
for (const auto& [name, health] : _subsystemHealth) {
|
||||
if (health.status == HealthStatus::WARNING) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
bool HealthMonitor::shouldRollbackFirmware() const {
|
||||
uint8_t criticalFailures = getCriticalFailureCount();
|
||||
|
||||
// Rollback if more than 2 critical subsystems have failed
|
||||
// This is configurable based on system requirements
|
||||
const uint8_t MAX_CRITICAL_FAILURES = 2;
|
||||
|
||||
return criticalFailures > MAX_CRITICAL_FAILURES;
|
||||
}
|
||||
|
||||
String HealthMonitor::generateHealthReport() const {
|
||||
StaticJsonDocument<2048> doc;
|
||||
|
||||
doc["timestamp"] = millis();
|
||||
doc["overall_health"] = healthStatusToString(calculateOverallHealth());
|
||||
doc["critical_failures"] = getCriticalFailureCount();
|
||||
doc["warnings"] = getWarningCount();
|
||||
doc["firmware_stable"] = isFirmwareStable();
|
||||
doc["rollback_recommended"] = shouldRollbackFirmware();
|
||||
|
||||
JsonObject subsystems = doc.createNestedObject("subsystems");
|
||||
|
||||
for (const auto& [name, health] : _subsystemHealth) {
|
||||
JsonObject subsystem = subsystems.createNestedObject(name);
|
||||
subsystem["status"] = healthStatusToString(health.status);
|
||||
subsystem["critical"] = health.isCritical;
|
||||
subsystem["last_check"] = health.lastCheck;
|
||||
|
||||
if (!health.lastError.isEmpty()) {
|
||||
subsystem["error"] = health.lastError;
|
||||
}
|
||||
}
|
||||
|
||||
String report;
|
||||
serializeJsonPretty(doc, report);
|
||||
return report;
|
||||
}
|
||||
|
||||
String HealthMonitor::getHealthSummary() const {
|
||||
HealthStatus overall = calculateOverallHealth();
|
||||
uint8_t critical = getCriticalFailureCount();
|
||||
uint8_t warnings = getWarningCount();
|
||||
|
||||
String summary = "System Health: " + healthStatusToString(overall);
|
||||
|
||||
if (critical > 0) {
|
||||
summary += " (" + String(critical) + " critical failures)";
|
||||
}
|
||||
|
||||
if (warnings > 0) {
|
||||
summary += " (" + String(warnings) + " warnings)";
|
||||
}
|
||||
|
||||
if (shouldRollbackFirmware()) {
|
||||
summary += " - ROLLBACK RECOMMENDED";
|
||||
}
|
||||
|
||||
return summary;
|
||||
}
|
||||
|
||||
void HealthMonitor::updateSubsystemHealth(const String& name, HealthStatus status, const String& error) {
|
||||
auto it = _subsystemHealth.find(name);
|
||||
if (it != _subsystemHealth.end()) {
|
||||
it->second.status = status;
|
||||
it->second.lastError = error;
|
||||
it->second.lastCheck = millis();
|
||||
|
||||
LOG_VERBOSE("🔍 %s: %s %s",
|
||||
name.c_str(),
|
||||
healthStatusToString(status).c_str(),
|
||||
error.isEmpty() ? "" : ("(" + error + ")").c_str());
|
||||
}
|
||||
}
|
||||
|
||||
bool HealthMonitor::areCriticalSubsystemsHealthy() const {
|
||||
for (const auto& [name, health] : _subsystemHealth) {
|
||||
if (health.isCritical &&
|
||||
(health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
HealthStatus HealthMonitor::calculateOverallHealth() const {
|
||||
bool hasCriticalFailures = (getCriticalFailureCount() > 0);
|
||||
bool hasWarnings = (getWarningCount() > 0);
|
||||
|
||||
if (hasCriticalFailures) {
|
||||
return HealthStatus::CRITICAL;
|
||||
} else if (hasWarnings) {
|
||||
return HealthStatus::WARNING;
|
||||
} else {
|
||||
return HealthStatus::HEALTHY;
|
||||
}
|
||||
}
|
||||
|
||||
String HealthMonitor::healthStatusToString(HealthStatus status) const {
|
||||
switch (status) {
|
||||
case HealthStatus::HEALTHY:
|
||||
return "HEALTHY";
|
||||
case HealthStatus::WARNING:
|
||||
return "WARNING";
|
||||
case HealthStatus::CRITICAL:
|
||||
return "CRITICAL";
|
||||
case HealthStatus::FAILED:
|
||||
return "FAILED";
|
||||
default:
|
||||
return "UNKNOWN";
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user