429 lines
15 KiB
C++
429 lines
15 KiB
C++
/*
|
|
* ═══════════════════════════════════════════════════════════════════════════════════
|
|
* HEALTHMONITOR.CPP - System Health Monitoring Implementation
|
|
* ═══════════════════════════════════════════════════════════════════════════════════
|
|
*/
|
|
|
|
#include "HealthMonitor.hpp"
|
|
#include "../BellEngine/BellEngine.hpp"
|
|
#include "../OutputManager/OutputManager.hpp"
|
|
#include "../Communication/Communication.hpp"
|
|
#include "../Player/Player.hpp"
|
|
#include "../TimeKeeper/TimeKeeper.hpp"
|
|
#include "../Telemetry/Telemetry.hpp"
|
|
#include "../OTAManager/OTAManager.hpp"
|
|
#include "../Networking/Networking.hpp"
|
|
#include "../ConfigManager/ConfigManager.hpp"
|
|
#include "../FileManager/FileManager.hpp"
|
|
#include <ArduinoJson.h>
|
|
|
|
HealthMonitor::HealthMonitor() {
|
|
initializeSubsystemHealth();
|
|
}
|
|
|
|
HealthMonitor::~HealthMonitor() {
|
|
if (_monitoringTaskHandle != nullptr) {
|
|
vTaskDelete(_monitoringTaskHandle);
|
|
_monitoringTaskHandle = nullptr;
|
|
}
|
|
}
|
|
|
|
bool HealthMonitor::begin() {
|
|
LOG_INFO("🏥 Initializing Health Monitor System");
|
|
|
|
// Create monitoring task if auto-monitoring is enabled
|
|
if (_autoMonitoring) {
|
|
xTaskCreatePinnedToCore(
|
|
monitoringTask,
|
|
"HealthMonitor",
|
|
4096,
|
|
this,
|
|
3, // Medium priority
|
|
&_monitoringTaskHandle,
|
|
0 // Core 0 (different from BellEngine which uses Core 1)
|
|
);
|
|
|
|
if (_monitoringTaskHandle != nullptr) {
|
|
LOG_INFO("✅ Health Monitor initialized with automatic monitoring");
|
|
return true;
|
|
} else {
|
|
LOG_ERROR("❌ Failed to create Health Monitor task");
|
|
return false;
|
|
}
|
|
} else {
|
|
LOG_INFO("✅ Health Monitor initialized (manual mode)");
|
|
return true;
|
|
}
|
|
}
|
|
|
|
void HealthMonitor::initializeSubsystemHealth() {
|
|
// Initialize all subsystem health entries
|
|
// Mark critical subsystems that must be healthy for operation
|
|
|
|
_subsystemHealth["BellEngine"] = SubsystemHealth("BellEngine", true);
|
|
_subsystemHealth["OutputManager"] = SubsystemHealth("OutputManager", true);
|
|
_subsystemHealth["ConfigManager"] = SubsystemHealth("ConfigManager", true);
|
|
_subsystemHealth["FileManager"] = SubsystemHealth("FileManager", true);
|
|
_subsystemHealth["Communication"] = SubsystemHealth("Communication", false); // Non-critical
|
|
_subsystemHealth["Player"] = SubsystemHealth("Player", true);
|
|
_subsystemHealth["TimeKeeper"] = SubsystemHealth("TimeKeeper", false); // Non-critical
|
|
_subsystemHealth["Telemetry"] = SubsystemHealth("Telemetry", false); // Non-critical
|
|
_subsystemHealth["OTAManager"] = SubsystemHealth("OTAManager", false); // Non-critical
|
|
_subsystemHealth["Networking"] = SubsystemHealth("Networking", false); // Non-critical
|
|
|
|
LOG_DEBUG("🏗️ Initialized health monitoring for %d subsystems", _subsystemHealth.size());
|
|
}
|
|
|
|
void HealthMonitor::monitoringTask(void* parameter) {
|
|
HealthMonitor* monitor = static_cast<HealthMonitor*>(parameter);
|
|
LOG_INFO("🏥 Health Monitor task started on Core %d", xPortGetCoreID());
|
|
|
|
while (true) {
|
|
monitor->monitoringLoop();
|
|
vTaskDelay(pdMS_TO_TICKS(monitor->_healthCheckInterval));
|
|
}
|
|
}
|
|
|
|
void HealthMonitor::monitoringLoop() {
|
|
|
|
if (_player) {
|
|
if (_player->_status != PlayerStatus::STOPPED) {
|
|
LOG_VERBOSE("⏸️ Skipping health check during active playback");
|
|
return;
|
|
}
|
|
}
|
|
|
|
LOG_VERBOSE("🔍 Performing periodic health check...");
|
|
|
|
HealthStatus overallHealth = performFullHealthCheck();
|
|
|
|
// Log warnings for any unhealthy subsystems
|
|
uint8_t criticalCount = getCriticalFailureCount();
|
|
uint8_t warningCount = getWarningCount();
|
|
|
|
if (criticalCount > 0) {
|
|
LOG_WARNING("🚨 Health Monitor: %d critical failures detected!", criticalCount);
|
|
|
|
// List critical failures
|
|
for (const auto& [name, health] : _subsystemHealth) {
|
|
if (health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED) {
|
|
LOG_ERROR("❌ CRITICAL: %s - %s", name.c_str(), health.lastError.c_str());
|
|
}
|
|
}
|
|
|
|
// Check if firmware rollback is recommended
|
|
if (shouldRollbackFirmware()) {
|
|
LOG_ERROR("🔄 FIRMWARE ROLLBACK RECOMMENDED - Too many critical failures");
|
|
// In a real system, this would trigger an OTA rollback
|
|
// For now, we just log the recommendation
|
|
}
|
|
} else if (warningCount > 0) {
|
|
LOG_WARNING("⚠️ Health Monitor: %d warnings detected", warningCount);
|
|
} else {
|
|
LOG_VERBOSE("✅ All subsystems healthy");
|
|
}
|
|
}
|
|
|
|
HealthStatus HealthMonitor::performFullHealthCheck() {
|
|
unsigned long startTime = millis();
|
|
uint8_t checkedSystems = 0;
|
|
|
|
// Check BellEngine
|
|
if (_bellEngine) {
|
|
bool healthy = _bellEngine->isHealthy();
|
|
updateSubsystemHealth("BellEngine",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
|
healthy ? "" : "BellEngine health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check OutputManager
|
|
if (_outputManager) {
|
|
bool healthy = _outputManager->isHealthy();
|
|
updateSubsystemHealth("OutputManager",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
|
healthy ? "" : "OutputManager health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check Communication
|
|
if (_communication) {
|
|
bool healthy = _communication->isHealthy();
|
|
updateSubsystemHealth("Communication",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
|
healthy ? "" : "Communication health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check Player
|
|
if (_player) {
|
|
bool healthy = _player->isHealthy();
|
|
updateSubsystemHealth("Player",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
|
healthy ? "" : "Player health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check TimeKeeper
|
|
if (_timeKeeper) {
|
|
bool healthy = _timeKeeper->isHealthy();
|
|
updateSubsystemHealth("TimeKeeper",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
|
healthy ? "" : "TimeKeeper health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check Telemetry
|
|
if (_telemetry) {
|
|
bool healthy = _telemetry->isHealthy();
|
|
updateSubsystemHealth("Telemetry",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
|
healthy ? "" : "Telemetry health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check OTAManager
|
|
if (_otaManager) {
|
|
bool healthy = _otaManager->isHealthy();
|
|
updateSubsystemHealth("OTAManager",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
|
healthy ? "" : "OTAManager health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check Networking
|
|
if (_networking) {
|
|
bool healthy = _networking->isHealthy();
|
|
updateSubsystemHealth("Networking",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
|
|
healthy ? "" : "Networking health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check ConfigManager
|
|
if (_configManager) {
|
|
bool healthy = _configManager->isHealthy();
|
|
updateSubsystemHealth("ConfigManager",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
|
healthy ? "" : "ConfigManager health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
// Check FileManager
|
|
if (_fileManager) {
|
|
bool healthy = _fileManager->isHealthy();
|
|
updateSubsystemHealth("FileManager",
|
|
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
|
|
healthy ? "" : "FileManager health check failed");
|
|
checkedSystems++;
|
|
}
|
|
|
|
unsigned long elapsed = millis() - startTime;
|
|
LOG_VERBOSE("🔍 Health check completed: %d systems in %lums", checkedSystems, elapsed);
|
|
|
|
return calculateOverallHealth();
|
|
}
|
|
|
|
HealthStatus HealthMonitor::checkSubsystemHealth(const String& subsystemName) {
|
|
// Perform health check on specific subsystem
|
|
auto it = _subsystemHealth.find(subsystemName);
|
|
if (it == _subsystemHealth.end()) {
|
|
LOG_WARNING("❓ Unknown subsystem: %s", subsystemName.c_str());
|
|
return HealthStatus::FAILED;
|
|
}
|
|
|
|
bool healthy = false;
|
|
|
|
// Check specific subsystem
|
|
if (subsystemName == "BellEngine" && _bellEngine) {
|
|
healthy = _bellEngine->isHealthy();
|
|
} else if (subsystemName == "OutputManager" && _outputManager) {
|
|
healthy = _outputManager->isHealthy();
|
|
} else if (subsystemName == "Communication" && _communication) {
|
|
healthy = _communication->isHealthy();
|
|
} else if (subsystemName == "Player" && _player) {
|
|
healthy = _player->isHealthy();
|
|
} else if (subsystemName == "TimeKeeper" && _timeKeeper) {
|
|
healthy = _timeKeeper->isHealthy();
|
|
} else if (subsystemName == "Telemetry" && _telemetry) {
|
|
healthy = _telemetry->isHealthy();
|
|
} else if (subsystemName == "OTAManager" && _otaManager) {
|
|
healthy = _otaManager->isHealthy();
|
|
} else if (subsystemName == "Networking" && _networking) {
|
|
healthy = _networking->isHealthy();
|
|
} else if (subsystemName == "ConfigManager" && _configManager) {
|
|
healthy = _configManager->isHealthy();
|
|
} else if (subsystemName == "FileManager" && _fileManager) {
|
|
healthy = _fileManager->isHealthy();
|
|
} else {
|
|
LOG_WARNING("🔌 Subsystem %s not connected to health monitor", subsystemName.c_str());
|
|
return HealthStatus::FAILED;
|
|
}
|
|
|
|
HealthStatus status = healthy ? HealthStatus::HEALTHY :
|
|
(it->second.isCritical ? HealthStatus::CRITICAL : HealthStatus::WARNING);
|
|
|
|
updateSubsystemHealth(subsystemName, status,
|
|
healthy ? "" : subsystemName + " health check failed");
|
|
|
|
return status;
|
|
}
|
|
|
|
const std::map<String, SubsystemHealth>& HealthMonitor::getAllSubsystemHealth() const {
|
|
return _subsystemHealth;
|
|
}
|
|
|
|
SubsystemHealth HealthMonitor::getSubsystemHealth(const String& subsystemName) const {
|
|
auto it = _subsystemHealth.find(subsystemName);
|
|
if (it != _subsystemHealth.end()) {
|
|
return it->second;
|
|
}
|
|
|
|
// Return default unhealthy status for unknown subsystems
|
|
SubsystemHealth unknown(subsystemName);
|
|
unknown.status = HealthStatus::FAILED;
|
|
unknown.lastError = "Subsystem not found";
|
|
return unknown;
|
|
}
|
|
|
|
bool HealthMonitor::isFirmwareStable() const {
|
|
return areCriticalSubsystemsHealthy() && (getCriticalFailureCount() == 0);
|
|
}
|
|
|
|
uint8_t HealthMonitor::getCriticalFailureCount() const {
|
|
uint8_t count = 0;
|
|
|
|
for (const auto& [name, health] : _subsystemHealth) {
|
|
if (health.isCritical &&
|
|
(health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) {
|
|
count++;
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
uint8_t HealthMonitor::getWarningCount() const {
|
|
uint8_t count = 0;
|
|
|
|
for (const auto& [name, health] : _subsystemHealth) {
|
|
if (health.status == HealthStatus::WARNING) {
|
|
count++;
|
|
}
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
bool HealthMonitor::shouldRollbackFirmware() const {
|
|
uint8_t criticalFailures = getCriticalFailureCount();
|
|
|
|
// Rollback if more than 2 critical subsystems have failed
|
|
// This is configurable based on system requirements
|
|
const uint8_t MAX_CRITICAL_FAILURES = 2;
|
|
|
|
return criticalFailures > MAX_CRITICAL_FAILURES;
|
|
}
|
|
|
|
String HealthMonitor::generateHealthReport() const {
|
|
StaticJsonDocument<2048> doc;
|
|
|
|
doc["timestamp"] = millis();
|
|
doc["overall_health"] = healthStatusToString(calculateOverallHealth());
|
|
doc["critical_failures"] = getCriticalFailureCount();
|
|
doc["warnings"] = getWarningCount();
|
|
doc["firmware_stable"] = isFirmwareStable();
|
|
doc["rollback_recommended"] = shouldRollbackFirmware();
|
|
|
|
JsonObject subsystems = doc.createNestedObject("subsystems");
|
|
|
|
for (const auto& [name, health] : _subsystemHealth) {
|
|
JsonObject subsystem = subsystems.createNestedObject(name);
|
|
subsystem["status"] = healthStatusToString(health.status);
|
|
subsystem["critical"] = health.isCritical;
|
|
subsystem["last_check"] = health.lastCheck;
|
|
|
|
if (!health.lastError.isEmpty()) {
|
|
subsystem["error"] = health.lastError;
|
|
}
|
|
}
|
|
|
|
String report;
|
|
serializeJsonPretty(doc, report);
|
|
return report;
|
|
}
|
|
|
|
String HealthMonitor::getHealthSummary() const {
|
|
HealthStatus overall = calculateOverallHealth();
|
|
uint8_t critical = getCriticalFailureCount();
|
|
uint8_t warnings = getWarningCount();
|
|
|
|
String summary = "System Health: " + healthStatusToString(overall);
|
|
|
|
if (critical > 0) {
|
|
summary += " (" + String(critical) + " critical failures)";
|
|
}
|
|
|
|
if (warnings > 0) {
|
|
summary += " (" + String(warnings) + " warnings)";
|
|
}
|
|
|
|
if (shouldRollbackFirmware()) {
|
|
summary += " - ROLLBACK RECOMMENDED";
|
|
}
|
|
|
|
return summary;
|
|
}
|
|
|
|
void HealthMonitor::updateSubsystemHealth(const String& name, HealthStatus status, const String& error) {
|
|
auto it = _subsystemHealth.find(name);
|
|
if (it != _subsystemHealth.end()) {
|
|
it->second.status = status;
|
|
it->second.lastError = error;
|
|
it->second.lastCheck = millis();
|
|
|
|
LOG_VERBOSE("🔍 %s: %s %s",
|
|
name.c_str(),
|
|
healthStatusToString(status).c_str(),
|
|
error.isEmpty() ? "" : ("(" + error + ")").c_str());
|
|
}
|
|
}
|
|
|
|
bool HealthMonitor::areCriticalSubsystemsHealthy() const {
|
|
for (const auto& [name, health] : _subsystemHealth) {
|
|
if (health.isCritical &&
|
|
(health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
HealthStatus HealthMonitor::calculateOverallHealth() const {
|
|
bool hasCriticalFailures = (getCriticalFailureCount() > 0);
|
|
bool hasWarnings = (getWarningCount() > 0);
|
|
|
|
if (hasCriticalFailures) {
|
|
return HealthStatus::CRITICAL;
|
|
} else if (hasWarnings) {
|
|
return HealthStatus::WARNING;
|
|
} else {
|
|
return HealthStatus::HEALTHY;
|
|
}
|
|
}
|
|
|
|
String HealthMonitor::healthStatusToString(HealthStatus status) const {
|
|
switch (status) {
|
|
case HealthStatus::HEALTHY:
|
|
return "HEALTHY";
|
|
case HealthStatus::WARNING:
|
|
return "WARNING";
|
|
case HealthStatus::CRITICAL:
|
|
return "CRITICAL";
|
|
case HealthStatus::FAILED:
|
|
return "FAILED";
|
|
default:
|
|
return "UNKNOWN";
|
|
}
|
|
}
|