/* * ═══════════════════════════════════════════════════════════════════════════════════ * HEALTHMONITOR.CPP - System Health Monitoring Implementation * ═══════════════════════════════════════════════════════════════════════════════════ */ #include "HealthMonitor.hpp" #include "../BellEngine/BellEngine.hpp" #include "../OutputManager/OutputManager.hpp" #include "../Communication/Communication.hpp" #include "../Player/Player.hpp" #include "../TimeKeeper/TimeKeeper.hpp" #include "../Telemetry/Telemetry.hpp" #include "../OTAManager/OTAManager.hpp" #include "../Networking/Networking.hpp" #include "../ConfigManager/ConfigManager.hpp" #include "../FileManager/FileManager.hpp" #include HealthMonitor::HealthMonitor() { initializeSubsystemHealth(); } HealthMonitor::~HealthMonitor() { if (_monitoringTaskHandle != nullptr) { vTaskDelete(_monitoringTaskHandle); _monitoringTaskHandle = nullptr; } } bool HealthMonitor::begin() { LOG_INFO("🏥 Initializing Health Monitor System"); // Create monitoring task if auto-monitoring is enabled if (_autoMonitoring) { xTaskCreatePinnedToCore( monitoringTask, "HealthMonitor", 4096, this, 3, // Medium priority &_monitoringTaskHandle, 0 // Core 0 (different from BellEngine which uses Core 1) ); if (_monitoringTaskHandle != nullptr) { LOG_INFO("✅ Health Monitor initialized with automatic monitoring"); return true; } else { LOG_ERROR("❌ Failed to create Health Monitor task"); return false; } } else { LOG_INFO("✅ Health Monitor initialized (manual mode)"); return true; } } void HealthMonitor::initializeSubsystemHealth() { // Initialize all subsystem health entries // Mark critical subsystems that must be healthy for operation _subsystemHealth["BellEngine"] = SubsystemHealth("BellEngine", true); _subsystemHealth["OutputManager"] = SubsystemHealth("OutputManager", true); _subsystemHealth["ConfigManager"] = SubsystemHealth("ConfigManager", true); _subsystemHealth["FileManager"] = SubsystemHealth("FileManager", true); _subsystemHealth["Communication"] = SubsystemHealth("Communication", false); // Non-critical _subsystemHealth["Player"] = SubsystemHealth("Player", true); _subsystemHealth["TimeKeeper"] = SubsystemHealth("TimeKeeper", false); // Non-critical _subsystemHealth["Telemetry"] = SubsystemHealth("Telemetry", false); // Non-critical _subsystemHealth["OTAManager"] = SubsystemHealth("OTAManager", false); // Non-critical _subsystemHealth["Networking"] = SubsystemHealth("Networking", false); // Non-critical LOG_DEBUG("🏗️ Initialized health monitoring for %d subsystems", _subsystemHealth.size()); } void HealthMonitor::monitoringTask(void* parameter) { HealthMonitor* monitor = static_cast(parameter); LOG_INFO("🏥 Health Monitor task started on Core %d", xPortGetCoreID()); while (true) { monitor->monitoringLoop(); vTaskDelay(pdMS_TO_TICKS(monitor->_healthCheckInterval)); } } void HealthMonitor::monitoringLoop() { if (_player) { if (_player->_status != PlayerStatus::STOPPED) { LOG_VERBOSE("⏸️ Skipping health check during active playback"); return; } } LOG_VERBOSE("🔍 Performing periodic health check..."); HealthStatus overallHealth = performFullHealthCheck(); // Log warnings for any unhealthy subsystems uint8_t criticalCount = getCriticalFailureCount(); uint8_t warningCount = getWarningCount(); if (criticalCount > 0) { LOG_WARNING("🚨 Health Monitor: %d critical failures detected!", criticalCount); // List critical failures for (const auto& [name, health] : _subsystemHealth) { if (health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED) { LOG_ERROR("❌ CRITICAL: %s - %s", name.c_str(), health.lastError.c_str()); } } // Check if firmware rollback is recommended if (shouldRollbackFirmware()) { LOG_ERROR("🔄 FIRMWARE ROLLBACK RECOMMENDED - Too many critical failures"); // In a real system, this would trigger an OTA rollback // For now, we just log the recommendation } } else if (warningCount > 0) { LOG_WARNING("⚠️ Health Monitor: %d warnings detected", warningCount); } else { LOG_VERBOSE("✅ All subsystems healthy"); } } HealthStatus HealthMonitor::performFullHealthCheck() { unsigned long startTime = millis(); uint8_t checkedSystems = 0; // Check BellEngine if (_bellEngine) { bool healthy = _bellEngine->isHealthy(); updateSubsystemHealth("BellEngine", healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL, healthy ? "" : "BellEngine health check failed"); checkedSystems++; } // Check OutputManager if (_outputManager) { bool healthy = _outputManager->isHealthy(); updateSubsystemHealth("OutputManager", healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL, healthy ? "" : "OutputManager health check failed"); checkedSystems++; } // Check Communication if (_communication) { bool healthy = _communication->isHealthy(); updateSubsystemHealth("Communication", healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING, healthy ? "" : "Communication health check failed"); checkedSystems++; } // Check Player if (_player) { bool healthy = _player->isHealthy(); updateSubsystemHealth("Player", healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL, healthy ? "" : "Player health check failed"); checkedSystems++; } // Check TimeKeeper if (_timeKeeper) { bool healthy = _timeKeeper->isHealthy(); updateSubsystemHealth("TimeKeeper", healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING, healthy ? "" : "TimeKeeper health check failed"); checkedSystems++; } // Check Telemetry if (_telemetry) { bool healthy = _telemetry->isHealthy(); updateSubsystemHealth("Telemetry", healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING, healthy ? "" : "Telemetry health check failed"); checkedSystems++; } // Check OTAManager if (_otaManager) { bool healthy = _otaManager->isHealthy(); updateSubsystemHealth("OTAManager", healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING, healthy ? "" : "OTAManager health check failed"); checkedSystems++; } // Check Networking if (_networking) { bool healthy = _networking->isHealthy(); updateSubsystemHealth("Networking", healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING, healthy ? "" : "Networking health check failed"); checkedSystems++; } // Check ConfigManager if (_configManager) { bool healthy = _configManager->isHealthy(); updateSubsystemHealth("ConfigManager", healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL, healthy ? "" : "ConfigManager health check failed"); checkedSystems++; } // Check FileManager if (_fileManager) { bool healthy = _fileManager->isHealthy(); updateSubsystemHealth("FileManager", healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL, healthy ? "" : "FileManager health check failed"); checkedSystems++; } unsigned long elapsed = millis() - startTime; LOG_VERBOSE("🔍 Health check completed: %d systems in %lums", checkedSystems, elapsed); return calculateOverallHealth(); } HealthStatus HealthMonitor::checkSubsystemHealth(const String& subsystemName) { // Perform health check on specific subsystem auto it = _subsystemHealth.find(subsystemName); if (it == _subsystemHealth.end()) { LOG_WARNING("❓ Unknown subsystem: %s", subsystemName.c_str()); return HealthStatus::FAILED; } bool healthy = false; // Check specific subsystem if (subsystemName == "BellEngine" && _bellEngine) { healthy = _bellEngine->isHealthy(); } else if (subsystemName == "OutputManager" && _outputManager) { healthy = _outputManager->isHealthy(); } else if (subsystemName == "Communication" && _communication) { healthy = _communication->isHealthy(); } else if (subsystemName == "Player" && _player) { healthy = _player->isHealthy(); } else if (subsystemName == "TimeKeeper" && _timeKeeper) { healthy = _timeKeeper->isHealthy(); } else if (subsystemName == "Telemetry" && _telemetry) { healthy = _telemetry->isHealthy(); } else if (subsystemName == "OTAManager" && _otaManager) { healthy = _otaManager->isHealthy(); } else if (subsystemName == "Networking" && _networking) { healthy = _networking->isHealthy(); } else if (subsystemName == "ConfigManager" && _configManager) { healthy = _configManager->isHealthy(); } else if (subsystemName == "FileManager" && _fileManager) { healthy = _fileManager->isHealthy(); } else { LOG_WARNING("🔌 Subsystem %s not connected to health monitor", subsystemName.c_str()); return HealthStatus::FAILED; } HealthStatus status = healthy ? HealthStatus::HEALTHY : (it->second.isCritical ? HealthStatus::CRITICAL : HealthStatus::WARNING); updateSubsystemHealth(subsystemName, status, healthy ? "" : subsystemName + " health check failed"); return status; } const std::map& HealthMonitor::getAllSubsystemHealth() const { return _subsystemHealth; } SubsystemHealth HealthMonitor::getSubsystemHealth(const String& subsystemName) const { auto it = _subsystemHealth.find(subsystemName); if (it != _subsystemHealth.end()) { return it->second; } // Return default unhealthy status for unknown subsystems SubsystemHealth unknown(subsystemName); unknown.status = HealthStatus::FAILED; unknown.lastError = "Subsystem not found"; return unknown; } bool HealthMonitor::isFirmwareStable() const { return areCriticalSubsystemsHealthy() && (getCriticalFailureCount() == 0); } uint8_t HealthMonitor::getCriticalFailureCount() const { uint8_t count = 0; for (const auto& [name, health] : _subsystemHealth) { if (health.isCritical && (health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) { count++; } } return count; } uint8_t HealthMonitor::getWarningCount() const { uint8_t count = 0; for (const auto& [name, health] : _subsystemHealth) { if (health.status == HealthStatus::WARNING) { count++; } } return count; } bool HealthMonitor::shouldRollbackFirmware() const { uint8_t criticalFailures = getCriticalFailureCount(); // Rollback if more than 2 critical subsystems have failed // This is configurable based on system requirements const uint8_t MAX_CRITICAL_FAILURES = 2; return criticalFailures > MAX_CRITICAL_FAILURES; } String HealthMonitor::generateHealthReport() const { StaticJsonDocument<2048> doc; doc["timestamp"] = millis(); doc["overall_health"] = healthStatusToString(calculateOverallHealth()); doc["critical_failures"] = getCriticalFailureCount(); doc["warnings"] = getWarningCount(); doc["firmware_stable"] = isFirmwareStable(); doc["rollback_recommended"] = shouldRollbackFirmware(); JsonObject subsystems = doc.createNestedObject("subsystems"); for (const auto& [name, health] : _subsystemHealth) { JsonObject subsystem = subsystems.createNestedObject(name); subsystem["status"] = healthStatusToString(health.status); subsystem["critical"] = health.isCritical; subsystem["last_check"] = health.lastCheck; if (!health.lastError.isEmpty()) { subsystem["error"] = health.lastError; } } String report; serializeJsonPretty(doc, report); return report; } String HealthMonitor::getHealthSummary() const { HealthStatus overall = calculateOverallHealth(); uint8_t critical = getCriticalFailureCount(); uint8_t warnings = getWarningCount(); String summary = "System Health: " + healthStatusToString(overall); if (critical > 0) { summary += " (" + String(critical) + " critical failures)"; } if (warnings > 0) { summary += " (" + String(warnings) + " warnings)"; } if (shouldRollbackFirmware()) { summary += " - ROLLBACK RECOMMENDED"; } return summary; } void HealthMonitor::updateSubsystemHealth(const String& name, HealthStatus status, const String& error) { auto it = _subsystemHealth.find(name); if (it != _subsystemHealth.end()) { it->second.status = status; it->second.lastError = error; it->second.lastCheck = millis(); LOG_VERBOSE("🔍 %s: %s %s", name.c_str(), healthStatusToString(status).c_str(), error.isEmpty() ? "" : ("(" + error + ")").c_str()); } } bool HealthMonitor::areCriticalSubsystemsHealthy() const { for (const auto& [name, health] : _subsystemHealth) { if (health.isCritical && (health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) { return false; } } return true; } HealthStatus HealthMonitor::calculateOverallHealth() const { bool hasCriticalFailures = (getCriticalFailureCount() > 0); bool hasWarnings = (getWarningCount() > 0); if (hasCriticalFailures) { return HealthStatus::CRITICAL; } else if (hasWarnings) { return HealthStatus::WARNING; } else { return HealthStatus::HEALTHY; } } String HealthMonitor::healthStatusToString(HealthStatus status) const { switch (status) { case HealthStatus::HEALTHY: return "HEALTHY"; case HealthStatus::WARNING: return "WARNING"; case HealthStatus::CRITICAL: return "CRITICAL"; case HealthStatus::FAILED: return "FAILED"; default: return "UNKNOWN"; } }