/* * ═══════════════════════════════════════════════════════════════════════════════════ * HEALTHMONITOR.HPP - System Health Monitoring and Firmware Validation * ═══════════════════════════════════════════════════════════════════════════════════ * * 🏥 THE SYSTEM HEALTH GUARDIAN OF VESPER 🏥 * * This class provides comprehensive system health monitoring across all subsystems. * It determines whether the current firmware is stable and functional, or if a * rollback to the previous firmware version should be performed. * * 🏗️ ARCHITECTURE: * • Periodic health checks across all major subsystems * • Critical vs non-critical failure classification * • Firmware stability validation and rollback decision making * • Centralized health status reporting * • Thread-safe operation with configurable check intervals * * 🔍 MONITORED SUBSYSTEMS: * • BellEngine: Core timing and bell control system * • OutputManager: Hardware abstraction layer * • Communication: MQTT, WebSocket, and UDP protocols * • Player: Melody playback management * • TimeKeeper: RTC and time synchronization * • Telemetry: System monitoring and analytics * • OTAManager: Firmware update management * • Networking: Network connectivity management * • ConfigManager: Configuration and persistence * • FileManager: SD card and file operations * * 🚨 FAILURE CLASSIFICATION: * • CRITICAL: Failures that make the device unusable * • WARNING: Failures that affect functionality but allow operation * • INFO: Minor issues that don't affect core functionality * * 🔄 FIRMWARE VALIDATION: * • Boot-time stability check * • Runtime health monitoring * • Automatic rollback decision making * • Health status persistence * * 📋 VERSION: 1.0 (Initial health monitoring system) * 📅 DATE: 2025 * 👨‍💻 AUTHOR: Advanced Bell Systems * ═══════════════════════════════════════════════════════════════════════════════════ */ #pragma once #include #include #include #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include "../Logging/Logging.hpp" // Forward declarations for all monitored subsystems class BellEngine; class OutputManager; class CommunicationRouter; class Player; class Timekeeper; class Telemetry; class OTAManager; class Networking; class ConfigManager; class FileManager; /** * @enum HealthStatus * @brief Health status levels for subsystems */ enum class HealthStatus { HEALTHY, // System is functioning normally WARNING, // System has minor issues but is operational CRITICAL, // System has major issues affecting functionality FAILED // System is non-functional }; /** * @struct SubsystemHealth * @brief Health information for a single subsystem */ struct SubsystemHealth { String name; // Subsystem name HealthStatus status; // Current health status String lastError; // Last error message (if any) unsigned long lastCheck; // Timestamp of last health check bool isCritical; // Whether this subsystem is critical for operation // Default constructor for std::map compatibility SubsystemHealth() : name(""), status(HealthStatus::HEALTHY), lastCheck(0), isCritical(false) {} SubsystemHealth(const String& n, bool critical = false) : name(n), status(HealthStatus::HEALTHY), lastCheck(0), isCritical(critical) {} }; /** * @class HealthMonitor * @brief Comprehensive system health monitoring and firmware validation * * The HealthMonitor continuously monitors all subsystems to ensure the firmware * is stable and functional. It can make decisions about firmware rollbacks * based on the overall system health. */ class HealthMonitor { public: // ═══════════════════════════════════════════════════════════════════════════════ // CONSTRUCTOR & INITIALIZATION // ═══════════════════════════════════════════════════════════════════════════════ /** * @brief Constructor - Initialize health monitoring system */ HealthMonitor(); /** * @brief Destructor - Clean up resources */ ~HealthMonitor(); /** * @brief Initialize health monitoring system * @return true if initialization successful */ bool begin(); // ═══════════════════════════════════════════════════════════════════════════════ // SUBSYSTEM REGISTRATION // ═══════════════════════════════════════════════════════════════════════════════ /** @brief Register BellEngine for monitoring */ void setBellEngine(BellEngine* bellEngine) { _bellEngine = bellEngine; } /** @brief Register OutputManager for monitoring */ void setOutputManager(OutputManager* outputManager) { _outputManager = outputManager; } /** @brief Register Communication for monitoring */ void setCommunication(CommunicationRouter* communication) { _communication = communication; } /** @brief Register Player for monitoring */ void setPlayer(Player* player) { _player = player; } /** @brief Register TimeKeeper for monitoring */ void setTimeKeeper(Timekeeper* timeKeeper) { _timeKeeper = timeKeeper; } /** @brief Register Telemetry for monitoring */ void setTelemetry(Telemetry* telemetry) { _telemetry = telemetry; } /** @brief Register OTAManager for monitoring */ void setOTAManager(OTAManager* otaManager) { _otaManager = otaManager; } /** @brief Register Networking for monitoring */ void setNetworking(Networking* networking) { _networking = networking; } /** @brief Register ConfigManager for monitoring */ void setConfigManager(ConfigManager* configManager) { _configManager = configManager; } /** @brief Register FileManager for monitoring */ void setFileManager(FileManager* fileManager) { _fileManager = fileManager; } // ═══════════════════════════════════════════════════════════════════════════════ // HEALTH CHECK METHODS // ═══════════════════════════════════════════════════════════════════════════════ /** * @brief Perform comprehensive health check on all subsystems * @return Overall system health status */ HealthStatus performFullHealthCheck(); /** * @brief Perform health check on a specific subsystem * @param subsystemName Name of the subsystem to check * @return Health status of the specified subsystem */ HealthStatus checkSubsystemHealth(const String& subsystemName); /** * @brief Get current health status of all subsystems * @return Map of subsystem names to their health information */ const std::map& getAllSubsystemHealth() const; /** * @brief Get health status of a specific subsystem * @param subsystemName Name of the subsystem * @return Health information for the subsystem */ SubsystemHealth getSubsystemHealth(const String& subsystemName) const; // ═══════════════════════════════════════════════════════════════════════════════ // FIRMWARE VALIDATION // ═══════════════════════════════════════════════════════════════════════════════ /** * @brief Check if current firmware is stable and should be kept * @return true if firmware is stable, false if rollback is recommended */ bool isFirmwareStable() const; /** * @brief Get the number of critical failures detected * @return Count of subsystems with critical failures */ uint8_t getCriticalFailureCount() const; /** * @brief Get the number of warning-level issues detected * @return Count of subsystems with warning-level issues */ uint8_t getWarningCount() const; /** * @brief Check if a firmware rollback is recommended * @return true if rollback is recommended due to critical failures */ bool shouldRollbackFirmware() const; // ═══════════════════════════════════════════════════════════════════════════════ // HEALTH REPORTING // ═══════════════════════════════════════════════════════════════════════════════ /** * @brief Generate a comprehensive health report * @return JSON string containing detailed health information */ String generateHealthReport() const; /** * @brief Get a summary of system health * @return Brief health summary string */ String getHealthSummary() const; // ═══════════════════════════════════════════════════════════════════════════════ // CONFIGURATION // ═══════════════════════════════════════════════════════════════════════════════ /** * @brief Set health check interval * @param intervalMs Interval between health checks in milliseconds */ void setHealthCheckInterval(unsigned long intervalMs) { _healthCheckInterval = intervalMs; } /** * @brief Enable or disable automatic health monitoring * @param enabled Whether to enable automatic monitoring */ void setAutoMonitoring(bool enabled) { _autoMonitoring = enabled; } private: // ═══════════════════════════════════════════════════════════════════════════════ // SUBSYSTEM REFERENCES // ═══════════════════════════════════════════════════════════════════════════════ BellEngine* _bellEngine = nullptr; OutputManager* _outputManager = nullptr; CommunicationRouter* _communication = nullptr; Player* _player = nullptr; Timekeeper* _timeKeeper = nullptr; Telemetry* _telemetry = nullptr; OTAManager* _otaManager = nullptr; Networking* _networking = nullptr; ConfigManager* _configManager = nullptr; FileManager* _fileManager = nullptr; // ═══════════════════════════════════════════════════════════════════════════════ // HEALTH MONITORING STATE // ═══════════════════════════════════════════════════════════════════════════════ std::map _subsystemHealth; TaskHandle_t _monitoringTaskHandle = nullptr; unsigned long _healthCheckInterval = 300000; // 5 minutes default bool _autoMonitoring = true; // ═══════════════════════════════════════════════════════════════════════════════ // PRIVATE HELPER METHODS // ═══════════════════════════════════════════════════════════════════════════════ /** * @brief Initialize all subsystem health entries */ void initializeSubsystemHealth(); /** * @brief Monitoring task function */ static void monitoringTask(void* parameter); /** * @brief Main monitoring loop */ void monitoringLoop(); /** * @brief Update health status for a specific subsystem */ void updateSubsystemHealth(const String& name, HealthStatus status, const String& error = ""); /** * @brief Check if enough critical subsystems are healthy */ bool areCriticalSubsystemsHealthy() const; /** * @brief Calculate overall system health based on subsystem status */ HealthStatus calculateOverallHealth() const; /** * @brief Convert health status to string */ String healthStatusToString(HealthStatus status) const; };