315 lines
14 KiB
C++
315 lines
14 KiB
C++
/*
|
|
* ═══════════════════════════════════════════════════════════════════════════════════
|
|
* HEALTHMONITOR.HPP - System Health Monitoring and Firmware Validation
|
|
* ═══════════════════════════════════════════════════════════════════════════════════
|
|
*
|
|
* 🏥 THE SYSTEM HEALTH GUARDIAN OF VESPER 🏥
|
|
*
|
|
* This class provides comprehensive system health monitoring across all subsystems.
|
|
* It determines whether the current firmware is stable and functional, or if a
|
|
* rollback to the previous firmware version should be performed.
|
|
*
|
|
* 🏗️ ARCHITECTURE:
|
|
* • Periodic health checks across all major subsystems
|
|
* • Critical vs non-critical failure classification
|
|
* • Firmware stability validation and rollback decision making
|
|
* • Centralized health status reporting
|
|
* • Thread-safe operation with configurable check intervals
|
|
*
|
|
* 🔍 MONITORED SUBSYSTEMS:
|
|
* • BellEngine: Core timing and bell control system
|
|
* • OutputManager: Hardware abstraction layer
|
|
* • Communication: MQTT, WebSocket, and UDP protocols
|
|
* • Player: Melody playback management
|
|
* • TimeKeeper: RTC and time synchronization
|
|
* • Telemetry: System monitoring and analytics
|
|
* • OTAManager: Firmware update management
|
|
* • Networking: Network connectivity management
|
|
* • ConfigManager: Configuration and persistence
|
|
* • FileManager: SD card and file operations
|
|
*
|
|
* 🚨 FAILURE CLASSIFICATION:
|
|
* • CRITICAL: Failures that make the device unusable
|
|
* • WARNING: Failures that affect functionality but allow operation
|
|
* • INFO: Minor issues that don't affect core functionality
|
|
*
|
|
* 🔄 FIRMWARE VALIDATION:
|
|
* • Boot-time stability check
|
|
* • Runtime health monitoring
|
|
* • Automatic rollback decision making
|
|
* • Health status persistence
|
|
*
|
|
* 📋 VERSION: 1.0 (Initial health monitoring system)
|
|
* 📅 DATE: 2025
|
|
* 👨💻 AUTHOR: Advanced Bell Systems
|
|
* ═══════════════════════════════════════════════════════════════════════════════════
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include <Arduino.h>
|
|
#include <vector>
|
|
#include <map>
|
|
#include "freertos/FreeRTOS.h"
|
|
#include "freertos/task.h"
|
|
#include "../Logging/Logging.hpp"
|
|
|
|
// Forward declarations for all monitored subsystems
|
|
class BellEngine;
|
|
class OutputManager;
|
|
class CommunicationRouter;
|
|
class Player;
|
|
class Timekeeper;
|
|
class Telemetry;
|
|
class OTAManager;
|
|
class Networking;
|
|
class ConfigManager;
|
|
class FileManager;
|
|
|
|
/**
|
|
* @enum HealthStatus
|
|
* @brief Health status levels for subsystems
|
|
*/
|
|
enum class HealthStatus {
|
|
HEALTHY, // System is functioning normally
|
|
WARNING, // System has minor issues but is operational
|
|
CRITICAL, // System has major issues affecting functionality
|
|
FAILED // System is non-functional
|
|
};
|
|
|
|
/**
|
|
* @struct SubsystemHealth
|
|
* @brief Health information for a single subsystem
|
|
*/
|
|
struct SubsystemHealth {
|
|
String name; // Subsystem name
|
|
HealthStatus status; // Current health status
|
|
String lastError; // Last error message (if any)
|
|
unsigned long lastCheck; // Timestamp of last health check
|
|
bool isCritical; // Whether this subsystem is critical for operation
|
|
|
|
// Default constructor for std::map compatibility
|
|
SubsystemHealth()
|
|
: name(""), status(HealthStatus::HEALTHY), lastCheck(0), isCritical(false) {}
|
|
|
|
SubsystemHealth(const String& n, bool critical = false)
|
|
: name(n), status(HealthStatus::HEALTHY), lastCheck(0), isCritical(critical) {}
|
|
};
|
|
|
|
/**
|
|
* @class HealthMonitor
|
|
* @brief Comprehensive system health monitoring and firmware validation
|
|
*
|
|
* The HealthMonitor continuously monitors all subsystems to ensure the firmware
|
|
* is stable and functional. It can make decisions about firmware rollbacks
|
|
* based on the overall system health.
|
|
*/
|
|
class HealthMonitor {
|
|
public:
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// CONSTRUCTOR & INITIALIZATION
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* @brief Constructor - Initialize health monitoring system
|
|
*/
|
|
HealthMonitor();
|
|
|
|
/**
|
|
* @brief Destructor - Clean up resources
|
|
*/
|
|
~HealthMonitor();
|
|
|
|
/**
|
|
* @brief Initialize health monitoring system
|
|
* @return true if initialization successful
|
|
*/
|
|
bool begin();
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// SUBSYSTEM REGISTRATION
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
/** @brief Register BellEngine for monitoring */
|
|
void setBellEngine(BellEngine* bellEngine) { _bellEngine = bellEngine; }
|
|
|
|
/** @brief Register OutputManager for monitoring */
|
|
void setOutputManager(OutputManager* outputManager) { _outputManager = outputManager; }
|
|
|
|
/** @brief Register Communication for monitoring */
|
|
void setCommunication(CommunicationRouter* communication) { _communication = communication; }
|
|
|
|
/** @brief Register Player for monitoring */
|
|
void setPlayer(Player* player) { _player = player; }
|
|
|
|
/** @brief Register TimeKeeper for monitoring */
|
|
void setTimeKeeper(Timekeeper* timeKeeper) { _timeKeeper = timeKeeper; }
|
|
|
|
/** @brief Register Telemetry for monitoring */
|
|
void setTelemetry(Telemetry* telemetry) { _telemetry = telemetry; }
|
|
|
|
/** @brief Register OTAManager for monitoring */
|
|
void setOTAManager(OTAManager* otaManager) { _otaManager = otaManager; }
|
|
|
|
/** @brief Register Networking for monitoring */
|
|
void setNetworking(Networking* networking) { _networking = networking; }
|
|
|
|
/** @brief Register ConfigManager for monitoring */
|
|
void setConfigManager(ConfigManager* configManager) { _configManager = configManager; }
|
|
|
|
/** @brief Register FileManager for monitoring */
|
|
void setFileManager(FileManager* fileManager) { _fileManager = fileManager; }
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// HEALTH CHECK METHODS
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* @brief Perform comprehensive health check on all subsystems
|
|
* @return Overall system health status
|
|
*/
|
|
HealthStatus performFullHealthCheck();
|
|
|
|
/**
|
|
* @brief Perform health check on a specific subsystem
|
|
* @param subsystemName Name of the subsystem to check
|
|
* @return Health status of the specified subsystem
|
|
*/
|
|
HealthStatus checkSubsystemHealth(const String& subsystemName);
|
|
|
|
/**
|
|
* @brief Get current health status of all subsystems
|
|
* @return Map of subsystem names to their health information
|
|
*/
|
|
const std::map<String, SubsystemHealth>& getAllSubsystemHealth() const;
|
|
|
|
/**
|
|
* @brief Get health status of a specific subsystem
|
|
* @param subsystemName Name of the subsystem
|
|
* @return Health information for the subsystem
|
|
*/
|
|
SubsystemHealth getSubsystemHealth(const String& subsystemName) const;
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// FIRMWARE VALIDATION
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* @brief Check if current firmware is stable and should be kept
|
|
* @return true if firmware is stable, false if rollback is recommended
|
|
*/
|
|
bool isFirmwareStable() const;
|
|
|
|
/**
|
|
* @brief Get the number of critical failures detected
|
|
* @return Count of subsystems with critical failures
|
|
*/
|
|
uint8_t getCriticalFailureCount() const;
|
|
|
|
/**
|
|
* @brief Get the number of warning-level issues detected
|
|
* @return Count of subsystems with warning-level issues
|
|
*/
|
|
uint8_t getWarningCount() const;
|
|
|
|
/**
|
|
* @brief Check if a firmware rollback is recommended
|
|
* @return true if rollback is recommended due to critical failures
|
|
*/
|
|
bool shouldRollbackFirmware() const;
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// HEALTH REPORTING
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* @brief Generate a comprehensive health report
|
|
* @return JSON string containing detailed health information
|
|
*/
|
|
String generateHealthReport() const;
|
|
|
|
/**
|
|
* @brief Get a summary of system health
|
|
* @return Brief health summary string
|
|
*/
|
|
String getHealthSummary() const;
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// CONFIGURATION
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* @brief Set health check interval
|
|
* @param intervalMs Interval between health checks in milliseconds
|
|
*/
|
|
void setHealthCheckInterval(unsigned long intervalMs) { _healthCheckInterval = intervalMs; }
|
|
|
|
/**
|
|
* @brief Enable or disable automatic health monitoring
|
|
* @param enabled Whether to enable automatic monitoring
|
|
*/
|
|
void setAutoMonitoring(bool enabled) { _autoMonitoring = enabled; }
|
|
|
|
private:
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// SUBSYSTEM REFERENCES
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
BellEngine* _bellEngine = nullptr;
|
|
OutputManager* _outputManager = nullptr;
|
|
CommunicationRouter* _communication = nullptr;
|
|
Player* _player = nullptr;
|
|
Timekeeper* _timeKeeper = nullptr;
|
|
Telemetry* _telemetry = nullptr;
|
|
OTAManager* _otaManager = nullptr;
|
|
Networking* _networking = nullptr;
|
|
ConfigManager* _configManager = nullptr;
|
|
FileManager* _fileManager = nullptr;
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// HEALTH MONITORING STATE
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
std::map<String, SubsystemHealth> _subsystemHealth;
|
|
TaskHandle_t _monitoringTaskHandle = nullptr;
|
|
unsigned long _healthCheckInterval = 300000; // 5 minutes default
|
|
bool _autoMonitoring = true;
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// PRIVATE HELPER METHODS
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
/**
|
|
* @brief Initialize all subsystem health entries
|
|
*/
|
|
void initializeSubsystemHealth();
|
|
|
|
/**
|
|
* @brief Monitoring task function
|
|
*/
|
|
static void monitoringTask(void* parameter);
|
|
|
|
/**
|
|
* @brief Main monitoring loop
|
|
*/
|
|
void monitoringLoop();
|
|
|
|
/**
|
|
* @brief Update health status for a specific subsystem
|
|
*/
|
|
void updateSubsystemHealth(const String& name, HealthStatus status, const String& error = "");
|
|
|
|
/**
|
|
* @brief Check if enough critical subsystems are healthy
|
|
*/
|
|
bool areCriticalSubsystemsHealthy() const;
|
|
|
|
/**
|
|
* @brief Calculate overall system health based on subsystem status
|
|
*/
|
|
HealthStatus calculateOverallHealth() const;
|
|
|
|
/**
|
|
* @brief Convert health status to string
|
|
*/
|
|
String healthStatusToString(HealthStatus status) const;
|
|
};
|