Complete Rebuild, with Subsystems for each component. RTOS Tasks. (help by Claude)
This commit is contained in:
314
vesper/src/HealthMonitor/HealthMonitor.hpp
Normal file
314
vesper/src/HealthMonitor/HealthMonitor.hpp
Normal file
@@ -0,0 +1,314 @@
|
||||
/*
|
||||
* ═══════════════════════════════════════════════════════════════════════════════════
|
||||
* HEALTHMONITOR.HPP - System Health Monitoring and Firmware Validation
|
||||
* ═══════════════════════════════════════════════════════════════════════════════════
|
||||
*
|
||||
* 🏥 THE SYSTEM HEALTH GUARDIAN OF VESPER 🏥
|
||||
*
|
||||
* This class provides comprehensive system health monitoring across all subsystems.
|
||||
* It determines whether the current firmware is stable and functional, or if a
|
||||
* rollback to the previous firmware version should be performed.
|
||||
*
|
||||
* 🏗️ ARCHITECTURE:
|
||||
* • Periodic health checks across all major subsystems
|
||||
* • Critical vs non-critical failure classification
|
||||
* • Firmware stability validation and rollback decision making
|
||||
* • Centralized health status reporting
|
||||
* • Thread-safe operation with configurable check intervals
|
||||
*
|
||||
* 🔍 MONITORED SUBSYSTEMS:
|
||||
* • BellEngine: Core timing and bell control system
|
||||
* • OutputManager: Hardware abstraction layer
|
||||
* • Communication: MQTT, WebSocket, and UDP protocols
|
||||
* • Player: Melody playback management
|
||||
* • TimeKeeper: RTC and time synchronization
|
||||
* • Telemetry: System monitoring and analytics
|
||||
* • OTAManager: Firmware update management
|
||||
* • Networking: Network connectivity management
|
||||
* • ConfigManager: Configuration and persistence
|
||||
* • FileManager: SD card and file operations
|
||||
*
|
||||
* 🚨 FAILURE CLASSIFICATION:
|
||||
* • CRITICAL: Failures that make the device unusable
|
||||
* • WARNING: Failures that affect functionality but allow operation
|
||||
* • INFO: Minor issues that don't affect core functionality
|
||||
*
|
||||
* 🔄 FIRMWARE VALIDATION:
|
||||
* • Boot-time stability check
|
||||
* • Runtime health monitoring
|
||||
* • Automatic rollback decision making
|
||||
* • Health status persistence
|
||||
*
|
||||
* 📋 VERSION: 1.0 (Initial health monitoring system)
|
||||
* 📅 DATE: 2025
|
||||
* 👨💻 AUTHOR: Advanced Bell Systems
|
||||
* ═══════════════════════════════════════════════════════════════════════════════════
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <Arduino.h>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#include "../Logging/Logging.hpp"
|
||||
|
||||
// Forward declarations for all monitored subsystems
|
||||
class BellEngine;
|
||||
class OutputManager;
|
||||
class Communication;
|
||||
class Player;
|
||||
class Timekeeper;
|
||||
class Telemetry;
|
||||
class OTAManager;
|
||||
class Networking;
|
||||
class ConfigManager;
|
||||
class FileManager;
|
||||
|
||||
/**
|
||||
* @enum HealthStatus
|
||||
* @brief Health status levels for subsystems
|
||||
*/
|
||||
enum class HealthStatus {
|
||||
HEALTHY, // System is functioning normally
|
||||
WARNING, // System has minor issues but is operational
|
||||
CRITICAL, // System has major issues affecting functionality
|
||||
FAILED // System is non-functional
|
||||
};
|
||||
|
||||
/**
|
||||
* @struct SubsystemHealth
|
||||
* @brief Health information for a single subsystem
|
||||
*/
|
||||
struct SubsystemHealth {
|
||||
String name; // Subsystem name
|
||||
HealthStatus status; // Current health status
|
||||
String lastError; // Last error message (if any)
|
||||
unsigned long lastCheck; // Timestamp of last health check
|
||||
bool isCritical; // Whether this subsystem is critical for operation
|
||||
|
||||
// Default constructor for std::map compatibility
|
||||
SubsystemHealth()
|
||||
: name(""), status(HealthStatus::HEALTHY), lastCheck(0), isCritical(false) {}
|
||||
|
||||
SubsystemHealth(const String& n, bool critical = false)
|
||||
: name(n), status(HealthStatus::HEALTHY), lastCheck(0), isCritical(critical) {}
|
||||
};
|
||||
|
||||
/**
|
||||
* @class HealthMonitor
|
||||
* @brief Comprehensive system health monitoring and firmware validation
|
||||
*
|
||||
* The HealthMonitor continuously monitors all subsystems to ensure the firmware
|
||||
* is stable and functional. It can make decisions about firmware rollbacks
|
||||
* based on the overall system health.
|
||||
*/
|
||||
class HealthMonitor {
|
||||
public:
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// CONSTRUCTOR & INITIALIZATION
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @brief Constructor - Initialize health monitoring system
|
||||
*/
|
||||
HealthMonitor();
|
||||
|
||||
/**
|
||||
* @brief Destructor - Clean up resources
|
||||
*/
|
||||
~HealthMonitor();
|
||||
|
||||
/**
|
||||
* @brief Initialize health monitoring system
|
||||
* @return true if initialization successful
|
||||
*/
|
||||
bool begin();
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// SUBSYSTEM REGISTRATION
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/** @brief Register BellEngine for monitoring */
|
||||
void setBellEngine(BellEngine* bellEngine) { _bellEngine = bellEngine; }
|
||||
|
||||
/** @brief Register OutputManager for monitoring */
|
||||
void setOutputManager(OutputManager* outputManager) { _outputManager = outputManager; }
|
||||
|
||||
/** @brief Register Communication for monitoring */
|
||||
void setCommunication(Communication* communication) { _communication = communication; }
|
||||
|
||||
/** @brief Register Player for monitoring */
|
||||
void setPlayer(Player* player) { _player = player; }
|
||||
|
||||
/** @brief Register TimeKeeper for monitoring */
|
||||
void setTimeKeeper(Timekeeper* timeKeeper) { _timeKeeper = timeKeeper; }
|
||||
|
||||
/** @brief Register Telemetry for monitoring */
|
||||
void setTelemetry(Telemetry* telemetry) { _telemetry = telemetry; }
|
||||
|
||||
/** @brief Register OTAManager for monitoring */
|
||||
void setOTAManager(OTAManager* otaManager) { _otaManager = otaManager; }
|
||||
|
||||
/** @brief Register Networking for monitoring */
|
||||
void setNetworking(Networking* networking) { _networking = networking; }
|
||||
|
||||
/** @brief Register ConfigManager for monitoring */
|
||||
void setConfigManager(ConfigManager* configManager) { _configManager = configManager; }
|
||||
|
||||
/** @brief Register FileManager for monitoring */
|
||||
void setFileManager(FileManager* fileManager) { _fileManager = fileManager; }
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// HEALTH CHECK METHODS
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @brief Perform comprehensive health check on all subsystems
|
||||
* @return Overall system health status
|
||||
*/
|
||||
HealthStatus performFullHealthCheck();
|
||||
|
||||
/**
|
||||
* @brief Perform health check on a specific subsystem
|
||||
* @param subsystemName Name of the subsystem to check
|
||||
* @return Health status of the specified subsystem
|
||||
*/
|
||||
HealthStatus checkSubsystemHealth(const String& subsystemName);
|
||||
|
||||
/**
|
||||
* @brief Get current health status of all subsystems
|
||||
* @return Map of subsystem names to their health information
|
||||
*/
|
||||
const std::map<String, SubsystemHealth>& getAllSubsystemHealth() const;
|
||||
|
||||
/**
|
||||
* @brief Get health status of a specific subsystem
|
||||
* @param subsystemName Name of the subsystem
|
||||
* @return Health information for the subsystem
|
||||
*/
|
||||
SubsystemHealth getSubsystemHealth(const String& subsystemName) const;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// FIRMWARE VALIDATION
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @brief Check if current firmware is stable and should be kept
|
||||
* @return true if firmware is stable, false if rollback is recommended
|
||||
*/
|
||||
bool isFirmwareStable() const;
|
||||
|
||||
/**
|
||||
* @brief Get the number of critical failures detected
|
||||
* @return Count of subsystems with critical failures
|
||||
*/
|
||||
uint8_t getCriticalFailureCount() const;
|
||||
|
||||
/**
|
||||
* @brief Get the number of warning-level issues detected
|
||||
* @return Count of subsystems with warning-level issues
|
||||
*/
|
||||
uint8_t getWarningCount() const;
|
||||
|
||||
/**
|
||||
* @brief Check if a firmware rollback is recommended
|
||||
* @return true if rollback is recommended due to critical failures
|
||||
*/
|
||||
bool shouldRollbackFirmware() const;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// HEALTH REPORTING
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @brief Generate a comprehensive health report
|
||||
* @return JSON string containing detailed health information
|
||||
*/
|
||||
String generateHealthReport() const;
|
||||
|
||||
/**
|
||||
* @brief Get a summary of system health
|
||||
* @return Brief health summary string
|
||||
*/
|
||||
String getHealthSummary() const;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// CONFIGURATION
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @brief Set health check interval
|
||||
* @param intervalMs Interval between health checks in milliseconds
|
||||
*/
|
||||
void setHealthCheckInterval(unsigned long intervalMs) { _healthCheckInterval = intervalMs; }
|
||||
|
||||
/**
|
||||
* @brief Enable or disable automatic health monitoring
|
||||
* @param enabled Whether to enable automatic monitoring
|
||||
*/
|
||||
void setAutoMonitoring(bool enabled) { _autoMonitoring = enabled; }
|
||||
|
||||
private:
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// SUBSYSTEM REFERENCES
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
BellEngine* _bellEngine = nullptr;
|
||||
OutputManager* _outputManager = nullptr;
|
||||
Communication* _communication = nullptr;
|
||||
Player* _player = nullptr;
|
||||
Timekeeper* _timeKeeper = nullptr;
|
||||
Telemetry* _telemetry = nullptr;
|
||||
OTAManager* _otaManager = nullptr;
|
||||
Networking* _networking = nullptr;
|
||||
ConfigManager* _configManager = nullptr;
|
||||
FileManager* _fileManager = nullptr;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// HEALTH MONITORING STATE
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
std::map<String, SubsystemHealth> _subsystemHealth;
|
||||
TaskHandle_t _monitoringTaskHandle = nullptr;
|
||||
unsigned long _healthCheckInterval = 300000; // 5 minutes default
|
||||
bool _autoMonitoring = true;
|
||||
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
// PRIVATE HELPER METHODS
|
||||
// ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
/**
|
||||
* @brief Initialize all subsystem health entries
|
||||
*/
|
||||
void initializeSubsystemHealth();
|
||||
|
||||
/**
|
||||
* @brief Monitoring task function
|
||||
*/
|
||||
static void monitoringTask(void* parameter);
|
||||
|
||||
/**
|
||||
* @brief Main monitoring loop
|
||||
*/
|
||||
void monitoringLoop();
|
||||
|
||||
/**
|
||||
* @brief Update health status for a specific subsystem
|
||||
*/
|
||||
void updateSubsystemHealth(const String& name, HealthStatus status, const String& error = "");
|
||||
|
||||
/**
|
||||
* @brief Check if enough critical subsystems are healthy
|
||||
*/
|
||||
bool areCriticalSubsystemsHealthy() const;
|
||||
|
||||
/**
|
||||
* @brief Calculate overall system health based on subsystem status
|
||||
*/
|
||||
HealthStatus calculateOverallHealth() const;
|
||||
|
||||
/**
|
||||
* @brief Convert health status to string
|
||||
*/
|
||||
String healthStatusToString(HealthStatus status) const;
|
||||
};
|
||||
Reference in New Issue
Block a user