Complete Rebuild, with Subsystems for each component. RTOS Tasks. (help by Claude)

This commit is contained in:
2025-10-01 12:42:00 +03:00
parent 104c1d04d4
commit f696984cd1
57 changed files with 11757 additions and 2290 deletions

View File

@@ -0,0 +1,428 @@
/*
* ═══════════════════════════════════════════════════════════════════════════════════
* HEALTHMONITOR.CPP - System Health Monitoring Implementation
* ═══════════════════════════════════════════════════════════════════════════════════
*/
#include "HealthMonitor.hpp"
#include "../BellEngine/BellEngine.hpp"
#include "../OutputManager/OutputManager.hpp"
#include "../Communication/Communication.hpp"
#include "../Player/Player.hpp"
#include "../TimeKeeper/TimeKeeper.hpp"
#include "../Telemetry/Telemetry.hpp"
#include "../OTAManager/OTAManager.hpp"
#include "../Networking/Networking.hpp"
#include "../ConfigManager/ConfigManager.hpp"
#include "../FileManager/FileManager.hpp"
#include <ArduinoJson.h>
HealthMonitor::HealthMonitor() {
initializeSubsystemHealth();
}
HealthMonitor::~HealthMonitor() {
if (_monitoringTaskHandle != nullptr) {
vTaskDelete(_monitoringTaskHandle);
_monitoringTaskHandle = nullptr;
}
}
bool HealthMonitor::begin() {
LOG_INFO("🏥 Initializing Health Monitor System");
// Create monitoring task if auto-monitoring is enabled
if (_autoMonitoring) {
xTaskCreatePinnedToCore(
monitoringTask,
"HealthMonitor",
4096,
this,
3, // Medium priority
&_monitoringTaskHandle,
0 // Core 0 (different from BellEngine which uses Core 1)
);
if (_monitoringTaskHandle != nullptr) {
LOG_INFO("✅ Health Monitor initialized with automatic monitoring");
return true;
} else {
LOG_ERROR("❌ Failed to create Health Monitor task");
return false;
}
} else {
LOG_INFO("✅ Health Monitor initialized (manual mode)");
return true;
}
}
void HealthMonitor::initializeSubsystemHealth() {
// Initialize all subsystem health entries
// Mark critical subsystems that must be healthy for operation
_subsystemHealth["BellEngine"] = SubsystemHealth("BellEngine", true);
_subsystemHealth["OutputManager"] = SubsystemHealth("OutputManager", true);
_subsystemHealth["ConfigManager"] = SubsystemHealth("ConfigManager", true);
_subsystemHealth["FileManager"] = SubsystemHealth("FileManager", true);
_subsystemHealth["Communication"] = SubsystemHealth("Communication", false); // Non-critical
_subsystemHealth["Player"] = SubsystemHealth("Player", true);
_subsystemHealth["TimeKeeper"] = SubsystemHealth("TimeKeeper", false); // Non-critical
_subsystemHealth["Telemetry"] = SubsystemHealth("Telemetry", false); // Non-critical
_subsystemHealth["OTAManager"] = SubsystemHealth("OTAManager", false); // Non-critical
_subsystemHealth["Networking"] = SubsystemHealth("Networking", false); // Non-critical
LOG_DEBUG("🏗️ Initialized health monitoring for %d subsystems", _subsystemHealth.size());
}
void HealthMonitor::monitoringTask(void* parameter) {
HealthMonitor* monitor = static_cast<HealthMonitor*>(parameter);
LOG_INFO("🏥 Health Monitor task started on Core %d", xPortGetCoreID());
while (true) {
monitor->monitoringLoop();
vTaskDelay(pdMS_TO_TICKS(monitor->_healthCheckInterval));
}
}
void HealthMonitor::monitoringLoop() {
if (_player) {
if (_player->_status != PlayerStatus::STOPPED) {
LOG_VERBOSE("⏸️ Skipping health check during active playback");
return;
}
}
LOG_VERBOSE("🔍 Performing periodic health check...");
HealthStatus overallHealth = performFullHealthCheck();
// Log warnings for any unhealthy subsystems
uint8_t criticalCount = getCriticalFailureCount();
uint8_t warningCount = getWarningCount();
if (criticalCount > 0) {
LOG_WARNING("🚨 Health Monitor: %d critical failures detected!", criticalCount);
// List critical failures
for (const auto& [name, health] : _subsystemHealth) {
if (health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED) {
LOG_ERROR("❌ CRITICAL: %s - %s", name.c_str(), health.lastError.c_str());
}
}
// Check if firmware rollback is recommended
if (shouldRollbackFirmware()) {
LOG_ERROR("🔄 FIRMWARE ROLLBACK RECOMMENDED - Too many critical failures");
// In a real system, this would trigger an OTA rollback
// For now, we just log the recommendation
}
} else if (warningCount > 0) {
LOG_WARNING("⚠️ Health Monitor: %d warnings detected", warningCount);
} else {
LOG_VERBOSE("✅ All subsystems healthy");
}
}
HealthStatus HealthMonitor::performFullHealthCheck() {
unsigned long startTime = millis();
uint8_t checkedSystems = 0;
// Check BellEngine
if (_bellEngine) {
bool healthy = _bellEngine->isHealthy();
updateSubsystemHealth("BellEngine",
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
healthy ? "" : "BellEngine health check failed");
checkedSystems++;
}
// Check OutputManager
if (_outputManager) {
bool healthy = _outputManager->isHealthy();
updateSubsystemHealth("OutputManager",
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
healthy ? "" : "OutputManager health check failed");
checkedSystems++;
}
// Check Communication
if (_communication) {
bool healthy = _communication->isHealthy();
updateSubsystemHealth("Communication",
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
healthy ? "" : "Communication health check failed");
checkedSystems++;
}
// Check Player
if (_player) {
bool healthy = _player->isHealthy();
updateSubsystemHealth("Player",
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
healthy ? "" : "Player health check failed");
checkedSystems++;
}
// Check TimeKeeper
if (_timeKeeper) {
bool healthy = _timeKeeper->isHealthy();
updateSubsystemHealth("TimeKeeper",
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
healthy ? "" : "TimeKeeper health check failed");
checkedSystems++;
}
// Check Telemetry
if (_telemetry) {
bool healthy = _telemetry->isHealthy();
updateSubsystemHealth("Telemetry",
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
healthy ? "" : "Telemetry health check failed");
checkedSystems++;
}
// Check OTAManager
if (_otaManager) {
bool healthy = _otaManager->isHealthy();
updateSubsystemHealth("OTAManager",
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
healthy ? "" : "OTAManager health check failed");
checkedSystems++;
}
// Check Networking
if (_networking) {
bool healthy = _networking->isHealthy();
updateSubsystemHealth("Networking",
healthy ? HealthStatus::HEALTHY : HealthStatus::WARNING,
healthy ? "" : "Networking health check failed");
checkedSystems++;
}
// Check ConfigManager
if (_configManager) {
bool healthy = _configManager->isHealthy();
updateSubsystemHealth("ConfigManager",
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
healthy ? "" : "ConfigManager health check failed");
checkedSystems++;
}
// Check FileManager
if (_fileManager) {
bool healthy = _fileManager->isHealthy();
updateSubsystemHealth("FileManager",
healthy ? HealthStatus::HEALTHY : HealthStatus::CRITICAL,
healthy ? "" : "FileManager health check failed");
checkedSystems++;
}
unsigned long elapsed = millis() - startTime;
LOG_VERBOSE("🔍 Health check completed: %d systems in %lums", checkedSystems, elapsed);
return calculateOverallHealth();
}
HealthStatus HealthMonitor::checkSubsystemHealth(const String& subsystemName) {
// Perform health check on specific subsystem
auto it = _subsystemHealth.find(subsystemName);
if (it == _subsystemHealth.end()) {
LOG_WARNING("❓ Unknown subsystem: %s", subsystemName.c_str());
return HealthStatus::FAILED;
}
bool healthy = false;
// Check specific subsystem
if (subsystemName == "BellEngine" && _bellEngine) {
healthy = _bellEngine->isHealthy();
} else if (subsystemName == "OutputManager" && _outputManager) {
healthy = _outputManager->isHealthy();
} else if (subsystemName == "Communication" && _communication) {
healthy = _communication->isHealthy();
} else if (subsystemName == "Player" && _player) {
healthy = _player->isHealthy();
} else if (subsystemName == "TimeKeeper" && _timeKeeper) {
healthy = _timeKeeper->isHealthy();
} else if (subsystemName == "Telemetry" && _telemetry) {
healthy = _telemetry->isHealthy();
} else if (subsystemName == "OTAManager" && _otaManager) {
healthy = _otaManager->isHealthy();
} else if (subsystemName == "Networking" && _networking) {
healthy = _networking->isHealthy();
} else if (subsystemName == "ConfigManager" && _configManager) {
healthy = _configManager->isHealthy();
} else if (subsystemName == "FileManager" && _fileManager) {
healthy = _fileManager->isHealthy();
} else {
LOG_WARNING("🔌 Subsystem %s not connected to health monitor", subsystemName.c_str());
return HealthStatus::FAILED;
}
HealthStatus status = healthy ? HealthStatus::HEALTHY :
(it->second.isCritical ? HealthStatus::CRITICAL : HealthStatus::WARNING);
updateSubsystemHealth(subsystemName, status,
healthy ? "" : subsystemName + " health check failed");
return status;
}
const std::map<String, SubsystemHealth>& HealthMonitor::getAllSubsystemHealth() const {
return _subsystemHealth;
}
SubsystemHealth HealthMonitor::getSubsystemHealth(const String& subsystemName) const {
auto it = _subsystemHealth.find(subsystemName);
if (it != _subsystemHealth.end()) {
return it->second;
}
// Return default unhealthy status for unknown subsystems
SubsystemHealth unknown(subsystemName);
unknown.status = HealthStatus::FAILED;
unknown.lastError = "Subsystem not found";
return unknown;
}
bool HealthMonitor::isFirmwareStable() const {
return areCriticalSubsystemsHealthy() && (getCriticalFailureCount() == 0);
}
uint8_t HealthMonitor::getCriticalFailureCount() const {
uint8_t count = 0;
for (const auto& [name, health] : _subsystemHealth) {
if (health.isCritical &&
(health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) {
count++;
}
}
return count;
}
uint8_t HealthMonitor::getWarningCount() const {
uint8_t count = 0;
for (const auto& [name, health] : _subsystemHealth) {
if (health.status == HealthStatus::WARNING) {
count++;
}
}
return count;
}
bool HealthMonitor::shouldRollbackFirmware() const {
uint8_t criticalFailures = getCriticalFailureCount();
// Rollback if more than 2 critical subsystems have failed
// This is configurable based on system requirements
const uint8_t MAX_CRITICAL_FAILURES = 2;
return criticalFailures > MAX_CRITICAL_FAILURES;
}
String HealthMonitor::generateHealthReport() const {
StaticJsonDocument<2048> doc;
doc["timestamp"] = millis();
doc["overall_health"] = healthStatusToString(calculateOverallHealth());
doc["critical_failures"] = getCriticalFailureCount();
doc["warnings"] = getWarningCount();
doc["firmware_stable"] = isFirmwareStable();
doc["rollback_recommended"] = shouldRollbackFirmware();
JsonObject subsystems = doc.createNestedObject("subsystems");
for (const auto& [name, health] : _subsystemHealth) {
JsonObject subsystem = subsystems.createNestedObject(name);
subsystem["status"] = healthStatusToString(health.status);
subsystem["critical"] = health.isCritical;
subsystem["last_check"] = health.lastCheck;
if (!health.lastError.isEmpty()) {
subsystem["error"] = health.lastError;
}
}
String report;
serializeJsonPretty(doc, report);
return report;
}
String HealthMonitor::getHealthSummary() const {
HealthStatus overall = calculateOverallHealth();
uint8_t critical = getCriticalFailureCount();
uint8_t warnings = getWarningCount();
String summary = "System Health: " + healthStatusToString(overall);
if (critical > 0) {
summary += " (" + String(critical) + " critical failures)";
}
if (warnings > 0) {
summary += " (" + String(warnings) + " warnings)";
}
if (shouldRollbackFirmware()) {
summary += " - ROLLBACK RECOMMENDED";
}
return summary;
}
void HealthMonitor::updateSubsystemHealth(const String& name, HealthStatus status, const String& error) {
auto it = _subsystemHealth.find(name);
if (it != _subsystemHealth.end()) {
it->second.status = status;
it->second.lastError = error;
it->second.lastCheck = millis();
LOG_VERBOSE("🔍 %s: %s %s",
name.c_str(),
healthStatusToString(status).c_str(),
error.isEmpty() ? "" : ("(" + error + ")").c_str());
}
}
bool HealthMonitor::areCriticalSubsystemsHealthy() const {
for (const auto& [name, health] : _subsystemHealth) {
if (health.isCritical &&
(health.status == HealthStatus::CRITICAL || health.status == HealthStatus::FAILED)) {
return false;
}
}
return true;
}
HealthStatus HealthMonitor::calculateOverallHealth() const {
bool hasCriticalFailures = (getCriticalFailureCount() > 0);
bool hasWarnings = (getWarningCount() > 0);
if (hasCriticalFailures) {
return HealthStatus::CRITICAL;
} else if (hasWarnings) {
return HealthStatus::WARNING;
} else {
return HealthStatus::HEALTHY;
}
}
String HealthMonitor::healthStatusToString(HealthStatus status) const {
switch (status) {
case HealthStatus::HEALTHY:
return "HEALTHY";
case HealthStatus::WARNING:
return "WARNING";
case HealthStatus::CRITICAL:
return "CRITICAL";
case HealthStatus::FAILED:
return "FAILED";
default:
return "UNKNOWN";
}
}