diff --git a/check-health.d/state.rsc b/check-health.d/state.rsc new file mode 100644 index 0000000..bcc1fbc --- /dev/null +++ b/check-health.d/state.rsc @@ -0,0 +1,48 @@ +#!rsc by RouterOS +# RouterOS script: check-health.d/state +# Copyright (c) 2019-2025 Christian Hesse +# https://rsc.eworm.de/COPYING.md +# +# requires RouterOS, version=7.14 +# +# check for RouterOS health state - state plugin +# https://rsc.eworm.de/doc/check-health.md + +:global CheckHealthPlugins; + +:set ($CheckHealthPlugins->[ :jobname ]) do={ + :local FuncName [ :tostr $0 ]; + + :global CheckHealthLast; + :global Identity; + + :global LogPrint; + :global SendNotification2; + :global SymbolForNotification; + + :if ([ :len [ /system/health/find where type="" name~"-state\$"] ] = 0) do={ + $LogPrint debug $FuncName ("Your device does not provide any state health values."); + :return false; + } + + :foreach State in=[ /system/health/find where type="" name~"-state\$" ] do={ + :local Name [ /system/health/get $State name ]; + :local Value [ /system/health/get $State value ]; + + :if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={ + :if ($CheckHealthLast->$Name = "ok" && \ + $Value != "ok") do={ + $SendNotification2 ({ origin=$FuncName; \ + subject=([ $SymbolForNotification "cross-mark" ] . "Health warning: " . $Name); \ + message=("The device '" . $Name . "' on " . $Identity . " failed!") }); + } + :if ($CheckHealthLast->$Name != "ok" && \ + $Value = "ok") do={ + $SendNotification2 ({ origin=$FuncName; \ + subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \ + message=("The device '" . $Name . "' on " . $Identity . " recovered!") }); + } + } + :set ($CheckHealthLast->$Name) $Value; + } +} diff --git a/check-health.d/temperature.rsc b/check-health.d/temperature.rsc new file mode 100644 index 0000000..9b84782 --- /dev/null +++ b/check-health.d/temperature.rsc @@ -0,0 +1,74 @@ +#!rsc by RouterOS +# RouterOS script: check-health.d/temperature +# Copyright (c) 2019-2025 Christian Hesse +# https://rsc.eworm.de/COPYING.md +# +# requires RouterOS, version=7.14 +# +# check for RouterOS health state - temperature plugin +# https://rsc.eworm.de/doc/check-health.md + +:global CheckHealthPlugins; + +:set ($CheckHealthPlugins->[ :jobname ]) do={ + :local FuncName [ :tostr $0 ]; + + :global CheckHealthLast; + :global CheckHealthTemperature; + :global CheckHealthTemperatureDeviation; + :global CheckHealthTemperatureNotified; + :global Identity; + + :global LogPrint; + :global SendNotification2; + :global SymbolForNotification; + + :if ([ :len [ /system/health/find where type="C" ] ] = 0) do={ + $LogPrint debug $FuncName ("Your device does not provide any voltage health values."); + :return false; + } + + :local TempToNum do={ + :global CharacterReplace; + :local T [ :toarray [ $CharacterReplace $1 "." "," ] ]; + :return ($T->0 * 10 + $T->1); + } + + :if ([ :typeof $CheckHealthTemperatureNotified ] != "array") do={ + :set CheckHealthTemperatureNotified ({}); + } + + :foreach Temperature in=[ /system/health/find where type="C" ] do={ + :local Name [ /system/health/get $Temperature name ]; + :local Value [ /system/health/get $Temperature value ]; + + :if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={ + :if ([ :typeof ($CheckHealthTemperature->$Name) ] != "num" ) do={ + $LogPrint info $FuncName ("No threshold given for " . $Name . ", assuming 50C."); + :set ($CheckHealthTemperature->$Name) 50; + } + :local Validate [ /system/health/get [ find where name=$Name ] value ]; + :while ($Value != $Validate) do={ + :set Value $Validate; + :set Validate [ /system/health/get [ find where name=$Name ] value ]; + } + :if ($Value > $CheckHealthTemperature->$Name && \ + $CheckHealthTemperatureNotified->$Name != true) do={ + $SendNotification2 ({ origin=$FuncName; \ + subject=([ $SymbolForNotification "fire" ] . "Health warning: " . $Name); \ + message=("The " . $Name . " on " . $Identity . " is above threshold: " . \ + $Value . "\C2\B0" . "C") }); + :set ($CheckHealthTemperatureNotified->$Name) true; + } + :if ($Value <= ($CheckHealthTemperature->$Name - $CheckHealthTemperatureDeviation) && \ + $CheckHealthTemperatureNotified->$Name = true) do={ + $SendNotification2 ({ origin=$FuncName; \ + subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \ + message=("The " . $Name . " on " . $Identity . " dropped below threshold: " . \ + $Value . "\C2\B0" . "C") }); + :set ($CheckHealthTemperatureNotified->$Name) false; + } + } + :set ($CheckHealthLast->$Name) $Value; + } +} diff --git a/check-health.d/voltage.rsc b/check-health.d/voltage.rsc new file mode 100644 index 0000000..6394795 --- /dev/null +++ b/check-health.d/voltage.rsc @@ -0,0 +1,63 @@ +#!rsc by RouterOS +# RouterOS script: check-health.d/voltage +# Copyright (c) 2019-2025 Christian Hesse +# https://rsc.eworm.de/COPYING.md +# +# requires RouterOS, version=7.14 +# +# check for RouterOS health state - voltage plugin +# https://rsc.eworm.de/doc/check-health.md + +:global CheckHealthPlugins; + +:set ($CheckHealthPlugins->[ :jobname ]) do={ + :local FuncName [ :tostr $0 ]; + + :global CheckHealthLast; + :global CheckHealthVoltageLow; + :global CheckHealthVoltagePercent; + :global Identity; + + :global FormatLine; + :global IfThenElse; + :global LogPrint; + :global SendNotification2; + :global SymbolForNotification; + + :if ([ :len [ /system/health/find where type="V" ] ] = 0) do={ + $LogPrint debug $FuncName ("Your device does not provide any voltage health values."); + :return false; + } + + :foreach Voltage in=[ /system/health/find where type="V" ] do={ + :local Name [ /system/health/get $Voltage name ]; + :local Value [ /system/health/get $Voltage value ]; + + :if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={ + :local NumCurr [ $TempToNum $Value ]; + :local NumLast [ $TempToNum ($CheckHealthLast->$Name) ]; + + :if ($NumLast * (100 + $CheckHealthVoltagePercent) < $NumCurr * 100 || \ + $NumLast * 100 > $NumCurr * (100 + $CheckHealthVoltagePercent)) do={ + $SendNotification2 ({ origin=$FuncName; \ + subject=([ $SymbolForNotification ("high-voltage-sign,chart-" . [ $IfThenElse ($NumLast < \ + $NumCurr) "in" "de" ] . "creasing") ] . "Health warning: " . $Name); \ + message=("The " . $Name . " on " . $Identity . " jumped more than " . $CheckHealthVoltagePercent . "%.\n\n" . \ + [ $FormatLine "old value" ($CheckHealthLast->$Name . " V") 12 ] . "\n" . \ + [ $FormatLine "new value" ($Value . " V") 12 ]) }); + } else={ + :if ($NumCurr <= $CheckHealthVoltageLow && $NumLast > $CheckHealthVoltageLow) do={ + $SendNotification2 ({ origin=$FuncName; \ + subject=([ $SymbolForNotification "high-voltage-sign,chart-decreasing" ] . "Health warning: Low " . $Name); \ + message=("The " . $Name . " on " . $Identity . " dropped to " . $Value . " V below hard limit.") }); + } + :if ($NumCurr > $CheckHealthVoltageLow && $NumLast <= $CheckHealthVoltageLow) do={ + $SendNotification2 ({ origin=$FuncName; \ + subject=([ $SymbolForNotification "high-voltage-sign,chart-increasing" ] . "Health recovery: Low " . $Name); \ + message=("The " . $Name . " on " . $Identity . " recovered to " . $Value . " V above hard limit.") }); + } + } + } + :set ($CheckHealthLast->$Name) $Value; + } +} diff --git a/check-health.rsc b/check-health.rsc index 34c508c..827f597 100644 --- a/check-health.rsc +++ b/check-health.rsc @@ -19,11 +19,6 @@ :global CheckHealthCPUUtilizationNotified; :global CheckHealthLast; :global CheckHealthRAMUtilizationNotified; - :global CheckHealthTemperature; - :global CheckHealthTemperatureDeviation; - :global CheckHealthTemperatureNotified; - :global CheckHealthVoltageLow; - :global CheckHealthVoltagePercent; :global Identity; :global FormatLine; @@ -33,6 +28,7 @@ :global ScriptLock; :global SendNotification2; :global SymbolForNotification; + :global ValidateSyntax; :local TempToNum do={ :global CharacterReplace; @@ -78,105 +74,37 @@ :set CheckHealthRAMUtilizationNotified false; } - :if ([ :len [ /system/health/find ] ] = 0) do={ - $LogPrint debug $ScriptName ("Your device does not provide any health values."); + :local Plugins [ /system/script/find where name~"^check-health.d/." ]; + :if ([ :len $Plugins ] = 0) do={ + $LogPrint debug $ScriptName ("No plugins installed."); :set ExitOK true; :error true; } + :global CheckHealthPlugins ({}); :if ([ :typeof $CheckHealthLast ] != "array") do={ :set CheckHealthLast ({}); } - :if ([ :typeof $CheckHealthTemperatureNotified ] != "array") do={ - :set CheckHealthTemperatureNotified ({}); - } - :foreach Voltage in=[ /system/health/find where type="V" ] do={ - :local Name [ /system/health/get $Voltage name ]; - :local Value [ /system/health/get $Voltage value ]; - - :if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={ - :local NumCurr [ $TempToNum $Value ]; - :local NumLast [ $TempToNum ($CheckHealthLast->$Name) ]; - - :if ($NumLast * (100 + $CheckHealthVoltagePercent) < $NumCurr * 100 || \ - $NumLast * 100 > $NumCurr * (100 + $CheckHealthVoltagePercent)) do={ - $SendNotification2 ({ origin=$ScriptName; \ - subject=([ $SymbolForNotification ("high-voltage-sign,chart-" . [ $IfThenElse ($NumLast < \ - $NumCurr) "in" "de" ] . "creasing") ] . "Health warning: " . $Name); \ - message=("The " . $Name . " on " . $Identity . " jumped more than " . $CheckHealthVoltagePercent . "%.\n\n" . \ - [ $FormatLine "old value" ($CheckHealthLast->$Name . " V") 12 ] . "\n" . \ - [ $FormatLine "new value" ($Value . " V") 12 ]) }); - } else={ - :if ($NumCurr <= $CheckHealthVoltageLow && $NumLast > $CheckHealthVoltageLow) do={ - $SendNotification2 ({ origin=$ScriptName; \ - subject=([ $SymbolForNotification "high-voltage-sign,chart-decreasing" ] . "Health warning: Low " . $Name); \ - message=("The " . $Name . " on " . $Identity . " dropped to " . $Value . " V below hard limit.") }); - } - :if ($NumCurr > $CheckHealthVoltageLow && $NumLast <= $CheckHealthVoltageLow) do={ - $SendNotification2 ({ origin=$ScriptName; \ - subject=([ $SymbolForNotification "high-voltage-sign,chart-increasing" ] . "Health recovery: Low " . $Name); \ - message=("The " . $Name . " on " . $Identity . " recovered to " . $Value . " V above hard limit.") }); - } + :foreach Plugin in=$Plugins do={ + :local PluginVal [ /system/script/get $Plugin ]; + :if ([ $ValidateSyntax ($PluginVal->"source") ] = true) do={ + :do { + /system/script/run $Plugin; + } on-error={ + $LogPrint error $ScriptName ("Plugin '" . $ScriptVal->"name" . "' failed to run."); } + } else={ + $LogPrint error $ScriptName ("Plugin '" . $ScriptVal->"name" . "' failed syntax validation, skipping."); } - :set ($CheckHealthLast->$Name) $Value; } - :foreach PSU in=[ /system/health/find where name~"^psu.*-state\$" ] do={ - :local Name [ /system/health/get $PSU name ]; - :local Value [ /system/health/get $PSU value ]; - - :if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={ - :if ($CheckHealthLast->$Name = "ok" && \ - $Value != "ok") do={ - $SendNotification2 ({ origin=$ScriptName; \ - subject=([ $SymbolForNotification "cross-mark" ] . "Health warning: " . $Name); \ - message=("The power supply unit '" . $Name . "' on " . $Identity . " failed!") }); - } - :if ($CheckHealthLast->$Name != "ok" && \ - $Value = "ok") do={ - $SendNotification2 ({ origin=$ScriptName; \ - subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \ - message=("The power supply unit '" . $Name . "' on " . $Identity . " recovered!") }); - } - } - :set ($CheckHealthLast->$Name) $Value; + :foreach PluginName,Discard in=$CheckHealthPlugins do={ + ($CheckHealthPlugins->$PluginName) \ + ("\$CheckHealthPlugins->\"" . $PluginName . "\""); } - :foreach Temperature in=[ /system/health/find where type="C" ] do={ - :local Name [ /system/health/get $Temperature name ]; - :local Value [ /system/health/get $Temperature value ]; - - :if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={ - :if ([ :typeof ($CheckHealthTemperature->$Name) ] != "num" ) do={ - $LogPrint info $ScriptName ("No threshold given for " . $Name . ", assuming 50C."); - :set ($CheckHealthTemperature->$Name) 50; - } - :local Validate [ /system/health/get [ find where name=$Name ] value ]; - :while ($Value != $Validate) do={ - :set Value $Validate; - :set Validate [ /system/health/get [ find where name=$Name ] value ]; - } - :if ($Value > $CheckHealthTemperature->$Name && \ - $CheckHealthTemperatureNotified->$Name != true) do={ - $SendNotification2 ({ origin=$ScriptName; \ - subject=([ $SymbolForNotification "fire" ] . "Health warning: " . $Name); \ - message=("The " . $Name . " on " . $Identity . " is above threshold: " . \ - $Value . "\C2\B0" . "C") }); - :set ($CheckHealthTemperatureNotified->$Name) true; - } - :if ($Value <= ($CheckHealthTemperature->$Name - $CheckHealthTemperatureDeviation) && \ - $CheckHealthTemperatureNotified->$Name = true) do={ - $SendNotification2 ({ origin=$ScriptName; \ - subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \ - message=("The " . $Name . " on " . $Identity . " dropped below threshold: " . \ - $Value . "\C2\B0" . "C") }); - :set ($CheckHealthTemperatureNotified->$Name) false; - } - } - :set ($CheckHealthLast->$Name) $Value; - } + :set CheckHealthPlugins; } on-error={ :global ExitError; $ExitError $ExitOK [ :jobname ]; } diff --git a/doc/check-health.d/notification-08-psu-fail.avif b/doc/check-health.d/notification-08-state-fail.avif similarity index 100% rename from doc/check-health.d/notification-08-psu-fail.avif rename to doc/check-health.d/notification-08-state-fail.avif diff --git a/doc/check-health.d/notification-09-psu-ok.avif b/doc/check-health.d/notification-09-state-ok.avif similarity index 100% rename from doc/check-health.d/notification-09-psu-ok.avif rename to doc/check-health.d/notification-09-state-ok.avif diff --git a/doc/check-health.md b/doc/check-health.md index a945371..51e71bc 100644 --- a/doc/check-health.md +++ b/doc/check-health.md @@ -17,21 +17,21 @@ Description ----------- This script is run from scheduler periodically, sending notification on -health related events: +health related events. Monitoring CPU and RAM utilization (available +processing and memory resources) works on all devices: * high CPU utilization * high RAM utilization (low available RAM) + +With additional plugins functionality can be extended, depending on +sensors available in hardware: + * voltage jumps up or down more than configured threshold * voltage drops below hard lower limit +* fan failed or recovered * power supply failed or recovered * temperature is above or below threshold -Monitoring CPU and RAM utilization (available processing and memory -resources) works on all devices. Other than that only sensors available -in hardware can be checked. See what your hardware supports: - - /system/health/print; - > ⚠️ **Warning**: Note that bad initial state will not trigger an event! For > example rebooting a device that is already too hot will not trigger an > alert on high temperature. @@ -59,8 +59,8 @@ in hardware can be checked. See what your hardware supports: #### PSU state -![check-health notification psu fail](check-health.d/notification-08-psu-fail.avif) -![check-health notification psu ok](check-health.d/notification-09-psu-ok.avif) +![check-health notification state fail](check-health.d/notification-08-state-fail.avif) +![check-health notification state ok](check-health.d/notification-09-state-ok.avif) Requirements and installation ----------------------------- @@ -74,6 +74,30 @@ Just install the script and create a scheduler: > precision of cpu utilization, escpecially on devices with limited > resources. Thus an unusual interval is used here. +### Plugins + +Additional plugins are available for sensors available in hardware. First +check what your hardware supports: + + /system/health/print; + +Then install the plugin for *fan* and *power supply unit* *state*: + + $ScriptInstallUpdate check-health,check-health.d/state; + +... or *temperature*: + + $ScriptInstallUpdate check-health,check-health.d/temperature; + +... or *voltage*: + + $ScriptInstallUpdate check-health,check-health.d/voltage; + +You can also combine the commands and install all or a subset of plugins +in one go: + + $ScriptInstallUpdate check-health,check-health.d/state,check-health.d/temperature,check-health.d/voltage; + Configuration ------------- diff --git a/global-functions.rsc b/global-functions.rsc index a2ec833..8eb6712 100644 --- a/global-functions.rsc +++ b/global-functions.rsc @@ -13,7 +13,7 @@ :local ScriptName [ :jobname ]; # expected configuration version -:global ExpectedConfigVersion 131; +:global ExpectedConfigVersion 132; # global variables not to be changed by user :global GlobalFunctionsReady false; diff --git a/news-and-changes.rsc b/news-and-changes.rsc index 7508b43..c7e566f 100644 --- a/news-and-changes.rsc +++ b/news-and-changes.rsc @@ -56,6 +56,7 @@ 129="Extended 'backup-partition' to support RouterOS copy-over - interactively or before feature update."; 130="Dropped intermediate certificates, depending on just root certificates now."; 131="Enhanced certificate download to fallback to mkcert.org, so all (commonly trusted) root certificates are available now."; + 132="Split off plugins from 'check-health', so the script works on all devices to monitor CPU and RAM. The supported plugins for sensors in hardware are installed automatically."; }; # Migration steps to be applied on script updates @@ -64,4 +65,5 @@ 100=":global ScriptInstallUpdate; :if ([ :len [ /system/script/find where name=\"ssh-keys-import\" source~\"^#!rsc by RouterOS\\r?\\n\" ] ] > 0) do={ /system/script/set name=\"mod/ssh-keys-import\" ssh-keys-import; \$ScriptInstallUpdate; }"; 104=":global CharacterReplace; :global ScriptInstallUpdate; :foreach Script in={ \"capsman-download-packages\"; \"capsman-rolling-upgrade\"; \"hotspot-to-wpa\"; \"hotspot-to-wpa-cleanup\" } do={ /system/script/set name=(\$Script . \".capsman\") [ find where name=\$Script ]; :foreach Scheduler in=[ /system/scheduler/find where on-event~(\$Script . \"([^-.]|\\\$)\") ] do={ /system/scheduler/set \$Scheduler on-event=[ \$CharacterReplace [ get \$Scheduler on-event ] \$Script (\$Script . \".capsman\") ]; }; }; /ip/hotspot/user/profile/set on-login=\"hotspot-to-wpa.capsman\" [ find where on-login=\"hotspot-to-wpa\" ]; \$ScriptInstallUpdate;"; 111=":local Rec [ /ip/dns/static/find where comment~\"^managed by dhcp-to-dns for \" ]; :if ([ :len \$Rec ] > 0) do={ /ip/dns/static/remove \$Rec; /system/script/run dhcp-to-dns; }"; + 132=":if ([ :len [ /system/script/find where name=\"check-health\" ] ] > 0) do={ :local Code \":local Install \\\"check-health\\\"; :if ([ :len [ /system/health/find where type=\\\"\\\" name~\\\"-state\\\\\\\$\\\" ] ] > 0) do={ :set Install (\\\$Install . \\\",check-health.d/state\\\"); }; :if ([ :len [ /system/health/find where type=\\\"C\\\" ] ] > 0) do={ :set Install (\\\$Install . \\\",check-health.d/temperature\\\"); }; :if ([ :len [ /system/health/find where type=\\\"V\\\" ] ] > 0) do={ :set Install (\\\$Install . \\\",check-health.d/voltage\\\"); }; :global ScriptInstallUpdate; \\\$ScriptInstallUpdate \\\$Install;\"; :global ValidateSyntax; :if ([ \$ValidateSyntax \$Code ] = true) do={ :do { [ :parse \$Code ]; } on-error={ }; }; }"; };