check-health: split off plugins...

... from 'check-health', so the script works on all devices to monitor
CPU and RAM. The supported plugins for sensors in hardware are installed
automatically.
This commit is contained in:
Christian Hesse 2025-02-04 12:27:35 +01:00
parent 84ba3a463a
commit 23d38927bc
9 changed files with 239 additions and 100 deletions

48
check-health.d/state.rsc Normal file
View file

@ -0,0 +1,48 @@
#!rsc by RouterOS
# RouterOS script: check-health.d/state
# Copyright (c) 2019-2025 Christian Hesse <mail@eworm.de>
# https://rsc.eworm.de/COPYING.md
#
# requires RouterOS, version=7.14
#
# check for RouterOS health state - state plugin
# https://rsc.eworm.de/doc/check-health.md
:global CheckHealthPlugins;
:set ($CheckHealthPlugins->[ :jobname ]) do={
:local FuncName [ :tostr $0 ];
:global CheckHealthLast;
:global Identity;
:global LogPrint;
:global SendNotification2;
:global SymbolForNotification;
:if ([ :len [ /system/health/find where type="" name~"-state\$"] ] = 0) do={
$LogPrint debug $FuncName ("Your device does not provide any state health values.");
:return false;
}
:foreach State in=[ /system/health/find where type="" name~"-state\$" ] do={
:local Name [ /system/health/get $State name ];
:local Value [ /system/health/get $State value ];
:if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={
:if ($CheckHealthLast->$Name = "ok" && \
$Value != "ok") do={
$SendNotification2 ({ origin=$FuncName; \
subject=([ $SymbolForNotification "cross-mark" ] . "Health warning: " . $Name); \
message=("The device '" . $Name . "' on " . $Identity . " failed!") });
}
:if ($CheckHealthLast->$Name != "ok" && \
$Value = "ok") do={
$SendNotification2 ({ origin=$FuncName; \
subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \
message=("The device '" . $Name . "' on " . $Identity . " recovered!") });
}
}
:set ($CheckHealthLast->$Name) $Value;
}
}

View file

@ -0,0 +1,74 @@
#!rsc by RouterOS
# RouterOS script: check-health.d/temperature
# Copyright (c) 2019-2025 Christian Hesse <mail@eworm.de>
# https://rsc.eworm.de/COPYING.md
#
# requires RouterOS, version=7.14
#
# check for RouterOS health state - temperature plugin
# https://rsc.eworm.de/doc/check-health.md
:global CheckHealthPlugins;
:set ($CheckHealthPlugins->[ :jobname ]) do={
:local FuncName [ :tostr $0 ];
:global CheckHealthLast;
:global CheckHealthTemperature;
:global CheckHealthTemperatureDeviation;
:global CheckHealthTemperatureNotified;
:global Identity;
:global LogPrint;
:global SendNotification2;
:global SymbolForNotification;
:if ([ :len [ /system/health/find where type="C" ] ] = 0) do={
$LogPrint debug $FuncName ("Your device does not provide any voltage health values.");
:return false;
}
:local TempToNum do={
:global CharacterReplace;
:local T [ :toarray [ $CharacterReplace $1 "." "," ] ];
:return ($T->0 * 10 + $T->1);
}
:if ([ :typeof $CheckHealthTemperatureNotified ] != "array") do={
:set CheckHealthTemperatureNotified ({});
}
:foreach Temperature in=[ /system/health/find where type="C" ] do={
:local Name [ /system/health/get $Temperature name ];
:local Value [ /system/health/get $Temperature value ];
:if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={
:if ([ :typeof ($CheckHealthTemperature->$Name) ] != "num" ) do={
$LogPrint info $FuncName ("No threshold given for " . $Name . ", assuming 50C.");
:set ($CheckHealthTemperature->$Name) 50;
}
:local Validate [ /system/health/get [ find where name=$Name ] value ];
:while ($Value != $Validate) do={
:set Value $Validate;
:set Validate [ /system/health/get [ find where name=$Name ] value ];
}
:if ($Value > $CheckHealthTemperature->$Name && \
$CheckHealthTemperatureNotified->$Name != true) do={
$SendNotification2 ({ origin=$FuncName; \
subject=([ $SymbolForNotification "fire" ] . "Health warning: " . $Name); \
message=("The " . $Name . " on " . $Identity . " is above threshold: " . \
$Value . "\C2\B0" . "C") });
:set ($CheckHealthTemperatureNotified->$Name) true;
}
:if ($Value <= ($CheckHealthTemperature->$Name - $CheckHealthTemperatureDeviation) && \
$CheckHealthTemperatureNotified->$Name = true) do={
$SendNotification2 ({ origin=$FuncName; \
subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \
message=("The " . $Name . " on " . $Identity . " dropped below threshold: " . \
$Value . "\C2\B0" . "C") });
:set ($CheckHealthTemperatureNotified->$Name) false;
}
}
:set ($CheckHealthLast->$Name) $Value;
}
}

View file

@ -0,0 +1,63 @@
#!rsc by RouterOS
# RouterOS script: check-health.d/voltage
# Copyright (c) 2019-2025 Christian Hesse <mail@eworm.de>
# https://rsc.eworm.de/COPYING.md
#
# requires RouterOS, version=7.14
#
# check for RouterOS health state - voltage plugin
# https://rsc.eworm.de/doc/check-health.md
:global CheckHealthPlugins;
:set ($CheckHealthPlugins->[ :jobname ]) do={
:local FuncName [ :tostr $0 ];
:global CheckHealthLast;
:global CheckHealthVoltageLow;
:global CheckHealthVoltagePercent;
:global Identity;
:global FormatLine;
:global IfThenElse;
:global LogPrint;
:global SendNotification2;
:global SymbolForNotification;
:if ([ :len [ /system/health/find where type="V" ] ] = 0) do={
$LogPrint debug $FuncName ("Your device does not provide any voltage health values.");
:return false;
}
:foreach Voltage in=[ /system/health/find where type="V" ] do={
:local Name [ /system/health/get $Voltage name ];
:local Value [ /system/health/get $Voltage value ];
:if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={
:local NumCurr [ $TempToNum $Value ];
:local NumLast [ $TempToNum ($CheckHealthLast->$Name) ];
:if ($NumLast * (100 + $CheckHealthVoltagePercent) < $NumCurr * 100 || \
$NumLast * 100 > $NumCurr * (100 + $CheckHealthVoltagePercent)) do={
$SendNotification2 ({ origin=$FuncName; \
subject=([ $SymbolForNotification ("high-voltage-sign,chart-" . [ $IfThenElse ($NumLast < \
$NumCurr) "in" "de" ] . "creasing") ] . "Health warning: " . $Name); \
message=("The " . $Name . " on " . $Identity . " jumped more than " . $CheckHealthVoltagePercent . "%.\n\n" . \
[ $FormatLine "old value" ($CheckHealthLast->$Name . " V") 12 ] . "\n" . \
[ $FormatLine "new value" ($Value . " V") 12 ]) });
} else={
:if ($NumCurr <= $CheckHealthVoltageLow && $NumLast > $CheckHealthVoltageLow) do={
$SendNotification2 ({ origin=$FuncName; \
subject=([ $SymbolForNotification "high-voltage-sign,chart-decreasing" ] . "Health warning: Low " . $Name); \
message=("The " . $Name . " on " . $Identity . " dropped to " . $Value . " V below hard limit.") });
}
:if ($NumCurr > $CheckHealthVoltageLow && $NumLast <= $CheckHealthVoltageLow) do={
$SendNotification2 ({ origin=$FuncName; \
subject=([ $SymbolForNotification "high-voltage-sign,chart-increasing" ] . "Health recovery: Low " . $Name); \
message=("The " . $Name . " on " . $Identity . " recovered to " . $Value . " V above hard limit.") });
}
}
}
:set ($CheckHealthLast->$Name) $Value;
}
}

View file

@ -19,11 +19,6 @@
:global CheckHealthCPUUtilizationNotified;
:global CheckHealthLast;
:global CheckHealthRAMUtilizationNotified;
:global CheckHealthTemperature;
:global CheckHealthTemperatureDeviation;
:global CheckHealthTemperatureNotified;
:global CheckHealthVoltageLow;
:global CheckHealthVoltagePercent;
:global Identity;
:global FormatLine;
@ -33,6 +28,7 @@
:global ScriptLock;
:global SendNotification2;
:global SymbolForNotification;
:global ValidateSyntax;
:local TempToNum do={
:global CharacterReplace;
@ -78,105 +74,37 @@
:set CheckHealthRAMUtilizationNotified false;
}
:if ([ :len [ /system/health/find ] ] = 0) do={
$LogPrint debug $ScriptName ("Your device does not provide any health values.");
:local Plugins [ /system/script/find where name~"^check-health.d/." ];
:if ([ :len $Plugins ] = 0) do={
$LogPrint debug $ScriptName ("No plugins installed.");
:set ExitOK true;
:error true;
}
:global CheckHealthPlugins ({});
:if ([ :typeof $CheckHealthLast ] != "array") do={
:set CheckHealthLast ({});
}
:if ([ :typeof $CheckHealthTemperatureNotified ] != "array") do={
:set CheckHealthTemperatureNotified ({});
}
:foreach Voltage in=[ /system/health/find where type="V" ] do={
:local Name [ /system/health/get $Voltage name ];
:local Value [ /system/health/get $Voltage value ];
:if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={
:local NumCurr [ $TempToNum $Value ];
:local NumLast [ $TempToNum ($CheckHealthLast->$Name) ];
:if ($NumLast * (100 + $CheckHealthVoltagePercent) < $NumCurr * 100 || \
$NumLast * 100 > $NumCurr * (100 + $CheckHealthVoltagePercent)) do={
$SendNotification2 ({ origin=$ScriptName; \
subject=([ $SymbolForNotification ("high-voltage-sign,chart-" . [ $IfThenElse ($NumLast < \
$NumCurr) "in" "de" ] . "creasing") ] . "Health warning: " . $Name); \
message=("The " . $Name . " on " . $Identity . " jumped more than " . $CheckHealthVoltagePercent . "%.\n\n" . \
[ $FormatLine "old value" ($CheckHealthLast->$Name . " V") 12 ] . "\n" . \
[ $FormatLine "new value" ($Value . " V") 12 ]) });
} else={
:if ($NumCurr <= $CheckHealthVoltageLow && $NumLast > $CheckHealthVoltageLow) do={
$SendNotification2 ({ origin=$ScriptName; \
subject=([ $SymbolForNotification "high-voltage-sign,chart-decreasing" ] . "Health warning: Low " . $Name); \
message=("The " . $Name . " on " . $Identity . " dropped to " . $Value . " V below hard limit.") });
}
:if ($NumCurr > $CheckHealthVoltageLow && $NumLast <= $CheckHealthVoltageLow) do={
$SendNotification2 ({ origin=$ScriptName; \
subject=([ $SymbolForNotification "high-voltage-sign,chart-increasing" ] . "Health recovery: Low " . $Name); \
message=("The " . $Name . " on " . $Identity . " recovered to " . $Value . " V above hard limit.") });
}
:foreach Plugin in=$Plugins do={
:local PluginVal [ /system/script/get $Plugin ];
:if ([ $ValidateSyntax ($PluginVal->"source") ] = true) do={
:do {
/system/script/run $Plugin;
} on-error={
$LogPrint error $ScriptName ("Plugin '" . $ScriptVal->"name" . "' failed to run.");
}
} else={
$LogPrint error $ScriptName ("Plugin '" . $ScriptVal->"name" . "' failed syntax validation, skipping.");
}
:set ($CheckHealthLast->$Name) $Value;
}
:foreach PSU in=[ /system/health/find where name~"^psu.*-state\$" ] do={
:local Name [ /system/health/get $PSU name ];
:local Value [ /system/health/get $PSU value ];
:if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={
:if ($CheckHealthLast->$Name = "ok" && \
$Value != "ok") do={
$SendNotification2 ({ origin=$ScriptName; \
subject=([ $SymbolForNotification "cross-mark" ] . "Health warning: " . $Name); \
message=("The power supply unit '" . $Name . "' on " . $Identity . " failed!") });
}
:if ($CheckHealthLast->$Name != "ok" && \
$Value = "ok") do={
$SendNotification2 ({ origin=$ScriptName; \
subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \
message=("The power supply unit '" . $Name . "' on " . $Identity . " recovered!") });
}
}
:set ($CheckHealthLast->$Name) $Value;
:foreach PluginName,Discard in=$CheckHealthPlugins do={
($CheckHealthPlugins->$PluginName) \
("\$CheckHealthPlugins->\"" . $PluginName . "\"");
}
:foreach Temperature in=[ /system/health/find where type="C" ] do={
:local Name [ /system/health/get $Temperature name ];
:local Value [ /system/health/get $Temperature value ];
:if ([ :typeof ($CheckHealthLast->$Name) ] != "nothing") do={
:if ([ :typeof ($CheckHealthTemperature->$Name) ] != "num" ) do={
$LogPrint info $ScriptName ("No threshold given for " . $Name . ", assuming 50C.");
:set ($CheckHealthTemperature->$Name) 50;
}
:local Validate [ /system/health/get [ find where name=$Name ] value ];
:while ($Value != $Validate) do={
:set Value $Validate;
:set Validate [ /system/health/get [ find where name=$Name ] value ];
}
:if ($Value > $CheckHealthTemperature->$Name && \
$CheckHealthTemperatureNotified->$Name != true) do={
$SendNotification2 ({ origin=$ScriptName; \
subject=([ $SymbolForNotification "fire" ] . "Health warning: " . $Name); \
message=("The " . $Name . " on " . $Identity . " is above threshold: " . \
$Value . "\C2\B0" . "C") });
:set ($CheckHealthTemperatureNotified->$Name) true;
}
:if ($Value <= ($CheckHealthTemperature->$Name - $CheckHealthTemperatureDeviation) && \
$CheckHealthTemperatureNotified->$Name = true) do={
$SendNotification2 ({ origin=$ScriptName; \
subject=([ $SymbolForNotification "white-heavy-check-mark" ] . "Health recovery: " . $Name); \
message=("The " . $Name . " on " . $Identity . " dropped below threshold: " . \
$Value . "\C2\B0" . "C") });
:set ($CheckHealthTemperatureNotified->$Name) false;
}
}
:set ($CheckHealthLast->$Name) $Value;
}
:set CheckHealthPlugins;
} on-error={
:global ExitError; $ExitError $ExitOK [ :jobname ];
}

View file

Before

Width:  |  Height:  |  Size: 3.4 KiB

After

Width:  |  Height:  |  Size: 3.4 KiB

Before After
Before After

View file

Before

Width:  |  Height:  |  Size: 3.4 KiB

After

Width:  |  Height:  |  Size: 3.4 KiB

Before After
Before After

View file

@ -17,21 +17,21 @@ Description
-----------
This script is run from scheduler periodically, sending notification on
health related events:
health related events. Monitoring CPU and RAM utilization (available
processing and memory resources) works on all devices:
* high CPU utilization
* high RAM utilization (low available RAM)
With additional plugins functionality can be extended, depending on
sensors available in hardware:
* voltage jumps up or down more than configured threshold
* voltage drops below hard lower limit
* fan failed or recovered
* power supply failed or recovered
* temperature is above or below threshold
Monitoring CPU and RAM utilization (available processing and memory
resources) works on all devices. Other than that only sensors available
in hardware can be checked. See what your hardware supports:
/system/health/print;
> ⚠️ **Warning**: Note that bad initial state will not trigger an event! For
> example rebooting a device that is already too hot will not trigger an
> alert on high temperature.
@ -59,8 +59,8 @@ in hardware can be checked. See what your hardware supports:
#### PSU state
![check-health notification psu fail](check-health.d/notification-08-psu-fail.avif)
![check-health notification psu ok](check-health.d/notification-09-psu-ok.avif)
![check-health notification state fail](check-health.d/notification-08-state-fail.avif)
![check-health notification state ok](check-health.d/notification-09-state-ok.avif)
Requirements and installation
-----------------------------
@ -74,6 +74,30 @@ Just install the script and create a scheduler:
> precision of cpu utilization, escpecially on devices with limited
> resources. Thus an unusual interval is used here.
### Plugins
Additional plugins are available for sensors available in hardware. First
check what your hardware supports:
/system/health/print;
Then install the plugin for *fan* and *power supply unit* *state*:
$ScriptInstallUpdate check-health,check-health.d/state;
... or *temperature*:
$ScriptInstallUpdate check-health,check-health.d/temperature;
... or *voltage*:
$ScriptInstallUpdate check-health,check-health.d/voltage;
You can also combine the commands and install all or a subset of plugins
in one go:
$ScriptInstallUpdate check-health,check-health.d/state,check-health.d/temperature,check-health.d/voltage;
Configuration
-------------

View file

@ -13,7 +13,7 @@
:local ScriptName [ :jobname ];
# expected configuration version
:global ExpectedConfigVersion 131;
:global ExpectedConfigVersion 132;
# global variables not to be changed by user
:global GlobalFunctionsReady false;

View file

@ -56,6 +56,7 @@
129="Extended 'backup-partition' to support RouterOS copy-over - interactively or before feature update.";
130="Dropped intermediate certificates, depending on just root certificates now.";
131="Enhanced certificate download to fallback to mkcert.org, so all (commonly trusted) root certificates are available now.";
132="Split off plugins from 'check-health', so the script works on all devices to monitor CPU and RAM. The supported plugins for sensors in hardware are installed automatically.";
};
# Migration steps to be applied on script updates
@ -64,4 +65,5 @@
100=":global ScriptInstallUpdate; :if ([ :len [ /system/script/find where name=\"ssh-keys-import\" source~\"^#!rsc by RouterOS\\r?\\n\" ] ] > 0) do={ /system/script/set name=\"mod/ssh-keys-import\" ssh-keys-import; \$ScriptInstallUpdate; }";
104=":global CharacterReplace; :global ScriptInstallUpdate; :foreach Script in={ \"capsman-download-packages\"; \"capsman-rolling-upgrade\"; \"hotspot-to-wpa\"; \"hotspot-to-wpa-cleanup\" } do={ /system/script/set name=(\$Script . \".capsman\") [ find where name=\$Script ]; :foreach Scheduler in=[ /system/scheduler/find where on-event~(\$Script . \"([^-.]|\\\$)\") ] do={ /system/scheduler/set \$Scheduler on-event=[ \$CharacterReplace [ get \$Scheduler on-event ] \$Script (\$Script . \".capsman\") ]; }; }; /ip/hotspot/user/profile/set on-login=\"hotspot-to-wpa.capsman\" [ find where on-login=\"hotspot-to-wpa\" ]; \$ScriptInstallUpdate;";
111=":local Rec [ /ip/dns/static/find where comment~\"^managed by dhcp-to-dns for \" ]; :if ([ :len \$Rec ] > 0) do={ /ip/dns/static/remove \$Rec; /system/script/run dhcp-to-dns; }";
132=":if ([ :len [ /system/script/find where name=\"check-health\" ] ] > 0) do={ :local Code \":local Install \\\"check-health\\\"; :if ([ :len [ /system/health/find where type=\\\"\\\" name~\\\"-state\\\\\\\$\\\" ] ] > 0) do={ :set Install (\\\$Install . \\\",check-health.d/state\\\"); }; :if ([ :len [ /system/health/find where type=\\\"C\\\" ] ] > 0) do={ :set Install (\\\$Install . \\\",check-health.d/temperature\\\"); }; :if ([ :len [ /system/health/find where type=\\\"V\\\" ] ] > 0) do={ :set Install (\\\$Install . \\\",check-health.d/voltage\\\"); }; :global ScriptInstallUpdate; \\\$ScriptInstallUpdate \\\$Install;\"; :global ValidateSyntax; :if ([ \$ValidateSyntax \$Code ] = true) do={ :do { [ :parse \$Code ]; } on-error={ }; }; }";
};