#!/usr/bin/perl -w # # Example monitoring code illustrating alarming on consecutive or other # failure conditions. use strict; # Alarm after this many consecutive failures. my $BREACHES = 3; my %events = ( 'conn#1' => [qw{0 0 1 1 1 1 1 1 0}], 'conn#2' => [qw{0 1 0 1 0 1 0 1 0}], ); for my $event ( sort keys %events ) { my $time = 0; for my $state ( @{ $events{$event} } ) { # Detect consecutive failures, reset alarm immediately when state # returns to normal. This will never detect flapping. simple( $time, $event, $state, \&handle_alarm ); # Delay alarm reset: require the previous state to also be not in # alarm before clearing the alarm state. This will detect 50% # flapping at a three breach threshold. simple_delayreset( $time, $event, $state, \&handle_alarm ); # Using a running average did not pan out (see running_average, # below), as it would both require knowing what the average would # come out to for a certain number of breaches over time, and # setting the alarm percentage for a particular number of breaches. # It would also require the delayed alarm reset code, as otherwise # an long-lasting incident maintains the alarm state well after the # issue has cleared. $time += 5; } } { my %alarm_errors; sub simple { my ( $time, $monitor_name, $state, $handle_alarm_ref ) = @_; if ( $state == 0 ) { $alarm_errors{$monitor_name}->{count} = 0; # TODO may need code ref here if must cleanup various alarm states # or notify external services. return $state; } $alarm_errors{$monitor_name}->{count}++; if ( $alarm_errors{$monitor_name}->{count} >= $BREACHES ) { return $handle_alarm_ref->( $time, $monitor_name, $state, ( caller(0) )[3] ); } } } { my %alarm_errors; sub simple_delayreset { my ( $time, $monitor_name, $state, $handle_alarm_ref ) = @_; if ( $state == 0 ) { # Only reset alarm state if previous state was also clear. This # keeps the alarm active should a pattern of "1 1 1 0 1 1" occur. # This could also use the more complicated logic "only clear alarm # state after N intervals of a good state." # # This delayed reset allows an otherwise simple consecutive alarm # to detect some cases of flapping! if ( exists $alarm_errors{$monitor_name}->{prev_state} and $alarm_errors{$monitor_name}->{prev_state} == 0 ) { $alarm_errors{$monitor_name}->{count} = 0; } $alarm_errors{$monitor_name}->{prev_state} = $state; # TODO may need code ref here if must cleanup various alarm states # or notify external services. return $state; } $alarm_errors{$monitor_name}->{count} += 1; $alarm_errors{$monitor_name}->{prev_state} = $state; if ( $alarm_errors{$monitor_name}->{count} >= $BREACHES ) { return $handle_alarm_ref->( $time, $monitor_name, $state, ( caller(0) )[3] ); } return 0; } } { my %alarm_errors; sub running_average { my ( $time, $monitor_name, $state, $handle_alarm_ref ) = @_; my $prev_avg = exists $alarm_errors{$monitor_name}->{prev_average} ? $alarm_errors{$monitor_name}->{prev_average} : 0; my $running_avg = $prev_avg + ( ( $state - $prev_avg ) / ++$alarm_errors{$monitor_name}->{count} ); warn "DBG $monitor_name time $time state $state ravg $running_avg\n"; $alarm_errors{$monitor_name}->{prev_average} = $running_avg; } } sub handle_alarm { my ( $time, $monitor_name, $state, $detector ) = @_; $detector =~ s/^main:://; print "$monitor_name at $time alarm on $detector\n"; return $state; }