246 lines
4.9 KiB
Go
246 lines
4.9 KiB
Go
![]() |
package watchdog
|
||
|
|
||
|
import (
|
||
|
"context"
|
||
|
"errors"
|
||
|
"slices"
|
||
|
"sync"
|
||
|
"time"
|
||
|
|
||
|
"golang.org/x/sync/errgroup"
|
||
|
)
|
||
|
|
||
|
var (
|
||
|
ErrNotConfigured = errors.New("no checks configured")
|
||
|
ErrNotRunning = errors.New("watchdog is not running")
|
||
|
)
|
||
|
|
||
|
type Watchdog struct {
|
||
|
checks []Check
|
||
|
running *running
|
||
|
mu sync.Mutex
|
||
|
}
|
||
|
|
||
|
// NewWatchDog accepts settings for a number of groups to monitor.
|
||
|
// keys of a map are group names. values are slices of their respective checks.
|
||
|
func NewWatchDog(checks ...Check) *Watchdog {
|
||
|
w := &Watchdog{
|
||
|
checks: checks,
|
||
|
}
|
||
|
|
||
|
return w
|
||
|
}
|
||
|
|
||
|
type running struct {
|
||
|
out chan CheckResult
|
||
|
stop chan struct{}
|
||
|
}
|
||
|
|
||
|
func (w *Watchdog) ListChecks() ([]Check, error) {
|
||
|
w.mu.Lock()
|
||
|
defer w.mu.Unlock()
|
||
|
|
||
|
out := make([]Check, len(w.checks))
|
||
|
copy(out, w.checks)
|
||
|
|
||
|
return out, nil
|
||
|
}
|
||
|
|
||
|
// AddChecks adds checks to the group. This DOES NOT
|
||
|
// affect already runnning monitoring for group. Use Stop and
|
||
|
// then Start to restart monitoring when a new check is added.
|
||
|
// Check may have duplicate Name fields but note that RemoveChecks removes checks
|
||
|
// by their Name fields.
|
||
|
func (w *Watchdog) AddChecks(checks ...Check) error {
|
||
|
w.mu.Lock()
|
||
|
defer w.mu.Unlock()
|
||
|
|
||
|
w.checks = append(w.checks, checks...)
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// RemoveChecks removes the named checks.
|
||
|
// This does not affect the already running monitoring for the group.
|
||
|
func (w *Watchdog) RemoveChecks(names ...string) error {
|
||
|
w.mu.Lock()
|
||
|
defer w.mu.Unlock()
|
||
|
|
||
|
remaining := make([]Check, 0, len(w.checks)-len(names))
|
||
|
for _, e := range w.checks {
|
||
|
if slices.Contains(names, e.Name) {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
remaining = append(remaining, e)
|
||
|
}
|
||
|
|
||
|
w.checks = remaining
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Start starts monitoring.
|
||
|
// Subsequent calls to start just returns the SAME channel. If you need
|
||
|
// to have more that one reader from the channel - fan out on your side.
|
||
|
// On start Watchdog runs all provided check and pushes current status of ALL checks of
|
||
|
// the group to the channel.
|
||
|
// Subsequently only changes of status are pushed regardless of return error values of
|
||
|
// CheckFunc provided to Watchdog.
|
||
|
func (w *Watchdog) Start(ctx context.Context) (<-chan CheckResult, error) {
|
||
|
w.mu.Lock()
|
||
|
defer w.mu.Unlock()
|
||
|
|
||
|
if w.running != nil {
|
||
|
return w.running.out, nil
|
||
|
}
|
||
|
|
||
|
// start new session
|
||
|
if len(w.checks) == 0 {
|
||
|
return nil, ErrNotConfigured
|
||
|
}
|
||
|
|
||
|
r := runMonitoringForGroup(ctx, w.checks)
|
||
|
w.running = r
|
||
|
|
||
|
return r.out, nil
|
||
|
}
|
||
|
|
||
|
// Stop stops execution of checks.
|
||
|
// Subsequent calls of Stop for the same groupID
|
||
|
// return ErrNoSuchGroup.
|
||
|
func (w *Watchdog) Stop() error {
|
||
|
w.mu.Lock()
|
||
|
defer w.mu.Unlock()
|
||
|
|
||
|
if w.running == nil {
|
||
|
return ErrNotRunning
|
||
|
}
|
||
|
|
||
|
close(w.running.stop)
|
||
|
|
||
|
w.running = nil
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// RunImmediately runs configured checks for group concurrently and returns results.
|
||
|
// Concurrency limits number of checks that are allowed run concurrently. Setting
|
||
|
// concurrency to 0 means that all check of the group are allowed to run simultaneously.
|
||
|
func (w *Watchdog) RunImmediately(ctx context.Context, concurrency int) ([]CheckResult, error) {
|
||
|
w.mu.Lock()
|
||
|
if len(w.checks) == 0 {
|
||
|
w.mu.Unlock()
|
||
|
return nil, ErrNotConfigured
|
||
|
}
|
||
|
|
||
|
// making a copy here because mutex should not be
|
||
|
// held while checks are running
|
||
|
cp := make([]Check, len(w.checks))
|
||
|
copy(cp, w.checks)
|
||
|
w.mu.Unlock() // release
|
||
|
|
||
|
if concurrency == 0 {
|
||
|
concurrency = len(cp)
|
||
|
}
|
||
|
|
||
|
statuses := runChecksConcurrently(ctx, cp, concurrency)
|
||
|
|
||
|
slices.SortFunc(statuses, func(a, b CheckResult) int {
|
||
|
if a.Name < b.Name {
|
||
|
return -1
|
||
|
} else if a.Name > b.Name {
|
||
|
return 1
|
||
|
}
|
||
|
return 0
|
||
|
})
|
||
|
|
||
|
return statuses, nil
|
||
|
}
|
||
|
|
||
|
func runMonitoringForGroup(ctx context.Context, ch []Check) *running {
|
||
|
events := make(chan CheckResult)
|
||
|
stop := make(chan struct{})
|
||
|
|
||
|
grp := errgroup.Group{}
|
||
|
|
||
|
for _, c := range ch {
|
||
|
grp.Go(func() error {
|
||
|
state := CheckResult{}
|
||
|
ticker := time.Tick(c.Interval)
|
||
|
|
||
|
for {
|
||
|
status, err := c.Check(ctx)
|
||
|
|
||
|
s := CheckResult{
|
||
|
Name: c.Name,
|
||
|
Status: status,
|
||
|
Error: err,
|
||
|
}
|
||
|
|
||
|
if s.Status != state.Status {
|
||
|
events <- s
|
||
|
}
|
||
|
|
||
|
state = s
|
||
|
|
||
|
select {
|
||
|
case <-ticker:
|
||
|
// continue looping
|
||
|
case <-stop:
|
||
|
return nil
|
||
|
case <-ctx.Done():
|
||
|
return ctx.Err()
|
||
|
}
|
||
|
}
|
||
|
})
|
||
|
}
|
||
|
|
||
|
// separate goroutine to close the output chan
|
||
|
// when everyone's dead
|
||
|
go func() {
|
||
|
grp.Wait()
|
||
|
close(events)
|
||
|
}()
|
||
|
|
||
|
return &running{out: events, stop: stop}
|
||
|
}
|
||
|
|
||
|
func runChecksConcurrently(ctx context.Context, ch []Check, concurrency int) []CheckResult {
|
||
|
statuses := make([]CheckResult, 0, len(ch))
|
||
|
m := sync.Mutex{} // for append operations
|
||
|
|
||
|
group := errgroup.Group{}
|
||
|
|
||
|
sema := make(chan struct{}, concurrency) // semaphore to limit concurrency
|
||
|
|
||
|
for _, e := range ch {
|
||
|
sema <- struct{}{} // acquire
|
||
|
|
||
|
group.Go(func() error {
|
||
|
defer func() {
|
||
|
}()
|
||
|
|
||
|
status, err := e.Check(ctx)
|
||
|
|
||
|
r := CheckResult{
|
||
|
Name: e.Name,
|
||
|
Status: status,
|
||
|
Error: err,
|
||
|
}
|
||
|
|
||
|
m.Lock()
|
||
|
defer m.Unlock()
|
||
|
statuses = append(statuses, r)
|
||
|
|
||
|
<-sema // release
|
||
|
|
||
|
return nil
|
||
|
})
|
||
|
}
|
||
|
|
||
|
group.Wait()
|
||
|
|
||
|
return statuses
|
||
|
}
|