Files
watchdog/watchdog.go

325 lines
6.1 KiB
Go
Raw Normal View History

2025-07-12 22:01:23 +03:00
package watchdog
import (
"context"
"errors"
"slices"
"sync"
"time"
)
var (
ErrNotConfigured = errors.New("no checks configured")
ErrNotRunning = errors.New("watchdog is not running")
)
// Watchdog keeps checks to run either periodically
// or on demand.
2025-07-12 22:01:23 +03:00
type Watchdog struct {
2025-09-23 08:59:10 +03:00
checks map[string]*wdCheck
mu sync.Mutex
events chan CheckResult // output channel
limiter chan struct{} // TODO: use proper limiter here
2025-09-21 14:35:34 +03:00
timeout time.Duration // timeout for checks to complete
monitoring bool // is monitoring currently in progress
running int // number of active checks monitored
}
type wdCheck struct {
check Check
stop chan struct{}
}
// New creates instance of Watchdog with
// provided checks.
func New(checks ...Check) *Watchdog {
2025-09-23 08:59:10 +03:00
w := Watchdog{
checks: make(map[string]*wdCheck),
}
2025-09-21 14:35:34 +03:00
for _, c := range checks {
nc := &wdCheck{
check: c,
}
2025-09-23 08:59:10 +03:00
w.checks[c.Name] = nc
2025-07-12 22:01:23 +03:00
}
2025-09-21 14:35:34 +03:00
return &w
2025-07-12 22:01:23 +03:00
}
func (w *Watchdog) ListChecks() []Check {
2025-07-12 22:01:23 +03:00
w.mu.Lock()
defer w.mu.Unlock()
2025-09-21 14:35:34 +03:00
out := w.copyChecks()
2025-07-12 22:01:23 +03:00
return out
2025-07-12 22:01:23 +03:00
}
// SetTimeout sets timeout for all checks that
// get started with Start method. Changing this
// value does not affect running checks.
// Watchdog does not enforce this timeout, it
// just passes context.WithTimemout to check functions.
// If this method is not called the default timeout
// of 10 seconds is used.
func (w *Watchdog) SetTimeout(d time.Duration) {
w.mu.Lock()
defer w.mu.Unlock()
w.timeout = d
}
// AddChecks adds checks to the group.
// If monitoring is in progress then monitoring it started for the newly added
// check as well.
2025-09-21 14:35:34 +03:00
// Check may have not have duplicate Name fields. New check with the same
// hame overwrites the previous one.
func (w *Watchdog) AddChecks(checks ...Check) {
2025-07-12 22:01:23 +03:00
w.mu.Lock()
defer w.mu.Unlock()
2025-09-23 08:59:10 +03:00
if w.checks == nil {
w.checks = make(map[string]*wdCheck)
}
2025-09-21 14:35:34 +03:00
for _, c := range checks {
nc := &wdCheck{
2025-09-21 14:35:34 +03:00
check: c,
}
2025-09-21 14:35:34 +03:00
2025-09-23 08:59:10 +03:00
old, haveOld := w.checks[c.Name]
2025-09-21 14:35:34 +03:00
2025-09-23 08:59:10 +03:00
w.checks[c.Name] = nc
if w.monitoring {
w.startMonitoring(nc)
2025-09-21 14:35:34 +03:00
if haveOld {
w.stopMonitoring(old)
}
}
}
2025-07-12 22:01:23 +03:00
}
// RemoveChecks removes the named checks.
func (w *Watchdog) RemoveChecks(names ...string) {
2025-07-12 22:01:23 +03:00
w.mu.Lock()
defer w.mu.Unlock()
2025-09-21 14:35:34 +03:00
for _, name := range names {
2025-09-23 08:59:10 +03:00
c, ok := w.checks[name]
2025-09-21 14:35:34 +03:00
if !ok {
2025-07-12 22:01:23 +03:00
continue
}
2025-09-21 14:35:34 +03:00
if w.monitoring {
w.stopMonitoring(c)
}
2025-07-12 22:01:23 +03:00
2025-09-23 08:59:10 +03:00
delete(w.checks, name)
2025-09-21 14:35:34 +03:00
}
2025-07-12 22:01:23 +03:00
}
// Start starts monitoring.
// Subsequent calls to start return the SAME channel. If you need
2025-07-12 22:01:23 +03:00
// to have more that one reader from the channel - fan out on your side.
// On start Watchdog runs all provided check and pushes current status of checks of
// to the channel.
// Subsequently if check func returns different status or if it returns an error the
// result is pushed to the channel.
// Concurrency argument limits the number of checks that can run concurrently. 0 means no
// limit (all checks may run concurrently).
func (w *Watchdog) Start(concurrency int) (<-chan CheckResult, error) {
2025-07-12 22:01:23 +03:00
w.mu.Lock()
defer w.mu.Unlock()
2025-09-23 08:59:10 +03:00
if len(w.checks) == 0 {
2025-07-12 22:01:23 +03:00
return nil, ErrNotConfigured
}
if w.monitoring {
return w.events, nil
}
if concurrency == 0 {
2025-09-23 08:59:10 +03:00
concurrency = len(w.checks)
}
if w.timeout == 0 {
w.timeout = DefaultTimeout
}
w.events = make(chan CheckResult, concurrency)
w.limiter = make(chan struct{}, concurrency)
2025-07-12 22:01:23 +03:00
2025-09-23 08:59:10 +03:00
for _, c := range w.checks {
2025-09-21 14:35:34 +03:00
w.startMonitoring(c)
}
return w.events, nil
2025-07-12 22:01:23 +03:00
}
// Stop stops execution of checks.
// Subsequent calls return ErrNotRunning.
2025-07-12 22:01:23 +03:00
func (w *Watchdog) Stop() error {
w.mu.Lock()
defer w.mu.Unlock()
if !w.monitoring {
2025-07-12 22:01:23 +03:00
return ErrNotRunning
}
2025-09-23 08:59:10 +03:00
for _, c := range w.checks {
2025-09-21 14:35:34 +03:00
w.stopMonitoring(c)
}
2025-07-12 22:01:23 +03:00
return nil
}
// RunImmediately runs configured checks concurrently and returns results.
// Setting concurrency to 0 means that all check of the group are allowed to run simultaneously.
// Otherwise at most concurrency checks will be allowed to run simultaneously.
2025-07-12 22:01:23 +03:00
func (w *Watchdog) RunImmediately(ctx context.Context, concurrency int) ([]CheckResult, error) {
w.mu.Lock()
2025-09-21 14:35:34 +03:00
2025-09-23 08:59:10 +03:00
if len(w.checks) == 0 {
2025-07-12 22:01:23 +03:00
w.mu.Unlock()
return nil, ErrNotConfigured
}
cp := w.copyChecks()
2025-09-23 08:59:10 +03:00
w.mu.Unlock()
2025-07-12 22:01:23 +03:00
if concurrency == 0 {
concurrency = len(cp)
}
statuses := runChecksConcurrently(ctx, cp, concurrency)
slices.SortFunc(statuses, func(a, b CheckResult) int {
if a.Name < b.Name {
return -1
} else if a.Name > b.Name {
return 1
}
return 0
})
return statuses, nil
}
func (w *Watchdog) copyChecks() []Check {
2025-09-23 08:59:10 +03:00
cp := make([]Check, 0, len(w.checks))
for _, v := range w.checks {
2025-09-21 14:35:34 +03:00
cp = append(cp, v.check)
}
return cp
}
func (w *Watchdog) startMonitoring(wdc *wdCheck) {
wdc.stop = make(chan struct{})
c := wdc.check
2025-09-21 14:35:34 +03:00
if !w.monitoring {
w.monitoring = true
}
w.running++
2025-07-12 22:01:23 +03:00
go func() {
2025-09-23 08:59:10 +03:00
var curr error = nil
ticker := time.Tick(wdc.check.Interval)
for {
w.limiter <- struct{}{}
ctx, cancel := context.WithTimeout(context.Background(), w.timeout)
defer cancel()
2025-07-12 22:01:23 +03:00
2025-09-23 08:59:10 +03:00
err := c.Check(ctx)
<-w.limiter
2025-09-21 14:35:34 +03:00
r := CheckResult{
2025-09-23 08:59:10 +03:00
Name: c.Name,
Error: err,
2025-07-12 22:01:23 +03:00
}
2025-09-21 14:35:34 +03:00
// if status changed or we've got an error
// then report this
2025-09-23 08:59:10 +03:00
if !errors.Is(r.Error, curr) {
2025-09-21 14:35:34 +03:00
w.events <- r
}
2025-07-12 22:01:23 +03:00
2025-09-23 08:59:10 +03:00
curr = r.Error
select {
case <-ticker:
// continue looping
case <-wdc.stop:
// stopping this specific check
return
}
}
}()
2025-07-12 22:01:23 +03:00
}
func (w *Watchdog) stopMonitoring(wdc *wdCheck) {
close(wdc.stop)
2025-09-21 14:35:34 +03:00
w.running--
if w.running == 0 {
w.monitoring = false
close(w.events)
}
}
2025-07-12 22:01:23 +03:00
func runChecksConcurrently(ctx context.Context, ch []Check, concurrency int) []CheckResult {
sema := make(chan struct{}, concurrency) // semaphore to limit concurrency
2025-09-21 14:35:34 +03:00
done := make(chan CheckResult, len(ch))
2025-07-12 22:01:23 +03:00
2025-09-21 14:35:34 +03:00
wg := new(sync.WaitGroup)
wg.Add(len(ch))
2025-07-12 22:01:23 +03:00
for _, e := range ch {
2025-09-21 14:35:34 +03:00
go func() {
sema <- struct{}{} // acquire
2025-07-12 22:01:23 +03:00
defer func() {
2025-09-21 14:35:34 +03:00
<-sema // release
wg.Done()
2025-07-12 22:01:23 +03:00
}()
// relying on assumption that CheckFunc obeys context
// cancellation
2025-09-23 08:59:10 +03:00
err := e.Check(ctx)
2025-07-12 22:01:23 +03:00
r := CheckResult{
2025-09-23 08:59:10 +03:00
Name: e.Name,
Error: err,
2025-07-12 22:01:23 +03:00
}
2025-09-21 14:35:34 +03:00
done <- r
}()
2025-07-12 22:01:23 +03:00
}
2025-09-21 14:35:34 +03:00
go func() {
wg.Wait()
close(done)
}()
results := make([]CheckResult, 0, len(ch))
// collect results
for r := range done {
results = append(results, r)
}
2025-07-12 22:01:23 +03:00
2025-09-21 14:35:34 +03:00
return results
2025-07-12 22:01:23 +03:00
}