From 94faf092ead43f8ddc4ce04da41bd0bed38fcfe2 Mon Sep 17 00:00:00 2001
From: Dmitry S <11892559+swift1337@users.noreply.github.com>
Date: Fri, 20 Dec 2024 19:10:49 +0100
Subject: [PATCH] feat(pkg): scheduler package (#3319)

* Implement ticker.StopBlocking()

* Add Scheduler package

* Scheduler improvements & test coverage

* Rename ticker.Run to ticker.Start for consistency

* Implement block ticker

* Update changelog

* Rename Task to Executable. Rename Definition to Task

* Use atomic.Bool

* Fix blockTicker concurrency issues. Add intervalTicker

* Simplify Task. Add support for different tickers

* Add metrics
---
 changelog.md                    |   1 +
 pkg/scheduler/metrics.go        |  29 +++
 pkg/scheduler/opts.go           |  46 ++++
 pkg/scheduler/scheduler.go      | 211 +++++++++++++++++
 pkg/scheduler/scheduler_test.go | 403 ++++++++++++++++++++++++++++++++
 pkg/scheduler/tickers.go        | 172 ++++++++++++++
 pkg/ticker/ticker.go            |  56 +++--
 pkg/ticker/ticker_test.go       | 122 +++++++++-
 zetaclient/metrics/metrics.go   |  21 ++
 9 files changed, 1040 insertions(+), 21 deletions(-)
 create mode 100644 pkg/scheduler/metrics.go
 create mode 100644 pkg/scheduler/opts.go
 create mode 100644 pkg/scheduler/scheduler.go
 create mode 100644 pkg/scheduler/scheduler_test.go
 create mode 100644 pkg/scheduler/tickers.go

diff --git a/changelog.md b/changelog.md
index 590878fccc..8060dbd540 100644
--- a/changelog.md
+++ b/changelog.md
@@ -17,6 +17,7 @@
 
 * [3170](https://github.com/zeta-chain/node/pull/3170) - revamp TSS package in zetaclient
 * [3291](https://github.com/zeta-chain/node/pull/3291) - revamp zetaclient initialization (+ graceful shutdown)
+* [3319](https://github.com/zeta-chain/node/pull/3319) - implement scheduler for zetaclient
 
 ### Fixes
 
diff --git a/pkg/scheduler/metrics.go b/pkg/scheduler/metrics.go
new file mode 100644
index 0000000000..96d581ebfb
--- /dev/null
+++ b/pkg/scheduler/metrics.go
@@ -0,0 +1,29 @@
+package scheduler
+
+import (
+	"time"
+
+	"github.com/zeta-chain/node/zetaclient/metrics"
+)
+
+// Note that currently the hard-coded "global" metrics are used.
+func recordMetrics(task *Task, startedAt time.Time, err error, skipped bool) {
+	var status string
+	switch {
+	case skipped:
+		status = "skipped"
+	case err != nil:
+		status = "failed"
+	default:
+		status = "ok"
+	}
+
+	var (
+		group = string(task.group)
+		name  = task.name
+		dur   = time.Since(startedAt).Seconds()
+	)
+
+	metrics.SchedulerTaskInvocationCounter.WithLabelValues(status, group, name).Inc()
+	metrics.SchedulerTaskExecutionDuration.WithLabelValues(status, group, name).Observe(dur)
+}
diff --git a/pkg/scheduler/opts.go b/pkg/scheduler/opts.go
new file mode 100644
index 0000000000..8e5d54e370
--- /dev/null
+++ b/pkg/scheduler/opts.go
@@ -0,0 +1,46 @@
+package scheduler
+
+import (
+	"time"
+
+	cometbft "github.com/cometbft/cometbft/types"
+)
+
+// Opt Task option
+type Opt func(task *Task, taskOpts *taskOpts)
+
+// Name sets task name.
+func Name(name string) Opt {
+	return func(t *Task, _ *taskOpts) { t.name = name }
+}
+
+// GroupName sets task group. Otherwise, defaults to DefaultGroup.
+func GroupName(group Group) Opt {
+	return func(t *Task, _ *taskOpts) { t.group = group }
+}
+
+// LogFields augments Task's logger with some fields.
+func LogFields(fields map[string]any) Opt {
+	return func(_ *Task, opts *taskOpts) { opts.logFields = fields }
+}
+
+// Interval sets initial task interval.
+func Interval(interval time.Duration) Opt {
+	return func(_ *Task, opts *taskOpts) { opts.interval = interval }
+}
+
+// Skipper sets task skipper function
+func Skipper(skipper func() bool) Opt {
+	return func(t *Task, _ *taskOpts) { t.skipper = skipper }
+}
+
+// IntervalUpdater sets interval updater function.
+func IntervalUpdater(intervalUpdater func() time.Duration) Opt {
+	return func(_ *Task, opts *taskOpts) { opts.intervalUpdater = intervalUpdater }
+}
+
+// BlockTicker makes Task to listen for new zeta blocks
+// instead of using interval ticker. IntervalUpdater is ignored.
+func BlockTicker(blocks <-chan cometbft.EventDataNewBlock) Opt {
+	return func(_ *Task, opts *taskOpts) { opts.blockChan = blocks }
+}
diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go
new file mode 100644
index 0000000000..2328cbddd7
--- /dev/null
+++ b/pkg/scheduler/scheduler.go
@@ -0,0 +1,211 @@
+// Package scheduler provides a background task scheduler that allows for the registration,
+// execution, and management of periodic tasks. Tasks can be grouped, named, and configured
+// with various options such as custom intervals, log fields, and skip conditions.
+//
+// The scheduler supports dynamic interval updates and can gracefully stop tasks either
+// individually or by group.
+package scheduler
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	cometbft "github.com/cometbft/cometbft/types"
+	"github.com/google/uuid"
+	"github.com/rs/zerolog"
+
+	"github.com/zeta-chain/node/pkg/bg"
+)
+
+// Scheduler represents background task scheduler.
+type Scheduler struct {
+	tasks  map[uuid.UUID]*Task
+	mu     sync.RWMutex
+	logger zerolog.Logger
+}
+
+// Executable arbitrary function that can be executed.
+type Executable func(ctx context.Context) error
+
+// Group represents Task group. Tasks can be grouped for easier management.
+type Group string
+
+// DefaultGroup is the default task group.
+const DefaultGroup = Group("default")
+
+// tickable ticker abstraction to support different implementations
+type tickable interface {
+	Start(ctx context.Context) error
+	Stop()
+}
+
+// Task represents scheduler's task.
+type Task struct {
+	// ref to the Scheduler is required
+	scheduler *Scheduler
+
+	id    uuid.UUID
+	group Group
+	name  string
+
+	exec Executable
+
+	// ticker abstraction to support different implementations
+	ticker  tickable
+	skipper func() bool
+
+	logger zerolog.Logger
+}
+
+type taskOpts struct {
+	interval        time.Duration
+	intervalUpdater func() time.Duration
+
+	blockChan <-chan cometbft.EventDataNewBlock
+
+	logFields map[string]any
+}
+
+// New Scheduler instance.
+func New(logger zerolog.Logger) *Scheduler {
+	return &Scheduler{
+		tasks:  make(map[uuid.UUID]*Task),
+		logger: logger.With().Str("module", "scheduler").Logger(),
+	}
+}
+
+// Register registers and starts new Task in the background
+func (s *Scheduler) Register(ctx context.Context, exec Executable, opts ...Opt) *Task {
+	id := uuid.New()
+	task := &Task{
+		scheduler: s,
+		id:        id,
+		group:     DefaultGroup,
+		name:      id.String(),
+		exec:      exec,
+	}
+
+	config := &taskOpts{
+		interval: time.Second,
+	}
+
+	for _, opt := range opts {
+		opt(task, config)
+	}
+
+	task.logger = newTaskLogger(task, config, s.logger)
+	task.ticker = newTickable(task, config)
+
+	task.logger.Info().Msg("Starting scheduler task")
+	bg.Work(ctx, task.ticker.Start, bg.WithLogger(task.logger))
+
+	s.mu.Lock()
+	s.tasks[id] = task
+	s.mu.Unlock()
+
+	return task
+}
+
+// Stop stops all tasks.
+func (s *Scheduler) Stop() {
+	s.StopGroup("")
+}
+
+// StopGroup stops all tasks in the group.
+func (s *Scheduler) StopGroup(group Group) {
+	var selectedTasks []*Task
+
+	s.mu.RLock()
+
+	// Filter desired tasks
+	for _, task := range s.tasks {
+		// "" is for wildcard i.e. all groups
+		if group == "" || task.group == group {
+			selectedTasks = append(selectedTasks, task)
+		}
+	}
+
+	s.mu.RUnlock()
+
+	if len(selectedTasks) == 0 {
+		return
+	}
+
+	// Stop all selected tasks concurrently
+	var wg sync.WaitGroup
+	wg.Add(len(selectedTasks))
+
+	for _, task := range selectedTasks {
+		go func(task *Task) {
+			defer wg.Done()
+			task.Stop()
+		}(task)
+	}
+
+	wg.Wait()
+}
+
+// Stop stops the task and offloads it from the scheduler.
+func (t *Task) Stop() {
+	t.logger.Info().Msg("Stopping scheduler task")
+	start := time.Now()
+
+	t.ticker.Stop()
+
+	t.scheduler.mu.Lock()
+	delete(t.scheduler.tasks, t.id)
+	t.scheduler.mu.Unlock()
+
+	timeTakenMS := time.Since(start).Milliseconds()
+	t.logger.Info().Int64("time_taken_ms", timeTakenMS).Msg("Stopped scheduler task")
+}
+
+// execute executes Task with additional logging and metrics.
+func (t *Task) execute(ctx context.Context) error {
+	startedAt := time.Now().UTC()
+
+	// skip tick
+	if t.skipper != nil && t.skipper() {
+		recordMetrics(t, startedAt, nil, true)
+		return nil
+	}
+
+	err := t.exec(ctx)
+
+	recordMetrics(t, startedAt, err, false)
+
+	return err
+}
+
+func newTaskLogger(task *Task, opts *taskOpts, logger zerolog.Logger) zerolog.Logger {
+	logOpts := logger.With().
+		Str("task.name", task.name).
+		Str("task.group", string(task.group))
+
+	if len(opts.logFields) > 0 {
+		logOpts = logOpts.Fields(opts.logFields)
+	}
+
+	taskType := "interval_ticker"
+	if opts.blockChan != nil {
+		taskType = "block_ticker"
+	}
+
+	return logOpts.Str("task.type", taskType).Logger()
+}
+
+func newTickable(task *Task, opts *taskOpts) tickable {
+	// Block-based ticker
+	if opts.blockChan != nil {
+		return newBlockTicker(task.execute, opts.blockChan, task.logger)
+	}
+
+	return newIntervalTicker(
+		task.execute,
+		opts.interval,
+		opts.intervalUpdater,
+		task.name,
+		task.logger,
+	)
+}
diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go
new file mode 100644
index 0000000000..a993bc875a
--- /dev/null
+++ b/pkg/scheduler/scheduler_test.go
@@ -0,0 +1,403 @@
+package scheduler
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	cometbft "github.com/cometbft/cometbft/types"
+	"github.com/rs/zerolog"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestScheduler(t *testing.T) {
+	t.Run("Basic case", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		var counter int32
+
+		exec := func(ctx context.Context) error {
+			atomic.AddInt32(&counter, 1)
+			return nil
+		}
+
+		// ACT
+		// Register task and stop it after x1.5 interval.
+		ts.scheduler.Register(ts.ctx, exec)
+		time.Sleep(1500 * time.Millisecond)
+		ts.scheduler.Stop()
+
+		// ASSERT
+		// Counter should be 2 because we invoke a task once on a start,
+		// once after 1 second (default interval),
+		// and then at T=1.5s we stop the scheduler.
+		assert.Equal(t, int32(2), counter)
+
+		// Check logs
+		assert.Contains(t, ts.logBuffer.String(), "Stopped scheduler task")
+		assert.Contains(t, ts.logBuffer.String(), `"task.group":"default"`)
+	})
+
+	t.Run("More opts", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		var counter int32
+
+		exec := func(ctx context.Context) error {
+			atomic.AddInt32(&counter, 1)
+			return nil
+		}
+
+		// ACT
+		// Register task and stop it after x1.5 interval.
+		ts.scheduler.Register(
+			ts.ctx,
+			exec,
+			Name("counter-inc"),
+			GroupName("my-custom-group"),
+			Interval(300*time.Millisecond),
+			LogFields(map[string]any{
+				"blockchain": "doge",
+				"validators": []string{"alice", "bob"},
+			}),
+		)
+
+		time.Sleep(time.Second)
+		ts.scheduler.Stop()
+
+		// ASSERT
+		// Counter should be 1 + 1000/300 = 4 (first run + interval runs)
+		assert.Equal(t, int32(4), counter)
+
+		// Also check that log fields are present
+		assert.Contains(t, ts.logBuffer.String(), `"task.name":"counter-inc","task.group":"my-custom-group"`)
+		assert.Contains(t, ts.logBuffer.String(), `"blockchain":"doge","validators":["alice","bob"]`)
+	})
+
+	t.Run("Task can stop itself", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		var counter int32
+
+		exec := func(ctx context.Context) error {
+			atomic.AddInt32(&counter, 1)
+			return nil
+		}
+
+		// ACT
+		// Register task and stop it after x1.5 interval.
+		task := ts.scheduler.Register(ts.ctx, exec, Interval(300*time.Millisecond))
+
+		time.Sleep(time.Second)
+		task.Stop()
+
+		// ASSERT
+		// Counter should be 1 + 1000/300 = 4 (first run + interval runs)
+		assert.Equal(t, int32(4), counter)
+	})
+
+	t.Run("Skipper option", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		var counter int32
+
+		exec := func(ctx context.Context) error {
+			atomic.AddInt32(&counter, 1)
+			return nil
+		}
+
+		const maxValue = 5
+
+		// Skipper function that drops the task after reaching a certain counter value.
+		skipper := func() bool {
+			allowed := atomic.LoadInt32(&counter) < maxValue
+			return !allowed
+		}
+
+		// ACT
+		// Register task and stop it after x1.5 interval.
+		task := ts.scheduler.Register(ts.ctx, exec, Interval(50*time.Millisecond), Skipper(skipper))
+
+		time.Sleep(time.Second)
+		task.Stop()
+
+		// ASSERT
+		assert.Equal(t, int32(maxValue), counter)
+	})
+
+	t.Run("IntervalUpdater option", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		var counter int32
+
+		exec := func(ctx context.Context) error {
+			atomic.AddInt32(&counter, 1)
+			return nil
+		}
+
+		// Interval updater that increases the interval by 50ms on each counter increment.
+		intervalUpdater := func() time.Duration {
+			return time.Duration(atomic.LoadInt32(&counter)) * 50 * time.Millisecond
+		}
+
+		// ACT
+		// Register task and stop it after x1.5 interval.
+		task := ts.scheduler.Register(ts.ctx, exec, Interval(time.Millisecond), IntervalUpdater(intervalUpdater))
+
+		time.Sleep(time.Second)
+		task.Stop()
+
+		// ASSERT
+		assert.Equal(t, int32(6), counter)
+
+		assert.Contains(t, ts.logBuffer.String(), `"ticker.old_interval":1,"ticker.new_interval":50`)
+		assert.Contains(t, ts.logBuffer.String(), `"ticker.old_interval":50,"ticker.new_interval":100`)
+		assert.Contains(t, ts.logBuffer.String(), `"ticker.old_interval":100,"ticker.new_interval":150`)
+		assert.Contains(t, ts.logBuffer.String(), `"ticker.old_interval":150,"ticker.new_interval":200`)
+		assert.Contains(t, ts.logBuffer.String(), `"ticker.old_interval":200,"ticker.new_interval":250`)
+		assert.Contains(t, ts.logBuffer.String(), `"ticker.old_interval":250,"ticker.new_interval":300`)
+	})
+
+	t.Run("Multiple tasks in different groups", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		// Given multiple tasks
+		var counterA, counterB, counterC int32
+
+		// Two tasks for Alice
+		taskAliceA := func(ctx context.Context) error {
+			atomic.AddInt32(&counterA, 1)
+			time.Sleep(60 * time.Millisecond)
+			return nil
+		}
+
+		taskAliceB := func(ctx context.Context) error {
+			atomic.AddInt32(&counterB, 1)
+			time.Sleep(70 * time.Millisecond)
+			return nil
+		}
+
+		// One task for Bob
+		taskBobC := func(ctx context.Context) error {
+			atomic.AddInt32(&counterC, 1)
+			time.Sleep(80 * time.Millisecond)
+			return nil
+		}
+
+		// ACT
+		// Register all tasks with different intervals and groups
+		ts.scheduler.Register(ts.ctx, taskAliceA, Interval(50*time.Millisecond), GroupName("alice"), Name("a"))
+		ts.scheduler.Register(ts.ctx, taskAliceB, Interval(100*time.Millisecond), GroupName("alice"), Name("b"))
+		ts.scheduler.Register(ts.ctx, taskBobC, Interval(200*time.Millisecond), GroupName("bob"), Name("c"))
+
+		// Wait and then stop Alice's tasks
+		time.Sleep(time.Second)
+		ts.scheduler.StopGroup("alice")
+
+		// ASSERT #1
+		shutdownLogPattern := func(group, name string) string {
+			const pattern = `"task\.name":"%s","task\.group":"%s",.*"message":"Stopped scheduler task"`
+			return fmt.Sprintf(pattern, name, group)
+		}
+
+		// Make sure Alice.A and Alice.B are stopped
+		assert.Regexp(t, shutdownLogPattern("alice", "a"), ts.logBuffer.String())
+		assert.Regexp(t, shutdownLogPattern("alice", "b"), ts.logBuffer.String())
+
+		// But Bob.C is still running
+		assert.NotRegexp(t, shutdownLogPattern("bob", "c"), ts.logBuffer.String())
+
+		// ACT #2
+		time.Sleep(200 * time.Millisecond)
+		ts.scheduler.StopGroup("bob")
+
+		// ASSERT #2
+		// Bob.C is not running
+		assert.Regexp(t, shutdownLogPattern("bob", "c"), ts.logBuffer.String())
+	})
+
+	t.Run("Block tick: tick is faster than the block", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		// Given a task that increments a counter by block height
+		var counter int64
+
+		task := func(ctx context.Context) error {
+			// Note that ctx contains the block event
+			blockEvent, ok := BlockFromContext(ctx)
+			require.True(t, ok)
+
+			atomic.AddInt64(&counter, blockEvent.Block.Height)
+			time.Sleep(100 * time.Millisecond)
+			return nil
+		}
+
+		// Given block ticker
+		blockChan := ts.mockBlockChan(200*time.Millisecond, 0)
+
+		// ACT
+		// Register block
+		ts.scheduler.Register(ts.ctx, task, BlockTicker(blockChan))
+		time.Sleep(1200 * time.Millisecond)
+		ts.scheduler.Stop()
+
+		// ASSERT
+		assert.Equal(t, int64(21), counter)
+		assert.Contains(t, ts.logBuffer.String(), "Stopped scheduler task")
+		assert.Contains(t, ts.logBuffer.String(), `"task.type":"block_ticker"`)
+	})
+
+	t.Run("Block tick: tick is slower than the block", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		// Given a task that increments a counter on start
+		// and then decrements before finish
+		var counter int64
+
+		exec := func(ctx context.Context) error {
+			_, ok := BlockFromContext(ctx)
+			require.True(t, ok)
+
+			atomic.AddInt64(&counter, 1)
+			time.Sleep(256 * time.Millisecond)
+			atomic.AddInt64(&counter, -1)
+			return nil
+		}
+
+		// Given block ticker
+		blockChan := ts.mockBlockChan(100*time.Millisecond, 0)
+
+		// ACT
+		// Register block
+		ts.scheduler.Register(ts.ctx, exec, BlockTicker(blockChan))
+		time.Sleep(1200 * time.Millisecond)
+		ts.scheduler.Stop()
+
+		// ASSERT
+		// zero indicates that Stop() waits for current iteration to finish (graceful shutdown)
+		assert.Equal(t, int64(0), counter)
+	})
+
+	t.Run("Block tick: chan closes unexpectedly", func(t *testing.T) {
+		t.Parallel()
+
+		// ARRANGE
+		ts := newTestSuite(t)
+
+		// Given a task that increments a counter on start
+		// and then decrements before finish
+		var counter int64
+
+		exec := func(ctx context.Context) error {
+			_, ok := BlockFromContext(ctx)
+			require.True(t, ok)
+
+			atomic.AddInt64(&counter, 1)
+			time.Sleep(200 * time.Millisecond)
+			atomic.AddInt64(&counter, -1)
+			return nil
+		}
+
+		// Given block ticker that closes after 3 blocks
+		blockChan := ts.mockBlockChan(100*time.Millisecond, 3)
+
+		// ACT
+		// Register block
+		ts.scheduler.Register(ts.ctx, exec, BlockTicker(blockChan), Name("block-tick"))
+
+		// Wait for a while
+		time.Sleep(1000 * time.Millisecond)
+
+		// Stop the scheduler.
+		// Note that actually the ticker is already stopped.
+		ts.scheduler.Stop()
+
+		// ASSERT
+		// zero indicates that Stop() waits for current iteration to finish (graceful shutdown)
+		assert.Equal(t, int64(0), counter)
+		assert.Contains(t, ts.logBuffer.String(), "Block channel closed")
+	})
+}
+
+type testSuite struct {
+	ctx       context.Context
+	scheduler *Scheduler
+
+	logger    zerolog.Logger
+	logBuffer *bytes.Buffer
+}
+
+func newTestSuite(t *testing.T) *testSuite {
+	logBuffer := &bytes.Buffer{}
+	logger := zerolog.New(io.MultiWriter(zerolog.NewTestWriter(t), logBuffer))
+
+	return &testSuite{
+		ctx:       context.Background(),
+		scheduler: New(logger),
+		logger:    logger,
+		logBuffer: logBuffer,
+	}
+}
+
+// mockBlockChan mocks websocket blocks. Optionally halts after lastBlock.
+func (ts *testSuite) mockBlockChan(interval time.Duration, lastBlock int64) chan cometbft.EventDataNewBlock {
+	producer := make(chan cometbft.EventDataNewBlock)
+
+	go func() {
+		var blockNumber int64
+
+		for {
+			blockNumber++
+			ts.logger.Info().Int64("block_number", blockNumber).Msg("Producing new block")
+
+			header := cometbft.Header{
+				ChainID: "zeta",
+				Height:  blockNumber,
+				Time:    time.Now(),
+			}
+
+			producer <- cometbft.EventDataNewBlock{
+				Block: &cometbft.Block{Header: header},
+			}
+
+			if blockNumber > 0 && blockNumber == lastBlock {
+				ts.logger.Info().Int64("block_number", blockNumber).Msg("Halting block producer")
+				close(producer)
+				return
+			}
+
+			time.Sleep(interval)
+		}
+	}()
+
+	return producer
+}
diff --git a/pkg/scheduler/tickers.go b/pkg/scheduler/tickers.go
new file mode 100644
index 0000000000..613194c44b
--- /dev/null
+++ b/pkg/scheduler/tickers.go
@@ -0,0 +1,172 @@
+package scheduler
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	cometbft "github.com/cometbft/cometbft/types"
+	"github.com/rs/zerolog"
+
+	"github.com/zeta-chain/node/pkg/ticker"
+)
+
+// intervalTicker wrapper for ticker.Ticker.
+type intervalTicker struct {
+	ticker *ticker.Ticker
+}
+
+func newIntervalTicker(
+	task Executable,
+	interval time.Duration,
+	intervalUpdater func() time.Duration,
+	taskName string,
+	logger zerolog.Logger,
+) *intervalTicker {
+	wrapper := func(ctx context.Context, t *ticker.Ticker) error {
+		if err := task(ctx); err != nil {
+			logger.Error().Err(err).Msg("task failed")
+		}
+
+		if intervalUpdater != nil {
+			// noop if interval is not changed
+			t.SetInterval(intervalUpdater())
+		}
+
+		return nil
+	}
+
+	tt := ticker.New(interval, wrapper, ticker.WithLogger(logger, taskName))
+
+	return &intervalTicker{ticker: tt}
+}
+
+func (t *intervalTicker) Start(ctx context.Context) error {
+	return t.ticker.Start(ctx)
+}
+
+func (t *intervalTicker) Stop() {
+	t.ticker.StopBlocking()
+}
+
+// blockTicker represents custom ticker implementation that ticks on new Zeta block events.
+// Pass blockTicker ONLY by pointer.
+type blockTicker struct {
+	exec Executable
+
+	// block channel that will be used to receive new blocks
+	blockChan <-chan cometbft.EventDataNewBlock
+
+	// stopChan is used to stop the ticker
+	stopChan chan struct{}
+
+	// doneChan is used to signal that the ticker has stopped (i.e. "blocking stop")
+	doneChan chan struct{}
+
+	isRunning bool
+	mu        sync.Mutex
+
+	logger zerolog.Logger
+}
+
+type blockCtxKey struct{}
+
+func newBlockTicker(task Executable, blockChan <-chan cometbft.EventDataNewBlock, logger zerolog.Logger) *blockTicker {
+	return &blockTicker{
+		exec:      task,
+		blockChan: blockChan,
+		logger:    logger,
+	}
+}
+
+func withBlockEvent(ctx context.Context, event cometbft.EventDataNewBlock) context.Context {
+	return context.WithValue(ctx, blockCtxKey{}, event)
+}
+
+// BlockFromContext returns cometbft.EventDataNewBlock from the context or false.
+func BlockFromContext(ctx context.Context) (cometbft.EventDataNewBlock, bool) {
+	blockEvent, ok := ctx.Value(blockCtxKey{}).(cometbft.EventDataNewBlock)
+	return blockEvent, ok
+}
+
+func (t *blockTicker) Start(ctx context.Context) error {
+	if err := t.init(); err != nil {
+		return err
+	}
+
+	defer t.cleanup()
+
+	// release Stop() blocking
+	defer func() { close(t.doneChan) }()
+
+	for {
+		select {
+		case block, ok := <-t.blockChan:
+			// channel closed
+			if !ok {
+				t.logger.Warn().Msg("Block channel closed")
+				return nil
+			}
+
+			ctx := withBlockEvent(ctx, block)
+
+			if err := t.exec(ctx); err != nil {
+				t.logger.Error().Err(err).Msg("Task error")
+			}
+		case <-ctx.Done():
+			t.logger.Warn().Err(ctx.Err()).Msg("Content error")
+			return nil
+		case <-t.stopChan:
+			// caller invoked t.stop()
+			return nil
+		}
+	}
+}
+
+func (t *blockTicker) Stop() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// noop
+	if !t.isRunning {
+		return
+	}
+
+	// notify async loop to stop
+	close(t.stopChan)
+
+	// wait for the loop to stop
+	<-t.doneChan
+
+	t.isRunning = false
+}
+
+func (t *blockTicker) init() error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	if t.isRunning {
+		return fmt.Errorf("ticker already started")
+	}
+
+	t.stopChan = make(chan struct{})
+	t.doneChan = make(chan struct{})
+	t.isRunning = true
+
+	return nil
+}
+
+// if ticker was stopped NOT by Stop() method, we want to make a cleanup
+func (t *blockTicker) cleanup() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	// noop
+	if !t.isRunning {
+		return
+	}
+
+	t.isRunning = false
+	close(t.stopChan)
+}
diff --git a/pkg/ticker/ticker.go b/pkg/ticker/ticker.go
index ebac3e1a96..9ec0d4cb06 100644
--- a/pkg/ticker/ticker.go
+++ b/pkg/ticker/ticker.go
@@ -56,6 +56,7 @@ type Ticker struct {
 	stopped   bool
 	ctxCancel context.CancelFunc
 
+	runCompleteChan  chan struct{}
 	externalStopChan <-chan struct{}
 	logger           zerolog.Logger
 }
@@ -94,7 +95,7 @@ func New(interval time.Duration, task Task, opts ...Opt) *Ticker {
 
 // Run creates and runs a new Ticker.
 func Run(ctx context.Context, interval time.Duration, task Task, opts ...Opt) error {
-	return New(interval, task, opts...).Run(ctx)
+	return New(interval, task, opts...).Start(ctx)
 }
 
 // Run runs the ticker by blocking current goroutine. It also invokes BEFORE ticker starts.
@@ -102,8 +103,17 @@ func Run(ctx context.Context, interval time.Duration, task Task, opts ...Opt) er
 // - context is done (returns ctx.Err())
 // - task returns an error or panics
 // - shutdown signal is received
-func (t *Ticker) Run(ctx context.Context) (err error) {
+func (t *Ticker) Start(ctx context.Context) (err error) {
+	// prevent concurrent runs
+	t.runnerMu.Lock()
+	defer t.runnerMu.Unlock()
+
+	ctx = t.setStartState(ctx)
+
 	defer func() {
+		// used in StopBlocking()
+		close(t.runCompleteChan)
+
 		if r := recover(); r != nil {
 			stack := string(debug.Stack())
 			lines := strings.Split(stack, "\n")
@@ -116,21 +126,14 @@ func (t *Ticker) Run(ctx context.Context) (err error) {
 		}
 	}()
 
-	// prevent concurrent runs
-	t.runnerMu.Lock()
-	defer t.runnerMu.Unlock()
-
-	// setup
-	ctx, t.ctxCancel = context.WithCancel(ctx)
-	t.ticker = time.NewTicker(t.interval)
-	t.stopped = false
-
 	// initial run
-	if err := t.task(ctx, t); err != nil {
+	if err = t.task(ctx, t); err != nil {
 		t.Stop()
 		return fmt.Errorf("ticker task failed (initial run): %w", err)
 	}
 
+	defer t.setStopState()
+
 	for {
 		select {
 		case <-ctx.Done():
@@ -172,8 +175,35 @@ func (t *Ticker) SetInterval(interval time.Duration) {
 	t.ticker.Reset(interval)
 }
 
-// Stop stops the ticker. Safe to call concurrently or multiple times.
+// Stop stops the ticker in a NON-blocking way. If the task is running in a separate goroutine,
+// this call *might* not wait for it to finish. To wait for task finish, use StopBlocking().
+// It's safe to call Stop() multiple times / concurrently / within the task.
 func (t *Ticker) Stop() {
+	t.setStopState()
+}
+
+// StopBlocking stops the ticker in a blocking way i.e. it waits for the task to finish.
+// DO NOT call this within the task.
+func (t *Ticker) StopBlocking() {
+	t.setStopState()
+	<-t.runCompleteChan
+}
+
+func (t *Ticker) setStartState(ctx context.Context) context.Context {
+	t.stateMu.Lock()
+	defer t.stateMu.Unlock()
+
+	ctx, t.ctxCancel = context.WithCancel(ctx)
+	t.ticker = time.NewTicker(t.interval)
+	t.stopped = false
+
+	// this signals that Run() is about to return
+	t.runCompleteChan = make(chan struct{})
+
+	return ctx
+}
+
+func (t *Ticker) setStopState() {
 	t.stateMu.Lock()
 	defer t.stateMu.Unlock()
 
diff --git a/pkg/ticker/ticker_test.go b/pkg/ticker/ticker_test.go
index 4e03a28d4f..276eb9457f 100644
--- a/pkg/ticker/ticker_test.go
+++ b/pkg/ticker/ticker_test.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"context"
 	"fmt"
+	"sync/atomic"
 	"testing"
 	"time"
 
@@ -19,6 +20,8 @@ func TestTicker(t *testing.T) {
 	)
 
 	t.Run("Basic case with context", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		// Given a counter
 		var counter int
@@ -35,7 +38,7 @@ func TestTicker(t *testing.T) {
 		})
 
 		// ACT
-		err := ticker.Run(ctx)
+		err := ticker.Start(ctx)
 
 		// ASSERT
 		assert.ErrorIs(t, err, context.DeadlineExceeded)
@@ -45,6 +48,8 @@ func TestTicker(t *testing.T) {
 	})
 
 	t.Run("Halts when error occurred", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		// Given a counter
 		var counter int
@@ -62,7 +67,7 @@ func TestTicker(t *testing.T) {
 		})
 
 		// ACT
-		err := ticker.Run(ctx)
+		err := ticker.Start(ctx)
 
 		// ASSERT
 		assert.ErrorContains(t, err, "oops")
@@ -70,6 +75,8 @@ func TestTicker(t *testing.T) {
 	})
 
 	t.Run("Dynamic interval update", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		// Given a counter
 		var counter int
@@ -93,7 +100,7 @@ func TestTicker(t *testing.T) {
 		})
 
 		// ACT
-		err := ticker.Run(ctx)
+		err := ticker.Start(ctx)
 
 		// ASSERT
 		assert.ErrorIs(t, err, context.DeadlineExceeded)
@@ -104,6 +111,8 @@ func TestTicker(t *testing.T) {
 	})
 
 	t.Run("Stop ticker", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		// Given a counter
 		var counter int
@@ -124,7 +133,7 @@ func TestTicker(t *testing.T) {
 		}()
 
 		// ACT
-		err := ticker.Run(ctx)
+		err := ticker.Start(ctx)
 
 		// ASSERT
 		assert.NoError(t, err)
@@ -135,7 +144,96 @@ func TestTicker(t *testing.T) {
 		})
 	})
 
+	t.Run("Stop ticker in a blocking fashion", func(t *testing.T) {
+		t.Parallel()
+
+		const (
+			tickerInterval = 100 * time.Millisecond
+			workDuration   = 600 * time.Millisecond
+			stopAfterStart = workDuration + tickerInterval/2
+		)
+
+		newLogger := func(t *testing.T) zerolog.Logger {
+			return zerolog.New(zerolog.NewTestWriter(t)).With().Timestamp().Logger()
+		}
+
+		// test task that imitates some work
+		newTask := func(counter *int32, logger zerolog.Logger) Task {
+			return func(ctx context.Context, _ *Ticker) error {
+				logger.Info().Msg("Tick start")
+				atomic.AddInt32(counter, 1)
+
+				time.Sleep(workDuration)
+
+				logger.Info().Msgf("Tick end")
+				atomic.AddInt32(counter, -1)
+
+				return nil
+			}
+		}
+
+		t.Run("Non-blocking stop fails do finish the work", func(t *testing.T) {
+			t.Parallel()
+
+			// ARRANGE
+			// Given some test task that imitates some work
+			testLogger := newLogger(t)
+			counter := int32(0)
+			task := newTask(&counter, testLogger)
+
+			// Given a ticker
+			ticker := New(tickerInterval, task, WithLogger(testLogger, "test-non-blocking-ticker"))
+
+			// ACT
+			// Imitate the ticker run in the background
+			go func() {
+				err := ticker.Start(context.Background())
+				require.NoError(t, err)
+			}()
+
+			// Then stop the ticker after some delay
+			time.Sleep(stopAfterStart)
+			testLogger.Info().Msg("Stopping ticker")
+			ticker.Stop()
+			testLogger.Info().Msg("Stopped ticker")
+
+			// ASSERT
+			// If ticker is stopped BEFORE the work is done i.e. "in the middle of work",
+			// thus the counter would be `1. You can also check the logs
+			assert.Equal(t, int32(1), counter)
+		})
+
+		t.Run("Blocking stop works as expected", func(t *testing.T) {
+			t.Parallel()
+
+			// ARRANGE
+			// Now if we have the SAME test but with blocking stop, it should work
+			testLogger := newLogger(t)
+			counter := int32(0)
+			task := newTask(&counter, testLogger)
+
+			ticker := New(tickerInterval, task, WithLogger(testLogger, "test-non-blocking-ticker"))
+
+			// ACT
+			go func() {
+				err := ticker.Start(context.Background())
+				require.NoError(t, err)
+			}()
+
+			time.Sleep(stopAfterStart)
+			testLogger.Info().Msg("Stopping ticker")
+			ticker.StopBlocking()
+			testLogger.Info().Msg("Stopped ticker")
+
+			// ASSERT
+			// If ticker is stopped AFTER the work is done
+			assert.Equal(t, int32(0), counter)
+		})
+	})
+
 	t.Run("Panic", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		// Given a context
 		ctx := context.Background()
@@ -146,15 +244,17 @@ func TestTicker(t *testing.T) {
 		})
 
 		// ACT
-		err := ticker.Run(ctx)
+		err := ticker.Start(ctx)
 
 		// ASSERT
 		assert.ErrorContains(t, err, "panic during ticker run: oops")
 		// assert that we get error with the correct line number
-		assert.ErrorContains(t, err, "ticker_test.go:145")
+		assert.ErrorContains(t, err, "ticker_test.go:243")
 	})
 
 	t.Run("Nil panic", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		// Given a context
 		ctx := context.Background()
@@ -167,7 +267,7 @@ func TestTicker(t *testing.T) {
 		})
 
 		// ACT
-		err := ticker.Run(ctx)
+		err := ticker.Start(ctx)
 
 		// ASSERT
 		assert.ErrorContains(
@@ -176,10 +276,12 @@ func TestTicker(t *testing.T) {
 			"panic during ticker run: runtime error: invalid memory address or nil pointer dereference",
 		)
 		// assert that we get error with the correct line number
-		assert.ErrorContains(t, err, "ticker_test.go:165")
+		assert.ErrorContains(t, err, "ticker_test.go:265")
 	})
 
 	t.Run("Run as a single call", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		// Given a counter
 		var counter int
@@ -202,6 +304,8 @@ func TestTicker(t *testing.T) {
 	})
 
 	t.Run("With stop channel", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		var (
 			tickerInterval = 100 * time.Millisecond
@@ -232,6 +336,8 @@ func TestTicker(t *testing.T) {
 	})
 
 	t.Run("With logger", func(t *testing.T) {
+		t.Parallel()
+
 		// ARRANGE
 		out := &bytes.Buffer{}
 		logger := zerolog.New(out)
diff --git a/zetaclient/metrics/metrics.go b/zetaclient/metrics/metrics.go
index 28fe897504..36dc5ad813 100644
--- a/zetaclient/metrics/metrics.go
+++ b/zetaclient/metrics/metrics.go
@@ -170,6 +170,27 @@ var (
 		Name:      "num_connected_peers",
 		Help:      "The number of connected peers (authenticated keygen peers)",
 	})
+
+	// SchedulerTaskInvocationCounter tracks invocations categorized by status, group, and name
+	SchedulerTaskInvocationCounter = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: ZetaClientNamespace,
+			Name:      "scheduler_task_invocations_total",
+			Help:      "Total number of task invocations",
+		},
+		[]string{"status", "task_group", "task_name"},
+	)
+
+	// SchedulerTaskExecutionDuration measures the execution duration of tasks
+	SchedulerTaskExecutionDuration = promauto.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: ZetaClientNamespace,
+			Name:      "scheduler_task_duration_seconds",
+			Help:      "Histogram of task execution duration in seconds",
+			Buckets:   []float64{0.05, 0.1, 0.2, 0.3, 0.5, 1, 1.5, 2, 3, 5, 7.5, 10, 15}, // 50ms to 15s
+		},
+		[]string{"status", "task_group", "task_name"},
+	)
 )
 
 // NewMetrics creates a new Metrics instance