From 26bd4a59adb35e671b3f89eb8a1b9e55f9081b16 Mon Sep 17 00:00:00 2001
From: David Chisnall <github@theravensnest.org>
Date: Fri, 12 Apr 2024 14:10:31 +0100
Subject: [PATCH] Tickless scheduler

This commit incorporates several changes:

Timers are now not set for a regular tick, they are set when a thread
may be preempted.  Specifically, the timer is set to the next timeout
that will trigger a scheduling operation.  This avoids timers
triggering a switch to the scheduler to do nothing (resume the currently
running thread).

This means that if a thread sleeps for ten ticks while another runs, we
will get one timer interrupt ten ticks in the future, rather than ten
interrupts one tick apart.

This means that ticks are now calculated retroactively based on elapsed
time, rather than counted on each context switch.

This, in turn, necessitates some small API changes.  We previously
conflated two things:

 - Sleep for N * (tick duration)
 - Yield and allow lower-priority threads to run for, at most, N * (tick
   duration)

These are now deconflated by adding a second parameter to thread_sleep.
Most sleeps are of the second form and so this is the default.

This reduces the time taken to run the test suite on Sonata by around
30% and in the Ibex SAFE simulator by 13%.
---
 sdk/core/allocator/main.cc                    |  2 +-
 sdk/core/scheduler/main.cc                    | 33 +++++--
 sdk/core/scheduler/thread.h                   | 64 ++++++++++++--
 sdk/core/scheduler/timer.h                    | 85 +++++++++++++++++--
 sdk/core/switcher/entry.S                     |  6 ++
 sdk/include/FreeRTOS-Compat/task.h            |  2 +-
 .../platform/generic-riscv/platform-timer.hh  | 39 +++++----
 sdk/include/thread.h                          | 25 +++++-
 sdk/lib/debug/debug.cc                        |  7 ++
 tests/locks-test.cc                           |  2 +
 10 files changed, 223 insertions(+), 42 deletions(-)

diff --git a/sdk/core/allocator/main.cc b/sdk/core/allocator/main.cc
index 29a652b1b..7a47e8212 100644
--- a/sdk/core/allocator/main.cc
+++ b/sdk/core/allocator/main.cc
@@ -291,7 +291,7 @@ namespace
 						// Drop and reacquire the lock while yielding.
 						// Sleep for a single tick.
 						g.unlock();
-						Timeout smallSleep{0};
+						Timeout smallSleep{1};
 						thread_sleep(&smallSleep);
 						if (!reacquire_lock(timeout, g, smallSleep.elapsed))
 						{
diff --git a/sdk/core/scheduler/main.cc b/sdk/core/scheduler/main.cc
index 88442be36..f563d102b 100644
--- a/sdk/core/scheduler/main.cc
+++ b/sdk/core/scheduler/main.cc
@@ -270,15 +270,20 @@ namespace sched
 
 		ExceptionGuard g{[=]() { sched_panic(mcause, mepc, mtval); }};
 
+		bool tick = false;
 		switch (mcause)
 		{
 			// Explicit yield call
 			case MCAUSE_ECALL_MACHINE:
-				schedNeeded = true;
+			{
+				schedNeeded           = true;
+				Thread *currentThread = Thread::current_get();
+				tick = currentThread && currentThread->is_ready();
 				break;
+			}
 			case MCAUSE_INTR | MCAUSE_MTIME:
-				Timer::do_interrupt();
 				schedNeeded = true;
+				tick        = true;
 				break;
 			case MCAUSE_INTR | MCAUSE_MEXTERN:
 				schedNeeded = false;
@@ -293,6 +298,7 @@ namespace sched
 					  std::tie(schedNeeded, std::ignore, std::ignore) =
 					    futex_wake(Capability{&word}.address());
 				  });
+				tick = schedNeeded;
 				break;
 			case MCAUSE_THREAD_EXIT:
 				// Make the current thread non-runnable.
@@ -305,13 +311,23 @@ namespace sched
 				// We cannot continue exiting this thread, make sure we will
 				// pick a new one.
 				schedNeeded  = true;
+				tick         = true;
 				sealedTStack = nullptr;
 				break;
 			default:
 				sched_panic(mcause, mepc, mtval);
 		}
+		if (tick || !Thread::any_ready())
+		{
+			Timer::expiretimers();
+		}
 		auto newContext =
 		  schedNeeded ? Thread::schedule(sealedTStack) : sealedTStack;
+#if 0
+		Debug::log("Thread: {}",
+		           Thread::current_get() ? Thread::current_get()->id_get() : 0);
+#endif
+		Timer::update();
 
 		if constexpr (Accounting)
 		{
@@ -419,14 +435,17 @@ SystickReturn __cheri_compartment("sched") thread_systemtick_get()
 }
 
 __cheriot_minimum_stack(0x80) int __cheri_compartment("sched")
-  thread_sleep(Timeout *timeout)
+  thread_sleep(Timeout *timeout, uint32_t flags)
 {
 	STACK_CHECK(0x80);
 	if (!check_timeout_pointer(timeout))
 	{
 		return -EINVAL;
 	}
-	Thread::current_get()->suspend(timeout, nullptr, true);
+	////Debug::log("Thread {} sleeping for {} ticks",
+	/// Thread::current_get()->id_get(), timeout->remaining);
+	Thread *current = Thread::current_get();
+	current->suspend(timeout, nullptr, true, !(flags & ThreadSleepNoEarlyWake));
 	return 0;
 }
 
@@ -468,8 +487,10 @@ __cheriot_minimum_stack(0xa0) int futex_timed_wait(Timeout        *timeout,
 		// If we try to block ourself, that's a mistake.
 		if ((owningThread == currentThread) || (owningThread == nullptr))
 		{
-			Debug::log("futex_timed_wait: invalid owning thread {}",
-			           owningThread);
+			Debug::log("futex_timed_wait: thread {} acquiring PI futex with "
+			           "invalid owning thread {}",
+			           currentThread->id_get(),
+			           owningThreadID);
 			return -EINVAL;
 		}
 		Debug::log("Thread {} boosting priority of {} for futex {}",
diff --git a/sdk/core/scheduler/thread.h b/sdk/core/scheduler/thread.h
index e5b296143..fc1409048 100644
--- a/sdk/core/scheduler/thread.h
+++ b/sdk/core/scheduler/thread.h
@@ -18,6 +18,8 @@ namespace
 	// thread structures.
 	class MultiWaiterInternal;
 
+	uint64_t expiry_time_for_timeout(uint32_t timeout);
+
 	template<size_t NPrios>
 	class ThreadImpl final : private utils::NoCopyNoMove
 	{
@@ -114,6 +116,11 @@ namespace
 			return schedTStack;
 		}
 
+		static bool any_ready()
+		{
+			return priorityMap != 0;
+		}
+
 		/**
 		 * When yielding inside the scheduler compartment, we almost always want
 		 * to re-enable interrupts before ecall. If we don't, then a thread with
@@ -152,11 +159,12 @@ namespace
 		 */
 		bool suspend(Timeout     *t,
 		             ThreadImpl **newSleepQueue,
-		             bool         yieldUnconditionally = false)
+		             bool         yieldUnconditionally = false,
+		             bool         yieldNotSleep        = false)
 		{
 			if (t->remaining != 0)
 			{
-				suspend(t->remaining, newSleepQueue);
+				suspend(t->remaining, newSleepQueue, yieldNotSleep);
 			}
 			if ((t->remaining != 0) || yieldUnconditionally)
 			{
@@ -188,6 +196,7 @@ namespace
 		    OriginalPriority(priority),
 		    expiryTime(-1),
 		    state(ThreadState::Suspended),
+		    isYielding(false),
 		    sleepQueue(nullptr),
 		    tStackPtr(tstack)
 		{
@@ -212,7 +221,7 @@ namespace
 			// We must be suspended.
 			Debug::Assert(state == ThreadState::Suspended,
 			              "Waking thread that is in state {}, not suspended",
-			              state);
+			              static_cast<ThreadState>(state));
 			// First, remove self from the timer waiting list.
 			timer_list_remove(&waitingList);
 			if (sleepQueue != nullptr)
@@ -233,11 +242,18 @@ namespace
 					schedule        = true;
 				}
 			}
+			// If this is the same priority as the current thread, we may need
+			// to update the timer.
+			if (priority >= highestPriority)
+			{
+				schedule = true;
+			}
 			if (reason == WakeReason::Timer || reason == WakeReason::Delete)
 			{
 				multiWaiter = nullptr;
 			}
 			list_insert(&priorityList[priority]);
+			isYielding = false;
 
 			return schedule;
 		}
@@ -278,11 +294,14 @@ namespace
 		 * waiting on a resource, add it to the list of that resource. No
 		 * matter what, it has to be added to the timer list.
 		 */
-		void suspend(uint32_t waitTicks, ThreadImpl **newSleepQueue)
+		void suspend(uint32_t     waitTicks,
+		             ThreadImpl **newSleepQueue,
+		             bool         yieldNotSleep = false)
 		{
+			isYielding = yieldNotSleep;
 			Debug::Assert(state == ThreadState::Ready,
 			              "Suspending thread that is in state {}, not ready",
-			              state);
+			              static_cast<ThreadState>(state));
 			list_remove(&priorityList[priority]);
 			state = ThreadState::Suspended;
 			priority_map_remove();
@@ -291,8 +310,7 @@ namespace
 				list_insert(newSleepQueue);
 				sleepQueue = newSleepQueue;
 			}
-			expiryTime =
-			  (waitTicks == UINT32_MAX ? -1 : ticksSinceBoot + waitTicks);
+			expiryTime = expiry_time_for_timeout(waitTicks);
 
 			timer_list_insert(&waitingList);
 		}
@@ -407,7 +425,7 @@ namespace
 			Debug::Assert(state == ThreadState::Suspended,
 			              "Inserting thread into timer list that is in state "
 			              "{}, not suspended",
-			              state);
+			              static_cast<ThreadState>(state));
 			if (head == nullptr)
 			{
 				timerNext = timerPrev = *headPtr = this;
@@ -511,6 +529,29 @@ namespace
 			return priority;
 		}
 
+		bool is_ready()
+		{
+			return state == ThreadState::Ready;
+		}
+
+		bool is_yielding()
+		{
+			return isYielding;
+		}
+
+		/**
+		 * Returns true if there are other runnable threads with the same
+		 * priority as this thread.
+		 */
+		bool has_priority_peers()
+		{
+			Debug::Assert(state == ThreadState::Ready,
+			              "Checking for peers on thread that is in state {}, "
+			              "not ready",
+			              static_cast<ThreadState>(state));
+			return next != this;
+		}
+
 		~ThreadImpl()
 		{
 			// We have static definition of threads. We only create threads in
@@ -616,7 +657,12 @@ namespace
 		uint8_t priority;
 		/// The original priority level for this thread.  This never changes.
 		const uint8_t OriginalPriority;
-		ThreadState   state;
+		ThreadState   state : 2;
+		/**
+		 * If the thread is yielding, it may be scheduled before its timeout
+		 * expires, as long as no other threads are runnable.
+		 */
+		bool isYielding : 1;
 	};
 
 	using Thread = ThreadImpl<ThreadPrioNum>;
diff --git a/sdk/core/scheduler/timer.h b/sdk/core/scheduler/timer.h
index a86120b51..b56b8f0ec 100644
--- a/sdk/core/scheduler/timer.h
+++ b/sdk/core/scheduler/timer.h
@@ -28,6 +28,9 @@ namespace
 
 	class Timer final : private TimerCore
 	{
+		inline static uint64_t lastTickTime         = 0;
+		inline static uint32_t accumulatedTickError = 0;
+
 		public:
 		static void interrupt_setup()
 		{
@@ -35,27 +38,62 @@ namespace
 			              "Cycles per tick can't be represented in 32 bits. "
 			              "Double check your platform config");
 			init();
-			setnext(TIMERCYCLES_PER_TICK);
 		}
 
-		static void do_interrupt()
+		using TimerCore::time;
+
+		static void update()
 		{
-			++Thread::ticksSinceBoot;
+			auto *thread             = Thread::current_get();
+			bool  waitingListIsEmpty = ((Thread::waitingList == nullptr) ||
+                                       (Thread::waitingList->expiryTime == -1));
+			bool  threadHasNoPeers =
+			  (thread == nullptr) || (!thread->has_priority_peers());
+			if (waitingListIsEmpty && threadHasNoPeers)
+			{
+				clear();
+			}
+			else
+			{
+				uint64_t nextTimer = waitingListIsEmpty
+				                       ? time() + TIMERCYCLES_PER_TICK
+				                       : Thread::waitingList->expiryTime;
+				setnext(nextTimer);
+			}
+		}
 
-			expiretimers();
-			setnext(TIMERCYCLES_PER_TICK);
+		static uint64_t update_tick()
+		{
+			uint64_t now     = time();
+			uint32_t elapsed = now - lastTickTime;
+			int32_t  error   = elapsed % TIMERCYCLES_PER_TICK;
+			if (elapsed < TIMERCYCLES_PER_TICK)
+			{
+				error = TIMERCYCLES_PER_TICK - error;
+			}
+			accumulatedTickError += error;
+			int32_t errorDirection = accumulatedTickError < 0 ? -1 : 1;
+			int32_t absoluteError  = accumulatedTickError * errorDirection;
+			if (absoluteError >= TIMERCYCLES_PER_TICK)
+			{
+				Thread::ticksSinceBoot += errorDirection;
+				accumulatedTickError += TIMERCYCLES_PER_TICK * -errorDirection;
+			}
+			lastTickTime = now;
+			Thread::ticksSinceBoot += elapsed / TIMERCYCLES_PER_TICK;
+			return now;
 		}
 
-		private:
 		static void expiretimers()
 		{
+			uint64_t now = update_tick();
 			if (Thread::waitingList == nullptr)
 			{
 				return;
 			}
 			for (Thread *iter = Thread::waitingList;;)
 			{
-				if (iter->expiryTime <= Thread::ticksSinceBoot)
+				if (iter->expiryTime <= now)
 				{
 					Thread *iterNext = iter->timerNext;
 
@@ -72,6 +110,39 @@ namespace
 					break;
 				}
 			}
+			if (!Thread::any_ready())
+			{
+				for (Thread *iter = Thread::waitingList; iter;)
+				{
+					if (iter->is_yielding())
+					{
+						Debug::log("Woke thread {} {} cycles early",
+						           iter->id_get(),
+						           int64_t(iter->expiryTime) - now);
+						Thread *iterNext = iter->timerNext;
+						iter->ready(Thread::WakeReason::Timer);
+						iter = iterNext;
+						if (Thread::waitingList == nullptr ||
+						    iter == Thread::waitingList)
+						{
+							break;
+						}
+					}
+					else
+					{
+						break;
+					}
+				}
+			}
 		}
 	};
+
+	uint64_t expiry_time_for_timeout(uint32_t timeout)
+	{
+		if (timeout == -1)
+		{
+			return -1;
+		}
+		return Timer::time() + (timeout * TIMERCYCLES_PER_TICK);
+	}
 } // namespace
diff --git a/sdk/core/switcher/entry.S b/sdk/core/switcher/entry.S
index 8836d088f..30f946591 100644
--- a/sdk/core/switcher/entry.S
+++ b/sdk/core/switcher/entry.S
@@ -970,7 +970,13 @@ __Z13thread_id_getv:
 	// Load the trusted stack pointer into a register that we will clobber in
 	// the next instruction when we load the thread ID.
 	cspecialr          ca0, mtdc
+	//cgettag            a1, ca0
+	// If this is a null pointer, don't try to dereference it and report that
+	// we are thread 0.  This permits the debug code to work even from things
+	// that are not real threads.
+	//beqz               a1, .Lend
 	clh                a0, TrustedStack_offset_threadID(ca0)
+.Lend:
 	cret
 
 
diff --git a/sdk/include/FreeRTOS-Compat/task.h b/sdk/include/FreeRTOS-Compat/task.h
index 5aac58788..2517c0277 100644
--- a/sdk/include/FreeRTOS-Compat/task.h
+++ b/sdk/include/FreeRTOS-Compat/task.h
@@ -55,7 +55,7 @@ static inline BaseType_t xTaskCheckForTimeOut(TimeOut_t  *pxTimeOut,
 static inline void vTaskDelay(const TickType_t xTicksToDelay)
 {
 	struct Timeout timeout = {0, xTicksToDelay};
-	thread_sleep(&timeout);
+	thread_sleep(&timeout, 0);
 }
 
 /**
diff --git a/sdk/include/platform/generic-riscv/platform-timer.hh b/sdk/include/platform/generic-riscv/platform-timer.hh
index 377f32332..91461c0cb 100644
--- a/sdk/include/platform/generic-riscv/platform-timer.hh
+++ b/sdk/include/platform/generic-riscv/platform-timer.hh
@@ -38,6 +38,21 @@ class StandardClint : private utils::NoCopyNoMove
 		         2 * sizeof(uint32_t));
 	}
 
+	static uint64_t time()
+	{
+		/// the low 32 bits
+		volatile uint32_t *timerHigh = pmtimer + 1;
+		uint32_t           timeLow, timeHigh;
+
+		// Read the current time. Loop until the high 32 bits are stable.
+		do
+		{
+			timeHigh = *timerHigh;
+			timeLow  = *pmtimer;
+		} while (timeHigh != *timerHigh);
+		return (uint64_t(timeHigh) << 32) | timeLow;
+	}
+
 	/**
 	 * Set the timer up for the next timer interrupt. We need to:
 	 * 1. read the current MTIME,
@@ -51,28 +66,22 @@ class StandardClint : private utils::NoCopyNoMove
 	 * interrupt. Writing to any half is enough to clear the interrupt,
 	 * which is also why 2. is important.
 	 */
-	static void setnext(uint32_t cycles)
+	static void setnext(uint64_t nextTime)
 	{
 		/// the high 32 bits of the 64-bit MTIME register
 		volatile uint32_t *pmtimercmphigh = pmtimercmp + 1;
-		/// the low 32 bits
-		volatile uint32_t *pmtimerhigh = pmtimer + 1;
 		uint32_t           curmtimehigh, curmtime, curmtimenew;
 
-		// Read the current time. Loop until the high 32 bits are stable.
-		do
-		{
-			curmtimehigh = *pmtimerhigh;
-			curmtime     = *pmtimer;
-		} while (curmtimehigh != *pmtimerhigh);
-
-		// Add tick cycles to current time. Handle carry bit.
-		curmtimehigh += __builtin_add_overflow(curmtime, cycles, &curmtimenew);
-
 		// Write the new MTIMECMP value, at which the next interrupt fires.
 		*pmtimercmphigh = -1; // Prevent spurious interrupts.
-		*pmtimercmp     = curmtimenew;
-		*pmtimercmphigh = curmtimehigh;
+		*pmtimercmp     = nextTime;
+		*pmtimercmphigh = nextTime >> 32;
+	}
+
+	static void clear()
+	{
+		volatile uint32_t *pmtimercmphigh = pmtimercmp + 1;
+		*pmtimercmphigh                   = -1; // Prevent spurious interrupts.
 	}
 
 	private:
diff --git a/sdk/include/thread.h b/sdk/include/thread.h
index 3c5430a15..5e8d03aa5 100644
--- a/sdk/include/thread.h
+++ b/sdk/include/thread.h
@@ -23,6 +23,17 @@ typedef struct
 [[cheri::interrupt_state(disabled)]] SystickReturn __cheri_compartment("sched")
   thread_systemtick_get(void);
 
+enum ThreadSleepFlags : uint32_t
+{
+	/**
+	 * Sleep for up to the specified timeout, but wake early if there are no
+	 * other runnable threads.  This allows a high-priority thread to yield for
+	 * a fixed number of ticks for lower-priority threads to run, but does not
+	 * prevent it from resuming early.
+	 */
+	ThreadSleepNoEarlyWake = 1 << 0,
+};
+
 /**
  * Sleep for at most the specified timeout (see `timeout.h`).
  *
@@ -34,9 +45,17 @@ typedef struct
  * but reports the time spent sleeping.  This requires a cross-domain call and
  * return in addition to the overheads of `yield` and so `yield` should be
  * preferred in contexts where the elapsed time is not required.
+ *
+ * The `flags` parameter is a bitwise OR of `ThreadSleepFlags`.
+ *
+ * A sleeping thread may be woken early if no other threads are runnable.  The
+ * thread with the earliest timeout will be woken first.  If you are using
+ * `thread_sleep` to elapse real time, pass `ThreadSleepNoEarlyWake` as the
+ * flags argument to prevent early wakeups.
+ *
  */
 [[cheri::interrupt_state(disabled)]] int __cheri_compartment("sched")
-  thread_sleep(struct Timeout *timeout);
+  thread_sleep(struct Timeout *timeout, uint32_t flags __if_cxx(= 0));
 
 /**
  * Return the thread ID of the current running thread.
@@ -119,7 +138,7 @@ static inline uint64_t thread_millisecond_wait(uint32_t milliseconds)
 	// In simulation builds, just yield once but don't bother trying to do
 	// anything sensible with time.
 	Timeout t = {0, 1};
-	thread_sleep(&t);
+	thread_sleep(&t, 0);
 	return milliseconds;
 #else
 	static const uint32_t CyclesPerMillisecond = CPU_TIMER_HZ / 1'000;
@@ -133,7 +152,7 @@ static inline uint64_t thread_millisecond_wait(uint32_t milliseconds)
 	while ((end > current) && (end - current > MS_PER_TICK))
 	{
 		Timeout t = {0, ((uint32_t)(end - current)) / CyclesPerTick};
-		thread_sleep(&t);
+		thread_sleep(&t, ThreadSleepNoEarlyWake);
 		current = rdcycle64();
 	}
 	// Spin for the remaining time.
diff --git a/sdk/lib/debug/debug.cc b/sdk/lib/debug/debug.cc
index 075616433..2642d9fbf 100644
--- a/sdk/lib/debug/debug.cc
+++ b/sdk/lib/debug/debug.cc
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: MIT
 
 #include <debug.hh>
+#include <thread.h>
 
 using namespace CHERI;
 
@@ -335,7 +336,13 @@ debug_log_message_write(const char          *context,
 	DebugPrinter printer;
 	printer.write("\x1b[35m");
 	printer.write(context);
+#if 0
+	printer.write(" [Thread ");
+	printer.write(thread_id_get());
+	printer.write("]\033[0m: ");
+#else
 	printer.write("\033[0m: ");
+#endif
 	printer.format(format, messages, messageCount);
 	printer.write("\n");
 }
diff --git a/tests/locks-test.cc b/tests/locks-test.cc
index ffd8da271..20798c813 100644
--- a/tests/locks-test.cc
+++ b/tests/locks-test.cc
@@ -59,7 +59,9 @@ namespace
 		     "Trying to acquire lock spuriously succeeded");
 		if constexpr (!std::is_same_v<Lock, FlagLockPriorityInherited>)
 		{
+#ifndef SIMULATION
 			TEST(t.elapsed >= 1, "Sleep slept for {} ticks", t.elapsed);
+#endif
 		}
 	}