| // SPDX-License-Identifier: GPL-2.0-or-later | 
 | /* | 
 |  * Copyright (c) 2013, 2014 The Regents of the University of California | 
 |  * Copyright (c) 2020 Google Inc | 
 |  * | 
 |  * Barret Rhoden <brho@cs.berkeley.edu> | 
 |  * | 
 |  * Sorry, but you'll need to change your linux source to expose this function: | 
 |  | 
 |  EXPORT_SYMBOL_GPL(kthread_create_on_cpu); | 
 |  | 
 |  * | 
 |  */ | 
 |  | 
 | #include <linux/module.h> | 
 | #include <linux/moduleparam.h> | 
 | #include <linux/kobject.h> | 
 | #include <linux/sysfs.h> | 
 | #include <linux/slab.h> | 
 |  | 
 | #include <linux/sched/task.h> | 
 | #include <linux/sched/mm.h> | 
 | #include <linux/delay.h> | 
 | #include <linux/kthread.h> | 
 | #include <linux/completion.h> | 
 | #include <asm/msr.h> | 
 |  | 
 | /* Seems fine either way.  Userspace uses lfence; rdtsc.  So use this if you're | 
 |  * paranoid about comparisons between user and kernel. */ | 
 | #if 1 | 
 | static inline u64 read_tsc_serialized(void) | 
 | { | 
 | 	u32 lo, hi; | 
 |  | 
 | 	asm volatile("lfence; rdtsc" : "=a" (lo), "=d" (hi)); | 
 | 	return (u64)hi << 32 | lo; | 
 | } | 
 |  | 
 | #else | 
 | #define read_tsc_serialized rdtsc_ordered | 
 | #endif | 
 |  | 
 | static void simple_ndelay(u64 nsec) | 
 | { | 
 | 	u64 end; | 
 |  | 
 | 	end = rdtsc() + (tsc_khz * nsec) / 1000000; | 
 | 	do { | 
 | 		cpu_relax(); | 
 | 	} while (rdtsc() < end); | 
 | } | 
 |  | 
 | struct lock_sample { | 
 | 	u64 pre; | 
 | 	u64 acq; | 
 | 	u64 un; | 
 | 	bool valid; | 
 | }; | 
 |  | 
 | /* Consider using 128, if you're worried about Advanced Cacheline Prefetching */ | 
 | #define CL_SZ 64 | 
 |  | 
 | /* How long we'll run without IRQs enabled */ | 
 | #define MSEC_WITHOUT_IRQ 100 | 
 |  | 
 | /*********************** MCS ******************************/ | 
 |  | 
 | #define MCS_LOCK_INIT {0} | 
 | #define MCS_QNODE_INIT {0, 0} | 
 |  | 
 | typedef struct mcs_lock_qnode | 
 | { | 
 | 	struct mcs_lock_qnode *next; | 
 | 	int locked; | 
 | } __attribute__((aligned(CL_SZ))) mcs_lock_qnode_t; | 
 |  | 
 | typedef struct mcs_lock | 
 | { | 
 | 	mcs_lock_qnode_t *lock; | 
 | } mcs_lock_t; | 
 |  | 
 | /* Dirty trick to get an isolated cache line; need the attrib on the type. */ | 
 | struct { | 
 | 	struct mcs_lock ___mcs_l; | 
 | } __attribute__((aligned(CL_SZ))) __mcs_l = {MCS_LOCK_INIT}; | 
 | #define mcs_l __mcs_l.___mcs_l | 
 |  | 
 | void mcs_lock_init(struct mcs_lock *lock) | 
 | { | 
 | 	memset(lock, 0, sizeof(mcs_lock_t)); | 
 | } | 
 |  | 
 | static inline mcs_lock_qnode_t *mcs_qnode_swap(mcs_lock_qnode_t **addr, | 
 |                                                mcs_lock_qnode_t *val) | 
 | { | 
 | 	return (mcs_lock_qnode_t*) __sync_lock_test_and_set((void**)addr, val); | 
 | } | 
 |  | 
 | void notrace mcs_lock_lock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode) | 
 | { | 
 | 	mcs_lock_qnode_t *predecessor; | 
 |  | 
 | 	qnode->next = 0; | 
 | 	barrier();	/* swap provides a CPU mb() */ | 
 | 	predecessor = mcs_qnode_swap(&lock->lock, qnode); | 
 | 	if (predecessor) { | 
 | 		qnode->locked = 1; | 
 | 		smp_wmb(); | 
 | 		predecessor->next = qnode; | 
 | 		/* no need for a wrmb(), since this will only get unlocked | 
 | 		 * after they read our previous write */ | 
 | 		while (qnode->locked) | 
 | 			cpu_relax(); | 
 | 	} | 
 | 	barrier();/* just need a cmb, the swap handles the CPU wmb/wrmb() */ | 
 | } | 
 |  | 
 | /* CAS version (no usurper) */ | 
 | void notrace mcs_lock_unlock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode) | 
 | { | 
 | 	/* Check if someone is already waiting on us to unlock */ | 
 | 	if (qnode->next == 0) { | 
 | 		/* no need for CPU mbs, since there's an atomic_cas() */ | 
 | 		barrier(); | 
 | 		/* If we're still the lock, just swap it with 0 (unlock) and | 
 | 		 * return */ | 
 | 		if (__sync_bool_compare_and_swap((void**)&lock->lock, qnode, 0)) | 
 | 			return; | 
 | 		/* We failed, someone is there and we are some (maybe a | 
 | 		 * different) thread's pred.  Since someone else was waiting, | 
 | 		 * they should have made themselves our next.  Spin (very | 
 | 		 * briefly!) til it happens. */ | 
 | 		while (qnode->next == 0) | 
 | 			cpu_relax(); | 
 | 		/* Alpha wants a read_barrier_depends() here */ | 
 | 		/* Now that we have a next, unlock them */ | 
 | 		qnode->next->locked = 0; | 
 | 	} else { | 
 | 		/* mb()s necessary since we didn't call an atomic_swap() */ | 
 | 		/* need to make sure any previous writes don't pass unlocking */ | 
 | 		smp_wmb(); | 
 | 		/* need to make sure any reads happen before the unlocking */ | 
 | 		barrier(); //rwmb(); | 
 | 		/* simply unlock whoever is next */ | 
 | 		qnode->next->locked = 0; | 
 | 	} | 
 | } | 
 |  | 
 | /*********************** QUEUE ******************************/ | 
 | struct { | 
 | 	struct qspinlock ___qsl; | 
 | } __attribute__((aligned(CL_SZ))) __qsl = {__ARCH_SPIN_LOCK_UNLOCKED}; | 
 | #define qsl __qsl.___qsl | 
 |  | 
 | /*********************** SPIN ******************************/ | 
 | struct { | 
 | 	arch_spinlock_t ___asl;; | 
 | } __attribute__((aligned(CL_SZ))) __asl = {__ARCH_SPIN_LOCK_UNLOCKED}; | 
 | #define asl __asl.___asl | 
 |  | 
 |  | 
 | /* mtx protects all variables and the test run */ | 
 | static struct mutex mtx; | 
 |  | 
 | struct lock_test; | 
 | static struct lock_test *test; | 
 | static DECLARE_COMPLETION(test_done); | 
 |  | 
 | static unsigned int nr_threads; | 
 | static unsigned int nr_loops; | 
 | static unsigned int hold_time; | 
 | static unsigned int delay_time; | 
 |  | 
 | /* array[nr_thread] of pointers of lock_sample[nr_loops] */ | 
 | static struct lock_sample **times; | 
 | /* array[nr_thread] of task* */ | 
 | static struct task_struct **threads; | 
 | /* array[nr_thread] of void* */ | 
 | static void **retvals; | 
 | static void *results; | 
 | static size_t results_sz; | 
 |  | 
 | static bool run_locktest; | 
 | static atomic_t horses; | 
 |  | 
 | /* Every lock type has their own function, named __lock_name_thread(). */ | 
 | #define lock_func(lock_name, pre_cmd, lock_cmd, unlock_cmd)                    \ | 
 | static int __##lock_name##_thread(void *arg)                                   \ | 
 | {                                                                              \ | 
 | 	long thread_id = (long)arg;                                            \ | 
 | 	u64 pre_lock, acq_lock, un_lock;                                       \ | 
 | 	struct lock_sample *this_time;                                         \ | 
 | 	int i;                                                                 \ | 
 | 	u64 next_irq;                                                          \ | 
 |                                                                                \ | 
 | 	/*                                                                      | 
 | 	 * hold_time is the important one to have locally.  o/w we might cache | 
 | 	 * miss during the critical section.  Why would we miss?  Perhaps | 
 | 	 * because hold_time is on the adjacent cache line to the spinlock, and | 
 | 	 * !(MSR 0x1A4 & 2), though that'd only make sense if Adjacent Cachelin | 
 | 	 * Prefetching prefetched in exclusive mode (and thus invalidating). | 
 | 	 * The others are important too, though less so.  Their miss would be | 
 | 	 * outside the critical section, but if you happen to rearrange the | 
 | 	 * file, they could falsely share with the lock. | 
 | 	 */                                                                    \ | 
 | 	unsigned int hold_time_l = READ_ONCE(hold_time);                       \ | 
 | 	unsigned int delay_time_l = READ_ONCE(delay_time);                     \ | 
 | 	unsigned int nr_loops_l = READ_ONCE(nr_loops);                         \ | 
 |                                                                                \ | 
 | 	pre_cmd                                                                \ | 
 |                                                                                \ | 
 | 	atomic_dec(&horses);                                                   \ | 
 | 	while (atomic_read(&horses))                                           \ | 
 | 		cpu_relax();                                                   \ | 
 |                                                                                \ | 
 | 	/*                                                                      | 
 | 	 * I'd like to enable/disable IRQs in the loop, but that affects the | 
 | 	 * test, even if they are outside the timestamps and the critical | 
 | 	 * section.  Instead, just turn them on periodically.  100ms was what I | 
 | 	 * noticed didn't affect the test's throughput (Haswell). | 
 | 	 */                                                                    \ | 
 | 	local_irq_disable();                                                   \ | 
 | 	next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ;                       \ | 
 |                                                                                \ | 
 | 	for (i = 0; i < nr_loops_l; i++) {                                     \ | 
 | 		/*                                                              | 
 | 		 * might be able to replace this with post-processing.  let the | 
 | 		 * test run, and discard all entries after the first finisher   | 
 | 		 */                                                            \ | 
 | 		if (!READ_ONCE(run_locktest))                                  \ | 
 | 			break;                                                 \ | 
 |                                                                                \ | 
 | 		pre_lock = read_tsc_serialized();                              \ | 
 |                                                                                \ | 
 | 		lock_cmd                                                       \ | 
 |                                                                                \ | 
 | 		acq_lock = read_tsc_serialized();                              \ | 
 |                                                                                \ | 
 | 		if (hold_time_l)                                               \ | 
 | 			simple_ndelay(hold_time_l);                            \ | 
 |                                                                                \ | 
 | 		unlock_cmd                                                     \ | 
 |                                                                                \ | 
 | 		un_lock = read_tsc_serialized();                               \ | 
 |                                                                                \ | 
 | 		this_time = ×[thread_id][i];                              \ | 
 | 		this_time->pre = pre_lock;                                     \ | 
 | 		this_time->acq = acq_lock;                                     \ | 
 | 		this_time->un = un_lock;                                       \ | 
 | 		/* Can turn these on/off to control which samples we gather */ \ | 
 | 		this_time->valid = true;                                       \ | 
 | 		if (delay_time_l)                                              \ | 
 | 			simple_ndelay(delay_time_l);                           \ | 
 | 		/*                                                              | 
 | 		 * This can throw off your delay_time.  Think of delay_time as  | 
 | 		 * the least amount of time we'll wait between reacquiring the  | 
 | 		 * lock.                                                        | 
 | 		 */                                                            \ | 
 | 		if (next_irq < un_lock) {                                      \ | 
 | 			local_irq_enable();                                    \ | 
 | 			cond_resched();		/* since we're here. */        \ | 
 | 			local_irq_disable();                                   \ | 
 | 			next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ;       \ | 
 | 		}                                                              \ | 
 | 	}                                                                      \ | 
 |                                                                                \ | 
 | 	local_irq_enable();                                                    \ | 
 |                                                                                \ | 
 | 	/* First thread to finish stops the test */                            \ | 
 | 	WRITE_ONCE(run_locktest, false);                                       \ | 
 | 	/*                                                                      | 
 | 	 * Wakes the controller thread.  The others will be done soon, to       | 
 | 	 * complete the hokey thread join.                                      | 
 | 	 */                                                                    \ | 
 | 	complete(&test_done);                                                  \ | 
 |                                                                                \ | 
 | 	WRITE_ONCE(retvals[thread_id], (void*)(long)i);                        \ | 
 |                                                                                \ | 
 | 	return 0;                                                              \ | 
 | } | 
 |  | 
 | lock_func(mcs, | 
 | 	  struct mcs_lock_qnode qn = MCS_QNODE_INIT;, | 
 | 	  mcs_lock_lock(&mcs_l, &qn);, | 
 | 	  mcs_lock_unlock(&mcs_l, &qn);); | 
 | lock_func(queue, | 
 | 	  ;, | 
 | 	  queued_spin_lock(&qsl);, | 
 | 	  queued_spin_unlock(&qsl);); | 
 | lock_func(spin, | 
 | 	  ;, | 
 | 	  arch_spin_lock(&asl);, | 
 | 	  arch_spin_unlock(&asl);); | 
 |  | 
 | /* ID is for userspace, name is for the kthread, func is what runs */ | 
 | struct lock_test { | 
 | 	unsigned int id; | 
 | 	const char *name; | 
 | 	int (*func)(void *); | 
 | }; | 
 |  | 
 | #define LOCKTEST_MCS 		1 | 
 | #define LOCKTEST_QUEUE 		2 | 
 | #define LOCKTEST_SPIN 		3 | 
 |  | 
 | static struct lock_test tests[] = { | 
 | 	{LOCKTEST_MCS,          "mcs", __mcs_thread}, | 
 | 	{LOCKTEST_QUEUE,      "queue", __queue_thread}, | 
 | 	{LOCKTEST_SPIN,        "spin", __spin_thread}, | 
 | 	{} | 
 | }; | 
 |  | 
 | static struct lock_test *get_test(unsigned int id) | 
 | { | 
 | 	struct lock_test *ret; | 
 |  | 
 | 	for (ret = tests; ret->id; ret++) | 
 | 		if (ret->id == id) | 
 | 			return ret; | 
 | 	return NULL; | 
 | } | 
 |  | 
 | /* | 
 |  * This consolidates the results in a format we will export to userspace.  We | 
 |  * could have just used this format for the test itself, but then the times | 
 |  * arrays wouldn't be NUMA local. | 
 |  */ | 
 | static int mcs_build_output(struct lock_sample **times, void **retvals) | 
 | { | 
 | 	int i; | 
 | 	size_t sz_rets = nr_threads * sizeof(void*); | 
 | 	size_t sz_times_per = nr_loops * sizeof(struct lock_sample); | 
 |  | 
 | 	results_sz = sz_rets + nr_threads * sz_times_per; | 
 |  | 
 | 	kvfree(results); | 
 |  | 
 | 	results = kvzalloc(results_sz, GFP_KERNEL); | 
 | 	if (!results) { | 
 | 		pr_err("fucked %d", __LINE__); | 
 | 		return -1; | 
 | 	} | 
 |  | 
 | 	memcpy(results, retvals, sz_rets); | 
 | 	for (i = 0; i < nr_threads; i++) { | 
 | 		memcpy(results + sz_rets + i * sz_times_per, | 
 | 		       times[i], sz_times_per); | 
 | 	} | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static int mcs_lock_test(void) | 
 | { | 
 | 	int i; | 
 | 	int ret = -1; | 
 | 	size_t amt; | 
 |  | 
 | 	atomic_set(&horses, nr_threads); | 
 | 	WRITE_ONCE(run_locktest, true); | 
 |  | 
 | 	times = kcalloc(nr_threads, sizeof(struct lock_sample *), GFP_KERNEL); | 
 | 	if (!times) { | 
 | 		pr_err("fucked %d", __LINE__); | 
 | 		return ret; | 
 | 	} | 
 |  | 
 | 	if (check_mul_overflow((size_t)nr_loops, sizeof(struct lock_sample), | 
 | 			       &amt)) { | 
 | 		pr_err("fucked %d", __LINE__); | 
 | 		goto out_times; | 
 | 	} | 
 | 	for (i = 0; i < nr_threads; i++) { | 
 | 		times[i] = kvzalloc_node(amt, GFP_KERNEL, cpu_to_node(i)); | 
 | 		if (!times[i]) { | 
 | 			/* we clean up the times[i]s below */ | 
 | 			pr_err("fucked %d", __LINE__); | 
 | 			goto out_times; | 
 | 		} | 
 | 	} | 
 |  | 
 | 	retvals = kcalloc(nr_threads, sizeof(void *), GFP_KERNEL); | 
 | 	if (!retvals) { | 
 | 		pr_err("fucked %d", __LINE__); | 
 | 		goto out_times; | 
 | 	} | 
 | 	for (i = 0; i < nr_threads; i++) | 
 | 		retvals[i] = (void*)-1; | 
 |  | 
 | 	threads = kcalloc(nr_threads, sizeof(struct task_struct *), | 
 | 			  GFP_KERNEL); | 
 | 	if (!threads) { | 
 | 		pr_err("fucked %d", __LINE__); | 
 | 		goto out_retvals; | 
 | 	} | 
 |  | 
 | 	for (i = 0; i < nr_threads; i++) { | 
 | 		threads[i] = kthread_create_on_cpu(test->func, | 
 | 						   (void*)(long)i, i, "mcs-""%u"); | 
 | 		if (IS_ERR(threads[i])) { | 
 | 			while (--i >= 0) { | 
 | 				/* | 
 | 				 * We could recover, perhaps with something like | 
 | 				 * kthread_stop(threads[i]), but we'd need those | 
 | 				 * threads to check kthread_should_stop(), | 
 | 				 * perhaps in their hokey barrier.  I've never | 
 | 				 * had this fail, so I haven't tested it. | 
 | 				 */ | 
 | 			} | 
 | 			pr_err("fucked %d", __LINE__); | 
 | 			goto out_threads; | 
 | 		} | 
 | 	} | 
 | 	for (i = 0; i < nr_threads; i++) { | 
 | 		/* what's the deal with refcnting here?  it looks like an | 
 | 		 * uncounted ref: create->result = current.  so once we start | 
 | 		 * them, we probably can't touch this again. */ | 
 | 		wake_up_process(threads[i]); | 
 | 	} | 
 |  | 
 | 	/* Hokey join.  We know when the test is done but wait for the others */ | 
 | 	wait_for_completion(&test_done); | 
 | 	for (i = 0; i < nr_threads; i++) { | 
 | 		while (READ_ONCE(retvals[i]) == (void*)-1) | 
 | 			cond_resched(); | 
 | 	} | 
 |  | 
 | 	ret = mcs_build_output(times, retvals); | 
 |  | 
 | out_threads: | 
 | 	kfree(threads); | 
 | out_retvals: | 
 | 	kfree(retvals); | 
 | out_times: | 
 | 	for (i = 0; i < nr_threads; i++) | 
 | 		kvfree(times[i]); | 
 | 	kfree(times); | 
 | 	return ret; | 
 | } | 
 |  | 
 | static ssize_t mcs_read(struct file *filp, struct kobject *kobj, | 
 | 			struct bin_attribute *bin_attr, | 
 | 			char *buf, loff_t off, size_t count) | 
 | { | 
 | 	mutex_lock(&mtx); | 
 |  | 
 | 	if (!test) { | 
 | 		mutex_unlock(&mtx); | 
 | 		return -ENOLCK;	/* i'd kill for errstr */ | 
 | 	} | 
 | 	if (!off) { | 
 | 		if (mcs_lock_test()) { | 
 | 			mutex_unlock(&mtx); | 
 | 			return -1; | 
 | 		} | 
 | 	} | 
 | 	if (!results) { | 
 | 		pr_err("fucked %d", __LINE__); | 
 | 		mutex_unlock(&mtx); | 
 | 		return -1; | 
 | 	} | 
 | 	/* mildly concerned about addition overflow.  caller's job? */ | 
 | 	if (count + off > results_sz) { | 
 | 		pr_err("fucked off %lld count %lu sz %lu\n", off, count, | 
 | 		       results_sz); | 
 | 		count = results_sz - off; | 
 | 	} | 
 | 	memcpy(buf, results + off, count); | 
 |  | 
 | 	mutex_unlock(&mtx); | 
 |  | 
 | 	return count; | 
 | } | 
 |  | 
 | static loff_t __mcs_get_results_size(void) | 
 | { | 
 | 	return nr_threads * | 
 | 		(sizeof(void*) + nr_loops * sizeof(struct lock_sample)); | 
 | } | 
 |  | 
 | /* | 
 |  * Unfortunately, this doesn't update the file live.  It'll only take effect the | 
 |  * next time you open it.  So users need to write, close, open, read. | 
 |  */ | 
 | static void __mcs_update_size(void) | 
 | { | 
 | 	struct kernfs_node *kn = kernfs_find_and_get(kernel_kobj->sd, "mcs"); | 
 |  | 
 | 	if (!kn) { | 
 | 		pr_err("fucked %d", __LINE__); | 
 | 		return; | 
 | 	} | 
 | 	kn->attr.size = __mcs_get_results_size(); | 
 | } | 
 |  | 
 | static ssize_t mcs_write(struct file *filp, struct kobject *kobj, | 
 | 			 struct bin_attribute *bin_attr, | 
 | 			 char *buf, loff_t off, size_t count) | 
 | { | 
 | 	unsigned int id, threads, loops, hold, delay; | 
 | 	ssize_t ret; | 
 | 	struct lock_test *t; | 
 |  | 
 | 	/* TODO: check_mul_overflow and whatnot, esp for the result_sz buffer */ | 
 | 	ret = sscanf(buf, "%u %u %u %u %u", &id, &threads, &loops, &hold, | 
 | 		     &delay); | 
 | 	if (ret != 5) | 
 | 		return -EINVAL; | 
 | 	 | 
 | 	t = get_test(id); | 
 | 	if (!t) | 
 | 		return -ENOLCK; | 
 | 	if (threads > num_online_cpus()) | 
 | 		return -ENODEV; | 
 | 	if (threads == 0) | 
 | 		threads = num_online_cpus(); | 
 | 	mutex_lock(&mtx); | 
 | 	test = t; | 
 | 	nr_threads = threads; | 
 | 	nr_loops = loops; | 
 | 	hold_time = hold; | 
 | 	delay_time = delay; | 
 | 	__mcs_update_size(); | 
 | 	mutex_unlock(&mtx); | 
 | 	return count; | 
 | } | 
 |  | 
 | struct bin_attribute mcs_attr = { | 
 | 	.attr = { | 
 | 		.name = "mcs", | 
 | 		.mode = 0666, | 
 | 	}, | 
 | 	.size = 0, | 
 | 	.private = NULL, | 
 | 	.read = mcs_read, | 
 | 	.write = mcs_write, | 
 | }; | 
 |  | 
 | static int __init mcs_init(void) | 
 | { | 
 | 	mutex_init(&mtx); | 
 |  | 
 | 	/* | 
 | 	 * The user needs to set these, but start with sensible defaults in case | 
 | 	 * they read without writing. | 
 | 	 */ | 
 | 	nr_threads = num_online_cpus(); | 
 | 	nr_loops = 10000; | 
 | 	mcs_attr.size = __mcs_get_results_size(); | 
 |  | 
 | 	if (sysfs_create_bin_file(kernel_kobj, &mcs_attr)) { | 
 | 		pr_err("\n\nfucked %d !!!\n\n\n", __LINE__); | 
 | 		return -1; | 
 | 	} | 
 |  | 
 | 	return 0; | 
 | } | 
 |  | 
 | static void __exit mcs_exit(void) | 
 | { | 
 | 	sysfs_remove_bin_file(kernel_kobj, &mcs_attr); | 
 | } | 
 |  | 
 | module_init(mcs_init); | 
 | module_exit(mcs_exit); | 
 |  | 
 | MODULE_LICENSE("GPL"); | 
 | MODULE_AUTHOR("Barret Rhoden <brho@google.com>"); | 
 | MODULE_DESCRIPTION("MCS lock test"); |