| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (c) 2013, 2014 The Regents of the University of California |
| * Copyright (c) 2020 Google Inc |
| * |
| * Barret Rhoden <brho@cs.berkeley.edu> |
| * |
| * Sorry, but you'll need to change your linux source to expose this function: |
| |
| EXPORT_SYMBOL_GPL(kthread_create_on_cpu); |
| |
| * |
| */ |
| |
| #include <linux/module.h> |
| #include <linux/moduleparam.h> |
| #include <linux/kobject.h> |
| #include <linux/sysfs.h> |
| #include <linux/slab.h> |
| |
| #include <linux/sched/task.h> |
| #include <linux/sched/mm.h> |
| #include <linux/delay.h> |
| #include <linux/kthread.h> |
| #include <linux/completion.h> |
| #include <asm/msr.h> |
| |
| /* Seems fine either way. Userspace uses lfence; rdtsc. So use this if you're |
| * paranoid about comparisons between user and kernel. */ |
| #if 1 |
| static inline u64 read_tsc_serialized(void) |
| { |
| u32 lo, hi; |
| |
| asm volatile("lfence; rdtsc" : "=a" (lo), "=d" (hi)); |
| return (u64)hi << 32 | lo; |
| } |
| |
| #else |
| #define read_tsc_serialized rdtsc_ordered |
| #endif |
| |
| static void simple_ndelay(u64 nsec) |
| { |
| u64 end; |
| |
| end = rdtsc() + (tsc_khz * nsec) / 1000000; |
| do { |
| cpu_relax(); |
| } while (rdtsc() < end); |
| } |
| |
| struct lock_sample { |
| u64 pre; |
| u64 acq; |
| u64 un; |
| bool valid; |
| }; |
| |
| /* Consider using 128, if you're worried about Advanced Cacheline Prefetching */ |
| #define CL_SZ 64 |
| |
| /* How long we'll run without IRQs enabled */ |
| #define MSEC_WITHOUT_IRQ 100 |
| |
| /*********************** MCS ******************************/ |
| |
| #define MCS_LOCK_INIT {0} |
| #define MCS_QNODE_INIT {0, 0} |
| |
| typedef struct mcs_lock_qnode |
| { |
| struct mcs_lock_qnode *next; |
| int locked; |
| } __attribute__((aligned(CL_SZ))) mcs_lock_qnode_t; |
| |
| typedef struct mcs_lock |
| { |
| mcs_lock_qnode_t *lock; |
| } mcs_lock_t; |
| |
| /* Dirty trick to get an isolated cache line; need the attrib on the type. */ |
| struct { |
| struct mcs_lock ___mcs_l; |
| } __attribute__((aligned(CL_SZ))) __mcs_l = {MCS_LOCK_INIT}; |
| #define mcs_l __mcs_l.___mcs_l |
| |
| void mcs_lock_init(struct mcs_lock *lock) |
| { |
| memset(lock, 0, sizeof(mcs_lock_t)); |
| } |
| |
| static inline mcs_lock_qnode_t *mcs_qnode_swap(mcs_lock_qnode_t **addr, |
| mcs_lock_qnode_t *val) |
| { |
| return (mcs_lock_qnode_t*) __sync_lock_test_and_set((void**)addr, val); |
| } |
| |
| void notrace mcs_lock_lock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode) |
| { |
| mcs_lock_qnode_t *predecessor; |
| |
| qnode->next = 0; |
| barrier(); /* swap provides a CPU mb() */ |
| predecessor = mcs_qnode_swap(&lock->lock, qnode); |
| if (predecessor) { |
| qnode->locked = 1; |
| smp_wmb(); |
| predecessor->next = qnode; |
| /* no need for a wrmb(), since this will only get unlocked |
| * after they read our previous write */ |
| while (qnode->locked) |
| cpu_relax(); |
| } |
| barrier();/* just need a cmb, the swap handles the CPU wmb/wrmb() */ |
| } |
| |
| /* CAS version (no usurper) */ |
| void notrace mcs_lock_unlock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode) |
| { |
| /* Check if someone is already waiting on us to unlock */ |
| if (qnode->next == 0) { |
| /* no need for CPU mbs, since there's an atomic_cas() */ |
| barrier(); |
| /* If we're still the lock, just swap it with 0 (unlock) and |
| * return */ |
| if (__sync_bool_compare_and_swap((void**)&lock->lock, qnode, 0)) |
| return; |
| /* We failed, someone is there and we are some (maybe a |
| * different) thread's pred. Since someone else was waiting, |
| * they should have made themselves our next. Spin (very |
| * briefly!) til it happens. */ |
| while (qnode->next == 0) |
| cpu_relax(); |
| /* Alpha wants a read_barrier_depends() here */ |
| /* Now that we have a next, unlock them */ |
| qnode->next->locked = 0; |
| } else { |
| /* mb()s necessary since we didn't call an atomic_swap() */ |
| /* need to make sure any previous writes don't pass unlocking */ |
| smp_wmb(); |
| /* need to make sure any reads happen before the unlocking */ |
| barrier(); //rwmb(); |
| /* simply unlock whoever is next */ |
| qnode->next->locked = 0; |
| } |
| } |
| |
| /*********************** QUEUE ******************************/ |
| struct { |
| struct qspinlock ___qsl; |
| } __attribute__((aligned(CL_SZ))) __qsl = {__ARCH_SPIN_LOCK_UNLOCKED}; |
| #define qsl __qsl.___qsl |
| |
| /*********************** SPIN ******************************/ |
| struct { |
| arch_spinlock_t ___asl;; |
| } __attribute__((aligned(CL_SZ))) __asl = {__ARCH_SPIN_LOCK_UNLOCKED}; |
| #define asl __asl.___asl |
| |
| |
| /* mtx protects all variables and the test run */ |
| static struct mutex mtx; |
| |
| struct lock_test; |
| static struct lock_test *test; |
| static DECLARE_COMPLETION(test_done); |
| |
| static unsigned int nr_threads; |
| static unsigned int nr_loops; |
| static unsigned int hold_time; |
| static unsigned int delay_time; |
| |
| /* array[nr_thread] of pointers of lock_sample[nr_loops] */ |
| static struct lock_sample **times; |
| /* array[nr_thread] of task* */ |
| static struct task_struct **threads; |
| /* array[nr_thread] of void* */ |
| static void **retvals; |
| static void *results; |
| static size_t results_sz; |
| |
| static bool run_locktest; |
| static atomic_t horses; |
| |
| /* Every lock type has their own function, named __lock_name_thread(). */ |
| #define lock_func(lock_name, pre_cmd, lock_cmd, unlock_cmd) \ |
| static int __##lock_name##_thread(void *arg) \ |
| { \ |
| long thread_id = (long)arg; \ |
| u64 pre_lock, acq_lock, un_lock; \ |
| struct lock_sample *this_time; \ |
| int i; \ |
| u64 next_irq; \ |
| \ |
| /* |
| * hold_time is the important one to have locally. o/w we might cache |
| * miss during the critical section. Why would we miss? Perhaps |
| * because hold_time is on the adjacent cache line to the spinlock, and |
| * !(MSR 0x1A4 & 2), though that'd only make sense if Adjacent Cachelin |
| * Prefetching prefetched in exclusive mode (and thus invalidating). |
| * The others are important too, though less so. Their miss would be |
| * outside the critical section, but if you happen to rearrange the |
| * file, they could falsely share with the lock. |
| */ \ |
| unsigned int hold_time_l = READ_ONCE(hold_time); \ |
| unsigned int delay_time_l = READ_ONCE(delay_time); \ |
| unsigned int nr_loops_l = READ_ONCE(nr_loops); \ |
| \ |
| pre_cmd \ |
| \ |
| atomic_dec(&horses); \ |
| while (atomic_read(&horses)) \ |
| cpu_relax(); \ |
| \ |
| /* |
| * I'd like to enable/disable IRQs in the loop, but that affects the |
| * test, even if they are outside the timestamps and the critical |
| * section. Instead, just turn them on periodically. 100ms was what I |
| * noticed didn't affect the test's throughput (Haswell). |
| */ \ |
| local_irq_disable(); \ |
| next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ; \ |
| \ |
| for (i = 0; i < nr_loops_l; i++) { \ |
| /* |
| * might be able to replace this with post-processing. let the |
| * test run, and discard all entries after the first finisher |
| */ \ |
| if (!READ_ONCE(run_locktest)) \ |
| break; \ |
| \ |
| pre_lock = read_tsc_serialized(); \ |
| \ |
| lock_cmd \ |
| \ |
| acq_lock = read_tsc_serialized(); \ |
| \ |
| if (hold_time_l) \ |
| simple_ndelay(hold_time_l); \ |
| \ |
| unlock_cmd \ |
| \ |
| un_lock = read_tsc_serialized(); \ |
| \ |
| this_time = ×[thread_id][i]; \ |
| this_time->pre = pre_lock; \ |
| this_time->acq = acq_lock; \ |
| this_time->un = un_lock; \ |
| /* Can turn these on/off to control which samples we gather */ \ |
| this_time->valid = true; \ |
| if (delay_time_l) \ |
| simple_ndelay(delay_time_l); \ |
| /* |
| * This can throw off your delay_time. Think of delay_time as |
| * the least amount of time we'll wait between reacquiring the |
| * lock. |
| */ \ |
| if (next_irq < un_lock) { \ |
| local_irq_enable(); \ |
| cond_resched(); /* since we're here. */ \ |
| local_irq_disable(); \ |
| next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ; \ |
| } \ |
| } \ |
| \ |
| local_irq_enable(); \ |
| \ |
| /* First thread to finish stops the test */ \ |
| WRITE_ONCE(run_locktest, false); \ |
| /* |
| * Wakes the controller thread. The others will be done soon, to |
| * complete the hokey thread join. |
| */ \ |
| complete(&test_done); \ |
| \ |
| WRITE_ONCE(retvals[thread_id], (void*)(long)i); \ |
| \ |
| return 0; \ |
| } |
| |
| lock_func(mcs, |
| struct mcs_lock_qnode qn = MCS_QNODE_INIT;, |
| mcs_lock_lock(&mcs_l, &qn);, |
| mcs_lock_unlock(&mcs_l, &qn);); |
| lock_func(queue, |
| ;, |
| queued_spin_lock(&qsl);, |
| queued_spin_unlock(&qsl);); |
| lock_func(spin, |
| ;, |
| arch_spin_lock(&asl);, |
| arch_spin_unlock(&asl);); |
| |
| /* ID is for userspace, name is for the kthread, func is what runs */ |
| struct lock_test { |
| unsigned int id; |
| const char *name; |
| int (*func)(void *); |
| }; |
| |
| #define LOCKTEST_MCS 1 |
| #define LOCKTEST_QUEUE 2 |
| #define LOCKTEST_SPIN 3 |
| |
| static struct lock_test tests[] = { |
| {LOCKTEST_MCS, "mcs", __mcs_thread}, |
| {LOCKTEST_QUEUE, "queue", __queue_thread}, |
| {LOCKTEST_SPIN, "spin", __spin_thread}, |
| {} |
| }; |
| |
| static struct lock_test *get_test(unsigned int id) |
| { |
| struct lock_test *ret; |
| |
| for (ret = tests; ret->id; ret++) |
| if (ret->id == id) |
| return ret; |
| return NULL; |
| } |
| |
| /* |
| * This consolidates the results in a format we will export to userspace. We |
| * could have just used this format for the test itself, but then the times |
| * arrays wouldn't be NUMA local. |
| */ |
| static int mcs_build_output(struct lock_sample **times, void **retvals) |
| { |
| int i; |
| size_t sz_rets = nr_threads * sizeof(void*); |
| size_t sz_times_per = nr_loops * sizeof(struct lock_sample); |
| |
| results_sz = sz_rets + nr_threads * sz_times_per; |
| |
| kvfree(results); |
| |
| results = kvzalloc(results_sz, GFP_KERNEL); |
| if (!results) { |
| pr_err("fucked %d", __LINE__); |
| return -1; |
| } |
| |
| memcpy(results, retvals, sz_rets); |
| for (i = 0; i < nr_threads; i++) { |
| memcpy(results + sz_rets + i * sz_times_per, |
| times[i], sz_times_per); |
| } |
| |
| return 0; |
| } |
| |
| static int mcs_lock_test(void) |
| { |
| int i; |
| int ret = -1; |
| size_t amt; |
| |
| atomic_set(&horses, nr_threads); |
| WRITE_ONCE(run_locktest, true); |
| |
| times = kcalloc(nr_threads, sizeof(struct lock_sample *), GFP_KERNEL); |
| if (!times) { |
| pr_err("fucked %d", __LINE__); |
| return ret; |
| } |
| |
| if (check_mul_overflow((size_t)nr_loops, sizeof(struct lock_sample), |
| &amt)) { |
| pr_err("fucked %d", __LINE__); |
| goto out_times; |
| } |
| for (i = 0; i < nr_threads; i++) { |
| times[i] = kvzalloc_node(amt, GFP_KERNEL, cpu_to_node(i)); |
| if (!times[i]) { |
| /* we clean up the times[i]s below */ |
| pr_err("fucked %d", __LINE__); |
| goto out_times; |
| } |
| } |
| |
| retvals = kcalloc(nr_threads, sizeof(void *), GFP_KERNEL); |
| if (!retvals) { |
| pr_err("fucked %d", __LINE__); |
| goto out_times; |
| } |
| for (i = 0; i < nr_threads; i++) |
| retvals[i] = (void*)-1; |
| |
| threads = kcalloc(nr_threads, sizeof(struct task_struct *), |
| GFP_KERNEL); |
| if (!threads) { |
| pr_err("fucked %d", __LINE__); |
| goto out_retvals; |
| } |
| |
| for (i = 0; i < nr_threads; i++) { |
| threads[i] = kthread_create_on_cpu(test->func, |
| (void*)(long)i, i, "mcs-""%u"); |
| if (IS_ERR(threads[i])) { |
| while (--i >= 0) { |
| /* |
| * We could recover, perhaps with something like |
| * kthread_stop(threads[i]), but we'd need those |
| * threads to check kthread_should_stop(), |
| * perhaps in their hokey barrier. I've never |
| * had this fail, so I haven't tested it. |
| */ |
| } |
| pr_err("fucked %d", __LINE__); |
| goto out_threads; |
| } |
| } |
| for (i = 0; i < nr_threads; i++) { |
| /* what's the deal with refcnting here? it looks like an |
| * uncounted ref: create->result = current. so once we start |
| * them, we probably can't touch this again. */ |
| wake_up_process(threads[i]); |
| } |
| |
| /* Hokey join. We know when the test is done but wait for the others */ |
| wait_for_completion(&test_done); |
| for (i = 0; i < nr_threads; i++) { |
| while (READ_ONCE(retvals[i]) == (void*)-1) |
| cond_resched(); |
| } |
| |
| ret = mcs_build_output(times, retvals); |
| |
| out_threads: |
| kfree(threads); |
| out_retvals: |
| kfree(retvals); |
| out_times: |
| for (i = 0; i < nr_threads; i++) |
| kvfree(times[i]); |
| kfree(times); |
| return ret; |
| } |
| |
| static ssize_t mcs_read(struct file *filp, struct kobject *kobj, |
| struct bin_attribute *bin_attr, |
| char *buf, loff_t off, size_t count) |
| { |
| mutex_lock(&mtx); |
| |
| if (!test) { |
| mutex_unlock(&mtx); |
| return -ENOLCK; /* i'd kill for errstr */ |
| } |
| if (!off) { |
| if (mcs_lock_test()) { |
| mutex_unlock(&mtx); |
| return -1; |
| } |
| } |
| if (!results) { |
| pr_err("fucked %d", __LINE__); |
| mutex_unlock(&mtx); |
| return -1; |
| } |
| /* mildly concerned about addition overflow. caller's job? */ |
| if (count + off > results_sz) { |
| pr_err("fucked off %lld count %lu sz %lu\n", off, count, |
| results_sz); |
| count = results_sz - off; |
| } |
| memcpy(buf, results + off, count); |
| |
| mutex_unlock(&mtx); |
| |
| return count; |
| } |
| |
| static loff_t __mcs_get_results_size(void) |
| { |
| return nr_threads * |
| (sizeof(void*) + nr_loops * sizeof(struct lock_sample)); |
| } |
| |
| /* |
| * Unfortunately, this doesn't update the file live. It'll only take effect the |
| * next time you open it. So users need to write, close, open, read. |
| */ |
| static void __mcs_update_size(void) |
| { |
| struct kernfs_node *kn = kernfs_find_and_get(kernel_kobj->sd, "mcs"); |
| |
| if (!kn) { |
| pr_err("fucked %d", __LINE__); |
| return; |
| } |
| kn->attr.size = __mcs_get_results_size(); |
| } |
| |
| static ssize_t mcs_write(struct file *filp, struct kobject *kobj, |
| struct bin_attribute *bin_attr, |
| char *buf, loff_t off, size_t count) |
| { |
| unsigned int id, threads, loops, hold, delay; |
| ssize_t ret; |
| struct lock_test *t; |
| |
| /* TODO: check_mul_overflow and whatnot, esp for the result_sz buffer */ |
| ret = sscanf(buf, "%u %u %u %u %u", &id, &threads, &loops, &hold, |
| &delay); |
| if (ret != 5) |
| return -EINVAL; |
| |
| t = get_test(id); |
| if (!t) |
| return -ENOLCK; |
| if (threads > num_online_cpus()) |
| return -ENODEV; |
| if (threads == 0) |
| threads = num_online_cpus(); |
| mutex_lock(&mtx); |
| test = t; |
| nr_threads = threads; |
| nr_loops = loops; |
| hold_time = hold; |
| delay_time = delay; |
| __mcs_update_size(); |
| mutex_unlock(&mtx); |
| return count; |
| } |
| |
| struct bin_attribute mcs_attr = { |
| .attr = { |
| .name = "mcs", |
| .mode = 0666, |
| }, |
| .size = 0, |
| .private = NULL, |
| .read = mcs_read, |
| .write = mcs_write, |
| }; |
| |
| static int __init mcs_init(void) |
| { |
| mutex_init(&mtx); |
| |
| /* |
| * The user needs to set these, but start with sensible defaults in case |
| * they read without writing. |
| */ |
| nr_threads = num_online_cpus(); |
| nr_loops = 10000; |
| mcs_attr.size = __mcs_get_results_size(); |
| |
| if (sysfs_create_bin_file(kernel_kobj, &mcs_attr)) { |
| pr_err("\n\nfucked %d !!!\n\n\n", __LINE__); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| static void __exit mcs_exit(void) |
| { |
| sysfs_remove_bin_file(kernel_kobj, &mcs_attr); |
| } |
| |
| module_init(mcs_init); |
| module_exit(mcs_exit); |
| |
| MODULE_LICENSE("GPL"); |
| MODULE_AUTHOR("Barret Rhoden <brho@google.com>"); |
| MODULE_DESCRIPTION("MCS lock test"); |