blob: 59933136398a5514ced011eba21b9443efb5b452 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2013, 2014 The Regents of the University of California
* Copyright (c) 2020 Google Inc
*
* Barret Rhoden <brho@cs.berkeley.edu>
*
* Sorry, but you'll need to change your linux source to expose this function:
EXPORT_SYMBOL_GPL(kthread_create_on_cpu);
*
*/
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/task.h>
#include <linux/sched/mm.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <asm/msr.h>
/* Seems fine either way. Userspace uses lfence; rdtsc. So use this if you're
* paranoid about comparisons between user and kernel. */
#if 1
static inline u64 read_tsc_serialized(void)
{
u32 lo, hi;
asm volatile("lfence; rdtsc" : "=a" (lo), "=d" (hi));
return (u64)hi << 32 | lo;
}
#else
#define read_tsc_serialized rdtsc_ordered
#endif
static void simple_ndelay(u64 nsec)
{
u64 end;
end = rdtsc() + (tsc_khz * nsec) / 1000000;
do {
cpu_relax();
} while (rdtsc() < end);
}
struct lock_sample {
u64 pre;
u64 acq;
u64 un;
bool valid;
};
/* Consider using 128, if you're worried about Advanced Cacheline Prefetching */
#define CL_SZ 64
/* How long we'll run without IRQs enabled */
#define MSEC_WITHOUT_IRQ 100
/*********************** MCS ******************************/
#define MCS_LOCK_INIT {0}
#define MCS_QNODE_INIT {0, 0}
typedef struct mcs_lock_qnode
{
struct mcs_lock_qnode *next;
int locked;
} __attribute__((aligned(CL_SZ))) mcs_lock_qnode_t;
typedef struct mcs_lock
{
mcs_lock_qnode_t *lock;
} mcs_lock_t;
/* Dirty trick to get an isolated cache line; need the attrib on the type. */
struct {
struct mcs_lock ___mcs_l;
} __attribute__((aligned(CL_SZ))) __mcs_l = {MCS_LOCK_INIT};
#define mcs_l __mcs_l.___mcs_l
void mcs_lock_init(struct mcs_lock *lock)
{
memset(lock, 0, sizeof(mcs_lock_t));
}
static inline mcs_lock_qnode_t *mcs_qnode_swap(mcs_lock_qnode_t **addr,
mcs_lock_qnode_t *val)
{
return (mcs_lock_qnode_t*) __sync_lock_test_and_set((void**)addr, val);
}
void notrace mcs_lock_lock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode)
{
mcs_lock_qnode_t *predecessor;
qnode->next = 0;
barrier(); /* swap provides a CPU mb() */
predecessor = mcs_qnode_swap(&lock->lock, qnode);
if (predecessor) {
qnode->locked = 1;
smp_wmb();
predecessor->next = qnode;
/* no need for a wrmb(), since this will only get unlocked
* after they read our previous write */
while (qnode->locked)
cpu_relax();
}
barrier();/* just need a cmb, the swap handles the CPU wmb/wrmb() */
}
/* CAS version (no usurper) */
void notrace mcs_lock_unlock(struct mcs_lock *lock, struct mcs_lock_qnode *qnode)
{
/* Check if someone is already waiting on us to unlock */
if (qnode->next == 0) {
/* no need for CPU mbs, since there's an atomic_cas() */
barrier();
/* If we're still the lock, just swap it with 0 (unlock) and
* return */
if (__sync_bool_compare_and_swap((void**)&lock->lock, qnode, 0))
return;
/* We failed, someone is there and we are some (maybe a
* different) thread's pred. Since someone else was waiting,
* they should have made themselves our next. Spin (very
* briefly!) til it happens. */
while (qnode->next == 0)
cpu_relax();
/* Alpha wants a read_barrier_depends() here */
/* Now that we have a next, unlock them */
qnode->next->locked = 0;
} else {
/* mb()s necessary since we didn't call an atomic_swap() */
/* need to make sure any previous writes don't pass unlocking */
smp_wmb();
/* need to make sure any reads happen before the unlocking */
barrier(); //rwmb();
/* simply unlock whoever is next */
qnode->next->locked = 0;
}
}
/*********************** QUEUE ******************************/
struct {
struct qspinlock ___qsl;
} __attribute__((aligned(CL_SZ))) __qsl = {__ARCH_SPIN_LOCK_UNLOCKED};
#define qsl __qsl.___qsl
/*********************** SPIN ******************************/
struct {
arch_spinlock_t ___asl;;
} __attribute__((aligned(CL_SZ))) __asl = {__ARCH_SPIN_LOCK_UNLOCKED};
#define asl __asl.___asl
/* mtx protects all variables and the test run */
static struct mutex mtx;
struct lock_test;
static struct lock_test *test;
static DECLARE_COMPLETION(test_done);
static unsigned int nr_threads;
static unsigned int nr_loops;
static unsigned int hold_time;
static unsigned int delay_time;
/* array[nr_thread] of pointers of lock_sample[nr_loops] */
static struct lock_sample **times;
/* array[nr_thread] of task* */
static struct task_struct **threads;
/* array[nr_thread] of void* */
static void **retvals;
static void *results;
static size_t results_sz;
static bool run_locktest;
static atomic_t horses;
/* Every lock type has their own function, named __lock_name_thread(). */
#define lock_func(lock_name, pre_cmd, lock_cmd, unlock_cmd) \
static int __##lock_name##_thread(void *arg) \
{ \
long thread_id = (long)arg; \
u64 pre_lock, acq_lock, un_lock; \
struct lock_sample *this_time; \
int i; \
u64 next_irq; \
\
/*
* hold_time is the important one to have locally. o/w we might cache
* miss during the critical section. Why would we miss? Perhaps
* because hold_time is on the adjacent cache line to the spinlock, and
* !(MSR 0x1A4 & 2), though that'd only make sense if Adjacent Cachelin
* Prefetching prefetched in exclusive mode (and thus invalidating).
* The others are important too, though less so. Their miss would be
* outside the critical section, but if you happen to rearrange the
* file, they could falsely share with the lock.
*/ \
unsigned int hold_time_l = READ_ONCE(hold_time); \
unsigned int delay_time_l = READ_ONCE(delay_time); \
unsigned int nr_loops_l = READ_ONCE(nr_loops); \
\
pre_cmd \
\
atomic_dec(&horses); \
while (atomic_read(&horses)) \
cpu_relax(); \
\
/*
* I'd like to enable/disable IRQs in the loop, but that affects the
* test, even if they are outside the timestamps and the critical
* section. Instead, just turn them on periodically. 100ms was what I
* noticed didn't affect the test's throughput (Haswell).
*/ \
local_irq_disable(); \
next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ; \
\
for (i = 0; i < nr_loops_l; i++) { \
/*
* might be able to replace this with post-processing. let the
* test run, and discard all entries after the first finisher
*/ \
if (!READ_ONCE(run_locktest)) \
break; \
\
pre_lock = read_tsc_serialized(); \
\
lock_cmd \
\
acq_lock = read_tsc_serialized(); \
\
if (hold_time_l) \
simple_ndelay(hold_time_l); \
\
unlock_cmd \
\
un_lock = read_tsc_serialized(); \
\
this_time = &times[thread_id][i]; \
this_time->pre = pre_lock; \
this_time->acq = acq_lock; \
this_time->un = un_lock; \
/* Can turn these on/off to control which samples we gather */ \
this_time->valid = true; \
if (delay_time_l) \
simple_ndelay(delay_time_l); \
/*
* This can throw off your delay_time. Think of delay_time as
* the least amount of time we'll wait between reacquiring the
* lock.
*/ \
if (next_irq < un_lock) { \
local_irq_enable(); \
cond_resched(); /* since we're here. */ \
local_irq_disable(); \
next_irq = rdtsc() + tsc_khz * MSEC_WITHOUT_IRQ; \
} \
} \
\
local_irq_enable(); \
\
/* First thread to finish stops the test */ \
WRITE_ONCE(run_locktest, false); \
/*
* Wakes the controller thread. The others will be done soon, to
* complete the hokey thread join.
*/ \
complete(&test_done); \
\
WRITE_ONCE(retvals[thread_id], (void*)(long)i); \
\
return 0; \
}
lock_func(mcs,
struct mcs_lock_qnode qn = MCS_QNODE_INIT;,
mcs_lock_lock(&mcs_l, &qn);,
mcs_lock_unlock(&mcs_l, &qn););
lock_func(queue,
;,
queued_spin_lock(&qsl);,
queued_spin_unlock(&qsl););
lock_func(spin,
;,
arch_spin_lock(&asl);,
arch_spin_unlock(&asl););
/* ID is for userspace, name is for the kthread, func is what runs */
struct lock_test {
unsigned int id;
const char *name;
int (*func)(void *);
};
#define LOCKTEST_MCS 1
#define LOCKTEST_QUEUE 2
#define LOCKTEST_SPIN 3
static struct lock_test tests[] = {
{LOCKTEST_MCS, "mcs", __mcs_thread},
{LOCKTEST_QUEUE, "queue", __queue_thread},
{LOCKTEST_SPIN, "spin", __spin_thread},
{}
};
static struct lock_test *get_test(unsigned int id)
{
struct lock_test *ret;
for (ret = tests; ret->id; ret++)
if (ret->id == id)
return ret;
return NULL;
}
/*
* This consolidates the results in a format we will export to userspace. We
* could have just used this format for the test itself, but then the times
* arrays wouldn't be NUMA local.
*/
static int mcs_build_output(struct lock_sample **times, void **retvals)
{
int i;
size_t sz_rets = nr_threads * sizeof(void*);
size_t sz_times_per = nr_loops * sizeof(struct lock_sample);
results_sz = sz_rets + nr_threads * sz_times_per;
kvfree(results);
results = kvzalloc(results_sz, GFP_KERNEL);
if (!results) {
pr_err("fucked %d", __LINE__);
return -1;
}
memcpy(results, retvals, sz_rets);
for (i = 0; i < nr_threads; i++) {
memcpy(results + sz_rets + i * sz_times_per,
times[i], sz_times_per);
}
return 0;
}
static int mcs_lock_test(void)
{
int i;
int ret = -1;
size_t amt;
atomic_set(&horses, nr_threads);
WRITE_ONCE(run_locktest, true);
times = kcalloc(nr_threads, sizeof(struct lock_sample *), GFP_KERNEL);
if (!times) {
pr_err("fucked %d", __LINE__);
return ret;
}
if (check_mul_overflow((size_t)nr_loops, sizeof(struct lock_sample),
&amt)) {
pr_err("fucked %d", __LINE__);
goto out_times;
}
for (i = 0; i < nr_threads; i++) {
times[i] = kvzalloc_node(amt, GFP_KERNEL, cpu_to_node(i));
if (!times[i]) {
/* we clean up the times[i]s below */
pr_err("fucked %d", __LINE__);
goto out_times;
}
}
retvals = kcalloc(nr_threads, sizeof(void *), GFP_KERNEL);
if (!retvals) {
pr_err("fucked %d", __LINE__);
goto out_times;
}
for (i = 0; i < nr_threads; i++)
retvals[i] = (void*)-1;
threads = kcalloc(nr_threads, sizeof(struct task_struct *),
GFP_KERNEL);
if (!threads) {
pr_err("fucked %d", __LINE__);
goto out_retvals;
}
for (i = 0; i < nr_threads; i++) {
threads[i] = kthread_create_on_cpu(test->func,
(void*)(long)i, i, "mcs-""%u");
if (IS_ERR(threads[i])) {
while (--i >= 0) {
/*
* We could recover, perhaps with something like
* kthread_stop(threads[i]), but we'd need those
* threads to check kthread_should_stop(),
* perhaps in their hokey barrier. I've never
* had this fail, so I haven't tested it.
*/
}
pr_err("fucked %d", __LINE__);
goto out_threads;
}
}
for (i = 0; i < nr_threads; i++) {
/* what's the deal with refcnting here? it looks like an
* uncounted ref: create->result = current. so once we start
* them, we probably can't touch this again. */
wake_up_process(threads[i]);
}
/* Hokey join. We know when the test is done but wait for the others */
wait_for_completion(&test_done);
for (i = 0; i < nr_threads; i++) {
while (READ_ONCE(retvals[i]) == (void*)-1)
cond_resched();
}
ret = mcs_build_output(times, retvals);
out_threads:
kfree(threads);
out_retvals:
kfree(retvals);
out_times:
for (i = 0; i < nr_threads; i++)
kvfree(times[i]);
kfree(times);
return ret;
}
static ssize_t mcs_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
mutex_lock(&mtx);
if (!test) {
mutex_unlock(&mtx);
return -ENOLCK; /* i'd kill for errstr */
}
if (!off) {
if (mcs_lock_test()) {
mutex_unlock(&mtx);
return -1;
}
}
if (!results) {
pr_err("fucked %d", __LINE__);
mutex_unlock(&mtx);
return -1;
}
/* mildly concerned about addition overflow. caller's job? */
if (count + off > results_sz) {
pr_err("fucked off %lld count %lu sz %lu\n", off, count,
results_sz);
count = results_sz - off;
}
memcpy(buf, results + off, count);
mutex_unlock(&mtx);
return count;
}
static loff_t __mcs_get_results_size(void)
{
return nr_threads *
(sizeof(void*) + nr_loops * sizeof(struct lock_sample));
}
/*
* Unfortunately, this doesn't update the file live. It'll only take effect the
* next time you open it. So users need to write, close, open, read.
*/
static void __mcs_update_size(void)
{
struct kernfs_node *kn = kernfs_find_and_get(kernel_kobj->sd, "mcs");
if (!kn) {
pr_err("fucked %d", __LINE__);
return;
}
kn->attr.size = __mcs_get_results_size();
}
static ssize_t mcs_write(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
unsigned int id, threads, loops, hold, delay;
ssize_t ret;
struct lock_test *t;
/* TODO: check_mul_overflow and whatnot, esp for the result_sz buffer */
ret = sscanf(buf, "%u %u %u %u %u", &id, &threads, &loops, &hold,
&delay);
if (ret != 5)
return -EINVAL;
t = get_test(id);
if (!t)
return -ENOLCK;
if (threads > num_online_cpus())
return -ENODEV;
if (threads == 0)
threads = num_online_cpus();
mutex_lock(&mtx);
test = t;
nr_threads = threads;
nr_loops = loops;
hold_time = hold;
delay_time = delay;
__mcs_update_size();
mutex_unlock(&mtx);
return count;
}
struct bin_attribute mcs_attr = {
.attr = {
.name = "mcs",
.mode = 0666,
},
.size = 0,
.private = NULL,
.read = mcs_read,
.write = mcs_write,
};
static int __init mcs_init(void)
{
mutex_init(&mtx);
/*
* The user needs to set these, but start with sensible defaults in case
* they read without writing.
*/
nr_threads = num_online_cpus();
nr_loops = 10000;
mcs_attr.size = __mcs_get_results_size();
if (sysfs_create_bin_file(kernel_kobj, &mcs_attr)) {
pr_err("\n\nfucked %d !!!\n\n\n", __LINE__);
return -1;
}
return 0;
}
static void __exit mcs_exit(void)
{
sysfs_remove_bin_file(kernel_kobj, &mcs_attr);
}
module_init(mcs_init);
module_exit(mcs_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Barret Rhoden <brho@google.com>");
MODULE_DESCRIPTION("MCS lock test");