blob: 6ce3e382be26b7098a65b018ff2b3c85b965bae9 [file] [log] [blame]
/* Copyright (c) 2015 The Regents of the University of California
* Valmon Leymarie <leymariv@berkeley.edu>
* Kevin Klues <klueska@cs.berkeley.edu>
* See LICENSE for details.
*/
#include <arch/topology.h>
#include <sys/queue.h>
#include <env.h>
#include <corerequest.h>
#include <kmalloc.h>
enum pnode_type { CORE, CPU, SOCKET, NUMA, MACHINE, NUM_NODE_TYPES };
static const char * const pnode_label[] = {
[CORE] = "CORE",
[CPU] = "CPU",
[SOCKET] = "SOCKET",
[NUMA] = "NUMA",
[MACHINE] = "MACHINE"
};
/* Internal representation of a node in the hierarchy of elements in the cpu
* topology of the machine (i.e. numa domain, socket, cpu, core, etc.). */
struct sched_pnode {
int id;
enum pnode_type type;
struct sched_pnode *parent;
/* Refcount is used to track how many cores have been allocated beneath
* the current node in the hierarchy. */
int refcount;
/* All nodes except cores have children. Cores have a sched_pcore. */
union {
struct sched_pnode *children;
struct sched_pcore *sched_pcore;
};
};
#define num_cpus (cpu_topology_info.num_cpus)
#define num_sockets (cpu_topology_info.num_sockets)
#define num_numa (cpu_topology_info.num_numa)
#define cores_per_numa (cpu_topology_info.cores_per_numa)
#define cores_per_socket (cpu_topology_info.cores_per_socket)
#define cores_per_cpu (cpu_topology_info.cores_per_cpu)
#define cpus_per_socket (cpu_topology_info.cpus_per_socket)
#define cpus_per_numa (cpu_topology_info.cpus_per_numa)
#define sockets_per_numa (cpu_topology_info.sockets_per_numa)
#define child_node_type(t) ((t) - 1)
/* The pcores in the system. (array gets alloced in init()). */
struct sched_pcore *all_pcores;
/* TAILQ of all unallocated, idle (CG) cores */
struct sched_pcore_tailq idlecores = TAILQ_HEAD_INITIALIZER(idlecores);
/* An array containing the number of nodes at each level. */
static int num_nodes[NUM_NODE_TYPES];
/* A 2D array containing for all core i its distance from a core j. */
static int **core_distance;
/* An array containing the number of children at each level. */
static int num_descendants[NUM_NODE_TYPES][NUM_NODE_TYPES];
/* A list of lookup tables to find specific nodes by type and id. */
static int total_nodes;
static struct sched_pnode *all_nodes;
static struct sched_pnode *node_lookup[NUM_NODE_TYPES];
/* Recursively increase the refcount from the node passed in through all of its
* ancestors in the hierarchy. */
static void incref_nodes(struct sched_pnode *n)
{
while (n != NULL) {
n->refcount++;
n = n->parent;
}
}
/* Recursively decrease the refcount from the node passed in through all of its
* ancestors in the hierarchy. */
static void decref_nodes(struct sched_pnode *n)
{
while (n != NULL) {
n->refcount--;
n = n->parent;
}
}
/* Create a node and initialize it. This code assumes that child are created
* before parent nodes.
*
* all_nodes is laid out such that all of the nodes are: CORES, then the CPUS,
* then the SOCKETS, then NUMA, then one for MACHINE. The nodes at any given
* layer are later broken up into chunks of n and assigned to their parents
* (down below). */
static void init_nodes(int type, int num, int nchildren)
{
/* Initialize the lookup table for this node type. */
num_nodes[type] = num;
node_lookup[type] = all_nodes;
for (int i = CORE; i < type; i++)
node_lookup[type] += num_nodes[i];
/* Initialize all fields of each node of this type. */
for (int i = 0; i < num; i++) {
struct sched_pnode *n = &node_lookup[type][i];
/* Initialize the common fields. */
n->id = i;
n->type = type;
n->refcount = 0;
n->parent = NULL;
/* If we are a core node, initialize the sched_pcore field. */
if (n->type == CORE) {
n->sched_pcore = &all_pcores[n->id];
n->sched_pcore->sched_pnode = n;
n->sched_pcore->core_info =
&cpu_topology_info.core_list[n->id];
n->sched_pcore->alloc_proc = NULL;
n->sched_pcore->prov_proc = NULL;
}
/* Otherwise initialize the children field, updating the parent
* of all children to the current node. This assumes the
* children have already been initialized (i.e. init_nodes()
* must be called iteratively from the bottom-up). */
if (n->type != CORE) {
n->children =
&node_lookup[child_node_type(type)][i * nchildren];
for (int j = 0; j < nchildren; j++)
n->children[j].parent = n;
}
}
}
/* Helper: Two cores have the same level id (and thus are at distance k, below)
* if their IDs divided by the number of cores per level are the same, due to
* how we number our cores. */
static int get_level_id(int coreid, int level)
{
return coreid / num_descendants[level][CORE];
}
/* Allocate a table of distances from one core to an other. Cores on the same
* cpu have a distance of 1; same socket have a distance of 2; same numa -> 3;
* same machine -> 4; */
static void init_core_distances(void)
{
core_distance = kzmalloc(num_cores * sizeof(int*), MEM_WAIT);
for (int i = 0; i < num_cores; i++)
core_distance[i] = kzmalloc(num_cores * sizeof(int), MEM_WAIT);
for (int i = 0; i < num_cores; i++) {
for (int j = 0; j < num_cores; j++) {
for (int k = CPU; k <= MACHINE; k++) {
if (get_level_id(i, k) == get_level_id(j, k)) {
core_distance[i][j] = k;
break;
}
}
}
}
}
/* Initialize any data assocaited with doing core allocation. */
void corealloc_init(void)
{
void *nodes_and_cores;
/* Allocate a flat array of nodes. */
total_nodes = num_cores + num_cpus + num_sockets + num_numa + 1;
nodes_and_cores = kmalloc(total_nodes * sizeof(struct sched_pnode) +
num_cores * sizeof(struct sched_pcore),
MEM_WAIT);
all_nodes = nodes_and_cores;
all_pcores = nodes_and_cores + total_nodes * sizeof(struct sched_pnode);
/* Initialize the number of descendants from our cpu_topology info. */
num_descendants[CPU][CORE] = cores_per_cpu;
num_descendants[SOCKET][CORE] = cores_per_socket;
num_descendants[SOCKET][CPU] = cpus_per_socket;
num_descendants[NUMA][CORE] = cores_per_numa;
num_descendants[NUMA][CPU] = cpus_per_numa;
num_descendants[NUMA][SOCKET] = sockets_per_numa;
num_descendants[MACHINE][CORE] = num_cores;
num_descendants[MACHINE][CPU] = num_cpus;
num_descendants[MACHINE][SOCKET] = num_sockets;
num_descendants[MACHINE][NUMA] = num_numa;
/* Initialize the nodes at each level in our hierarchy. */
init_nodes(CORE, num_cores, 0);
init_nodes(CPU, num_cpus, cores_per_cpu);
init_nodes(SOCKET, num_sockets, cpus_per_socket);
init_nodes(NUMA, num_numa, sockets_per_numa);
init_nodes(MACHINE, 1, num_numa);
/* Initialize our table of core_distances */
init_core_distances();
for (int i = 0; i < num_cores; i++) {
/* Remove all ll_cores from consideration for allocation. */
if (is_ll_core(i)) {
incref_nodes(all_pcores[i].sched_pnode);
continue;
}
#ifdef CONFIG_DISABLE_SMT
/* Remove all odd cores from consideration for allocation. */
if (i % 2 == 1) {
incref_nodes(all_pcores[i].sched_pnode);
continue;
}
#endif /* CONFIG_DISABLE_SMT */
/* Fill the idlecores array. */
TAILQ_INSERT_HEAD(&idlecores, &all_pcores[i], alloc_next);
}
}
/* Initialize any data associated with allocating cores to a process. */
void corealloc_proc_init(struct proc *p)
{
TAILQ_INIT(&p->ksched_data.crd.alloc_me);
TAILQ_INIT(&p->ksched_data.crd.prov_alloc_me);
TAILQ_INIT(&p->ksched_data.crd.prov_not_alloc_me);
}
/* Returns the sum of the distances from one core to all cores in a list.
*
* This assumes cl is the allocated list, or possibly the idlecores, since it
* uses alloc_next. */
static int cumulative_core_distance(struct sched_pcore *c,
struct sched_pcore_tailq cl)
{
int d = 0;
struct sched_pcore *temp = NULL;
TAILQ_FOREACH(temp, &cl, alloc_next) {
d += core_distance[c->core_info->core_id]
[temp->core_info->core_id];
}
return d;
}
/* Returns the first core in the hierarchy under node n. */
static struct sched_pcore *first_core_in_node(struct sched_pnode *n)
{
struct sched_pnode *first_child = n;
while (first_child->type != CORE)
first_child = &first_child->children[0];
return first_child->sched_pcore;
}
/* Return the first provisioned core available. Otherwise, return NULL. */
static struct sched_pcore *find_first_provisioned_core(struct proc *p)
{
return TAILQ_FIRST(&(p->ksched_data.crd.prov_not_alloc_me));
}
/* Returns the best first core to allocate for a proc which owns no core.
* Return the core that is the farthest from the others's proc cores. */
static struct sched_pcore *find_first_core(struct proc *p)
{
struct sched_pcore *c;
struct sched_pnode *n;
struct sched_pnode *nodes;
struct sched_pnode *bestn;
int best_refcount;
/* Find the best, first provisioned core if there are any. Even if the
* whole machine is allocated, we still give out provisioned cores,
* because they will be revoked from their current owner if necessary.
*/
c = find_first_provisioned_core(p);
if (c != NULL)
return c;
/* Otherwise, if the whole machine is already allocated, there are no
* cores left to allocate, and we are done. */
bestn = &node_lookup[MACHINE][0];
if (bestn->refcount == num_descendants[MACHINE][CORE])
return NULL;
/* Otherwise, we know at least one core is still available, so let's
* find the best one to allocate first. We start at NUMA, and loop
* through the topology to find it. */
for (int i = NUMA; i >= CORE; i--) {
nodes = bestn->children;
best_refcount = total_nodes;
bestn = NULL;
for (int j = 0; j < num_nodes[i]; j++) {
n = &nodes[j];
if (n->refcount == 0)
return first_core_in_node(n);
if (n->refcount == num_descendants[i][CORE])
continue;
if (n->refcount < best_refcount) {
best_refcount = n->refcount;
bestn = n;
}
}
}
return bestn->sched_pcore;
}
/* Return the closest core from the list of provisioned cores to cores we
* already own. If no cores are available we return NULL.*/
static struct sched_pcore *find_closest_provisioned_core(struct proc *p)
{
struct sched_pcore_tailq provisioned =
p->ksched_data.crd.prov_not_alloc_me;
struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me;
struct sched_pcore *bestc = NULL;
struct sched_pcore *c = NULL;
int bestd = 0;
TAILQ_FOREACH(c, &provisioned, prov_next) {
int currd = cumulative_core_distance(c, allocated);
if ((bestd == 0) || (currd < bestd)) {
bestd = currd;
bestc = c;
}
}
return bestc;
}
/* Return the closest core from the list of idlecores to cores we already own.
* If no cores are available we return NULL.*/
static struct sched_pcore *find_closest_idle_core(struct proc *p)
{
struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me;
struct sched_pcore *bestc = NULL;
struct sched_pcore *c = NULL;
int bestd = 0;
/* TODO: Add optimization to hand out core at equivalent distance if the
* best core found is provisioned to someone else. */
TAILQ_FOREACH(c, &idlecores, alloc_next) {
int currd = cumulative_core_distance(c, allocated);
if ((bestd == 0) || (currd < bestd)) {
bestd = currd;
bestc = c;
}
}
return bestc;
}
/* Consider the first core provisioned to a proc by calling
* find_best_provisioned_core(). Then check siblings of the cores the proc
* already owns. Calculate for every possible node its
* cumulative_core_distance() (sum of the distances from this core to all of
* the cores the proc owns). Allocate the core that has the lowest
* core_distance. This code assumes that the scheduler that uses it holds a
* lock for the duration of the call. */
static struct sched_pcore *find_closest_core(struct proc *p)
{
struct sched_pcore *bestc;
bestc = find_closest_provisioned_core(p);
if (bestc)
return bestc;
return find_closest_idle_core(p);
}
/* Find the best core to allocate. If no cores are allocated yet, find one that
* is as far from the cores allocated to other processes as possible.
* Otherwise, find a core that is as close as possible to one of the other
* cores we already own. */
uint32_t __find_best_core_to_alloc(struct proc *p)
{
struct sched_pcore *c = NULL;
if (TAILQ_FIRST(&(p->ksched_data.crd.alloc_me)) == NULL)
c = find_first_core(p);
else
c = find_closest_core(p);
if (c == NULL)
return -1;
return spc2pcoreid(c);
}
/* Track the pcore properly when it is allocated to p. This code assumes that
* the scheduler that uses it holds a lock for the duration of the call. */
void __track_core_alloc(struct proc *p, uint32_t pcoreid)
{
struct sched_pcore *spc;
assert(pcoreid < num_cores); /* catch bugs */
spc = pcoreid2spc(pcoreid);
assert(spc->alloc_proc != p); /* corruption or double-alloc */
spc->alloc_proc = p;
/* if the pcore is prov to them and now allocated, move lists */
if (spc->prov_proc == p) {
TAILQ_REMOVE(&p->ksched_data.crd.prov_not_alloc_me, spc,
prov_next);
TAILQ_INSERT_TAIL(&p->ksched_data.crd.prov_alloc_me, spc,
prov_next);
}
/* Actually allocate the core, removing it from the idle core list. */
TAILQ_REMOVE(&idlecores, spc, alloc_next);
TAILQ_INSERT_TAIL(&p->ksched_data.crd.alloc_me, spc, alloc_next);
incref_nodes(spc->sched_pnode);
}
/* Track the pcore properly when it is deallocated from p. This code assumes
* that the scheduler that uses it holds a lock for the duration of the call.
* */
void __track_core_dealloc(struct proc *p, uint32_t pcoreid)
{
struct sched_pcore *spc;
assert(pcoreid < num_cores); /* catch bugs */
spc = pcoreid2spc(pcoreid);
spc->alloc_proc = NULL;
/* if the pcore is prov to them and now deallocated, move lists */
if (spc->prov_proc == p) {
TAILQ_REMOVE(&p->ksched_data.crd.prov_alloc_me, spc, prov_next);
/* this is the victim list, which can be sorted so that we pick
* the right victim (sort by alloc_proc reverse priority, etc).
* In this case, the core isn't alloc'd by anyone, so it should
* be the first victim. */
TAILQ_INSERT_HEAD(&p->ksched_data.crd.prov_not_alloc_me, spc,
prov_next);
}
/* Actually dealloc the core, putting it back on the idle core list. */
TAILQ_REMOVE(&(p->ksched_data.crd.alloc_me), spc, alloc_next);
TAILQ_INSERT_HEAD(&idlecores, spc, alloc_next);
decref_nodes(spc->sched_pnode);
}
/* Bulk interface for __track_core_dealloc */
void __track_core_dealloc_bulk(struct proc *p, uint32_t *pc_arr,
uint32_t nr_cores)
{
for (int i = 0; i < nr_cores; i++)
__track_core_dealloc(p, pc_arr[i]);
}
/* One off function to make 'pcoreid' the next core chosen by the core
* allocation algorithm (so long as no provisioned cores are still idle).
* This code assumes that the scheduler that uses it holds a lock for the
* duration of the call. */
void __next_core_to_alloc(uint32_t pcoreid)
{
printk("%s is not supported by this core allocation policy!\n",
__func__);
}
/* One off function to sort the idle core list for debugging in the kernel
* monitor. This code assumes that the scheduler that uses it holds a lock
* for the duration of the call. */
void __sort_idle_cores(void)
{
printk("%s is not supported by this core allocation policy!\n",
__func__);
}
/* Print the map of idle cores that are still allocatable through our core
* allocation algorithm. */
void print_idle_core_map(void)
{
printk("Idle cores (unlocked!):\n");
for (int i = 0; i < num_cores; i++) {
struct sched_pcore *spc_i = &all_pcores[i];
if (spc_i->alloc_proc == NULL)
printk("Core %d, prov to %d (%p)\n",
spc_i->core_info->core_id,
spc_i->prov_proc ? spc_i->prov_proc->pid : 0,
spc_i->prov_proc);
}
}