kern/src/corealloc_packed.c - akaros - Git at Google

 /* Copyright (c) 2015 The Regents of the University of California
  * Valmon Leymarie <leymariv@berkeley.edu>
  * Kevin Klues <klueska@cs.berkeley.edu>
  * See LICENSE for details.
  */

 #include <arch/topology.h>
 #include <sys/queue.h>
 #include <env.h>
 #include <corerequest.h>
 #include <kmalloc.h>

 enum pnode_type { CORE, CPU, SOCKET, NUMA, MACHINE, NUM_NODE_TYPES };

 static const char * const pnode_label[] = {
 	[CORE] = "CORE",
 	[CPU] = "CPU",
 	[SOCKET] = "SOCKET",
 	[NUMA] = "NUMA",
 	[MACHINE] = "MACHINE"
 };

 /* Internal representation of a node in the hierarchy of elements in the cpu
  * topology of the machine (i.e. numa domain, socket, cpu, core, etc.). */
 struct sched_pnode {
 	int id;
 	enum pnode_type type;
 	struct sched_pnode *parent;

 	/* Refcount is used to track how many cores have been allocated beneath
 	 * the current node in the hierarchy. */
 	int refcount;

 	/* All nodes except cores have children. Cores have a sched_pcore. */
 	union {
 		struct sched_pnode *children;
 		struct sched_pcore *sched_pcore;
 	};
 };

 #define num_cpus            (cpu_topology_info.num_cpus)
 #define num_sockets         (cpu_topology_info.num_sockets)
 #define num_numa            (cpu_topology_info.num_numa)
 #define cores_per_numa      (cpu_topology_info.cores_per_numa)
 #define cores_per_socket    (cpu_topology_info.cores_per_socket)
 #define cores_per_cpu       (cpu_topology_info.cores_per_cpu)
 #define cpus_per_socket     (cpu_topology_info.cpus_per_socket)
 #define cpus_per_numa       (cpu_topology_info.cpus_per_numa)
 #define sockets_per_numa    (cpu_topology_info.sockets_per_numa)

 #define child_node_type(t) ((t) - 1)

 /* The pcores in the system. (array gets alloced in init()).  */
 struct sched_pcore *all_pcores;

 /* TAILQ of all unallocated, idle (CG) cores */
 struct sched_pcore_tailq idlecores = TAILQ_HEAD_INITIALIZER(idlecores);

 /* An array containing the number of nodes at each level. */
 static int num_nodes[NUM_NODE_TYPES];

 /* A 2D array containing for all core i its distance from a core j. */
 static int **core_distance;

 /* An array containing the number of children at each level. */
 static int num_descendants[NUM_NODE_TYPES][NUM_NODE_TYPES];

 /* A list of lookup tables to find specific nodes by type and id. */
 static int total_nodes;
 static struct sched_pnode *all_nodes;
 static struct sched_pnode *node_lookup[NUM_NODE_TYPES];

 /* Recursively increase the refcount from the node passed in through all of its
  * ancestors in the hierarchy. */
 static void incref_nodes(struct sched_pnode *n)
 {
 	while (n != NULL) {
 		n->refcount++;
 		n = n->parent;
 	}
 }

 /* Recursively decrease the refcount from the node passed in through all of its
  * ancestors in the hierarchy. */
 static void decref_nodes(struct sched_pnode *n)
 {
 	while (n != NULL) {
 		n->refcount--;
 		n = n->parent;
 	}
 }

 /* Create a node and initialize it. This code assumes that child are created
  * before parent nodes.
  *
  * all_nodes is laid out such that all of the nodes are: CORES, then the CPUS,
  * then the SOCKETS, then NUMA, then one for MACHINE.  The nodes at any given
  * layer are later broken up into chunks of n and assigned to their parents
  * (down below). */
 static void init_nodes(int type, int num, int nchildren)
 {
 	/* Initialize the lookup table for this node type. */
 	num_nodes[type] = num;
 	node_lookup[type] = all_nodes;
 	for (int i = CORE; i < type; i++)
 		node_lookup[type] += num_nodes[i];

 	/* Initialize all fields of each node of this type. */
 	for (int i = 0; i < num; i++) {
 		struct sched_pnode *n = &node_lookup[type][i];

 		/* Initialize the common fields. */
 		n->id = i;
 		n->type = type;
 		n->refcount = 0;
 		n->parent = NULL;

 		/* If we are a core node, initialize the sched_pcore field. */
 		if (n->type == CORE) {
 			n->sched_pcore = &all_pcores[n->id];
 			n->sched_pcore->sched_pnode = n;
 			n->sched_pcore->core_info =
 				&cpu_topology_info.core_list[n->id];
 			n->sched_pcore->alloc_proc = NULL;
 			n->sched_pcore->prov_proc = NULL;
 		}
 		/* Otherwise initialize the children field, updating the parent
 		 * of all children to the current node. This assumes the
 		 * children have already been initialized (i.e. init_nodes()
 		 * must be called iteratively from the bottom-up). */
 		if (n->type != CORE) {
 			n->children =
 			    &node_lookup[child_node_type(type)][i * nchildren];
 			for (int j = 0; j < nchildren; j++)
 				n->children[j].parent = n;
 		}
 	}
 }

 /* Helper: Two cores have the same level id (and thus are at distance k, below)
  * if their IDs divided by the number of cores per level are the same, due to
  * how we number our cores. */
 static int get_level_id(int coreid, int level)
 {
 	return coreid / num_descendants[level][CORE];
 }

 /* Allocate a table of distances from one core to an other. Cores on the same
  * cpu have a distance of 1; same socket have a distance of 2; same numa -> 3;
  * same machine -> 4; */
 static void init_core_distances(void)
 {
 	core_distance = kzmalloc(num_cores * sizeof(int*), MEM_WAIT);
 	for (int i = 0; i < num_cores; i++)
 		core_distance[i] = kzmalloc(num_cores * sizeof(int), MEM_WAIT);
 	for (int i = 0; i < num_cores; i++) {
 		for (int j = 0; j < num_cores; j++) {
 			for (int k = CPU; k <= MACHINE; k++) {
 				if (get_level_id(i, k) == get_level_id(j, k)) {
 					core_distance[i][j] = k;
 					break;
 				}
 			}
 		}
 	}
 }

 /* Initialize any data assocaited with doing core allocation. */
 void corealloc_init(void)
 {
 	void *nodes_and_cores;

 	/* Allocate a flat array of nodes. */
 	total_nodes = num_cores + num_cpus + num_sockets + num_numa + 1;
 	nodes_and_cores = kmalloc(total_nodes * sizeof(struct sched_pnode) +
 	                          num_cores * sizeof(struct sched_pcore),
 	                          MEM_WAIT);
 	all_nodes = nodes_and_cores;
 	all_pcores = nodes_and_cores + total_nodes * sizeof(struct sched_pnode);

 	/* Initialize the number of descendants from our cpu_topology info. */
 	num_descendants[CPU][CORE] = cores_per_cpu;
 	num_descendants[SOCKET][CORE] = cores_per_socket;
 	num_descendants[SOCKET][CPU] = cpus_per_socket;
 	num_descendants[NUMA][CORE] = cores_per_numa;
 	num_descendants[NUMA][CPU] = cpus_per_numa;
 	num_descendants[NUMA][SOCKET] = sockets_per_numa;
 	num_descendants[MACHINE][CORE] = num_cores;
 	num_descendants[MACHINE][CPU] = num_cpus;
 	num_descendants[MACHINE][SOCKET] = num_sockets;
 	num_descendants[MACHINE][NUMA] = num_numa;

 	/* Initialize the nodes at each level in our hierarchy. */
 	init_nodes(CORE, num_cores, 0);
 	init_nodes(CPU, num_cpus, cores_per_cpu);
 	init_nodes(SOCKET, num_sockets, cpus_per_socket);
 	init_nodes(NUMA, num_numa, sockets_per_numa);
 	init_nodes(MACHINE, 1, num_numa);

 	/* Initialize our table of core_distances */
 	init_core_distances();

 	for (int i = 0; i < num_cores; i++) {
 		/* Remove all ll_cores from consideration for allocation. */
 		if (is_ll_core(i)) {
 			incref_nodes(all_pcores[i].sched_pnode);
 			continue;
 		}
 #ifdef CONFIG_DISABLE_SMT
 		/* Remove all odd cores from consideration for allocation. */
 		if (i % 2 == 1) {
 			incref_nodes(all_pcores[i].sched_pnode);
 			continue;
 		}
 #endif /* CONFIG_DISABLE_SMT */
 		/* Fill the idlecores array. */
 		TAILQ_INSERT_HEAD(&idlecores, &all_pcores[i], alloc_next);
 	}
 }

 /* Initialize any data associated with allocating cores to a process. */
 void corealloc_proc_init(struct proc *p)
 {
 	TAILQ_INIT(&p->ksched_data.crd.alloc_me);
 	TAILQ_INIT(&p->ksched_data.crd.prov_alloc_me);
 	TAILQ_INIT(&p->ksched_data.crd.prov_not_alloc_me);
 }

 /* Returns the sum of the distances from one core to all cores in a list.
  *
  * This assumes cl is the allocated list, or possibly the idlecores, since it
  * uses alloc_next. */
 static int cumulative_core_distance(struct sched_pcore *c,
                                     struct sched_pcore_tailq cl)
 {
 	int d = 0;
 	struct sched_pcore *temp = NULL;

 	TAILQ_FOREACH(temp, &cl, alloc_next) {
 		d += core_distance[c->core_info->core_id]
 			          [temp->core_info->core_id];
 	}
 	return d;
 }

 /* Returns the first core in the hierarchy under node n. */
 static struct sched_pcore *first_core_in_node(struct sched_pnode *n)
 {
 	struct sched_pnode *first_child = n;

 	while (first_child->type != CORE)
 		first_child = &first_child->children[0];
 	return first_child->sched_pcore;
 }

 /* Return the first provisioned core available. Otherwise, return NULL. */
 static struct sched_pcore *find_first_provisioned_core(struct proc *p)
 {
 	return TAILQ_FIRST(&(p->ksched_data.crd.prov_not_alloc_me));
 }

 /* Returns the best first core to allocate for a proc which owns no core.
  * Return the core that is the farthest from the others's proc cores. */
 static struct sched_pcore *find_first_core(struct proc *p)
 {
 	struct sched_pcore *c;
 	struct sched_pnode *n;
 	struct sched_pnode *nodes;
 	struct sched_pnode *bestn;
 	int best_refcount;

 	/* Find the best, first provisioned core if there are any. Even if the
 	 * whole machine is allocated, we still give out provisioned cores,
 	 * because they will be revoked from their current owner if necessary.
 	 */
 	c = find_first_provisioned_core(p);
 	if (c != NULL)
 		return c;

 	/* Otherwise, if the whole machine is already allocated, there are no
 	 * cores left to allocate, and we are done. */
 	bestn = &node_lookup[MACHINE][0];
 	if (bestn->refcount == num_descendants[MACHINE][CORE])
 		return NULL;

 	/* Otherwise, we know at least one core is still available, so let's
 	 * find the best one to allocate first. We start at NUMA, and loop
 	 * through the topology to find it. */
 	for (int i = NUMA; i >= CORE; i--) {
 		nodes = bestn->children;
 		best_refcount = total_nodes;
 		bestn = NULL;

 		for (int j = 0; j < num_nodes[i]; j++) {
 			n = &nodes[j];
 			if (n->refcount == 0)
 				return first_core_in_node(n);
 			if (n->refcount == num_descendants[i][CORE])
 				continue;
 			if (n->refcount < best_refcount) {
 				best_refcount = n->refcount;
 				bestn = n;
 			}
 		}
 	}
 	return bestn->sched_pcore;
 }

 /* Return the closest core from the list of provisioned cores to cores we
  * already own. If no cores are available we return NULL.*/
 static struct sched_pcore *find_closest_provisioned_core(struct proc *p)
 {
 	struct sched_pcore_tailq provisioned =
 		p->ksched_data.crd.prov_not_alloc_me;
 	struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me;
 	struct sched_pcore *bestc = NULL;
 	struct sched_pcore *c = NULL;
 	int bestd = 0;

 	TAILQ_FOREACH(c, &provisioned, prov_next) {
 		int currd = cumulative_core_distance(c, allocated);

 		if ((bestd == 0) || (currd < bestd)) {
 			bestd = currd;
 			bestc = c;
 		}
 	}
 	return bestc;
 }

 /* Return the closest core from the list of idlecores to cores we already own.
  * If no cores are available we return NULL.*/
 static struct sched_pcore *find_closest_idle_core(struct proc *p)
 {
 	struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me;
 	struct sched_pcore *bestc = NULL;
 	struct sched_pcore *c = NULL;
 	int bestd = 0;

 	/* TODO: Add optimization to hand out core at equivalent distance if the
 	 * best core found is provisioned to someone else. */
 	TAILQ_FOREACH(c, &idlecores, alloc_next) {
 		int currd = cumulative_core_distance(c, allocated);

 		if ((bestd == 0) || (currd < bestd)) {
 			bestd = currd;
 			bestc = c;
 		}
 	}
 	return bestc;
 }

 /* Consider the first core provisioned to a proc by calling
  * find_best_provisioned_core(). Then check siblings of the cores the proc
  * already owns. Calculate for every possible node its
  * cumulative_core_distance() (sum of the distances from this core to all of
  * the cores the proc owns). Allocate the core that has the lowest
  * core_distance.  This code assumes that the scheduler that uses it holds a
  * lock for the duration of the call. */
 static struct sched_pcore *find_closest_core(struct proc *p)
 {
 	struct sched_pcore *bestc;

 	bestc = find_closest_provisioned_core(p);
 	if (bestc)
 		return bestc;
 	return find_closest_idle_core(p);
 }

 /* Find the best core to allocate. If no cores are allocated yet, find one that
  * is as far from the cores allocated to other processes as possible.
  * Otherwise, find a core that is as close as possible to one of the other
  * cores we already own. */
 uint32_t __find_best_core_to_alloc(struct proc *p)
 {
 	struct sched_pcore *c = NULL;

 	if (TAILQ_FIRST(&(p->ksched_data.crd.alloc_me)) == NULL)
 		c = find_first_core(p);
 	else
 		c = find_closest_core(p);

 	if (c == NULL)
 		return -1;
 	return spc2pcoreid(c);
 }

 /* Track the pcore properly when it is allocated to p. This code assumes that
  * the scheduler that uses it holds a lock for the duration of the call. */
 void __track_core_alloc(struct proc *p, uint32_t pcoreid)
 {
 	struct sched_pcore *spc;

 	assert(pcoreid < num_cores);	/* catch bugs */
 	spc = pcoreid2spc(pcoreid);
 	assert(spc->alloc_proc != p);	/* corruption or double-alloc */
 	spc->alloc_proc = p;
 	/* if the pcore is prov to them and now allocated, move lists */
 	if (spc->prov_proc == p) {
 		TAILQ_REMOVE(&p->ksched_data.crd.prov_not_alloc_me, spc,
 			     prov_next);
 		TAILQ_INSERT_TAIL(&p->ksched_data.crd.prov_alloc_me, spc,
 				  prov_next);
 	}
 	/* Actually allocate the core, removing it from the idle core list. */
 	TAILQ_REMOVE(&idlecores, spc, alloc_next);
 	TAILQ_INSERT_TAIL(&p->ksched_data.crd.alloc_me, spc, alloc_next);
 	incref_nodes(spc->sched_pnode);
 }

 /* Track the pcore properly when it is deallocated from p. This code assumes
  * that the scheduler that uses it holds a lock for the duration of the call.
  * */
 void __track_core_dealloc(struct proc *p, uint32_t pcoreid)
 {
 	struct sched_pcore *spc;

 	assert(pcoreid < num_cores);	/* catch bugs */
 	spc = pcoreid2spc(pcoreid);
 	spc->alloc_proc = NULL;
 	/* if the pcore is prov to them and now deallocated, move lists */
 	if (spc->prov_proc == p) {
 		TAILQ_REMOVE(&p->ksched_data.crd.prov_alloc_me, spc, prov_next);
 		/* this is the victim list, which can be sorted so that we pick
 		 * the right victim (sort by alloc_proc reverse priority, etc).
 		 * In this case, the core isn't alloc'd by anyone, so it should
 		 * be the first victim. */
 		TAILQ_INSERT_HEAD(&p->ksched_data.crd.prov_not_alloc_me, spc,
 		                  prov_next);
 	}
 	/* Actually dealloc the core, putting it back on the idle core list. */
 	TAILQ_REMOVE(&(p->ksched_data.crd.alloc_me), spc, alloc_next);
 	TAILQ_INSERT_HEAD(&idlecores, spc, alloc_next);
 	decref_nodes(spc->sched_pnode);
 }

 /* Bulk interface for __track_core_dealloc */
 void __track_core_dealloc_bulk(struct proc *p, uint32_t *pc_arr,
                                uint32_t nr_cores)
 {
 	for (int i = 0; i < nr_cores; i++)
 		__track_core_dealloc(p, pc_arr[i]);
 }

 /* One off function to make 'pcoreid' the next core chosen by the core
  * allocation algorithm (so long as no provisioned cores are still idle).
  * This code assumes that the scheduler that uses it holds a lock for the
  * duration of the call. */
 void __next_core_to_alloc(uint32_t pcoreid)
 {
 	printk("%s is not supported by this core allocation policy!\n",
 	       __func__);
 }

 /* One off function to sort the idle core list for debugging in the kernel
  * monitor. This code assumes that the scheduler that uses it holds a lock
  * for the duration of the call. */
 void __sort_idle_cores(void)
 {
 	printk("%s is not supported by this core allocation policy!\n",
 	       __func__);
 }

 /* Print the map of idle cores that are still allocatable through our core
  * allocation algorithm. */
 void print_idle_core_map(void)
 {
 	printk("Idle cores (unlocked!):\n");
 	for (int i = 0; i < num_cores; i++) {
 		struct sched_pcore *spc_i = &all_pcores[i];

 		if (spc_i->alloc_proc == NULL)
 			printk("Core %d, prov to %d (%p)\n",
 			       spc_i->core_info->core_id,
 			       spc_i->prov_proc ? spc_i->prov_proc->pid : 0,
 			       spc_i->prov_proc);
 	}
 }
	/* Copyright (c) 2015 The Regents of the University of California
	* Valmon Leymarie <leymariv@berkeley.edu>
	* Kevin Klues <klueska@cs.berkeley.edu>
	* See LICENSE for details.
	*/

	#include <arch/topology.h>
	#include <sys/queue.h>
	#include <env.h>
	#include <corerequest.h>
	#include <kmalloc.h>

	enum pnode_type { CORE, CPU, SOCKET, NUMA, MACHINE, NUM_NODE_TYPES };

	static const char * const pnode_label[] = {
	[CORE] = "CORE",
	[CPU] = "CPU",
	[SOCKET] = "SOCKET",
	[NUMA] = "NUMA",
	[MACHINE] = "MACHINE"
	};

	/* Internal representation of a node in the hierarchy of elements in the cpu
	* topology of the machine (i.e. numa domain, socket, cpu, core, etc.). */
	struct sched_pnode {
	int id;
	enum pnode_type type;
	struct sched_pnode *parent;

	/* Refcount is used to track how many cores have been allocated beneath
	* the current node in the hierarchy. */
	int refcount;

	/* All nodes except cores have children. Cores have a sched_pcore. */
	union {
	struct sched_pnode *children;
	struct sched_pcore *sched_pcore;
	};
	};

	#define num_cpus (cpu_topology_info.num_cpus)
	#define num_sockets (cpu_topology_info.num_sockets)
	#define num_numa (cpu_topology_info.num_numa)
	#define cores_per_numa (cpu_topology_info.cores_per_numa)
	#define cores_per_socket (cpu_topology_info.cores_per_socket)
	#define cores_per_cpu (cpu_topology_info.cores_per_cpu)
	#define cpus_per_socket (cpu_topology_info.cpus_per_socket)
	#define cpus_per_numa (cpu_topology_info.cpus_per_numa)
	#define sockets_per_numa (cpu_topology_info.sockets_per_numa)

	#define child_node_type(t) ((t) - 1)

	/* The pcores in the system. (array gets alloced in init()). */
	struct sched_pcore *all_pcores;

	/* TAILQ of all unallocated, idle (CG) cores */
	struct sched_pcore_tailq idlecores = TAILQ_HEAD_INITIALIZER(idlecores);

	/* An array containing the number of nodes at each level. */
	static int num_nodes[NUM_NODE_TYPES];

	/* A 2D array containing for all core i its distance from a core j. */
	static int **core_distance;

	/* An array containing the number of children at each level. */
	static int num_descendants[NUM_NODE_TYPES][NUM_NODE_TYPES];

	/* A list of lookup tables to find specific nodes by type and id. */
	static int total_nodes;
	static struct sched_pnode *all_nodes;
	static struct sched_pnode *node_lookup[NUM_NODE_TYPES];

	/* Recursively increase the refcount from the node passed in through all of its
	* ancestors in the hierarchy. */
	static void incref_nodes(struct sched_pnode *n)
	{
	while (n != NULL) {
	n->refcount++;
	n = n->parent;
	}
	}

	/* Recursively decrease the refcount from the node passed in through all of its
	* ancestors in the hierarchy. */
	static void decref_nodes(struct sched_pnode *n)
	{
	while (n != NULL) {
	n->refcount--;
	n = n->parent;
	}
	}

	/* Create a node and initialize it. This code assumes that child are created
	* before parent nodes.
	*
	* all_nodes is laid out such that all of the nodes are: CORES, then the CPUS,
	* then the SOCKETS, then NUMA, then one for MACHINE. The nodes at any given
	* layer are later broken up into chunks of n and assigned to their parents
	* (down below). */
	static void init_nodes(int type, int num, int nchildren)
	{
	/* Initialize the lookup table for this node type. */
	num_nodes[type] = num;
	node_lookup[type] = all_nodes;
	for (int i = CORE; i < type; i++)
	node_lookup[type] += num_nodes[i];

	/* Initialize all fields of each node of this type. */
	for (int i = 0; i < num; i++) {
	struct sched_pnode *n = &node_lookup[type][i];

	/* Initialize the common fields. */
	n->id = i;
	n->type = type;
	n->refcount = 0;
	n->parent = NULL;

	/* If we are a core node, initialize the sched_pcore field. */
	if (n->type == CORE) {
	n->sched_pcore = &all_pcores[n->id];
	n->sched_pcore->sched_pnode = n;
	n->sched_pcore->core_info =
	&cpu_topology_info.core_list[n->id];
	n->sched_pcore->alloc_proc = NULL;
	n->sched_pcore->prov_proc = NULL;
	}
	/* Otherwise initialize the children field, updating the parent
	* of all children to the current node. This assumes the
	* children have already been initialized (i.e. init_nodes()
	* must be called iteratively from the bottom-up). */
	if (n->type != CORE) {
	n->children =
	&node_lookup[child_node_type(type)][i * nchildren];
	for (int j = 0; j < nchildren; j++)
	n->children[j].parent = n;
	}
	}
	}

	/* Helper: Two cores have the same level id (and thus are at distance k, below)
	* if their IDs divided by the number of cores per level are the same, due to
	* how we number our cores. */
	static int get_level_id(int coreid, int level)
	{
	return coreid / num_descendants[level][CORE];
	}

	/* Allocate a table of distances from one core to an other. Cores on the same
	* cpu have a distance of 1; same socket have a distance of 2; same numa -> 3;
	* same machine -> 4; */
	static void init_core_distances(void)
	{
	core_distance = kzmalloc(num_cores * sizeof(int*), MEM_WAIT);
	for (int i = 0; i < num_cores; i++)
	core_distance[i] = kzmalloc(num_cores * sizeof(int), MEM_WAIT);
	for (int i = 0; i < num_cores; i++) {
	for (int j = 0; j < num_cores; j++) {
	for (int k = CPU; k <= MACHINE; k++) {
	if (get_level_id(i, k) == get_level_id(j, k)) {
	core_distance[i][j] = k;
	break;
	}
	}
	}
	}
	}

	/* Initialize any data assocaited with doing core allocation. */
	void corealloc_init(void)
	{
	void *nodes_and_cores;

	/* Allocate a flat array of nodes. */
	total_nodes = num_cores + num_cpus + num_sockets + num_numa + 1;
	nodes_and_cores = kmalloc(total_nodes * sizeof(struct sched_pnode) +
	num_cores * sizeof(struct sched_pcore),
	MEM_WAIT);
	all_nodes = nodes_and_cores;
	all_pcores = nodes_and_cores + total_nodes * sizeof(struct sched_pnode);

	/* Initialize the number of descendants from our cpu_topology info. */
	num_descendants[CPU][CORE] = cores_per_cpu;
	num_descendants[SOCKET][CORE] = cores_per_socket;
	num_descendants[SOCKET][CPU] = cpus_per_socket;
	num_descendants[NUMA][CORE] = cores_per_numa;
	num_descendants[NUMA][CPU] = cpus_per_numa;
	num_descendants[NUMA][SOCKET] = sockets_per_numa;
	num_descendants[MACHINE][CORE] = num_cores;
	num_descendants[MACHINE][CPU] = num_cpus;
	num_descendants[MACHINE][SOCKET] = num_sockets;
	num_descendants[MACHINE][NUMA] = num_numa;

	/* Initialize the nodes at each level in our hierarchy. */
	init_nodes(CORE, num_cores, 0);
	init_nodes(CPU, num_cpus, cores_per_cpu);
	init_nodes(SOCKET, num_sockets, cpus_per_socket);
	init_nodes(NUMA, num_numa, sockets_per_numa);
	init_nodes(MACHINE, 1, num_numa);

	/* Initialize our table of core_distances */
	init_core_distances();

	for (int i = 0; i < num_cores; i++) {
	/* Remove all ll_cores from consideration for allocation. */
	if (is_ll_core(i)) {
	incref_nodes(all_pcores[i].sched_pnode);
	continue;
	}
	#ifdef CONFIG_DISABLE_SMT
	/* Remove all odd cores from consideration for allocation. */
	if (i % 2 == 1) {
	incref_nodes(all_pcores[i].sched_pnode);
	continue;
	}
	#endif /* CONFIG_DISABLE_SMT */
	/* Fill the idlecores array. */
	TAILQ_INSERT_HEAD(&idlecores, &all_pcores[i], alloc_next);
	}
	}

	/* Initialize any data associated with allocating cores to a process. */
	void corealloc_proc_init(struct proc *p)
	{
	TAILQ_INIT(&p->ksched_data.crd.alloc_me);
	TAILQ_INIT(&p->ksched_data.crd.prov_alloc_me);
	TAILQ_INIT(&p->ksched_data.crd.prov_not_alloc_me);
	}

	/* Returns the sum of the distances from one core to all cores in a list.
	*
	* This assumes cl is the allocated list, or possibly the idlecores, since it
	* uses alloc_next. */
	static int cumulative_core_distance(struct sched_pcore *c,
	struct sched_pcore_tailq cl)
	{
	int d = 0;
	struct sched_pcore *temp = NULL;

	TAILQ_FOREACH(temp, &cl, alloc_next) {
	d += core_distance[c->core_info->core_id]
	[temp->core_info->core_id];
	}
	return d;
	}

	/* Returns the first core in the hierarchy under node n. */
	static struct sched_pcore first_core_in_node(struct sched_pnode n)
	{
	struct sched_pnode *first_child = n;

	while (first_child->type != CORE)
	first_child = &first_child->children[0];
	return first_child->sched_pcore;
	}

	/* Return the first provisioned core available. Otherwise, return NULL. */
	static struct sched_pcore find_first_provisioned_core(struct proc p)
	{
	return TAILQ_FIRST(&(p->ksched_data.crd.prov_not_alloc_me));
	}

	/* Returns the best first core to allocate for a proc which owns no core.
	* Return the core that is the farthest from the others's proc cores. */
	static struct sched_pcore find_first_core(struct proc p)
	{
	struct sched_pcore *c;
	struct sched_pnode *n;
	struct sched_pnode *nodes;
	struct sched_pnode *bestn;
	int best_refcount;

	/* Find the best, first provisioned core if there are any. Even if the
	* whole machine is allocated, we still give out provisioned cores,
	* because they will be revoked from their current owner if necessary.
	*/
	c = find_first_provisioned_core(p);
	if (c != NULL)
	return c;

	/* Otherwise, if the whole machine is already allocated, there are no
	* cores left to allocate, and we are done. */
	bestn = &node_lookup[MACHINE][0];
	if (bestn->refcount == num_descendants[MACHINE][CORE])
	return NULL;

	/* Otherwise, we know at least one core is still available, so let's
	* find the best one to allocate first. We start at NUMA, and loop
	* through the topology to find it. */
	for (int i = NUMA; i >= CORE; i--) {
	nodes = bestn->children;
	best_refcount = total_nodes;
	bestn = NULL;

	for (int j = 0; j < num_nodes[i]; j++) {
	n = &nodes[j];
	if (n->refcount == 0)
	return first_core_in_node(n);
	if (n->refcount == num_descendants[i][CORE])
	continue;
	if (n->refcount < best_refcount) {
	best_refcount = n->refcount;
	bestn = n;
	}
	}
	}
	return bestn->sched_pcore;
	}

	/* Return the closest core from the list of provisioned cores to cores we
	* already own. If no cores are available we return NULL.*/
	static struct sched_pcore find_closest_provisioned_core(struct proc p)
	{
	struct sched_pcore_tailq provisioned =
	p->ksched_data.crd.prov_not_alloc_me;
	struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me;
	struct sched_pcore *bestc = NULL;
	struct sched_pcore *c = NULL;
	int bestd = 0;

	TAILQ_FOREACH(c, &provisioned, prov_next) {
	int currd = cumulative_core_distance(c, allocated);

	if ((bestd == 0) \|\| (currd < bestd)) {
	bestd = currd;
	bestc = c;
	}
	}
	return bestc;
	}

	/* Return the closest core from the list of idlecores to cores we already own.
	* If no cores are available we return NULL.*/
	static struct sched_pcore find_closest_idle_core(struct proc p)
	{
	struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me;
	struct sched_pcore *bestc = NULL;
	struct sched_pcore *c = NULL;
	int bestd = 0;

	/* TODO: Add optimization to hand out core at equivalent distance if the
	* best core found is provisioned to someone else. */
	TAILQ_FOREACH(c, &idlecores, alloc_next) {
	int currd = cumulative_core_distance(c, allocated);

	if ((bestd == 0) \|\| (currd < bestd)) {
	bestd = currd;
	bestc = c;
	}
	}
	return bestc;
	}

	/* Consider the first core provisioned to a proc by calling
	* find_best_provisioned_core(). Then check siblings of the cores the proc
	* already owns. Calculate for every possible node its
	* cumulative_core_distance() (sum of the distances from this core to all of
	* the cores the proc owns). Allocate the core that has the lowest
	* core_distance. This code assumes that the scheduler that uses it holds a
	* lock for the duration of the call. */
	static struct sched_pcore find_closest_core(struct proc p)
	{
	struct sched_pcore *bestc;

	bestc = find_closest_provisioned_core(p);
	if (bestc)
	return bestc;
	return find_closest_idle_core(p);
	}

	/* Find the best core to allocate. If no cores are allocated yet, find one that
	* is as far from the cores allocated to other processes as possible.
	* Otherwise, find a core that is as close as possible to one of the other
	* cores we already own. */
	uint32_t __find_best_core_to_alloc(struct proc *p)
	{
	struct sched_pcore *c = NULL;

	if (TAILQ_FIRST(&(p->ksched_data.crd.alloc_me)) == NULL)
	c = find_first_core(p);
	else
	c = find_closest_core(p);

	if (c == NULL)
	return -1;
	return spc2pcoreid(c);
	}

	/* Track the pcore properly when it is allocated to p. This code assumes that
	* the scheduler that uses it holds a lock for the duration of the call. */
	void __track_core_alloc(struct proc *p, uint32_t pcoreid)
	{
	struct sched_pcore *spc;

	assert(pcoreid < num_cores); /* catch bugs */
	spc = pcoreid2spc(pcoreid);
	assert(spc->alloc_proc != p); /* corruption or double-alloc */
	spc->alloc_proc = p;
	/* if the pcore is prov to them and now allocated, move lists */
	if (spc->prov_proc == p) {
	TAILQ_REMOVE(&p->ksched_data.crd.prov_not_alloc_me, spc,
	prov_next);
	TAILQ_INSERT_TAIL(&p->ksched_data.crd.prov_alloc_me, spc,
	prov_next);
	}
	/* Actually allocate the core, removing it from the idle core list. */
	TAILQ_REMOVE(&idlecores, spc, alloc_next);
	TAILQ_INSERT_TAIL(&p->ksched_data.crd.alloc_me, spc, alloc_next);
	incref_nodes(spc->sched_pnode);
	}

	/* Track the pcore properly when it is deallocated from p. This code assumes
	* that the scheduler that uses it holds a lock for the duration of the call.
	* */
	void __track_core_dealloc(struct proc *p, uint32_t pcoreid)
	{
	struct sched_pcore *spc;

	assert(pcoreid < num_cores); /* catch bugs */
	spc = pcoreid2spc(pcoreid);
	spc->alloc_proc = NULL;
	/* if the pcore is prov to them and now deallocated, move lists */
	if (spc->prov_proc == p) {
	TAILQ_REMOVE(&p->ksched_data.crd.prov_alloc_me, spc, prov_next);
	/* this is the victim list, which can be sorted so that we pick
	* the right victim (sort by alloc_proc reverse priority, etc).
	* In this case, the core isn't alloc'd by anyone, so it should
	* be the first victim. */
	TAILQ_INSERT_HEAD(&p->ksched_data.crd.prov_not_alloc_me, spc,
	prov_next);
	}
	/* Actually dealloc the core, putting it back on the idle core list. */
	TAILQ_REMOVE(&(p->ksched_data.crd.alloc_me), spc, alloc_next);
	TAILQ_INSERT_HEAD(&idlecores, spc, alloc_next);
	decref_nodes(spc->sched_pnode);
	}

	/* Bulk interface for __track_core_dealloc */
	void __track_core_dealloc_bulk(struct proc p, uint32_t pc_arr,
	uint32_t nr_cores)
	{
	for (int i = 0; i < nr_cores; i++)
	__track_core_dealloc(p, pc_arr[i]);
	}

	/* One off function to make 'pcoreid' the next core chosen by the core
	* allocation algorithm (so long as no provisioned cores are still idle).
	* This code assumes that the scheduler that uses it holds a lock for the
	* duration of the call. */
	void __next_core_to_alloc(uint32_t pcoreid)
	{
	printk("%s is not supported by this core allocation policy!\n",
	__func__);
	}

	/* One off function to sort the idle core list for debugging in the kernel
	* monitor. This code assumes that the scheduler that uses it holds a lock
	* for the duration of the call. */
	void __sort_idle_cores(void)
	{
	printk("%s is not supported by this core allocation policy!\n",
	__func__);
	}

	/* Print the map of idle cores that are still allocatable through our core
	* allocation algorithm. */
	void print_idle_core_map(void)
	{
	printk("Idle cores (unlocked!):\n");
	for (int i = 0; i < num_cores; i++) {
	struct sched_pcore *spc_i = &all_pcores[i];

	if (spc_i->alloc_proc == NULL)
	printk("Core %d, prov to %d (%p)\n",
	spc_i->core_info->core_id,
	spc_i->prov_proc ? spc_i->prov_proc->pid : 0,
	spc_i->prov_proc);
	}
	}