| /* Copyright (c) 2015 The Regents of the University of California | 
 |  * Valmon Leymarie <leymariv@berkeley.edu> | 
 |  * Kevin Klues <klueska@cs.berkeley.edu> | 
 |  * See LICENSE for details. | 
 |  */ | 
 |  | 
 | #include <arch/topology.h> | 
 | #include <sys/queue.h> | 
 | #include <env.h> | 
 | #include <corerequest.h> | 
 | #include <kmalloc.h> | 
 |  | 
 | enum pnode_type { CORE, CPU, SOCKET, NUMA, MACHINE, NUM_NODE_TYPES }; | 
 |  | 
 | static const char * const pnode_label[] = { | 
 | 	[CORE] = "CORE", | 
 | 	[CPU] = "CPU", | 
 | 	[SOCKET] = "SOCKET", | 
 | 	[NUMA] = "NUMA", | 
 | 	[MACHINE] = "MACHINE" | 
 | }; | 
 |  | 
 | /* Internal representation of a node in the hierarchy of elements in the cpu | 
 |  * topology of the machine (i.e. numa domain, socket, cpu, core, etc.). */ | 
 | struct sched_pnode { | 
 | 	int id; | 
 | 	enum pnode_type type; | 
 | 	struct sched_pnode *parent; | 
 |  | 
 | 	/* Refcount is used to track how many cores have been allocated beneath | 
 | 	 * the current node in the hierarchy. */ | 
 | 	int refcount; | 
 |  | 
 | 	/* All nodes except cores have children. Cores have a sched_pcore. */ | 
 | 	union { | 
 | 		struct sched_pnode *children; | 
 | 		struct sched_pcore *sched_pcore; | 
 | 	}; | 
 | }; | 
 |  | 
 | #define num_cpus            (cpu_topology_info.num_cpus) | 
 | #define num_sockets         (cpu_topology_info.num_sockets) | 
 | #define num_numa            (cpu_topology_info.num_numa) | 
 | #define cores_per_numa      (cpu_topology_info.cores_per_numa) | 
 | #define cores_per_socket    (cpu_topology_info.cores_per_socket) | 
 | #define cores_per_cpu       (cpu_topology_info.cores_per_cpu) | 
 | #define cpus_per_socket     (cpu_topology_info.cpus_per_socket) | 
 | #define cpus_per_numa       (cpu_topology_info.cpus_per_numa) | 
 | #define sockets_per_numa    (cpu_topology_info.sockets_per_numa) | 
 |  | 
 | #define child_node_type(t) ((t) - 1) | 
 |  | 
 | /* The pcores in the system. (array gets alloced in init()).  */ | 
 | struct sched_pcore *all_pcores; | 
 |  | 
 | /* TAILQ of all unallocated, idle (CG) cores */ | 
 | struct sched_pcore_tailq idlecores = TAILQ_HEAD_INITIALIZER(idlecores); | 
 |  | 
 | /* An array containing the number of nodes at each level. */ | 
 | static int num_nodes[NUM_NODE_TYPES]; | 
 |  | 
 | /* A 2D array containing for all core i its distance from a core j. */ | 
 | static int **core_distance; | 
 |  | 
 | /* An array containing the number of children at each level. */ | 
 | static int num_descendants[NUM_NODE_TYPES][NUM_NODE_TYPES]; | 
 |  | 
 | /* A list of lookup tables to find specific nodes by type and id. */ | 
 | static int total_nodes; | 
 | static struct sched_pnode *all_nodes; | 
 | static struct sched_pnode *node_lookup[NUM_NODE_TYPES]; | 
 |  | 
 | /* Recursively increase the refcount from the node passed in through all of its | 
 |  * ancestors in the hierarchy. */ | 
 | static void incref_nodes(struct sched_pnode *n) | 
 | { | 
 | 	while (n != NULL) { | 
 | 		n->refcount++; | 
 | 		n = n->parent; | 
 | 	} | 
 | } | 
 |  | 
 | /* Recursively decrease the refcount from the node passed in through all of its | 
 |  * ancestors in the hierarchy. */ | 
 | static void decref_nodes(struct sched_pnode *n) | 
 | { | 
 | 	while (n != NULL) { | 
 | 		n->refcount--; | 
 | 		n = n->parent; | 
 | 	} | 
 | } | 
 |  | 
 | /* Create a node and initialize it. This code assumes that child are created | 
 |  * before parent nodes. | 
 |  * | 
 |  * all_nodes is laid out such that all of the nodes are: CORES, then the CPUS, | 
 |  * then the SOCKETS, then NUMA, then one for MACHINE.  The nodes at any given | 
 |  * layer are later broken up into chunks of n and assigned to their parents | 
 |  * (down below). */ | 
 | static void init_nodes(int type, int num, int nchildren) | 
 | { | 
 | 	/* Initialize the lookup table for this node type. */ | 
 | 	num_nodes[type] = num; | 
 | 	node_lookup[type] = all_nodes; | 
 | 	for (int i = CORE; i < type; i++) | 
 | 		node_lookup[type] += num_nodes[i]; | 
 |  | 
 | 	/* Initialize all fields of each node of this type. */ | 
 | 	for (int i = 0; i < num; i++) { | 
 | 		struct sched_pnode *n = &node_lookup[type][i]; | 
 |  | 
 | 		/* Initialize the common fields. */ | 
 | 		n->id = i; | 
 | 		n->type = type; | 
 | 		n->refcount = 0; | 
 | 		n->parent = NULL; | 
 |  | 
 | 		/* If we are a core node, initialize the sched_pcore field. */ | 
 | 		if (n->type == CORE) { | 
 | 			n->sched_pcore = &all_pcores[n->id]; | 
 | 			n->sched_pcore->sched_pnode = n; | 
 | 			n->sched_pcore->core_info = | 
 | 				&cpu_topology_info.core_list[n->id]; | 
 | 			n->sched_pcore->alloc_proc = NULL; | 
 | 			n->sched_pcore->prov_proc = NULL; | 
 | 		} | 
 | 		/* Otherwise initialize the children field, updating the parent | 
 | 		 * of all children to the current node. This assumes the | 
 | 		 * children have already been initialized (i.e. init_nodes() | 
 | 		 * must be called iteratively from the bottom-up). */ | 
 | 		if (n->type != CORE) { | 
 | 			n->children = | 
 | 			    &node_lookup[child_node_type(type)][i * nchildren]; | 
 | 			for (int j = 0; j < nchildren; j++) | 
 | 				n->children[j].parent = n; | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | /* Helper: Two cores have the same level id (and thus are at distance k, below) | 
 |  * if their IDs divided by the number of cores per level are the same, due to | 
 |  * how we number our cores. */ | 
 | static int get_level_id(int coreid, int level) | 
 | { | 
 | 	return coreid / num_descendants[level][CORE]; | 
 | } | 
 |  | 
 | /* Allocate a table of distances from one core to an other. Cores on the same | 
 |  * cpu have a distance of 1; same socket have a distance of 2; same numa -> 3; | 
 |  * same machine -> 4; */ | 
 | static void init_core_distances(void) | 
 | { | 
 | 	core_distance = kzmalloc(num_cores * sizeof(int*), MEM_WAIT); | 
 | 	for (int i = 0; i < num_cores; i++) | 
 | 		core_distance[i] = kzmalloc(num_cores * sizeof(int), MEM_WAIT); | 
 | 	for (int i = 0; i < num_cores; i++) { | 
 | 		for (int j = 0; j < num_cores; j++) { | 
 | 			for (int k = CPU; k <= MACHINE; k++) { | 
 | 				if (get_level_id(i, k) == get_level_id(j, k)) { | 
 | 					core_distance[i][j] = k; | 
 | 					break; | 
 | 				} | 
 | 			} | 
 | 		} | 
 | 	} | 
 | } | 
 |  | 
 | /* Initialize any data assocaited with doing core allocation. */ | 
 | void corealloc_init(void) | 
 | { | 
 | 	void *nodes_and_cores; | 
 |  | 
 | 	/* Allocate a flat array of nodes. */ | 
 | 	total_nodes = num_cores + num_cpus + num_sockets + num_numa + 1; | 
 | 	nodes_and_cores = kmalloc(total_nodes * sizeof(struct sched_pnode) + | 
 | 	                          num_cores * sizeof(struct sched_pcore), | 
 | 	                          MEM_WAIT); | 
 | 	all_nodes = nodes_and_cores; | 
 | 	all_pcores = nodes_and_cores + total_nodes * sizeof(struct sched_pnode); | 
 |  | 
 | 	/* Initialize the number of descendants from our cpu_topology info. */ | 
 | 	num_descendants[CPU][CORE] = cores_per_cpu; | 
 | 	num_descendants[SOCKET][CORE] = cores_per_socket; | 
 | 	num_descendants[SOCKET][CPU] = cpus_per_socket; | 
 | 	num_descendants[NUMA][CORE] = cores_per_numa; | 
 | 	num_descendants[NUMA][CPU] = cpus_per_numa; | 
 | 	num_descendants[NUMA][SOCKET] = sockets_per_numa; | 
 | 	num_descendants[MACHINE][CORE] = num_cores; | 
 | 	num_descendants[MACHINE][CPU] = num_cpus; | 
 | 	num_descendants[MACHINE][SOCKET] = num_sockets; | 
 | 	num_descendants[MACHINE][NUMA] = num_numa; | 
 |  | 
 | 	/* Initialize the nodes at each level in our hierarchy. */ | 
 | 	init_nodes(CORE, num_cores, 0); | 
 | 	init_nodes(CPU, num_cpus, cores_per_cpu); | 
 | 	init_nodes(SOCKET, num_sockets, cpus_per_socket); | 
 | 	init_nodes(NUMA, num_numa, sockets_per_numa); | 
 | 	init_nodes(MACHINE, 1, num_numa); | 
 |  | 
 | 	/* Initialize our table of core_distances */ | 
 | 	init_core_distances(); | 
 |  | 
 | 	for (int i = 0; i < num_cores; i++) { | 
 | 		/* Remove all ll_cores from consideration for allocation. */ | 
 | 		if (is_ll_core(i)) { | 
 | 			incref_nodes(all_pcores[i].sched_pnode); | 
 | 			continue; | 
 | 		} | 
 | #ifdef CONFIG_DISABLE_SMT | 
 | 		/* Remove all odd cores from consideration for allocation. */ | 
 | 		if (i % 2 == 1) { | 
 | 			incref_nodes(all_pcores[i].sched_pnode); | 
 | 			continue; | 
 | 		} | 
 | #endif /* CONFIG_DISABLE_SMT */ | 
 | 		/* Fill the idlecores array. */ | 
 | 		TAILQ_INSERT_HEAD(&idlecores, &all_pcores[i], alloc_next); | 
 | 	} | 
 | } | 
 |  | 
 | /* Initialize any data associated with allocating cores to a process. */ | 
 | void corealloc_proc_init(struct proc *p) | 
 | { | 
 | 	TAILQ_INIT(&p->ksched_data.crd.alloc_me); | 
 | 	TAILQ_INIT(&p->ksched_data.crd.prov_alloc_me); | 
 | 	TAILQ_INIT(&p->ksched_data.crd.prov_not_alloc_me); | 
 | } | 
 |  | 
 | /* Returns the sum of the distances from one core to all cores in a list. | 
 |  * | 
 |  * This assumes cl is the allocated list, or possibly the idlecores, since it | 
 |  * uses alloc_next. */ | 
 | static int cumulative_core_distance(struct sched_pcore *c, | 
 |                                     struct sched_pcore_tailq cl) | 
 | { | 
 | 	int d = 0; | 
 | 	struct sched_pcore *temp = NULL; | 
 |  | 
 | 	TAILQ_FOREACH(temp, &cl, alloc_next) { | 
 | 		d += core_distance[c->core_info->core_id] | 
 | 			          [temp->core_info->core_id]; | 
 | 	} | 
 | 	return d; | 
 | } | 
 |  | 
 | /* Returns the first core in the hierarchy under node n. */ | 
 | static struct sched_pcore *first_core_in_node(struct sched_pnode *n) | 
 | { | 
 | 	struct sched_pnode *first_child = n; | 
 |  | 
 | 	while (first_child->type != CORE) | 
 | 		first_child = &first_child->children[0]; | 
 | 	return first_child->sched_pcore; | 
 | } | 
 |  | 
 | /* Return the first provisioned core available. Otherwise, return NULL. */ | 
 | static struct sched_pcore *find_first_provisioned_core(struct proc *p) | 
 | { | 
 | 	return TAILQ_FIRST(&(p->ksched_data.crd.prov_not_alloc_me)); | 
 | } | 
 |  | 
 | /* Returns the best first core to allocate for a proc which owns no core. | 
 |  * Return the core that is the farthest from the others's proc cores. */ | 
 | static struct sched_pcore *find_first_core(struct proc *p) | 
 | { | 
 | 	struct sched_pcore *c; | 
 | 	struct sched_pnode *n; | 
 | 	struct sched_pnode *nodes; | 
 | 	struct sched_pnode *bestn; | 
 | 	int best_refcount; | 
 |  | 
 | 	/* Find the best, first provisioned core if there are any. Even if the | 
 | 	 * whole machine is allocated, we still give out provisioned cores, | 
 | 	 * because they will be revoked from their current owner if necessary. | 
 | 	 */ | 
 | 	c = find_first_provisioned_core(p); | 
 | 	if (c != NULL) | 
 | 		return c; | 
 |  | 
 | 	/* Otherwise, if the whole machine is already allocated, there are no | 
 | 	 * cores left to allocate, and we are done. */ | 
 | 	bestn = &node_lookup[MACHINE][0]; | 
 | 	if (bestn->refcount == num_descendants[MACHINE][CORE]) | 
 | 		return NULL; | 
 |  | 
 | 	/* Otherwise, we know at least one core is still available, so let's | 
 | 	 * find the best one to allocate first. We start at NUMA, and loop | 
 | 	 * through the topology to find it. */ | 
 | 	for (int i = NUMA; i >= CORE; i--) { | 
 | 		nodes = bestn->children; | 
 | 		best_refcount = total_nodes; | 
 | 		bestn = NULL; | 
 |  | 
 | 		for (int j = 0; j < num_nodes[i]; j++) { | 
 | 			n = &nodes[j]; | 
 | 			if (n->refcount == 0) | 
 | 				return first_core_in_node(n); | 
 | 			if (n->refcount == num_descendants[i][CORE]) | 
 | 				continue; | 
 | 			if (n->refcount < best_refcount) { | 
 | 				best_refcount = n->refcount; | 
 | 				bestn = n; | 
 | 			} | 
 | 		} | 
 | 	} | 
 | 	return bestn->sched_pcore; | 
 | } | 
 |  | 
 | /* Return the closest core from the list of provisioned cores to cores we | 
 |  * already own. If no cores are available we return NULL.*/ | 
 | static struct sched_pcore *find_closest_provisioned_core(struct proc *p) | 
 | { | 
 | 	struct sched_pcore_tailq provisioned = | 
 | 		p->ksched_data.crd.prov_not_alloc_me; | 
 | 	struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me; | 
 | 	struct sched_pcore *bestc = NULL; | 
 | 	struct sched_pcore *c = NULL; | 
 | 	int bestd = 0; | 
 |  | 
 | 	TAILQ_FOREACH(c, &provisioned, prov_next) { | 
 | 		int currd = cumulative_core_distance(c, allocated); | 
 |  | 
 | 		if ((bestd == 0) || (currd < bestd)) { | 
 | 			bestd = currd; | 
 | 			bestc = c; | 
 | 		} | 
 | 	} | 
 | 	return bestc; | 
 | } | 
 |  | 
 | /* Return the closest core from the list of idlecores to cores we already own. | 
 |  * If no cores are available we return NULL.*/ | 
 | static struct sched_pcore *find_closest_idle_core(struct proc *p) | 
 | { | 
 | 	struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me; | 
 | 	struct sched_pcore *bestc = NULL; | 
 | 	struct sched_pcore *c = NULL; | 
 | 	int bestd = 0; | 
 |  | 
 | 	/* TODO: Add optimization to hand out core at equivalent distance if the | 
 | 	 * best core found is provisioned to someone else. */ | 
 | 	TAILQ_FOREACH(c, &idlecores, alloc_next) { | 
 | 		int currd = cumulative_core_distance(c, allocated); | 
 |  | 
 | 		if ((bestd == 0) || (currd < bestd)) { | 
 | 			bestd = currd; | 
 | 			bestc = c; | 
 | 		} | 
 | 	} | 
 | 	return bestc; | 
 | } | 
 |  | 
 | /* Consider the first core provisioned to a proc by calling | 
 |  * find_best_provisioned_core(). Then check siblings of the cores the proc | 
 |  * already owns. Calculate for every possible node its | 
 |  * cumulative_core_distance() (sum of the distances from this core to all of | 
 |  * the cores the proc owns). Allocate the core that has the lowest | 
 |  * core_distance.  This code assumes that the scheduler that uses it holds a | 
 |  * lock for the duration of the call. */ | 
 | static struct sched_pcore *find_closest_core(struct proc *p) | 
 | { | 
 | 	struct sched_pcore *bestc; | 
 |  | 
 | 	bestc = find_closest_provisioned_core(p); | 
 | 	if (bestc) | 
 | 		return bestc; | 
 | 	return find_closest_idle_core(p); | 
 | } | 
 |  | 
 | /* Find the best core to allocate. If no cores are allocated yet, find one that | 
 |  * is as far from the cores allocated to other processes as possible. | 
 |  * Otherwise, find a core that is as close as possible to one of the other | 
 |  * cores we already own. */ | 
 | uint32_t __find_best_core_to_alloc(struct proc *p) | 
 | { | 
 | 	struct sched_pcore *c = NULL; | 
 |  | 
 | 	if (TAILQ_FIRST(&(p->ksched_data.crd.alloc_me)) == NULL) | 
 | 		c = find_first_core(p); | 
 | 	else | 
 | 		c = find_closest_core(p); | 
 |  | 
 | 	if (c == NULL) | 
 | 		return -1; | 
 | 	return spc2pcoreid(c); | 
 | } | 
 |  | 
 | /* Track the pcore properly when it is allocated to p. This code assumes that | 
 |  * the scheduler that uses it holds a lock for the duration of the call. */ | 
 | void __track_core_alloc(struct proc *p, uint32_t pcoreid) | 
 | { | 
 | 	struct sched_pcore *spc; | 
 |  | 
 | 	assert(pcoreid < num_cores);	/* catch bugs */ | 
 | 	spc = pcoreid2spc(pcoreid); | 
 | 	assert(spc->alloc_proc != p);	/* corruption or double-alloc */ | 
 | 	spc->alloc_proc = p; | 
 | 	/* if the pcore is prov to them and now allocated, move lists */ | 
 | 	if (spc->prov_proc == p) { | 
 | 		TAILQ_REMOVE(&p->ksched_data.crd.prov_not_alloc_me, spc, | 
 | 			     prov_next); | 
 | 		TAILQ_INSERT_TAIL(&p->ksched_data.crd.prov_alloc_me, spc, | 
 | 				  prov_next); | 
 | 	} | 
 | 	/* Actually allocate the core, removing it from the idle core list. */ | 
 | 	TAILQ_REMOVE(&idlecores, spc, alloc_next); | 
 | 	TAILQ_INSERT_TAIL(&p->ksched_data.crd.alloc_me, spc, alloc_next); | 
 | 	incref_nodes(spc->sched_pnode); | 
 | } | 
 |  | 
 | /* Track the pcore properly when it is deallocated from p. This code assumes | 
 |  * that the scheduler that uses it holds a lock for the duration of the call. | 
 |  * */ | 
 | void __track_core_dealloc(struct proc *p, uint32_t pcoreid) | 
 | { | 
 | 	struct sched_pcore *spc; | 
 |  | 
 | 	assert(pcoreid < num_cores);	/* catch bugs */ | 
 | 	spc = pcoreid2spc(pcoreid); | 
 | 	spc->alloc_proc = NULL; | 
 | 	/* if the pcore is prov to them and now deallocated, move lists */ | 
 | 	if (spc->prov_proc == p) { | 
 | 		TAILQ_REMOVE(&p->ksched_data.crd.prov_alloc_me, spc, prov_next); | 
 | 		/* this is the victim list, which can be sorted so that we pick | 
 | 		 * the right victim (sort by alloc_proc reverse priority, etc). | 
 | 		 * In this case, the core isn't alloc'd by anyone, so it should | 
 | 		 * be the first victim. */ | 
 | 		TAILQ_INSERT_HEAD(&p->ksched_data.crd.prov_not_alloc_me, spc, | 
 | 		                  prov_next); | 
 | 	} | 
 | 	/* Actually dealloc the core, putting it back on the idle core list. */ | 
 | 	TAILQ_REMOVE(&(p->ksched_data.crd.alloc_me), spc, alloc_next); | 
 | 	TAILQ_INSERT_HEAD(&idlecores, spc, alloc_next); | 
 | 	decref_nodes(spc->sched_pnode); | 
 | } | 
 |  | 
 | /* Bulk interface for __track_core_dealloc */ | 
 | void __track_core_dealloc_bulk(struct proc *p, uint32_t *pc_arr, | 
 |                                uint32_t nr_cores) | 
 | { | 
 | 	for (int i = 0; i < nr_cores; i++) | 
 | 		__track_core_dealloc(p, pc_arr[i]); | 
 | } | 
 |  | 
 | /* One off function to make 'pcoreid' the next core chosen by the core | 
 |  * allocation algorithm (so long as no provisioned cores are still idle). | 
 |  * This code assumes that the scheduler that uses it holds a lock for the | 
 |  * duration of the call. */ | 
 | void __next_core_to_alloc(uint32_t pcoreid) | 
 | { | 
 | 	printk("%s is not supported by this core allocation policy!\n", | 
 | 	       __func__); | 
 | } | 
 |  | 
 | /* One off function to sort the idle core list for debugging in the kernel | 
 |  * monitor. This code assumes that the scheduler that uses it holds a lock | 
 |  * for the duration of the call. */ | 
 | void __sort_idle_cores(void) | 
 | { | 
 | 	printk("%s is not supported by this core allocation policy!\n", | 
 | 	       __func__); | 
 | } | 
 |  | 
 | /* Print the map of idle cores that are still allocatable through our core | 
 |  * allocation algorithm. */ | 
 | void print_idle_core_map(void) | 
 | { | 
 | 	printk("Idle cores (unlocked!):\n"); | 
 | 	for (int i = 0; i < num_cores; i++) { | 
 | 		struct sched_pcore *spc_i = &all_pcores[i]; | 
 |  | 
 | 		if (spc_i->alloc_proc == NULL) | 
 | 			printk("Core %d, prov to %d (%p)\n", | 
 | 			       spc_i->core_info->core_id, | 
 | 			       spc_i->prov_proc ? spc_i->prov_proc->pid : 0, | 
 | 			       spc_i->prov_proc); | 
 | 	} | 
 | } |