| /* Copyright (c) 2015 The Regents of the University of California |
| * Valmon Leymarie <leymariv@berkeley.edu> |
| * Kevin Klues <klueska@cs.berkeley.edu> |
| * See LICENSE for details. |
| */ |
| |
| #include <arch/topology.h> |
| #include <sys/queue.h> |
| #include <env.h> |
| #include <corerequest.h> |
| #include <kmalloc.h> |
| |
| enum pnode_type { CORE, CPU, SOCKET, NUMA, MACHINE, NUM_NODE_TYPES }; |
| |
| static const char * const pnode_label[] = { |
| [CORE] = "CORE", |
| [CPU] = "CPU", |
| [SOCKET] = "SOCKET", |
| [NUMA] = "NUMA", |
| [MACHINE] = "MACHINE" |
| }; |
| |
| /* Internal representation of a node in the hierarchy of elements in the cpu |
| * topology of the machine (i.e. numa domain, socket, cpu, core, etc.). */ |
| struct sched_pnode { |
| int id; |
| enum pnode_type type; |
| struct sched_pnode *parent; |
| |
| /* Refcount is used to track how many cores have been allocated beneath |
| * the current node in the hierarchy. */ |
| int refcount; |
| |
| /* All nodes except cores have children. Cores have a sched_pcore. */ |
| union { |
| struct sched_pnode *children; |
| struct sched_pcore *sched_pcore; |
| }; |
| }; |
| |
| #define num_cpus (cpu_topology_info.num_cpus) |
| #define num_sockets (cpu_topology_info.num_sockets) |
| #define num_numa (cpu_topology_info.num_numa) |
| #define cores_per_numa (cpu_topology_info.cores_per_numa) |
| #define cores_per_socket (cpu_topology_info.cores_per_socket) |
| #define cores_per_cpu (cpu_topology_info.cores_per_cpu) |
| #define cpus_per_socket (cpu_topology_info.cpus_per_socket) |
| #define cpus_per_numa (cpu_topology_info.cpus_per_numa) |
| #define sockets_per_numa (cpu_topology_info.sockets_per_numa) |
| |
| #define child_node_type(t) ((t) - 1) |
| |
| /* The pcores in the system. (array gets alloced in init()). */ |
| struct sched_pcore *all_pcores; |
| |
| /* TAILQ of all unallocated, idle (CG) cores */ |
| struct sched_pcore_tailq idlecores = TAILQ_HEAD_INITIALIZER(idlecores); |
| |
| /* An array containing the number of nodes at each level. */ |
| static int num_nodes[NUM_NODE_TYPES]; |
| |
| /* A 2D array containing for all core i its distance from a core j. */ |
| static int **core_distance; |
| |
| /* An array containing the number of children at each level. */ |
| static int num_descendants[NUM_NODE_TYPES][NUM_NODE_TYPES]; |
| |
| /* A list of lookup tables to find specific nodes by type and id. */ |
| static int total_nodes; |
| static struct sched_pnode *all_nodes; |
| static struct sched_pnode *node_lookup[NUM_NODE_TYPES]; |
| |
| /* Recursively increase the refcount from the node passed in through all of its |
| * ancestors in the hierarchy. */ |
| static void incref_nodes(struct sched_pnode *n) |
| { |
| while (n != NULL) { |
| n->refcount++; |
| n = n->parent; |
| } |
| } |
| |
| /* Recursively decrease the refcount from the node passed in through all of its |
| * ancestors in the hierarchy. */ |
| static void decref_nodes(struct sched_pnode *n) |
| { |
| while (n != NULL) { |
| n->refcount--; |
| n = n->parent; |
| } |
| } |
| |
| /* Create a node and initialize it. This code assumes that child are created |
| * before parent nodes. |
| * |
| * all_nodes is laid out such that all of the nodes are: CORES, then the CPUS, |
| * then the SOCKETS, then NUMA, then one for MACHINE. The nodes at any given |
| * layer are later broken up into chunks of n and assigned to their parents |
| * (down below). */ |
| static void init_nodes(int type, int num, int nchildren) |
| { |
| /* Initialize the lookup table for this node type. */ |
| num_nodes[type] = num; |
| node_lookup[type] = all_nodes; |
| for (int i = CORE; i < type; i++) |
| node_lookup[type] += num_nodes[i]; |
| |
| /* Initialize all fields of each node of this type. */ |
| for (int i = 0; i < num; i++) { |
| struct sched_pnode *n = &node_lookup[type][i]; |
| |
| /* Initialize the common fields. */ |
| n->id = i; |
| n->type = type; |
| n->refcount = 0; |
| n->parent = NULL; |
| |
| /* If we are a core node, initialize the sched_pcore field. */ |
| if (n->type == CORE) { |
| n->sched_pcore = &all_pcores[n->id]; |
| n->sched_pcore->sched_pnode = n; |
| n->sched_pcore->core_info = |
| &cpu_topology_info.core_list[n->id]; |
| n->sched_pcore->alloc_proc = NULL; |
| n->sched_pcore->prov_proc = NULL; |
| } |
| /* Otherwise initialize the children field, updating the parent |
| * of all children to the current node. This assumes the |
| * children have already been initialized (i.e. init_nodes() |
| * must be called iteratively from the bottom-up). */ |
| if (n->type != CORE) { |
| n->children = |
| &node_lookup[child_node_type(type)][i * nchildren]; |
| for (int j = 0; j < nchildren; j++) |
| n->children[j].parent = n; |
| } |
| } |
| } |
| |
| /* Helper: Two cores have the same level id (and thus are at distance k, below) |
| * if their IDs divided by the number of cores per level are the same, due to |
| * how we number our cores. */ |
| static int get_level_id(int coreid, int level) |
| { |
| return coreid / num_descendants[level][CORE]; |
| } |
| |
| /* Allocate a table of distances from one core to an other. Cores on the same |
| * cpu have a distance of 1; same socket have a distance of 2; same numa -> 3; |
| * same machine -> 4; */ |
| static void init_core_distances(void) |
| { |
| core_distance = kzmalloc(num_cores * sizeof(int*), MEM_WAIT); |
| for (int i = 0; i < num_cores; i++) |
| core_distance[i] = kzmalloc(num_cores * sizeof(int), MEM_WAIT); |
| for (int i = 0; i < num_cores; i++) { |
| for (int j = 0; j < num_cores; j++) { |
| for (int k = CPU; k <= MACHINE; k++) { |
| if (get_level_id(i, k) == get_level_id(j, k)) { |
| core_distance[i][j] = k; |
| break; |
| } |
| } |
| } |
| } |
| } |
| |
| /* Initialize any data assocaited with doing core allocation. */ |
| void corealloc_init(void) |
| { |
| void *nodes_and_cores; |
| |
| /* Allocate a flat array of nodes. */ |
| total_nodes = num_cores + num_cpus + num_sockets + num_numa + 1; |
| nodes_and_cores = kmalloc(total_nodes * sizeof(struct sched_pnode) + |
| num_cores * sizeof(struct sched_pcore), |
| MEM_WAIT); |
| all_nodes = nodes_and_cores; |
| all_pcores = nodes_and_cores + total_nodes * sizeof(struct sched_pnode); |
| |
| /* Initialize the number of descendants from our cpu_topology info. */ |
| num_descendants[CPU][CORE] = cores_per_cpu; |
| num_descendants[SOCKET][CORE] = cores_per_socket; |
| num_descendants[SOCKET][CPU] = cpus_per_socket; |
| num_descendants[NUMA][CORE] = cores_per_numa; |
| num_descendants[NUMA][CPU] = cpus_per_numa; |
| num_descendants[NUMA][SOCKET] = sockets_per_numa; |
| num_descendants[MACHINE][CORE] = num_cores; |
| num_descendants[MACHINE][CPU] = num_cpus; |
| num_descendants[MACHINE][SOCKET] = num_sockets; |
| num_descendants[MACHINE][NUMA] = num_numa; |
| |
| /* Initialize the nodes at each level in our hierarchy. */ |
| init_nodes(CORE, num_cores, 0); |
| init_nodes(CPU, num_cpus, cores_per_cpu); |
| init_nodes(SOCKET, num_sockets, cpus_per_socket); |
| init_nodes(NUMA, num_numa, sockets_per_numa); |
| init_nodes(MACHINE, 1, num_numa); |
| |
| /* Initialize our table of core_distances */ |
| init_core_distances(); |
| |
| for (int i = 0; i < num_cores; i++) { |
| /* Remove all ll_cores from consideration for allocation. */ |
| if (is_ll_core(i)) { |
| incref_nodes(all_pcores[i].sched_pnode); |
| continue; |
| } |
| #ifdef CONFIG_DISABLE_SMT |
| /* Remove all odd cores from consideration for allocation. */ |
| if (i % 2 == 1) { |
| incref_nodes(all_pcores[i].sched_pnode); |
| continue; |
| } |
| #endif /* CONFIG_DISABLE_SMT */ |
| /* Fill the idlecores array. */ |
| TAILQ_INSERT_HEAD(&idlecores, &all_pcores[i], alloc_next); |
| } |
| } |
| |
| /* Initialize any data associated with allocating cores to a process. */ |
| void corealloc_proc_init(struct proc *p) |
| { |
| TAILQ_INIT(&p->ksched_data.crd.alloc_me); |
| TAILQ_INIT(&p->ksched_data.crd.prov_alloc_me); |
| TAILQ_INIT(&p->ksched_data.crd.prov_not_alloc_me); |
| } |
| |
| /* Returns the sum of the distances from one core to all cores in a list. |
| * |
| * This assumes cl is the allocated list, or possibly the idlecores, since it |
| * uses alloc_next. */ |
| static int cumulative_core_distance(struct sched_pcore *c, |
| struct sched_pcore_tailq cl) |
| { |
| int d = 0; |
| struct sched_pcore *temp = NULL; |
| |
| TAILQ_FOREACH(temp, &cl, alloc_next) { |
| d += core_distance[c->core_info->core_id] |
| [temp->core_info->core_id]; |
| } |
| return d; |
| } |
| |
| /* Returns the first core in the hierarchy under node n. */ |
| static struct sched_pcore *first_core_in_node(struct sched_pnode *n) |
| { |
| struct sched_pnode *first_child = n; |
| |
| while (first_child->type != CORE) |
| first_child = &first_child->children[0]; |
| return first_child->sched_pcore; |
| } |
| |
| /* Return the first provisioned core available. Otherwise, return NULL. */ |
| static struct sched_pcore *find_first_provisioned_core(struct proc *p) |
| { |
| return TAILQ_FIRST(&(p->ksched_data.crd.prov_not_alloc_me)); |
| } |
| |
| /* Returns the best first core to allocate for a proc which owns no core. |
| * Return the core that is the farthest from the others's proc cores. */ |
| static struct sched_pcore *find_first_core(struct proc *p) |
| { |
| struct sched_pcore *c; |
| struct sched_pnode *n; |
| struct sched_pnode *nodes; |
| struct sched_pnode *bestn; |
| int best_refcount; |
| |
| /* Find the best, first provisioned core if there are any. Even if the |
| * whole machine is allocated, we still give out provisioned cores, |
| * because they will be revoked from their current owner if necessary. |
| */ |
| c = find_first_provisioned_core(p); |
| if (c != NULL) |
| return c; |
| |
| /* Otherwise, if the whole machine is already allocated, there are no |
| * cores left to allocate, and we are done. */ |
| bestn = &node_lookup[MACHINE][0]; |
| if (bestn->refcount == num_descendants[MACHINE][CORE]) |
| return NULL; |
| |
| /* Otherwise, we know at least one core is still available, so let's |
| * find the best one to allocate first. We start at NUMA, and loop |
| * through the topology to find it. */ |
| for (int i = NUMA; i >= CORE; i--) { |
| nodes = bestn->children; |
| best_refcount = total_nodes; |
| bestn = NULL; |
| |
| for (int j = 0; j < num_nodes[i]; j++) { |
| n = &nodes[j]; |
| if (n->refcount == 0) |
| return first_core_in_node(n); |
| if (n->refcount == num_descendants[i][CORE]) |
| continue; |
| if (n->refcount < best_refcount) { |
| best_refcount = n->refcount; |
| bestn = n; |
| } |
| } |
| } |
| return bestn->sched_pcore; |
| } |
| |
| /* Return the closest core from the list of provisioned cores to cores we |
| * already own. If no cores are available we return NULL.*/ |
| static struct sched_pcore *find_closest_provisioned_core(struct proc *p) |
| { |
| struct sched_pcore_tailq provisioned = |
| p->ksched_data.crd.prov_not_alloc_me; |
| struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me; |
| struct sched_pcore *bestc = NULL; |
| struct sched_pcore *c = NULL; |
| int bestd = 0; |
| |
| TAILQ_FOREACH(c, &provisioned, prov_next) { |
| int currd = cumulative_core_distance(c, allocated); |
| |
| if ((bestd == 0) || (currd < bestd)) { |
| bestd = currd; |
| bestc = c; |
| } |
| } |
| return bestc; |
| } |
| |
| /* Return the closest core from the list of idlecores to cores we already own. |
| * If no cores are available we return NULL.*/ |
| static struct sched_pcore *find_closest_idle_core(struct proc *p) |
| { |
| struct sched_pcore_tailq allocated = p->ksched_data.crd.alloc_me; |
| struct sched_pcore *bestc = NULL; |
| struct sched_pcore *c = NULL; |
| int bestd = 0; |
| |
| /* TODO: Add optimization to hand out core at equivalent distance if the |
| * best core found is provisioned to someone else. */ |
| TAILQ_FOREACH(c, &idlecores, alloc_next) { |
| int currd = cumulative_core_distance(c, allocated); |
| |
| if ((bestd == 0) || (currd < bestd)) { |
| bestd = currd; |
| bestc = c; |
| } |
| } |
| return bestc; |
| } |
| |
| /* Consider the first core provisioned to a proc by calling |
| * find_best_provisioned_core(). Then check siblings of the cores the proc |
| * already owns. Calculate for every possible node its |
| * cumulative_core_distance() (sum of the distances from this core to all of |
| * the cores the proc owns). Allocate the core that has the lowest |
| * core_distance. This code assumes that the scheduler that uses it holds a |
| * lock for the duration of the call. */ |
| static struct sched_pcore *find_closest_core(struct proc *p) |
| { |
| struct sched_pcore *bestc; |
| |
| bestc = find_closest_provisioned_core(p); |
| if (bestc) |
| return bestc; |
| return find_closest_idle_core(p); |
| } |
| |
| /* Find the best core to allocate. If no cores are allocated yet, find one that |
| * is as far from the cores allocated to other processes as possible. |
| * Otherwise, find a core that is as close as possible to one of the other |
| * cores we already own. */ |
| uint32_t __find_best_core_to_alloc(struct proc *p) |
| { |
| struct sched_pcore *c = NULL; |
| |
| if (TAILQ_FIRST(&(p->ksched_data.crd.alloc_me)) == NULL) |
| c = find_first_core(p); |
| else |
| c = find_closest_core(p); |
| |
| if (c == NULL) |
| return -1; |
| return spc2pcoreid(c); |
| } |
| |
| /* Track the pcore properly when it is allocated to p. This code assumes that |
| * the scheduler that uses it holds a lock for the duration of the call. */ |
| void __track_core_alloc(struct proc *p, uint32_t pcoreid) |
| { |
| struct sched_pcore *spc; |
| |
| assert(pcoreid < num_cores); /* catch bugs */ |
| spc = pcoreid2spc(pcoreid); |
| assert(spc->alloc_proc != p); /* corruption or double-alloc */ |
| spc->alloc_proc = p; |
| /* if the pcore is prov to them and now allocated, move lists */ |
| if (spc->prov_proc == p) { |
| TAILQ_REMOVE(&p->ksched_data.crd.prov_not_alloc_me, spc, |
| prov_next); |
| TAILQ_INSERT_TAIL(&p->ksched_data.crd.prov_alloc_me, spc, |
| prov_next); |
| } |
| /* Actually allocate the core, removing it from the idle core list. */ |
| TAILQ_REMOVE(&idlecores, spc, alloc_next); |
| TAILQ_INSERT_TAIL(&p->ksched_data.crd.alloc_me, spc, alloc_next); |
| incref_nodes(spc->sched_pnode); |
| } |
| |
| /* Track the pcore properly when it is deallocated from p. This code assumes |
| * that the scheduler that uses it holds a lock for the duration of the call. |
| * */ |
| void __track_core_dealloc(struct proc *p, uint32_t pcoreid) |
| { |
| struct sched_pcore *spc; |
| |
| assert(pcoreid < num_cores); /* catch bugs */ |
| spc = pcoreid2spc(pcoreid); |
| spc->alloc_proc = NULL; |
| /* if the pcore is prov to them and now deallocated, move lists */ |
| if (spc->prov_proc == p) { |
| TAILQ_REMOVE(&p->ksched_data.crd.prov_alloc_me, spc, prov_next); |
| /* this is the victim list, which can be sorted so that we pick |
| * the right victim (sort by alloc_proc reverse priority, etc). |
| * In this case, the core isn't alloc'd by anyone, so it should |
| * be the first victim. */ |
| TAILQ_INSERT_HEAD(&p->ksched_data.crd.prov_not_alloc_me, spc, |
| prov_next); |
| } |
| /* Actually dealloc the core, putting it back on the idle core list. */ |
| TAILQ_REMOVE(&(p->ksched_data.crd.alloc_me), spc, alloc_next); |
| TAILQ_INSERT_HEAD(&idlecores, spc, alloc_next); |
| decref_nodes(spc->sched_pnode); |
| } |
| |
| /* Bulk interface for __track_core_dealloc */ |
| void __track_core_dealloc_bulk(struct proc *p, uint32_t *pc_arr, |
| uint32_t nr_cores) |
| { |
| for (int i = 0; i < nr_cores; i++) |
| __track_core_dealloc(p, pc_arr[i]); |
| } |
| |
| /* One off function to make 'pcoreid' the next core chosen by the core |
| * allocation algorithm (so long as no provisioned cores are still idle). |
| * This code assumes that the scheduler that uses it holds a lock for the |
| * duration of the call. */ |
| void __next_core_to_alloc(uint32_t pcoreid) |
| { |
| printk("%s is not supported by this core allocation policy!\n", |
| __func__); |
| } |
| |
| /* One off function to sort the idle core list for debugging in the kernel |
| * monitor. This code assumes that the scheduler that uses it holds a lock |
| * for the duration of the call. */ |
| void __sort_idle_cores(void) |
| { |
| printk("%s is not supported by this core allocation policy!\n", |
| __func__); |
| } |
| |
| /* Print the map of idle cores that are still allocatable through our core |
| * allocation algorithm. */ |
| void print_idle_core_map(void) |
| { |
| printk("Idle cores (unlocked!):\n"); |
| for (int i = 0; i < num_cores; i++) { |
| struct sched_pcore *spc_i = &all_pcores[i]; |
| |
| if (spc_i->alloc_proc == NULL) |
| printk("Core %d, prov to %d (%p)\n", |
| spc_i->core_info->core_id, |
| spc_i->prov_proc ? spc_i->prov_proc->pid : 0, |
| spc_i->prov_proc); |
| } |
| } |