Blame - drivers/infiniband/hw/hfi1/affinity.c - hafnium/third_party/linux.git

blob: bedd5fba33b080a075405c9a01fa8006f5b0a3f2 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/*
				2	* Copyright(c) 2015 - 2018 Intel Corporation.
				3	*
				4	* This file is provided under a dual BSD/GPLv2 license. When using or
				5	* redistributing this file, you may do so under either license.
				6	*
				7	* GPL LICENSE SUMMARY
				8	*
				9	* This program is free software; you can redistribute it and/or modify
				10	* it under the terms of version 2 of the GNU General Public License as
				11	* published by the Free Software Foundation.
				12	*
				13	* This program is distributed in the hope that it will be useful, but
				14	* WITHOUT ANY WARRANTY; without even the implied warranty of
				15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				16	* General Public License for more details.
				17	*
				18	* BSD LICENSE
				19	*
				20	* Redistribution and use in source and binary forms, with or without
				21	* modification, are permitted provided that the following conditions
				22	* are met:
				23	*
				24	* - Redistributions of source code must retain the above copyright
				25	* notice, this list of conditions and the following disclaimer.
				26	* - Redistributions in binary form must reproduce the above copyright
				27	* notice, this list of conditions and the following disclaimer in
				28	* the documentation and/or other materials provided with the
				29	* distribution.
				30	* - Neither the name of Intel Corporation nor the names of its
				31	* contributors may be used to endorse or promote products derived
				32	* from this software without specific prior written permission.
				33	*
				34	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				35	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				36	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				37	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				38	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				39	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				40	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				41	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				42	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				43	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				44	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				45	*
				46	*/
				47	#include <linux/topology.h>
				48	#include <linux/cpumask.h>
				49	#include <linux/module.h>
				50	#include <linux/interrupt.h>
				51
				52	#include "hfi.h"
				53	#include "affinity.h"
				54	#include "sdma.h"
				55	#include "trace.h"
				56
				57	struct hfi1_affinity_node_list node_affinity = {
				58	.list = LIST_HEAD_INIT(node_affinity.list),
				59	.lock = __MUTEX_INITIALIZER(node_affinity.lock)
				60	};
				61
				62	/* Name of IRQ types, indexed by enum irq_type */
				63	static const char * const irq_type_names[] = {
				64	"SDMA",
				65	"RCVCTXT",
				66	"GENERAL",
				67	"OTHER",
				68	};
				69
				70	/* Per NUMA node count of HFI devices */
				71	static unsigned int *hfi1_per_node_cntr;
				72
				73	static inline void init_cpu_mask_set(struct cpu_mask_set *set)
				74	{
				75	cpumask_clear(&set->mask);
				76	cpumask_clear(&set->used);
				77	set->gen = 0;
				78	}
				79
				80	/* Increment generation of CPU set if needed */
				81	static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set)
				82	{
				83	if (cpumask_equal(&set->mask, &set->used)) {
				84	/*
				85	* We've used up all the CPUs, bump up the generation
				86	* and reset the 'used' map
				87	*/
				88	set->gen++;
				89	cpumask_clear(&set->used);
				90	}
				91	}
				92
				93	static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set)
				94	{
				95	if (cpumask_empty(&set->used) && set->gen) {
				96	set->gen--;
				97	cpumask_copy(&set->used, &set->mask);
				98	}
				99	}
				100
				101	/* Get the first CPU from the list of unused CPUs in a CPU set data structure */
				102	static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff)
				103	{
				104	int cpu;
				105
				106	if (!diff \|\| !set)
				107	return -EINVAL;
				108
				109	_cpu_mask_set_gen_inc(set);
				110
				111	/* Find out CPUs left in CPU mask */
				112	cpumask_andnot(diff, &set->mask, &set->used);
				113
				114	cpu = cpumask_first(diff);
				115	if (cpu >= nr_cpu_ids) /* empty */
				116	cpu = -EINVAL;
				117	else
				118	cpumask_set_cpu(cpu, &set->used);
				119
				120	return cpu;
				121	}
				122
				123	static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu)
				124	{
				125	if (!set)
				126	return;
				127
				128	cpumask_clear_cpu(cpu, &set->used);
				129	_cpu_mask_set_gen_dec(set);
				130	}
				131
				132	/* Initialize non-HT cpu cores mask */
				133	void init_real_cpu_mask(void)
				134	{
				135	int possible, curr_cpu, i, ht;
				136
				137	cpumask_clear(&node_affinity.real_cpu_mask);
				138
				139	/* Start with cpu online mask as the real cpu mask */
				140	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
				141
				142	/*
				143	* Remove HT cores from the real cpu mask. Do this in two steps below.
				144	*/
				145	possible = cpumask_weight(&node_affinity.real_cpu_mask);
				146	ht = cpumask_weight(topology_sibling_cpumask(
				147	cpumask_first(&node_affinity.real_cpu_mask)));
				148	/*
				149	* Step 1. Skip over the first N HT siblings and use them as the
				150	* "real" cores. Assumes that HT cores are not enumerated in
				151	* succession (except in the single core case).
				152	*/
				153	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
				154	for (i = 0; i < possible / ht; i++)
				155	curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
				156	/*
				157	* Step 2. Remove the remaining HT siblings. Use cpumask_next() to
				158	* skip any gaps.
				159	*/
				160	for (; i < possible; i++) {
				161	cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
				162	curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
				163	}
				164	}
				165
				166	int node_affinity_init(void)
				167	{
				168	int node;
				169	struct pci_dev *dev = NULL;
				170	const struct pci_device_id *ids = hfi1_pci_tbl;
				171
				172	cpumask_clear(&node_affinity.proc.used);
				173	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
				174
				175	node_affinity.proc.gen = 0;
				176	node_affinity.num_core_siblings =
				177	cpumask_weight(topology_sibling_cpumask(
				178	cpumask_first(&node_affinity.proc.mask)
				179	));
				180	node_affinity.num_possible_nodes = num_possible_nodes();
				181	node_affinity.num_online_nodes = num_online_nodes();
				182	node_affinity.num_online_cpus = num_online_cpus();
				183
				184	/*
				185	* The real cpu mask is part of the affinity struct but it has to be
				186	* initialized early. It is needed to calculate the number of user
				187	* contexts in set_up_context_variables().
				188	*/
				189	init_real_cpu_mask();
				190
				191	hfi1_per_node_cntr = kcalloc(node_affinity.num_possible_nodes,
				192	sizeof(*hfi1_per_node_cntr), GFP_KERNEL);
				193	if (!hfi1_per_node_cntr)
				194	return -ENOMEM;
				195
				196	while (ids->vendor) {
				197	dev = NULL;
				198	while ((dev = pci_get_device(ids->vendor, ids->device, dev))) {
				199	node = pcibus_to_node(dev->bus);
				200	if (node < 0)
				201	goto out;
				202
				203	hfi1_per_node_cntr[node]++;
				204	}
				205	ids++;
				206	}
				207
				208	return 0;
				209
				210	out:
				211	/*
				212	* Invalid PCI NUMA node information found, note it, and populate
				213	* our database 1:1.
				214	*/
				215	pr_err("HFI: Invalid PCI NUMA node. Performance may be affected\n");
				216	pr_err("HFI: System BIOS may need to be upgraded\n");
				217	for (node = 0; node < node_affinity.num_possible_nodes; node++)
				218	hfi1_per_node_cntr[node] = 1;
				219
				220	return 0;
				221	}
				222
				223	static void node_affinity_destroy(struct hfi1_affinity_node *entry)
				224	{
				225	free_percpu(entry->comp_vect_affinity);
				226	kfree(entry);
				227	}
				228
				229	void node_affinity_destroy_all(void)
				230	{
				231	struct list_head pos, q;
				232	struct hfi1_affinity_node *entry;
				233
				234	mutex_lock(&node_affinity.lock);
				235	list_for_each_safe(pos, q, &node_affinity.list) {
				236	entry = list_entry(pos, struct hfi1_affinity_node,
				237	list);
				238	list_del(pos);
				239	node_affinity_destroy(entry);
				240	}
				241	mutex_unlock(&node_affinity.lock);
				242	kfree(hfi1_per_node_cntr);
				243	}
				244
				245	static struct hfi1_affinity_node *node_affinity_allocate(int node)
				246	{
				247	struct hfi1_affinity_node *entry;
				248
				249	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
				250	if (!entry)
				251	return NULL;
				252	entry->node = node;
				253	entry->comp_vect_affinity = alloc_percpu(u16);
				254	INIT_LIST_HEAD(&entry->list);
				255
				256	return entry;
				257	}
				258
				259	/*
				260	* It appends an entry to the list.
				261	* It must be called with node_affinity.lock held.
				262	*/
				263	static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
				264	{
				265	list_add_tail(&entry->list, &node_affinity.list);
				266	}
				267
				268	/* It must be called with node_affinity.lock held */
				269	static struct hfi1_affinity_node *node_affinity_lookup(int node)
				270	{
				271	struct list_head *pos;
				272	struct hfi1_affinity_node *entry;
				273
				274	list_for_each(pos, &node_affinity.list) {
				275	entry = list_entry(pos, struct hfi1_affinity_node, list);
				276	if (entry->node == node)
				277	return entry;
				278	}
				279
				280	return NULL;
				281	}
				282
				283	static int per_cpu_affinity_get(cpumask_var_t possible_cpumask,
				284	u16 __percpu *comp_vect_affinity)
				285	{
				286	int curr_cpu;
				287	u16 cntr;
				288	u16 prev_cntr;
				289	int ret_cpu;
				290
				291	if (!possible_cpumask) {
				292	ret_cpu = -EINVAL;
				293	goto fail;
				294	}
				295
				296	if (!comp_vect_affinity) {
				297	ret_cpu = -EINVAL;
				298	goto fail;
				299	}
				300
				301	ret_cpu = cpumask_first(possible_cpumask);
				302	if (ret_cpu >= nr_cpu_ids) {
				303	ret_cpu = -EINVAL;
				304	goto fail;
				305	}
				306
				307	prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu);
				308	for_each_cpu(curr_cpu, possible_cpumask) {
				309	cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
				310
				311	if (cntr < prev_cntr) {
				312	ret_cpu = curr_cpu;
				313	prev_cntr = cntr;
				314	}
				315	}
				316
				317	*per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1;
				318
				319	fail:
				320	return ret_cpu;
				321	}
				322
				323	static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask,
				324	u16 __percpu *comp_vect_affinity)
				325	{
				326	int curr_cpu;
				327	int max_cpu;
				328	u16 cntr;
				329	u16 prev_cntr;
				330
				331	if (!possible_cpumask)
				332	return -EINVAL;
				333
				334	if (!comp_vect_affinity)
				335	return -EINVAL;
				336
				337	max_cpu = cpumask_first(possible_cpumask);
				338	if (max_cpu >= nr_cpu_ids)
				339	return -EINVAL;
				340
				341	prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu);
				342	for_each_cpu(curr_cpu, possible_cpumask) {
				343	cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu);
				344
				345	if (cntr > prev_cntr) {
				346	max_cpu = curr_cpu;
				347	prev_cntr = cntr;
				348	}
				349	}
				350
				351	*per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1;
				352
				353	return max_cpu;
				354	}
				355
				356	/*
				357	* Non-interrupt CPUs are used first, then interrupt CPUs.
				358	* Two already allocated cpu masks must be passed.
				359	*/
				360	static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd,
				361	struct hfi1_affinity_node *entry,
				362	cpumask_var_t non_intr_cpus,
				363	cpumask_var_t available_cpus)
				364	__must_hold(&node_affinity.lock)
				365	{
				366	int cpu;
				367	struct cpu_mask_set *set = dd->comp_vect;
				368
				369	lockdep_assert_held(&node_affinity.lock);
				370	if (!non_intr_cpus) {
				371	cpu = -1;
				372	goto fail;
				373	}
				374
				375	if (!available_cpus) {
				376	cpu = -1;
				377	goto fail;
				378	}
				379
				380	/* Available CPUs for pinning completion vectors */
				381	_cpu_mask_set_gen_inc(set);
				382	cpumask_andnot(available_cpus, &set->mask, &set->used);
				383
				384	/* Available CPUs without SDMA engine interrupts */
				385	cpumask_andnot(non_intr_cpus, available_cpus,
				386	&entry->def_intr.used);
				387
				388	/* If there are non-interrupt CPUs available, use them first */
				389	if (!cpumask_empty(non_intr_cpus))
				390	cpu = cpumask_first(non_intr_cpus);
				391	else /* Otherwise, use interrupt CPUs */
				392	cpu = cpumask_first(available_cpus);
				393
				394	if (cpu >= nr_cpu_ids) { /* empty */
				395	cpu = -1;
				396	goto fail;
				397	}
				398	cpumask_set_cpu(cpu, &set->used);
				399
				400	fail:
				401	return cpu;
				402	}
				403
				404	static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu)
				405	{
				406	struct cpu_mask_set *set = dd->comp_vect;
				407
				408	if (cpu < 0)
				409	return;
				410
				411	cpu_mask_set_put(set, cpu);
				412	}
				413
				414	/* _dev_comp_vect_mappings_destroy() is reentrant */
				415	static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd)
				416	{
				417	int i, cpu;
				418
				419	if (!dd->comp_vect_mappings)
				420	return;
				421
				422	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				423	cpu = dd->comp_vect_mappings[i];
				424	_dev_comp_vect_cpu_put(dd, cpu);
				425	dd->comp_vect_mappings[i] = -1;
				426	hfi1_cdbg(AFFINITY,
				427	"[%s] Release CPU %d from completion vector %d",
				428	rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i);
				429	}
				430
				431	kfree(dd->comp_vect_mappings);
				432	dd->comp_vect_mappings = NULL;
				433	}
				434
				435	/*
				436	* This function creates the table for looking up CPUs for completion vectors.
				437	* num_comp_vectors needs to have been initilized before calling this function.
				438	*/
				439	static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd,
				440	struct hfi1_affinity_node *entry)
				441	__must_hold(&node_affinity.lock)
				442	{
				443	int i, cpu, ret;
				444	cpumask_var_t non_intr_cpus;
				445	cpumask_var_t available_cpus;
				446
				447	lockdep_assert_held(&node_affinity.lock);
				448
				449	if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL))
				450	return -ENOMEM;
				451
				452	if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) {
				453	free_cpumask_var(non_intr_cpus);
				454	return -ENOMEM;
				455	}
				456
				457	dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus,
				458	sizeof(*dd->comp_vect_mappings),
				459	GFP_KERNEL);
				460	if (!dd->comp_vect_mappings) {
				461	ret = -ENOMEM;
				462	goto fail;
				463	}
				464	for (i = 0; i < dd->comp_vect_possible_cpus; i++)
				465	dd->comp_vect_mappings[i] = -1;
				466
				467	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				468	cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus,
				469	available_cpus);
				470	if (cpu < 0) {
				471	ret = -EINVAL;
				472	goto fail;
				473	}
				474
				475	dd->comp_vect_mappings[i] = cpu;
				476	hfi1_cdbg(AFFINITY,
				477	"[%s] Completion Vector %d -> CPU %d",
				478	rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu);
				479	}
				480
				481	return 0;
				482
				483	fail:
				484	free_cpumask_var(available_cpus);
				485	free_cpumask_var(non_intr_cpus);
				486	_dev_comp_vect_mappings_destroy(dd);
				487
				488	return ret;
				489	}
				490
				491	int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd)
				492	{
				493	int ret;
				494	struct hfi1_affinity_node *entry;
				495
				496	mutex_lock(&node_affinity.lock);
				497	entry = node_affinity_lookup(dd->node);
				498	if (!entry) {
				499	ret = -EINVAL;
				500	goto unlock;
				501	}
				502	ret = _dev_comp_vect_mappings_create(dd, entry);
				503	unlock:
				504	mutex_unlock(&node_affinity.lock);
				505
				506	return ret;
				507	}
				508
				509	void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd)
				510	{
				511	_dev_comp_vect_mappings_destroy(dd);
				512	}
				513
				514	int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect)
				515	{
				516	struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi);
				517	struct hfi1_devdata *dd = dd_from_dev(verbs_dev);
				518
				519	if (!dd->comp_vect_mappings)
				520	return -EINVAL;
				521	if (comp_vect >= dd->comp_vect_possible_cpus)
				522	return -EINVAL;
				523
				524	return dd->comp_vect_mappings[comp_vect];
				525	}
				526
				527	/*
				528	* It assumes dd->comp_vect_possible_cpus is available.
				529	*/
				530	static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd,
				531	struct hfi1_affinity_node *entry,
				532	bool first_dev_init)
				533	__must_hold(&node_affinity.lock)
				534	{
				535	int i, j, curr_cpu;
				536	int possible_cpus_comp_vect = 0;
				537	struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask;
				538
				539	lockdep_assert_held(&node_affinity.lock);
				540	/*
				541	* If there's only one CPU available for completion vectors, then
				542	* there will only be one completion vector available. Othewise,
				543	* the number of completion vector available will be the number of
				544	* available CPUs divide it by the number of devices in the
				545	* local NUMA node.
				546	*/
				547	if (cpumask_weight(&entry->comp_vect_mask) == 1) {
				548	possible_cpus_comp_vect = 1;
				549	dd_dev_warn(dd,
				550	"Number of kernel receive queues is too large for completion vector affinity to be effective\n");
				551	} else {
				552	possible_cpus_comp_vect +=
				553	cpumask_weight(&entry->comp_vect_mask) /
				554	hfi1_per_node_cntr[dd->node];
				555
				556	/*
				557	* If the completion vector CPUs available doesn't divide
				558	* evenly among devices, then the first device device to be
				559	* initialized gets an extra CPU.
				560	*/
				561	if (first_dev_init &&
				562	cpumask_weight(&entry->comp_vect_mask) %
				563	hfi1_per_node_cntr[dd->node] != 0)
				564	possible_cpus_comp_vect++;
				565	}
				566
				567	dd->comp_vect_possible_cpus = possible_cpus_comp_vect;
				568
				569	/* Reserving CPUs for device completion vector */
				570	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				571	curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask,
				572	entry->comp_vect_affinity);
				573	if (curr_cpu < 0)
				574	goto fail;
				575
				576	cpumask_set_cpu(curr_cpu, dev_comp_vect_mask);
				577	}
				578
				579	hfi1_cdbg(AFFINITY,
				580	"[%s] Completion vector affinity CPU set(s) %*pbl",
				581	rvt_get_ibdev_name(&(dd)->verbs_dev.rdi),
				582	cpumask_pr_args(dev_comp_vect_mask));
				583
				584	return 0;
				585
				586	fail:
				587	for (j = 0; j < i; j++)
				588	per_cpu_affinity_put_max(&entry->comp_vect_mask,
				589	entry->comp_vect_affinity);
				590
				591	return curr_cpu;
				592	}
				593
				594	/*
				595	* It assumes dd->comp_vect_possible_cpus is available.
				596	*/
				597	static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd,
				598	struct hfi1_affinity_node *entry)
				599	__must_hold(&node_affinity.lock)
				600	{
				601	int i, cpu;
				602
				603	lockdep_assert_held(&node_affinity.lock);
				604	if (!dd->comp_vect_possible_cpus)
				605	return;
				606
				607	for (i = 0; i < dd->comp_vect_possible_cpus; i++) {
				608	cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask,
				609	entry->comp_vect_affinity);
				610	/* Clearing CPU in device completion vector cpu mask */
				611	if (cpu >= 0)
				612	cpumask_clear_cpu(cpu, &dd->comp_vect->mask);
				613	}
				614
				615	dd->comp_vect_possible_cpus = 0;
				616	}
				617
				618	/*
				619	* Interrupt affinity.
				620	*
				621	* non-rcv avail gets a default mask that
				622	* starts as possible cpus with threads reset
				623	* and each rcv avail reset.
				624	*
				625	* rcv avail gets node relative 1 wrapping back
				626	* to the node relative 1 as necessary.
				627	*
				628	*/
				629	int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
				630	{
				631	int node = pcibus_to_node(dd->pcidev->bus);
				632	struct hfi1_affinity_node *entry;
				633	const struct cpumask *local_mask;
				634	int curr_cpu, possible, i, ret;
				635	bool new_entry = false;
				636
				637	/*
				638	* If the BIOS does not have the NUMA node information set, select
				639	* NUMA 0 so we get consistent performance.
				640	*/
				641	if (node < 0) {
				642	dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n");
				643	node = 0;
				644	}
				645	dd->node = node;
				646
				647	local_mask = cpumask_of_node(dd->node);
				648	if (cpumask_first(local_mask) >= nr_cpu_ids)
				649	local_mask = topology_core_cpumask(0);
				650
				651	mutex_lock(&node_affinity.lock);
				652	entry = node_affinity_lookup(dd->node);
				653
				654	/*
				655	* If this is the first time this NUMA node's affinity is used,
				656	* create an entry in the global affinity structure and initialize it.
				657	*/
				658	if (!entry) {
				659	entry = node_affinity_allocate(node);
				660	if (!entry) {
				661	dd_dev_err(dd,
				662	"Unable to allocate global affinity node\n");
				663	ret = -ENOMEM;
				664	goto fail;
				665	}
				666	new_entry = true;
				667
				668	init_cpu_mask_set(&entry->def_intr);
				669	init_cpu_mask_set(&entry->rcv_intr);
				670	cpumask_clear(&entry->comp_vect_mask);
				671	cpumask_clear(&entry->general_intr_mask);
				672	/* Use the "real" cpu mask of this node as the default */
				673	cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
				674	local_mask);
				675
				676	/* fill in the receive list */
				677	possible = cpumask_weight(&entry->def_intr.mask);
				678	curr_cpu = cpumask_first(&entry->def_intr.mask);
				679
				680	if (possible == 1) {
				681	/* only one CPU, everyone will use it */
				682	cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
				683	cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
				684	} else {
				685	/*
				686	* The general/control context will be the first CPU in
				687	* the default list, so it is removed from the default
				688	* list and added to the general interrupt list.
				689	*/
				690	cpumask_clear_cpu(curr_cpu, &entry->def_intr.mask);
				691	cpumask_set_cpu(curr_cpu, &entry->general_intr_mask);
				692	curr_cpu = cpumask_next(curr_cpu,
				693	&entry->def_intr.mask);
				694
				695	/*
				696	* Remove the remaining kernel receive queues from
				697	* the default list and add them to the receive list.
				698	*/
				699	for (i = 0;
				700	i < (dd->n_krcv_queues - 1) *
				701	hfi1_per_node_cntr[dd->node];
				702	i++) {
				703	cpumask_clear_cpu(curr_cpu,
				704	&entry->def_intr.mask);
				705	cpumask_set_cpu(curr_cpu,
				706	&entry->rcv_intr.mask);
				707	curr_cpu = cpumask_next(curr_cpu,
				708	&entry->def_intr.mask);
				709	if (curr_cpu >= nr_cpu_ids)
				710	break;
				711	}
				712
				713	/*
				714	* If there ends up being 0 CPU cores leftover for SDMA
				715	* engines, use the same CPU cores as general/control
				716	* context.
				717	*/
				718	if (cpumask_weight(&entry->def_intr.mask) == 0)
				719	cpumask_copy(&entry->def_intr.mask,
				720	&entry->general_intr_mask);
				721	}
				722
				723	/* Determine completion vector CPUs for the entire node */
				724	cpumask_and(&entry->comp_vect_mask,
				725	&node_affinity.real_cpu_mask, local_mask);
				726	cpumask_andnot(&entry->comp_vect_mask,
				727	&entry->comp_vect_mask,
				728	&entry->rcv_intr.mask);
				729	cpumask_andnot(&entry->comp_vect_mask,
				730	&entry->comp_vect_mask,
				731	&entry->general_intr_mask);
				732
				733	/*
				734	* If there ends up being 0 CPU cores leftover for completion
				735	* vectors, use the same CPU core as the general/control
				736	* context.
				737	*/
				738	if (cpumask_weight(&entry->comp_vect_mask) == 0)
				739	cpumask_copy(&entry->comp_vect_mask,
				740	&entry->general_intr_mask);
				741	}
				742
				743	ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry);
				744	if (ret < 0)
				745	goto fail;
				746
				747	if (new_entry)
				748	node_affinity_add_tail(entry);
				749
				750	mutex_unlock(&node_affinity.lock);
				751
				752	return 0;
				753
				754	fail:
				755	if (new_entry)
				756	node_affinity_destroy(entry);
				757	mutex_unlock(&node_affinity.lock);
				758	return ret;
				759	}
				760
				761	void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd)
				762	{
				763	struct hfi1_affinity_node *entry;
				764
				765	if (dd->node < 0)
				766	return;
				767
				768	mutex_lock(&node_affinity.lock);
				769	entry = node_affinity_lookup(dd->node);
				770	if (!entry)
				771	goto unlock;
				772
				773	/*
				774	* Free device completion vector CPUs to be used by future
				775	* completion vectors
				776	*/
				777	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
				778	unlock:
				779	mutex_unlock(&node_affinity.lock);
				780	dd->node = -1;
				781	}
				782
				783	/*
				784	* Function updates the irq affinity hint for msix after it has been changed
				785	* by the user using the /proc/irq interface. This function only accepts
				786	* one cpu in the mask.
				787	*/
				788	static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu)
				789	{
				790	struct sdma_engine *sde = msix->arg;
				791	struct hfi1_devdata *dd = sde->dd;
				792	struct hfi1_affinity_node *entry;
				793	struct cpu_mask_set *set;
				794	int i, old_cpu;
				795
				796	if (cpu > num_online_cpus() \|\| cpu == sde->cpu)
				797	return;
				798
				799	mutex_lock(&node_affinity.lock);
				800	entry = node_affinity_lookup(dd->node);
				801	if (!entry)
				802	goto unlock;
				803
				804	old_cpu = sde->cpu;
				805	sde->cpu = cpu;
				806	cpumask_clear(&msix->mask);
				807	cpumask_set_cpu(cpu, &msix->mask);
				808	dd_dev_dbg(dd, "IRQ: %u, type %s engine %u -> cpu: %d\n",
				809	msix->irq, irq_type_names[msix->type],
				810	sde->this_idx, cpu);
				811	irq_set_affinity_hint(msix->irq, &msix->mask);
				812
				813	/*
				814	* Set the new cpu in the hfi1_affinity_node and clean
				815	* the old cpu if it is not used by any other IRQ
				816	*/
				817	set = &entry->def_intr;
				818	cpumask_set_cpu(cpu, &set->mask);
				819	cpumask_set_cpu(cpu, &set->used);
				820	for (i = 0; i < dd->num_msix_entries; i++) {
				821	struct hfi1_msix_entry *other_msix;
				822
				823	other_msix = &dd->msix_entries[i];
				824	if (other_msix->type != IRQ_SDMA \|\| other_msix == msix)
				825	continue;
				826
				827	if (cpumask_test_cpu(old_cpu, &other_msix->mask))
				828	goto unlock;
				829	}
				830	cpumask_clear_cpu(old_cpu, &set->mask);
				831	cpumask_clear_cpu(old_cpu, &set->used);
				832	unlock:
				833	mutex_unlock(&node_affinity.lock);
				834	}
				835
				836	static void hfi1_irq_notifier_notify(struct irq_affinity_notify *notify,
				837	const cpumask_t *mask)
				838	{
				839	int cpu = cpumask_first(mask);
				840	struct hfi1_msix_entry *msix = container_of(notify,
				841	struct hfi1_msix_entry,
				842	notify);
				843
				844	/* Only one CPU configuration supported currently */
				845	hfi1_update_sdma_affinity(msix, cpu);
				846	}
				847
				848	static void hfi1_irq_notifier_release(struct kref *ref)
				849	{
				850	/*
				851	* This is required by affinity notifier. We don't have anything to
				852	* free here.
				853	*/
				854	}
				855
				856	static void hfi1_setup_sdma_notifier(struct hfi1_msix_entry *msix)
				857	{
				858	struct irq_affinity_notify *notify = &msix->notify;
				859
				860	notify->irq = msix->irq;
				861	notify->notify = hfi1_irq_notifier_notify;
				862	notify->release = hfi1_irq_notifier_release;
				863
				864	if (irq_set_affinity_notifier(notify->irq, notify))
				865	pr_err("Failed to register sdma irq affinity notifier for irq %d\n",
				866	notify->irq);
				867	}
				868
				869	static void hfi1_cleanup_sdma_notifier(struct hfi1_msix_entry *msix)
				870	{
				871	struct irq_affinity_notify *notify = &msix->notify;
				872
				873	if (irq_set_affinity_notifier(notify->irq, NULL))
				874	pr_err("Failed to cleanup sdma irq affinity notifier for irq %d\n",
				875	notify->irq);
				876	}
				877
				878	/*
				879	* Function sets the irq affinity for msix.
				880	* It must be called with node_affinity.lock held.
				881	*/
				882	static int get_irq_affinity(struct hfi1_devdata *dd,
				883	struct hfi1_msix_entry *msix)
				884	{
				885	cpumask_var_t diff;
				886	struct hfi1_affinity_node *entry;
				887	struct cpu_mask_set *set = NULL;
				888	struct sdma_engine *sde = NULL;
				889	struct hfi1_ctxtdata *rcd = NULL;
				890	char extra[64];
				891	int cpu = -1;
				892
				893	extra[0] = '\0';
				894	cpumask_clear(&msix->mask);
				895
				896	entry = node_affinity_lookup(dd->node);
				897
				898	switch (msix->type) {
				899	case IRQ_SDMA:
				900	sde = (struct sdma_engine *)msix->arg;
				901	scnprintf(extra, 64, "engine %u", sde->this_idx);
				902	set = &entry->def_intr;
				903	break;
				904	case IRQ_GENERAL:
				905	cpu = cpumask_first(&entry->general_intr_mask);
				906	break;
				907	case IRQ_RCVCTXT:
				908	rcd = (struct hfi1_ctxtdata *)msix->arg;
				909	if (rcd->ctxt == HFI1_CTRL_CTXT)
				910	cpu = cpumask_first(&entry->general_intr_mask);
				911	else
				912	set = &entry->rcv_intr;
				913	scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
				914	break;
				915	default:
				916	dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
				917	return -EINVAL;
				918	}
				919
				920	/*
				921	* The general and control contexts are placed on a particular
				922	* CPU, which is set above. Skip accounting for it. Everything else
				923	* finds its CPU here.
				924	*/
				925	if (cpu == -1 && set) {
				926	if (!zalloc_cpumask_var(&diff, GFP_KERNEL))
				927	return -ENOMEM;
				928
				929	cpu = cpu_mask_set_get_first(set, diff);
				930	if (cpu < 0) {
				931	free_cpumask_var(diff);
				932	dd_dev_err(dd, "Failure to obtain CPU for IRQ\n");
				933	return cpu;
				934	}
				935
				936	free_cpumask_var(diff);
				937	}
				938
				939	cpumask_set_cpu(cpu, &msix->mask);
				940	dd_dev_info(dd, "IRQ: %u, type %s %s -> cpu: %d\n",
				941	msix->irq, irq_type_names[msix->type],
				942	extra, cpu);
				943	irq_set_affinity_hint(msix->irq, &msix->mask);
				944
				945	if (msix->type == IRQ_SDMA) {
				946	sde->cpu = cpu;
				947	hfi1_setup_sdma_notifier(msix);
				948	}
				949
				950	return 0;
				951	}
				952
				953	int hfi1_get_irq_affinity(struct hfi1_devdata dd, struct hfi1_msix_entry msix)
				954	{
				955	int ret;
				956
				957	mutex_lock(&node_affinity.lock);
				958	ret = get_irq_affinity(dd, msix);
				959	mutex_unlock(&node_affinity.lock);
				960	return ret;
				961	}
				962
				963	void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
				964	struct hfi1_msix_entry *msix)
				965	{
				966	struct cpu_mask_set *set = NULL;
				967	struct hfi1_ctxtdata *rcd;
				968	struct hfi1_affinity_node *entry;
				969
				970	mutex_lock(&node_affinity.lock);
				971	entry = node_affinity_lookup(dd->node);
				972
				973	switch (msix->type) {
				974	case IRQ_SDMA:
				975	set = &entry->def_intr;
				976	hfi1_cleanup_sdma_notifier(msix);
				977	break;
				978	case IRQ_GENERAL:
				979	/* Don't do accounting for general contexts */
				980	break;
				981	case IRQ_RCVCTXT:
				982	rcd = (struct hfi1_ctxtdata *)msix->arg;
				983	/* Don't do accounting for control contexts */
				984	if (rcd->ctxt != HFI1_CTRL_CTXT)
				985	set = &entry->rcv_intr;
				986	break;
				987	default:
				988	mutex_unlock(&node_affinity.lock);
				989	return;
				990	}
				991
				992	if (set) {
				993	cpumask_andnot(&set->used, &set->used, &msix->mask);
				994	_cpu_mask_set_gen_dec(set);
				995	}
				996
				997	irq_set_affinity_hint(msix->irq, NULL);
				998	cpumask_clear(&msix->mask);
				999	mutex_unlock(&node_affinity.lock);
				1000	}
				1001
				1002	/* This should be called with node_affinity.lock held */
				1003	static void find_hw_thread_mask(uint hw_thread_no, cpumask_var_t hw_thread_mask,
				1004	struct hfi1_affinity_node_list *affinity)
				1005	{
				1006	int possible, curr_cpu, i;
				1007	uint num_cores_per_socket = node_affinity.num_online_cpus /
				1008	affinity->num_core_siblings /
				1009	node_affinity.num_online_nodes;
				1010
				1011	cpumask_copy(hw_thread_mask, &affinity->proc.mask);
				1012	if (affinity->num_core_siblings > 0) {
				1013	/* Removing other siblings not needed for now */
				1014	possible = cpumask_weight(hw_thread_mask);
				1015	curr_cpu = cpumask_first(hw_thread_mask);
				1016	for (i = 0;
				1017	i < num_cores_per_socket * node_affinity.num_online_nodes;
				1018	i++)
				1019	curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
				1020
				1021	for (; i < possible; i++) {
				1022	cpumask_clear_cpu(curr_cpu, hw_thread_mask);
				1023	curr_cpu = cpumask_next(curr_cpu, hw_thread_mask);
				1024	}
				1025
				1026	/* Identifying correct HW threads within physical cores */
				1027	cpumask_shift_left(hw_thread_mask, hw_thread_mask,
				1028	num_cores_per_socket *
				1029	node_affinity.num_online_nodes *
				1030	hw_thread_no);
				1031	}
				1032	}
				1033
				1034	int hfi1_get_proc_affinity(int node)
				1035	{
				1036	int cpu = -1, ret, i;
				1037	struct hfi1_affinity_node *entry;
				1038	cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
				1039	const struct cpumask *node_mask,
				1040	*proc_mask = &current->cpus_allowed;
				1041	struct hfi1_affinity_node_list *affinity = &node_affinity;
				1042	struct cpu_mask_set *set = &affinity->proc;
				1043
				1044	/*
				1045	* check whether process/context affinity has already
				1046	* been set
				1047	*/
				1048	if (cpumask_weight(proc_mask) == 1) {
				1049	hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
				1050	current->pid, current->comm,
				1051	cpumask_pr_args(proc_mask));
				1052	/*
				1053	* Mark the pre-set CPU as used. This is atomic so we don't
				1054	* need the lock
				1055	*/
				1056	cpu = cpumask_first(proc_mask);
				1057	cpumask_set_cpu(cpu, &set->used);
				1058	goto done;
				1059	} else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
				1060	hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
				1061	current->pid, current->comm,
				1062	cpumask_pr_args(proc_mask));
				1063	goto done;
				1064	}
				1065
				1066	/*
				1067	* The process does not have a preset CPU affinity so find one to
				1068	* recommend using the following algorithm:
				1069	*
				1070	* For each user process that is opening a context on HFI Y:
				1071	* a) If all cores are filled, reinitialize the bitmask
				1072	* b) Fill real cores first, then HT cores (First set of HT
				1073	* cores on all physical cores, then second set of HT core,
				1074	* and, so on) in the following order:
				1075	*
				1076	* 1. Same NUMA node as HFI Y and not running an IRQ
				1077	* handler
				1078	* 2. Same NUMA node as HFI Y and running an IRQ handler
				1079	* 3. Different NUMA node to HFI Y and not running an IRQ
				1080	* handler
				1081	* 4. Different NUMA node to HFI Y and running an IRQ
				1082	* handler
				1083	* c) Mark core as filled in the bitmask. As user processes are
				1084	* done, clear cores from the bitmask.
				1085	*/
				1086
				1087	ret = zalloc_cpumask_var(&diff, GFP_KERNEL);
				1088	if (!ret)
				1089	goto done;
				1090	ret = zalloc_cpumask_var(&hw_thread_mask, GFP_KERNEL);
				1091	if (!ret)
				1092	goto free_diff;
				1093	ret = zalloc_cpumask_var(&available_mask, GFP_KERNEL);
				1094	if (!ret)
				1095	goto free_hw_thread_mask;
				1096	ret = zalloc_cpumask_var(&intrs_mask, GFP_KERNEL);
				1097	if (!ret)
				1098	goto free_available_mask;
				1099
				1100	mutex_lock(&affinity->lock);
				1101	/*
				1102	* If we've used all available HW threads, clear the mask and start
				1103	* overloading.
				1104	*/
				1105	_cpu_mask_set_gen_inc(set);
				1106
				1107	/*
				1108	* If NUMA node has CPUs used by interrupt handlers, include them in the
				1109	* interrupt handler mask.
				1110	*/
				1111	entry = node_affinity_lookup(node);
				1112	if (entry) {
				1113	cpumask_copy(intrs_mask, (entry->def_intr.gen ?
				1114	&entry->def_intr.mask :
				1115	&entry->def_intr.used));
				1116	cpumask_or(intrs_mask, intrs_mask, (entry->rcv_intr.gen ?
				1117	&entry->rcv_intr.mask :
				1118	&entry->rcv_intr.used));
				1119	cpumask_or(intrs_mask, intrs_mask, &entry->general_intr_mask);
				1120	}
				1121	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
				1122	cpumask_pr_args(intrs_mask));
				1123
				1124	cpumask_copy(hw_thread_mask, &set->mask);
				1125
				1126	/*
				1127	* If HT cores are enabled, identify which HW threads within the
				1128	* physical cores should be used.
				1129	*/
				1130	if (affinity->num_core_siblings > 0) {
				1131	for (i = 0; i < affinity->num_core_siblings; i++) {
				1132	find_hw_thread_mask(i, hw_thread_mask, affinity);
				1133
				1134	/*
				1135	* If there's at least one available core for this HW
				1136	* thread number, stop looking for a core.
				1137	*
				1138	* diff will always be not empty at least once in this
				1139	* loop as the used mask gets reset when
				1140	* (set->mask == set->used) before this loop.
				1141	*/
				1142	cpumask_andnot(diff, hw_thread_mask, &set->used);
				1143	if (!cpumask_empty(diff))
				1144	break;
				1145	}
				1146	}
				1147	hfi1_cdbg(PROC, "Same available HW thread on all physical CPUs: %*pbl",
				1148	cpumask_pr_args(hw_thread_mask));
				1149
				1150	node_mask = cpumask_of_node(node);
				1151	hfi1_cdbg(PROC, "Device on NUMA %u, CPUs %*pbl", node,
				1152	cpumask_pr_args(node_mask));
				1153
				1154	/* Get cpumask of available CPUs on preferred NUMA */
				1155	cpumask_and(available_mask, hw_thread_mask, node_mask);
				1156	cpumask_andnot(available_mask, available_mask, &set->used);
				1157	hfi1_cdbg(PROC, "Available CPUs on NUMA %u: %*pbl", node,
				1158	cpumask_pr_args(available_mask));
				1159
				1160	/*
				1161	* At first, we don't want to place processes on the same
				1162	* CPUs as interrupt handlers. Then, CPUs running interrupt
				1163	* handlers are used.
				1164	*
				1165	* 1) If diff is not empty, then there are CPUs not running
				1166	* non-interrupt handlers available, so diff gets copied
				1167	* over to available_mask.
				1168	* 2) If diff is empty, then all CPUs not running interrupt
				1169	* handlers are taken, so available_mask contains all
				1170	* available CPUs running interrupt handlers.
				1171	* 3) If available_mask is empty, then all CPUs on the
				1172	* preferred NUMA node are taken, so other NUMA nodes are
				1173	* used for process assignments using the same method as
				1174	* the preferred NUMA node.
				1175	*/
				1176	cpumask_andnot(diff, available_mask, intrs_mask);
				1177	if (!cpumask_empty(diff))
				1178	cpumask_copy(available_mask, diff);
				1179
				1180	/* If we don't have CPUs on the preferred node, use other NUMA nodes */
				1181	if (cpumask_empty(available_mask)) {
				1182	cpumask_andnot(available_mask, hw_thread_mask, &set->used);
				1183	/* Excluding preferred NUMA cores */
				1184	cpumask_andnot(available_mask, available_mask, node_mask);
				1185	hfi1_cdbg(PROC,
				1186	"Preferred NUMA node cores are taken, cores available in other NUMA nodes: %*pbl",
				1187	cpumask_pr_args(available_mask));
				1188
				1189	/*
				1190	* At first, we don't want to place processes on the same
				1191	* CPUs as interrupt handlers.
				1192	*/
				1193	cpumask_andnot(diff, available_mask, intrs_mask);
				1194	if (!cpumask_empty(diff))
				1195	cpumask_copy(available_mask, diff);
				1196	}
				1197	hfi1_cdbg(PROC, "Possible CPUs for process: %*pbl",
				1198	cpumask_pr_args(available_mask));
				1199
				1200	cpu = cpumask_first(available_mask);
				1201	if (cpu >= nr_cpu_ids) /* empty */
				1202	cpu = -1;
				1203	else
				1204	cpumask_set_cpu(cpu, &set->used);
				1205
				1206	mutex_unlock(&affinity->lock);
				1207	hfi1_cdbg(PROC, "Process assigned to CPU %d", cpu);
				1208
				1209	free_cpumask_var(intrs_mask);
				1210	free_available_mask:
				1211	free_cpumask_var(available_mask);
				1212	free_hw_thread_mask:
				1213	free_cpumask_var(hw_thread_mask);
				1214	free_diff:
				1215	free_cpumask_var(diff);
				1216	done:
				1217	return cpu;
				1218	}
				1219
				1220	void hfi1_put_proc_affinity(int cpu)
				1221	{
				1222	struct hfi1_affinity_node_list *affinity = &node_affinity;
				1223	struct cpu_mask_set *set = &affinity->proc;
				1224
				1225	if (cpu < 0)
				1226	return;
				1227
				1228	mutex_lock(&affinity->lock);
				1229	cpu_mask_set_put(set, cpu);
				1230	hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu);
				1231	mutex_unlock(&affinity->lock);
				1232	}