Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 79cc750..a50dadd 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -141,7 +141,8 @@
config XEN_GNTDEV_DMABUF
bool "Add support for dma-buf grant access device driver extension"
- depends on XEN_GNTDEV && XEN_GRANT_DMA_ALLOC && DMA_SHARED_BUFFER
+ depends on XEN_GNTDEV && XEN_GRANT_DMA_ALLOC
+ select DMA_SHARED_BUFFER
help
Allows userspace processes and kernel modules to use Xen backed
dma-buf implementation. With this extension grant references to
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index 5bae515..ebb0551 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -395,7 +395,8 @@
#else
static enum bp_state reserve_additional_memory(void)
{
- balloon_stats.target_pages = balloon_stats.current_pages;
+ balloon_stats.target_pages = balloon_stats.current_pages +
+ balloon_stats.target_unpopulated;
return BP_ECANCELED;
}
#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
@@ -569,11 +570,13 @@
if (xen_hotplug_unpopulated) {
st = reserve_additional_memory();
if (st != BP_ECANCELED) {
+ int rc;
+
mutex_unlock(&balloon_mutex);
- wait_event(balloon_wq,
+ rc = wait_event_interruptible(balloon_wq,
!list_empty(&ballooned_pages));
mutex_lock(&balloon_mutex);
- return 0;
+ return rc ? -ENOMEM : 0;
}
}
@@ -631,6 +634,12 @@
out_undo:
mutex_unlock(&balloon_mutex);
free_xenballooned_pages(pgno, pages);
+ /*
+ * NB: free_xenballooned_pages will only subtract pgno pages, but since
+ * target_unpopulated is incremented with nr_pages at the start we need
+ * to remove the remaining ones also, or accounting will be screwed.
+ */
+ balloon_stats.target_unpopulated -= nr_pages - pgno;
return ret;
}
EXPORT_SYMBOL(alloc_xenballooned_pages);
diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c
index 8edef51..77cc80b 100644
--- a/drivers/xen/events/events_2l.c
+++ b/drivers/xen/events/events_2l.c
@@ -47,6 +47,11 @@
return EVTCHN_2L_NR_CHANNELS;
}
+static void evtchn_2l_remove(evtchn_port_t evtchn, unsigned int cpu)
+{
+ clear_bit(evtchn, BM(per_cpu(cpu_evtchn_mask, cpu)));
+}
+
static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu)
{
clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu)));
@@ -71,12 +76,6 @@
return sync_test_bit(port, BM(&s->evtchn_pending[0]));
}
-static bool evtchn_2l_test_and_set_mask(unsigned port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0]));
-}
-
static void evtchn_2l_mask(unsigned port)
{
struct shared_info *s = HYPERVISOR_shared_info;
@@ -91,6 +90,8 @@
BUG_ON(!irqs_disabled());
+ smp_wmb(); /* All writes before unmask must be visible. */
+
if (unlikely((cpu != cpu_from_evtchn(port))))
do_hypercall = 1;
else {
@@ -159,7 +160,7 @@
* a bitset of words which contain pending event bits. The second
* level is a bitset of pending events themselves.
*/
-static void evtchn_2l_handle_events(unsigned cpu)
+static void evtchn_2l_handle_events(unsigned cpu, struct evtchn_loop_ctrl *ctrl)
{
int irq;
xen_ulong_t pending_words;
@@ -240,10 +241,7 @@
/* Process port. */
port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx;
- irq = get_evtchn_to_irq(port);
-
- if (irq != -1)
- generic_handle_irq(irq);
+ handle_irq_for_port(port, ctrl);
bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD;
@@ -355,18 +353,27 @@
EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD);
}
+static int evtchn_2l_percpu_deinit(unsigned int cpu)
+{
+ memset(per_cpu(cpu_evtchn_mask, cpu), 0, sizeof(xen_ulong_t) *
+ EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD);
+
+ return 0;
+}
+
static const struct evtchn_ops evtchn_ops_2l = {
.max_channels = evtchn_2l_max_channels,
.nr_channels = evtchn_2l_max_channels,
+ .remove = evtchn_2l_remove,
.bind_to_cpu = evtchn_2l_bind_to_cpu,
.clear_pending = evtchn_2l_clear_pending,
.set_pending = evtchn_2l_set_pending,
.is_pending = evtchn_2l_is_pending,
- .test_and_set_mask = evtchn_2l_test_and_set_mask,
.mask = evtchn_2l_mask,
.unmask = evtchn_2l_unmask,
.handle_events = evtchn_2l_handle_events,
.resume = evtchn_2l_resume,
+ .percpu_deinit = evtchn_2l_percpu_deinit,
};
void __init xen_evtchn_2l_init(void)
diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 6c88439..87cfadd 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -33,6 +33,10 @@
#include <linux/slab.h>
#include <linux/irqnr.h>
#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/cpuhotplug.h>
+#include <linux/atomic.h>
+#include <linux/ktime.h>
#ifdef CONFIG_X86
#include <asm/desc.h>
@@ -62,6 +66,15 @@
#include "events_internal.h"
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "xen."
+
+static uint __read_mostly event_loop_timeout = 2;
+module_param(event_loop_timeout, uint, 0644);
+
+static uint __read_mostly event_eoi_delay = 10;
+module_param(event_eoi_delay, uint, 0644);
+
const struct evtchn_ops *evtchn_ops;
/*
@@ -70,6 +83,25 @@
*/
static DEFINE_MUTEX(irq_mapping_update_lock);
+/*
+ * Lock protecting event handling loop against removing event channels.
+ * Adding of event channels is no issue as the associated IRQ becomes active
+ * only after everything is setup (before request_[threaded_]irq() the handler
+ * can't be entered for an event, as the event channel will be unmasked only
+ * then).
+ */
+static DEFINE_RWLOCK(evtchn_rwlock);
+
+/*
+ * Lock hierarchy:
+ *
+ * irq_mapping_update_lock
+ * evtchn_rwlock
+ * IRQ-desc lock
+ * percpu eoi_list_lock
+ * irq_info->lock
+ */
+
static LIST_HEAD(xen_irq_list_head);
/* IRQ <-> VIRQ mapping. */
@@ -91,18 +123,23 @@
/* Xen will never allocate port zero for any purpose. */
#define VALID_EVTCHN(chn) ((chn) != 0)
+static struct irq_info *legacy_info_ptrs[NR_IRQS_LEGACY];
+
static struct irq_chip xen_dynamic_chip;
+static struct irq_chip xen_lateeoi_chip;
static struct irq_chip xen_percpu_chip;
static struct irq_chip xen_pirq_chip;
static void enable_dynirq(struct irq_data *data);
static void disable_dynirq(struct irq_data *data);
-static void clear_evtchn_to_irq_row(unsigned row)
+static DEFINE_PER_CPU(unsigned int, irq_epoch);
+
+static void clear_evtchn_to_irq_row(int *evtchn_row)
{
unsigned col;
for (col = 0; col < EVTCHN_PER_ROW; col++)
- evtchn_to_irq[row][col] = -1;
+ WRITE_ONCE(evtchn_row[col], -1);
}
static void clear_evtchn_to_irq_all(void)
@@ -112,7 +149,7 @@
for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) {
if (evtchn_to_irq[row] == NULL)
continue;
- clear_evtchn_to_irq_row(row);
+ clear_evtchn_to_irq_row(evtchn_to_irq[row]);
}
}
@@ -120,6 +157,7 @@
{
unsigned row;
unsigned col;
+ int *evtchn_row;
if (evtchn >= xen_evtchn_max_channels())
return -EINVAL;
@@ -132,14 +170,21 @@
if (irq == -1)
return 0;
- evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL);
- if (evtchn_to_irq[row] == NULL)
+ evtchn_row = (int *) __get_free_pages(GFP_KERNEL, 0);
+ if (evtchn_row == NULL)
return -ENOMEM;
- clear_evtchn_to_irq_row(row);
+ clear_evtchn_to_irq_row(evtchn_row);
+
+ /*
+ * We've prepared an empty row for the mapping. If a different
+ * thread was faster inserting it, we can drop ours.
+ */
+ if (cmpxchg(&evtchn_to_irq[row], NULL, evtchn_row) != NULL)
+ free_page((unsigned long) evtchn_row);
}
- evtchn_to_irq[row][col] = irq;
+ WRITE_ONCE(evtchn_to_irq[row][col], irq);
return 0;
}
@@ -149,13 +194,24 @@
return -1;
if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL)
return -1;
- return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)];
+ return READ_ONCE(evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]);
}
/* Get info for IRQ */
struct irq_info *info_for_irq(unsigned irq)
{
- return irq_get_handler_data(irq);
+ if (irq < nr_legacy_irqs())
+ return legacy_info_ptrs[irq];
+ else
+ return irq_get_chip_data(irq);
+}
+
+static void set_info_for_irq(unsigned int irq, struct irq_info *info)
+{
+ if (irq < nr_legacy_irqs())
+ legacy_info_ptrs[irq] = info;
+ else
+ irq_set_chip_data(irq, info);
}
/* Constructors for packed IRQ information. */
@@ -173,6 +229,8 @@
info->irq = irq;
info->evtchn = evtchn;
info->cpu = cpu;
+ info->mask_reason = EVT_MASK_REASON_EXPLICIT;
+ raw_spin_lock_init(&info->lock);
ret = set_evtchn_to_irq(evtchn, irq);
if (ret < 0)
@@ -239,6 +297,7 @@
static void xen_irq_info_cleanup(struct irq_info *info)
{
set_evtchn_to_irq(info->evtchn, -1);
+ xen_evtchn_port_remove(info->evtchn, info->cpu);
info->evtchn = 0;
}
@@ -247,10 +306,14 @@
*/
unsigned int evtchn_from_irq(unsigned irq)
{
- if (WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))
+ const struct irq_info *info = NULL;
+
+ if (likely(irq < nr_irqs))
+ info = info_for_irq(irq);
+ if (!info)
return 0;
- return info_for_irq(irq)->evtchn;
+ return info->evtchn;
}
unsigned irq_from_evtchn(unsigned int evtchn)
@@ -315,6 +378,34 @@
return ret;
}
+static void do_mask(struct irq_info *info, u8 reason)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&info->lock, flags);
+
+ if (!info->mask_reason)
+ mask_evtchn(info->evtchn);
+
+ info->mask_reason |= reason;
+
+ raw_spin_unlock_irqrestore(&info->lock, flags);
+}
+
+static void do_unmask(struct irq_info *info, u8 reason)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&info->lock, flags);
+
+ info->mask_reason &= ~reason;
+
+ if (!info->mask_reason)
+ unmask_evtchn(info->evtchn);
+
+ raw_spin_unlock_irqrestore(&info->lock, flags);
+}
+
#ifdef CONFIG_X86
static bool pirq_check_eoi_map(unsigned irq)
{
@@ -361,9 +452,160 @@
}
EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+struct lateeoi_work {
+ struct delayed_work delayed;
+ spinlock_t eoi_list_lock;
+ struct list_head eoi_list;
+};
+
+static DEFINE_PER_CPU(struct lateeoi_work, lateeoi);
+
+static void lateeoi_list_del(struct irq_info *info)
+{
+ struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu);
+ unsigned long flags;
+
+ spin_lock_irqsave(&eoi->eoi_list_lock, flags);
+ list_del_init(&info->eoi_list);
+ spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
+}
+
+static void lateeoi_list_add(struct irq_info *info)
+{
+ struct lateeoi_work *eoi = &per_cpu(lateeoi, info->eoi_cpu);
+ struct irq_info *elem;
+ u64 now = get_jiffies_64();
+ unsigned long delay;
+ unsigned long flags;
+
+ if (now < info->eoi_time)
+ delay = info->eoi_time - now;
+ else
+ delay = 1;
+
+ spin_lock_irqsave(&eoi->eoi_list_lock, flags);
+
+ if (list_empty(&eoi->eoi_list)) {
+ list_add(&info->eoi_list, &eoi->eoi_list);
+ mod_delayed_work_on(info->eoi_cpu, system_wq,
+ &eoi->delayed, delay);
+ } else {
+ list_for_each_entry_reverse(elem, &eoi->eoi_list, eoi_list) {
+ if (elem->eoi_time <= info->eoi_time)
+ break;
+ }
+ list_add(&info->eoi_list, &elem->eoi_list);
+ }
+
+ spin_unlock_irqrestore(&eoi->eoi_list_lock, flags);
+}
+
+static void xen_irq_lateeoi_locked(struct irq_info *info, bool spurious)
+{
+ evtchn_port_t evtchn;
+ unsigned int cpu;
+ unsigned int delay = 0;
+
+ evtchn = info->evtchn;
+ if (!VALID_EVTCHN(evtchn) || !list_empty(&info->eoi_list))
+ return;
+
+ if (spurious) {
+ if ((1 << info->spurious_cnt) < (HZ << 2))
+ info->spurious_cnt++;
+ if (info->spurious_cnt > 1) {
+ delay = 1 << (info->spurious_cnt - 2);
+ if (delay > HZ)
+ delay = HZ;
+ if (!info->eoi_time)
+ info->eoi_cpu = smp_processor_id();
+ info->eoi_time = get_jiffies_64() + delay;
+ }
+ } else {
+ info->spurious_cnt = 0;
+ }
+
+ cpu = info->eoi_cpu;
+ if (info->eoi_time &&
+ (info->irq_epoch == per_cpu(irq_epoch, cpu) || delay)) {
+ lateeoi_list_add(info);
+ return;
+ }
+
+ info->eoi_time = 0;
+
+ /* is_active hasn't been reset yet, do it now. */
+ smp_store_release(&info->is_active, 0);
+ do_unmask(info, EVT_MASK_REASON_EOI_PENDING);
+}
+
+static void xen_irq_lateeoi_worker(struct work_struct *work)
+{
+ struct lateeoi_work *eoi;
+ struct irq_info *info;
+ u64 now = get_jiffies_64();
+ unsigned long flags;
+
+ eoi = container_of(to_delayed_work(work), struct lateeoi_work, delayed);
+
+ read_lock_irqsave(&evtchn_rwlock, flags);
+
+ while (true) {
+ spin_lock(&eoi->eoi_list_lock);
+
+ info = list_first_entry_or_null(&eoi->eoi_list, struct irq_info,
+ eoi_list);
+
+ if (info == NULL || now < info->eoi_time) {
+ spin_unlock(&eoi->eoi_list_lock);
+ break;
+ }
+
+ list_del_init(&info->eoi_list);
+
+ spin_unlock(&eoi->eoi_list_lock);
+
+ info->eoi_time = 0;
+
+ xen_irq_lateeoi_locked(info, false);
+ }
+
+ if (info)
+ mod_delayed_work_on(info->eoi_cpu, system_wq,
+ &eoi->delayed, info->eoi_time - now);
+
+ read_unlock_irqrestore(&evtchn_rwlock, flags);
+}
+
+static void xen_cpu_init_eoi(unsigned int cpu)
+{
+ struct lateeoi_work *eoi = &per_cpu(lateeoi, cpu);
+
+ INIT_DELAYED_WORK(&eoi->delayed, xen_irq_lateeoi_worker);
+ spin_lock_init(&eoi->eoi_list_lock);
+ INIT_LIST_HEAD(&eoi->eoi_list);
+}
+
+void xen_irq_lateeoi(unsigned int irq, unsigned int eoi_flags)
+{
+ struct irq_info *info;
+ unsigned long flags;
+
+ read_lock_irqsave(&evtchn_rwlock, flags);
+
+ info = info_for_irq(irq);
+
+ if (info)
+ xen_irq_lateeoi_locked(info, eoi_flags & XEN_EOI_FLAG_SPURIOUS);
+
+ read_unlock_irqrestore(&evtchn_rwlock, flags);
+}
+EXPORT_SYMBOL_GPL(xen_irq_lateeoi);
+
static void xen_irq_init(unsigned irq)
{
struct irq_info *info;
+
#ifdef CONFIG_SMP
/* By default all event channels notify CPU#0. */
cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0));
@@ -376,8 +618,9 @@
info->type = IRQT_UNBOUND;
info->refcnt = -1;
- irq_set_handler_data(irq, info);
+ set_info_for_irq(irq, info);
+ INIT_LIST_HEAD(&info->eoi_list);
list_add_tail(&info->list, &xen_irq_list_head);
}
@@ -425,17 +668,25 @@
static void xen_free_irq(unsigned irq)
{
- struct irq_info *info = irq_get_handler_data(irq);
+ struct irq_info *info = info_for_irq(irq);
+ unsigned long flags;
if (WARN_ON(!info))
return;
+ write_lock_irqsave(&evtchn_rwlock, flags);
+
+ if (!list_empty(&info->eoi_list))
+ lateeoi_list_del(info);
+
list_del(&info->list);
- irq_set_handler_data(irq, NULL);
+ set_info_for_irq(irq, NULL);
WARN_ON(info->refcnt > 0);
+ write_unlock_irqrestore(&evtchn_rwlock, flags);
+
kfree(info);
/* Legacy IRQ descriptors are managed by the arch. */
@@ -454,6 +705,12 @@
BUG();
}
+static void event_handler_exit(struct irq_info *info)
+{
+ smp_store_release(&info->is_active, 0);
+ clear_evtchn(info->evtchn);
+}
+
static void pirq_query_unmask(int irq)
{
struct physdev_irq_status_query irq_status;
@@ -472,7 +729,8 @@
static void eoi_pirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ struct irq_info *info = info_for_irq(data->irq);
+ int evtchn = info ? info->evtchn : 0;
struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) };
int rc = 0;
@@ -481,16 +739,15 @@
if (unlikely(irqd_is_setaffinity_pending(data)) &&
likely(!irqd_irq_disabled(data))) {
- int masked = test_and_set_mask(evtchn);
+ do_mask(info, EVT_MASK_REASON_TEMPORARY);
- clear_evtchn(evtchn);
+ event_handler_exit(info);
irq_move_masked_irq(data);
- if (!masked)
- unmask_evtchn(evtchn);
+ do_unmask(info, EVT_MASK_REASON_TEMPORARY);
} else
- clear_evtchn(evtchn);
+ event_handler_exit(info);
if (pirq_needs_eoi(data->irq)) {
rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
@@ -541,7 +798,8 @@
goto err;
out:
- unmask_evtchn(evtchn);
+ do_unmask(info, EVT_MASK_REASON_EXPLICIT);
+
eoi_pirq(irq_get_irq_data(irq));
return 0;
@@ -568,7 +826,7 @@
if (!VALID_EVTCHN(evtchn))
return;
- mask_evtchn(evtchn);
+ do_mask(info, EVT_MASK_REASON_EXPLICIT);
xen_evtchn_close(evtchn);
xen_irq_info_cleanup(info);
}
@@ -602,7 +860,7 @@
static void __unbind_from_irq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
- struct irq_info *info = irq_get_handler_data(irq);
+ struct irq_info *info = info_for_irq(irq);
if (info->refcnt > 0) {
info->refcnt--;
@@ -827,7 +1085,7 @@
}
EXPORT_SYMBOL_GPL(xen_pirq_from_irq);
-int bind_evtchn_to_irq(unsigned int evtchn)
+static int bind_evtchn_to_irq_chip(evtchn_port_t evtchn, struct irq_chip *chip)
{
int irq;
int ret;
@@ -844,7 +1102,7 @@
if (irq < 0)
goto out;
- irq_set_chip_and_handler_name(irq, &xen_dynamic_chip,
+ irq_set_chip_and_handler_name(irq, chip,
handle_edge_irq, "event");
ret = xen_irq_info_evtchn_setup(irq, evtchn);
@@ -865,8 +1123,19 @@
return irq;
}
+
+int bind_evtchn_to_irq(evtchn_port_t evtchn)
+{
+ return bind_evtchn_to_irq_chip(evtchn, &xen_dynamic_chip);
+}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
+int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn)
+{
+ return bind_evtchn_to_irq_chip(evtchn, &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
+
static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
{
struct evtchn_bind_ipi bind_ipi;
@@ -908,8 +1177,9 @@
return irq;
}
-int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
- unsigned int remote_port)
+static int bind_interdomain_evtchn_to_irq_chip(unsigned int remote_domain,
+ evtchn_port_t remote_port,
+ struct irq_chip *chip)
{
struct evtchn_bind_interdomain bind_interdomain;
int err;
@@ -920,10 +1190,26 @@
err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
&bind_interdomain);
- return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
+ return err ? : bind_evtchn_to_irq_chip(bind_interdomain.local_port,
+ chip);
+}
+
+int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+ evtchn_port_t remote_port)
+{
+ return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port,
+ &xen_dynamic_chip);
}
EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
+int bind_interdomain_evtchn_to_irq_lateeoi(unsigned int remote_domain,
+ evtchn_port_t remote_port)
+{
+ return bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port,
+ &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq_lateeoi);
+
static int find_virq(unsigned int virq, unsigned int cpu)
{
struct evtchn_status status;
@@ -1019,14 +1305,15 @@
mutex_unlock(&irq_mapping_update_lock);
}
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname, void *dev_id)
+static int bind_evtchn_to_irqhandler_chip(evtchn_port_t evtchn,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id,
+ struct irq_chip *chip)
{
int irq, retval;
- irq = bind_evtchn_to_irq(evtchn);
+ irq = bind_evtchn_to_irq_chip(evtchn, chip);
if (irq < 0)
return irq;
retval = request_irq(irq, handler, irqflags, devname, dev_id);
@@ -1037,31 +1324,76 @@
return irq;
}
+
+int bind_evtchn_to_irqhandler(evtchn_port_t evtchn,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
+ devname, dev_id,
+ &xen_dynamic_chip);
+}
EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+int bind_evtchn_to_irqhandler_lateeoi(evtchn_port_t evtchn,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname, void *dev_id)
+{
+ return bind_evtchn_to_irqhandler_chip(evtchn, handler, irqflags,
+ devname, dev_id,
+ &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler_lateeoi);
+
+static int bind_interdomain_evtchn_to_irqhandler_chip(
+ unsigned int remote_domain, evtchn_port_t remote_port,
+ irq_handler_t handler, unsigned long irqflags,
+ const char *devname, void *dev_id, struct irq_chip *chip)
+{
+ int irq, retval;
+
+ irq = bind_interdomain_evtchn_to_irq_chip(remote_domain, remote_port,
+ chip);
+ if (irq < 0)
+ return irq;
+
+ retval = request_irq(irq, handler, irqflags, devname, dev_id);
+ if (retval != 0) {
+ unbind_from_irq(irq);
+ return retval;
+ }
+
+ return irq;
+}
+
int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
- unsigned int remote_port,
+ evtchn_port_t remote_port,
irq_handler_t handler,
unsigned long irqflags,
const char *devname,
void *dev_id)
{
- int irq, retval;
-
- irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port);
- if (irq < 0)
- return irq;
-
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
+ return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain,
+ remote_port, handler, irqflags, devname,
+ dev_id, &xen_dynamic_chip);
}
EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler);
+int bind_interdomain_evtchn_to_irqhandler_lateeoi(unsigned int remote_domain,
+ evtchn_port_t remote_port,
+ irq_handler_t handler,
+ unsigned long irqflags,
+ const char *devname,
+ void *dev_id)
+{
+ return bind_interdomain_evtchn_to_irqhandler_chip(remote_domain,
+ remote_port, handler, irqflags, devname,
+ dev_id, &xen_lateeoi_chip);
+}
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler_lateeoi);
+
int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
irq_handler_t handler,
unsigned long irqflags, const char *devname, void *dev_id)
@@ -1106,7 +1438,7 @@
void unbind_from_irqhandler(unsigned int irq, void *dev_id)
{
- struct irq_info *info = irq_get_handler_data(irq);
+ struct irq_info *info = info_for_irq(irq);
if (WARN_ON(!info))
return;
@@ -1140,7 +1472,7 @@
if (irq == -1)
return -ENOENT;
- info = irq_get_handler_data(irq);
+ info = info_for_irq(irq);
if (!info)
return -ENOENT;
@@ -1168,13 +1500,13 @@
if (irq == -1)
goto done;
- info = irq_get_handler_data(irq);
+ info = info_for_irq(irq);
if (!info)
goto done;
err = -EINVAL;
- if (info->refcnt <= 0)
+ if (info->refcnt <= 0 || info->refcnt == SHRT_MAX)
goto done;
info->refcnt++;
@@ -1213,6 +1545,56 @@
notify_remote_via_irq(irq);
}
+struct evtchn_loop_ctrl {
+ ktime_t timeout;
+ unsigned count;
+ bool defer_eoi;
+};
+
+void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl)
+{
+ int irq;
+ struct irq_info *info;
+
+ irq = get_evtchn_to_irq(port);
+ if (irq == -1)
+ return;
+
+ /*
+ * Check for timeout every 256 events.
+ * We are setting the timeout value only after the first 256
+ * events in order to not hurt the common case of few loop
+ * iterations. The 256 is basically an arbitrary value.
+ *
+ * In case we are hitting the timeout we need to defer all further
+ * EOIs in order to ensure to leave the event handling loop rather
+ * sooner than later.
+ */
+ if (!ctrl->defer_eoi && !(++ctrl->count & 0xff)) {
+ ktime_t kt = ktime_get();
+
+ if (!ctrl->timeout) {
+ kt = ktime_add_ms(kt,
+ jiffies_to_msecs(event_loop_timeout));
+ ctrl->timeout = kt;
+ } else if (kt > ctrl->timeout) {
+ ctrl->defer_eoi = true;
+ }
+ }
+
+ info = info_for_irq(irq);
+ if (xchg_acquire(&info->is_active, 1))
+ return;
+
+ if (ctrl->defer_eoi) {
+ info->eoi_cpu = smp_processor_id();
+ info->irq_epoch = __this_cpu_read(irq_epoch);
+ info->eoi_time = get_jiffies_64() + event_eoi_delay;
+ }
+
+ generic_handle_irq(irq);
+}
+
static DEFINE_PER_CPU(unsigned, xed_nesting_count);
static void __xen_evtchn_do_upcall(void)
@@ -1220,6 +1602,9 @@
struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu);
int cpu = get_cpu();
unsigned count;
+ struct evtchn_loop_ctrl ctrl = { 0 };
+
+ read_lock(&evtchn_rwlock);
do {
vcpu_info->evtchn_upcall_pending = 0;
@@ -1227,7 +1612,7 @@
if (__this_cpu_inc_return(xed_nesting_count) - 1)
goto out;
- xen_evtchn_handle_events(cpu);
+ xen_evtchn_handle_events(cpu, &ctrl);
BUG_ON(!irqs_disabled());
@@ -1236,6 +1621,14 @@
} while (count != 1 || vcpu_info->evtchn_upcall_pending);
out:
+ read_unlock(&evtchn_rwlock);
+
+ /*
+ * Increment irq_epoch only now to defer EOIs only for
+ * xen_irq_lateeoi() invocations occurring from inside the loop
+ * above.
+ */
+ __this_cpu_inc(irq_epoch);
put_cpu();
}
@@ -1294,10 +1687,10 @@
}
/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static int xen_rebind_evtchn_to_cpu(int evtchn, unsigned int tcpu)
+static int xen_rebind_evtchn_to_cpu(struct irq_info *info, unsigned int tcpu)
{
struct evtchn_bind_vcpu bind_vcpu;
- int masked;
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
if (!VALID_EVTCHN(evtchn))
return -1;
@@ -1313,7 +1706,7 @@
* Mask the event while changing the VCPU binding to prevent
* it being delivered on an unexpected VCPU.
*/
- masked = test_and_set_mask(evtchn);
+ do_mask(info, EVT_MASK_REASON_TEMPORARY);
/*
* If this fails, it usually just indicates that we're dealing with a
@@ -1323,8 +1716,7 @@
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
bind_evtchn_to_cpu(evtchn, tcpu);
- if (!masked)
- unmask_evtchn(evtchn);
+ do_unmask(info, EVT_MASK_REASON_TEMPORARY);
return 0;
}
@@ -1333,7 +1725,7 @@
bool force)
{
unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);
- int ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu);
+ int ret = xen_rebind_evtchn_to_cpu(info_for_irq(data->irq), tcpu);
if (!ret)
irq_data_update_effective_affinity(data, cpumask_of(tcpu));
@@ -1352,39 +1744,41 @@
static void enable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ struct irq_info *info = info_for_irq(data->irq);
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
if (VALID_EVTCHN(evtchn))
- unmask_evtchn(evtchn);
+ do_unmask(info, EVT_MASK_REASON_EXPLICIT);
}
static void disable_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ struct irq_info *info = info_for_irq(data->irq);
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
if (VALID_EVTCHN(evtchn))
- mask_evtchn(evtchn);
+ do_mask(info, EVT_MASK_REASON_EXPLICIT);
}
static void ack_dynirq(struct irq_data *data)
{
- int evtchn = evtchn_from_irq(data->irq);
+ struct irq_info *info = info_for_irq(data->irq);
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
if (!VALID_EVTCHN(evtchn))
return;
if (unlikely(irqd_is_setaffinity_pending(data)) &&
likely(!irqd_irq_disabled(data))) {
- int masked = test_and_set_mask(evtchn);
+ do_mask(info, EVT_MASK_REASON_TEMPORARY);
- clear_evtchn(evtchn);
+ event_handler_exit(info);
irq_move_masked_irq(data);
- if (!masked)
- unmask_evtchn(evtchn);
+ do_unmask(info, EVT_MASK_REASON_TEMPORARY);
} else
- clear_evtchn(evtchn);
+ event_handler_exit(info);
}
static void mask_ack_dynirq(struct irq_data *data)
@@ -1393,18 +1787,51 @@
ack_dynirq(data);
}
+static void lateeoi_ack_dynirq(struct irq_data *data)
+{
+ struct irq_info *info = info_for_irq(data->irq);
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+ if (!VALID_EVTCHN(evtchn))
+ return;
+
+ do_mask(info, EVT_MASK_REASON_EOI_PENDING);
+
+ if (unlikely(irqd_is_setaffinity_pending(data)) &&
+ likely(!irqd_irq_disabled(data))) {
+ do_mask(info, EVT_MASK_REASON_TEMPORARY);
+
+ clear_evtchn(evtchn);
+
+ irq_move_masked_irq(data);
+
+ do_unmask(info, EVT_MASK_REASON_TEMPORARY);
+ } else
+ clear_evtchn(evtchn);
+}
+
+static void lateeoi_mask_ack_dynirq(struct irq_data *data)
+{
+ struct irq_info *info = info_for_irq(data->irq);
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
+
+ if (VALID_EVTCHN(evtchn)) {
+ do_mask(info, EVT_MASK_REASON_EXPLICIT);
+ ack_dynirq(data);
+ }
+}
+
static int retrigger_dynirq(struct irq_data *data)
{
- unsigned int evtchn = evtchn_from_irq(data->irq);
- int masked;
+ struct irq_info *info = info_for_irq(data->irq);
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
if (!VALID_EVTCHN(evtchn))
return 0;
- masked = test_and_set_mask(evtchn);
+ do_mask(info, EVT_MASK_REASON_TEMPORARY);
set_evtchn(evtchn);
- if (!masked)
- unmask_evtchn(evtchn);
+ do_unmask(info, EVT_MASK_REASON_TEMPORARY);
return 1;
}
@@ -1499,10 +1926,11 @@
/* Clear an irq's pending state, in preparation for polling on it */
void xen_clear_irq_pending(int irq)
{
- int evtchn = evtchn_from_irq(irq);
+ struct irq_info *info = info_for_irq(irq);
+ evtchn_port_t evtchn = info ? info->evtchn : 0;
if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
+ event_handler_exit(info);
}
EXPORT_SYMBOL(xen_clear_irq_pending);
void xen_set_irq_pending(int irq)
@@ -1602,6 +2030,21 @@
.irq_retrigger = retrigger_dynirq,
};
+static struct irq_chip xen_lateeoi_chip __read_mostly = {
+ /* The chip name needs to contain "xen-dyn" for irqbalance to work. */
+ .name = "xen-dyn-lateeoi",
+
+ .irq_disable = disable_dynirq,
+ .irq_mask = disable_dynirq,
+ .irq_unmask = enable_dynirq,
+
+ .irq_ack = lateeoi_ack_dynirq,
+ .irq_mask_ack = lateeoi_mask_ack_dynirq,
+
+ .irq_set_affinity = set_affinity_irq,
+ .irq_retrigger = retrigger_dynirq,
+};
+
static struct irq_chip xen_pirq_chip __read_mostly = {
.name = "xen-pirq",
@@ -1632,16 +2075,6 @@
.irq_ack = ack_dynirq,
};
-int xen_set_callback_via(uint64_t via)
-{
- struct xen_hvm_param a;
- a.domid = DOMID_SELF;
- a.index = HVM_PARAM_CALLBACK_IRQ;
- a.value = via;
- return HYPERVISOR_hvm_op(HVMOP_set_param, &a);
-}
-EXPORT_SYMBOL_GPL(xen_set_callback_via);
-
#ifdef CONFIG_XEN_PVHVM
/* Vector callbacks are better than PCI interrupts to receive event
* channel notifications because we can receive vector callbacks on any
@@ -1668,12 +2101,31 @@
void xen_callback_vector(void) {}
#endif
-#undef MODULE_PARAM_PREFIX
-#define MODULE_PARAM_PREFIX "xen."
-
static bool fifo_events = true;
module_param(fifo_events, bool, 0);
+static int xen_evtchn_cpu_prepare(unsigned int cpu)
+{
+ int ret = 0;
+
+ xen_cpu_init_eoi(cpu);
+
+ if (evtchn_ops->percpu_init)
+ ret = evtchn_ops->percpu_init(cpu);
+
+ return ret;
+}
+
+static int xen_evtchn_cpu_dead(unsigned int cpu)
+{
+ int ret = 0;
+
+ if (evtchn_ops->percpu_deinit)
+ ret = evtchn_ops->percpu_deinit(cpu);
+
+ return ret;
+}
+
void __init xen_init_IRQ(void)
{
int ret = -EINVAL;
@@ -1684,6 +2136,12 @@
if (ret < 0)
xen_evtchn_2l_init();
+ xen_cpu_init_eoi(smp_processor_id());
+
+ cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE,
+ "xen/evtchn:prepare",
+ xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead);
+
evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()),
sizeof(*evtchn_to_irq), GFP_KERNEL);
BUG_ON(!evtchn_to_irq);
diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c
index 76b318e..360a7f8 100644
--- a/drivers/xen/events/events_fifo.c
+++ b/drivers/xen/events/events_fifo.c
@@ -209,12 +209,6 @@
return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word));
}
-static bool evtchn_fifo_test_and_set_mask(unsigned port)
-{
- event_word_t *word = event_word_from_port(port);
- return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
-}
-
static void evtchn_fifo_mask(unsigned port)
{
event_word_t *word = event_word_from_port(port);
@@ -227,19 +221,25 @@
return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word));
}
/*
- * Clear MASKED, spinning if BUSY is set.
+ * Clear MASKED if not PENDING, spinning if BUSY is set.
+ * Return true if mask was cleared.
*/
-static void clear_masked(volatile event_word_t *word)
+static bool clear_masked_cond(volatile event_word_t *word)
{
event_word_t new, old, w;
w = *word;
do {
+ if (w & (1 << EVTCHN_FIFO_PENDING))
+ return false;
+
old = w & ~(1 << EVTCHN_FIFO_BUSY);
new = old & ~(1 << EVTCHN_FIFO_MASKED);
w = sync_cmpxchg(word, old, new);
} while (w != old);
+
+ return true;
}
static void evtchn_fifo_unmask(unsigned port)
@@ -248,8 +248,7 @@
BUG_ON(!irqs_disabled());
- clear_masked(word);
- if (evtchn_fifo_is_pending(port)) {
+ if (!clear_masked_cond(word)) {
struct evtchn_unmask unmask = { .port = port };
(void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
}
@@ -270,19 +269,9 @@
return w & EVTCHN_FIFO_LINK_MASK;
}
-static void handle_irq_for_port(unsigned port)
-{
- int irq;
-
- irq = get_evtchn_to_irq(port);
- if (irq != -1)
- generic_handle_irq(irq);
-}
-
-static void consume_one_event(unsigned cpu,
+static void consume_one_event(unsigned cpu, struct evtchn_loop_ctrl *ctrl,
struct evtchn_fifo_control_block *control_block,
- unsigned priority, unsigned long *ready,
- bool drop)
+ unsigned priority, unsigned long *ready)
{
struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu);
uint32_t head;
@@ -315,16 +304,17 @@
clear_bit(priority, ready);
if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) {
- if (unlikely(drop))
+ if (unlikely(!ctrl))
pr_warn("Dropping pending event for port %u\n", port);
else
- handle_irq_for_port(port);
+ handle_irq_for_port(port, ctrl);
}
q->head[priority] = head;
}
-static void __evtchn_fifo_handle_events(unsigned cpu, bool drop)
+static void __evtchn_fifo_handle_events(unsigned cpu,
+ struct evtchn_loop_ctrl *ctrl)
{
struct evtchn_fifo_control_block *control_block;
unsigned long ready;
@@ -336,14 +326,15 @@
while (ready) {
q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES);
- consume_one_event(cpu, control_block, q, &ready, drop);
+ consume_one_event(cpu, ctrl, control_block, q, &ready);
ready |= xchg(&control_block->ready, 0);
}
}
-static void evtchn_fifo_handle_events(unsigned cpu)
+static void evtchn_fifo_handle_events(unsigned cpu,
+ struct evtchn_loop_ctrl *ctrl)
{
- __evtchn_fifo_handle_events(cpu, false);
+ __evtchn_fifo_handle_events(cpu, ctrl);
}
static void evtchn_fifo_resume(void)
@@ -380,21 +371,6 @@
event_array_pages = 0;
}
-static const struct evtchn_ops evtchn_ops_fifo = {
- .max_channels = evtchn_fifo_max_channels,
- .nr_channels = evtchn_fifo_nr_channels,
- .setup = evtchn_fifo_setup,
- .bind_to_cpu = evtchn_fifo_bind_to_cpu,
- .clear_pending = evtchn_fifo_clear_pending,
- .set_pending = evtchn_fifo_set_pending,
- .is_pending = evtchn_fifo_is_pending,
- .test_and_set_mask = evtchn_fifo_test_and_set_mask,
- .mask = evtchn_fifo_mask,
- .unmask = evtchn_fifo_unmask,
- .handle_events = evtchn_fifo_handle_events,
- .resume = evtchn_fifo_resume,
-};
-
static int evtchn_fifo_alloc_control_block(unsigned cpu)
{
void *control_block = NULL;
@@ -417,19 +393,35 @@
return ret;
}
-static int xen_evtchn_cpu_prepare(unsigned int cpu)
+static int evtchn_fifo_percpu_init(unsigned int cpu)
{
if (!per_cpu(cpu_control_block, cpu))
return evtchn_fifo_alloc_control_block(cpu);
return 0;
}
-static int xen_evtchn_cpu_dead(unsigned int cpu)
+static int evtchn_fifo_percpu_deinit(unsigned int cpu)
{
- __evtchn_fifo_handle_events(cpu, true);
+ __evtchn_fifo_handle_events(cpu, NULL);
return 0;
}
+static const struct evtchn_ops evtchn_ops_fifo = {
+ .max_channels = evtchn_fifo_max_channels,
+ .nr_channels = evtchn_fifo_nr_channels,
+ .setup = evtchn_fifo_setup,
+ .bind_to_cpu = evtchn_fifo_bind_to_cpu,
+ .clear_pending = evtchn_fifo_clear_pending,
+ .set_pending = evtchn_fifo_set_pending,
+ .is_pending = evtchn_fifo_is_pending,
+ .mask = evtchn_fifo_mask,
+ .unmask = evtchn_fifo_unmask,
+ .handle_events = evtchn_fifo_handle_events,
+ .resume = evtchn_fifo_resume,
+ .percpu_init = evtchn_fifo_percpu_init,
+ .percpu_deinit = evtchn_fifo_percpu_deinit,
+};
+
int __init xen_evtchn_fifo_init(void)
{
int cpu = smp_processor_id();
@@ -443,9 +435,5 @@
evtchn_ops = &evtchn_ops_fifo;
- cpuhp_setup_state_nocalls(CPUHP_XEN_EVTCHN_PREPARE,
- "xen/evtchn:prepare",
- xen_evtchn_cpu_prepare, xen_evtchn_cpu_dead);
-
return ret;
}
diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h
index 82938cf..eb012fb 100644
--- a/drivers/xen/events/events_internal.h
+++ b/drivers/xen/events/events_internal.h
@@ -30,11 +30,22 @@
*/
struct irq_info {
struct list_head list;
- int refcnt;
- enum xen_irq_type type; /* type */
+ struct list_head eoi_list;
+ short refcnt;
+ short spurious_cnt;
+ short type; /* type */
+ u8 mask_reason; /* Why is event channel masked */
+#define EVT_MASK_REASON_EXPLICIT 0x01
+#define EVT_MASK_REASON_TEMPORARY 0x02
+#define EVT_MASK_REASON_EOI_PENDING 0x04
+ u8 is_active; /* Is event just being handled? */
unsigned irq;
unsigned int evtchn; /* event channel */
unsigned short cpu; /* cpu bound */
+ unsigned short eoi_cpu; /* EOI must happen on this cpu */
+ unsigned int irq_epoch; /* If eoi_cpu valid: irq_epoch of event */
+ u64 eoi_time; /* Time in jiffies when to EOI. */
+ raw_spinlock_t lock;
union {
unsigned short virq;
@@ -53,28 +64,34 @@
#define PIRQ_SHAREABLE (1 << 1)
#define PIRQ_MSI_GROUP (1 << 2)
+struct evtchn_loop_ctrl;
+
struct evtchn_ops {
unsigned (*max_channels)(void);
unsigned (*nr_channels)(void);
int (*setup)(struct irq_info *info);
+ void (*remove)(evtchn_port_t port, unsigned int cpu);
void (*bind_to_cpu)(struct irq_info *info, unsigned cpu);
void (*clear_pending)(unsigned port);
void (*set_pending)(unsigned port);
bool (*is_pending)(unsigned port);
- bool (*test_and_set_mask)(unsigned port);
void (*mask)(unsigned port);
void (*unmask)(unsigned port);
- void (*handle_events)(unsigned cpu);
+ void (*handle_events)(unsigned cpu, struct evtchn_loop_ctrl *ctrl);
void (*resume)(void);
+
+ int (*percpu_init)(unsigned int cpu);
+ int (*percpu_deinit)(unsigned int cpu);
};
extern const struct evtchn_ops *evtchn_ops;
extern int **evtchn_to_irq;
int get_evtchn_to_irq(unsigned int evtchn);
+void handle_irq_for_port(evtchn_port_t port, struct evtchn_loop_ctrl *ctrl);
struct irq_info *info_for_irq(unsigned irq);
unsigned cpu_from_irq(unsigned irq);
@@ -96,6 +113,13 @@
return 0;
}
+static inline void xen_evtchn_port_remove(evtchn_port_t evtchn,
+ unsigned int cpu)
+{
+ if (evtchn_ops->remove)
+ evtchn_ops->remove(evtchn, cpu);
+}
+
static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info,
unsigned cpu)
{
@@ -117,11 +141,6 @@
return evtchn_ops->is_pending(port);
}
-static inline bool test_and_set_mask(unsigned port)
-{
- return evtchn_ops->test_and_set_mask(port);
-}
-
static inline void mask_evtchn(unsigned port)
{
return evtchn_ops->mask(port);
@@ -132,9 +151,10 @@
return evtchn_ops->unmask(port);
}
-static inline void xen_evtchn_handle_events(unsigned cpu)
+static inline void xen_evtchn_handle_events(unsigned cpu,
+ struct evtchn_loop_ctrl *ctrl)
{
- return evtchn_ops->handle_events(cpu);
+ return evtchn_ops->handle_events(cpu, ctrl);
}
static inline void xen_evtchn_resume(void)
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 052b55a..a439301 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -166,7 +166,6 @@
"Interrupt for port %d, but apparently not enabled; per-user %p\n",
evtchn->port, u);
- disable_irq_nosync(irq);
evtchn->enabled = false;
spin_lock(&u->ring_prod_lock);
@@ -292,7 +291,7 @@
evtchn = find_evtchn(u, port);
if (evtchn && !evtchn->enabled) {
evtchn->enabled = true;
- enable_irq(irq_from_evtchn(port));
+ xen_irq_lateeoi(irq_from_evtchn(port), 0);
}
}
@@ -392,8 +391,8 @@
if (rc < 0)
goto err;
- rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0,
- u->name, evtchn);
+ rc = bind_evtchn_to_irqhandler_lateeoi(port, evtchn_interrupt, 0,
+ u->name, evtchn);
if (rc < 0)
goto err;
diff --git a/drivers/xen/gntdev-dmabuf.c b/drivers/xen/gntdev-dmabuf.c
index 2c4f324..da79992 100644
--- a/drivers/xen/gntdev-dmabuf.c
+++ b/drivers/xen/gntdev-dmabuf.c
@@ -641,6 +641,14 @@
goto fail_detach;
}
+ /* Check that we have zero offset. */
+ if (sgt->sgl->offset) {
+ ret = ERR_PTR(-EINVAL);
+ pr_debug("DMA buffer has %d bytes offset, user-space expects 0\n",
+ sgt->sgl->offset);
+ goto fail_unmap;
+ }
+
/* Check number of pages that imported buffer has. */
if (attach->dmabuf->size != gntdev_dmabuf->nr_pages << PAGE_SHIFT) {
ret = ERR_PTR(-EINVAL);
diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index 81401f3..e953ea3 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -319,44 +319,47 @@
* to the kernel linear addresses of the struct pages.
* These ptes are completely different from the user ptes dealt
* with find_grant_ptes.
+ * Note that GNTMAP_device_map isn't needed here: The
+ * dev_bus_addr output field gets consumed only from ->map_ops,
+ * and by not requesting it when mapping we also avoid needing
+ * to mirror dev_bus_addr into ->unmap_ops (and holding an extra
+ * reference to the page in the hypervisor).
*/
+ unsigned int flags = (map->flags & ~GNTMAP_device_map) |
+ GNTMAP_host_map;
+
for (i = 0; i < map->count; i++) {
unsigned long address = (unsigned long)
pfn_to_kaddr(page_to_pfn(map->pages[i]));
BUG_ON(PageHighMem(map->pages[i]));
- gnttab_set_map_op(&map->kmap_ops[i], address,
- map->flags | GNTMAP_host_map,
+ gnttab_set_map_op(&map->kmap_ops[i], address, flags,
map->grants[i].ref,
map->grants[i].domid);
gnttab_set_unmap_op(&map->kunmap_ops[i], address,
- map->flags | GNTMAP_host_map, -1);
+ flags, -1);
}
}
pr_debug("map %d+%d\n", map->index, map->count);
err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL,
map->pages, map->count);
- if (err)
- return err;
for (i = 0; i < map->count; i++) {
- if (map->map_ops[i].status) {
+ if (map->map_ops[i].status == GNTST_okay)
+ map->unmap_ops[i].handle = map->map_ops[i].handle;
+ else if (!err)
err = -EINVAL;
- continue;
- }
- map->unmap_ops[i].handle = map->map_ops[i].handle;
- if (use_ptemod)
- map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
-#ifdef CONFIG_XEN_GRANT_DMA_ALLOC
- else if (map->dma_vaddr) {
- unsigned long bfn;
+ if (map->flags & GNTMAP_device_map)
+ map->unmap_ops[i].dev_bus_addr = map->map_ops[i].dev_bus_addr;
- bfn = pfn_to_bfn(page_to_pfn(map->pages[i]));
- map->unmap_ops[i].dev_bus_addr = __pfn_to_phys(bfn);
+ if (use_ptemod) {
+ if (map->kmap_ops[i].status == GNTST_okay)
+ map->kunmap_ops[i].handle = map->kmap_ops[i].handle;
+ else if (!err)
+ err = -EINVAL;
}
-#endif
}
return err;
}
@@ -831,17 +834,18 @@
s16 __user *status[GNTDEV_COPY_BATCH];
unsigned int nr_ops;
unsigned int nr_pages;
+ bool writeable;
};
static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt,
- bool writeable, unsigned long *gfn)
+ unsigned long *gfn)
{
unsigned long addr = (unsigned long)virt;
struct page *page;
unsigned long xen_pfn;
int ret;
- ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page);
+ ret = get_user_pages_fast(addr, 1, batch->writeable ? FOLL_WRITE : 0, &page);
if (ret < 0)
return ret;
@@ -857,9 +861,13 @@
{
unsigned int i;
- for (i = 0; i < batch->nr_pages; i++)
+ for (i = 0; i < batch->nr_pages; i++) {
+ if (batch->writeable && !PageDirty(batch->pages[i]))
+ set_page_dirty_lock(batch->pages[i]);
put_page(batch->pages[i]);
+ }
batch->nr_pages = 0;
+ batch->writeable = false;
}
static int gntdev_copy(struct gntdev_copy_batch *batch)
@@ -948,8 +956,9 @@
virt = seg->source.virt + copied;
off = (unsigned long)virt & ~XEN_PAGE_MASK;
len = min(len, (size_t)XEN_PAGE_SIZE - off);
+ batch->writeable = false;
- ret = gntdev_get_page(batch, virt, false, &gfn);
+ ret = gntdev_get_page(batch, virt, &gfn);
if (ret < 0)
return ret;
@@ -967,8 +976,9 @@
virt = seg->dest.virt + copied;
off = (unsigned long)virt & ~XEN_PAGE_MASK;
len = min(len, (size_t)XEN_PAGE_SIZE - off);
+ batch->writeable = true;
- ret = gntdev_get_page(batch, virt, true, &gfn);
+ ret = gntdev_get_page(batch, virt, &gfn);
if (ret < 0)
return ret;
diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c
index 5e30602..c456464 100644
--- a/drivers/xen/platform-pci.c
+++ b/drivers/xen/platform-pci.c
@@ -149,7 +149,6 @@
ret = gnttab_init();
if (ret)
goto grant_out;
- xenbus_probe(NULL);
return 0;
grant_out:
gnttab_free_auto_xlat_frames();
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
index 8b9919c..98a9d68 100644
--- a/drivers/xen/preempt.c
+++ b/drivers/xen/preempt.c
@@ -27,13 +27,15 @@
asmlinkage __visible void xen_maybe_preempt_hcall(void)
{
if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
- && need_resched())) {
+ && need_resched() && !preempt_count())) {
/*
* Clear flag as we may be rescheduled on a different
* cpu.
*/
__this_cpu_write(xen_in_preemptible_hcall, false);
- _cond_resched();
+ local_irq_enable();
+ cond_resched();
+ local_irq_disable();
__this_cpu_write(xen_in_preemptible_hcall, true);
}
}
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index c6070e7..9c9422e 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -724,14 +724,15 @@
return 0;
}
-static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata)
+static long privcmd_ioctl_mmap_resource(struct file *file,
+ struct privcmd_mmap_resource __user *udata)
{
struct privcmd_data *data = file->private_data;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct privcmd_mmap_resource kdata;
xen_pfn_t *pfns = NULL;
- struct xen_mem_acquire_resource xdata;
+ struct xen_mem_acquire_resource xdata = { };
int rc;
if (copy_from_user(&kdata, udata, sizeof(kdata)))
@@ -741,6 +742,22 @@
if (data->domid != DOMID_INVALID && data->domid != kdata.dom)
return -EPERM;
+ /* Both fields must be set or unset */
+ if (!!kdata.addr != !!kdata.num)
+ return -EINVAL;
+
+ xdata.domid = kdata.dom;
+ xdata.type = kdata.type;
+ xdata.id = kdata.id;
+
+ if (!kdata.addr && !kdata.num) {
+ /* Query the size of the resource. */
+ rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata);
+ if (rc)
+ return rc;
+ return __put_user(xdata.nr_frames, &udata->num);
+ }
+
down_write(&mm->mmap_sem);
vma = find_vma(mm, kdata.addr);
@@ -775,10 +792,6 @@
} else
vma->vm_private_data = PRIV_VMA_LOCKED;
- memset(&xdata, 0, sizeof(xdata));
- xdata.domid = kdata.dom;
- xdata.type = kdata.type;
- xdata.id = kdata.id;
xdata.frame = kdata.idx;
xdata.nr_frames = kdata.num;
set_xen_guest_handle(xdata.frame_list, pfns);
diff --git a/drivers/xen/pvcalls-back.c b/drivers/xen/pvcalls-back.c
index c57c71b..9439de2 100644
--- a/drivers/xen/pvcalls-back.c
+++ b/drivers/xen/pvcalls-back.c
@@ -66,6 +66,7 @@
atomic_t write;
atomic_t io;
atomic_t release;
+ atomic_t eoi;
void (*saved_data_ready)(struct sock *sk);
struct pvcalls_ioworker ioworker;
};
@@ -87,7 +88,7 @@
struct pvcalls_fedata *fedata,
struct sock_mapping *map);
-static void pvcalls_conn_back_read(void *opaque)
+static bool pvcalls_conn_back_read(void *opaque)
{
struct sock_mapping *map = (struct sock_mapping *)opaque;
struct msghdr msg;
@@ -107,17 +108,17 @@
virt_mb();
if (error)
- return;
+ return false;
size = pvcalls_queued(prod, cons, array_size);
if (size >= array_size)
- return;
+ return false;
spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags);
if (skb_queue_empty(&map->sock->sk->sk_receive_queue)) {
atomic_set(&map->read, 0);
spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock,
flags);
- return;
+ return true;
}
spin_unlock_irqrestore(&map->sock->sk->sk_receive_queue.lock, flags);
wanted = array_size - size;
@@ -141,7 +142,7 @@
ret = inet_recvmsg(map->sock, &msg, wanted, MSG_DONTWAIT);
WARN_ON(ret > wanted);
if (ret == -EAGAIN) /* shouldn't happen */
- return;
+ return true;
if (!ret)
ret = -ENOTCONN;
spin_lock_irqsave(&map->sock->sk->sk_receive_queue.lock, flags);
@@ -160,10 +161,10 @@
virt_wmb();
notify_remote_via_irq(map->irq);
- return;
+ return true;
}
-static void pvcalls_conn_back_write(struct sock_mapping *map)
+static bool pvcalls_conn_back_write(struct sock_mapping *map)
{
struct pvcalls_data_intf *intf = map->ring;
struct pvcalls_data *data = &map->data;
@@ -180,7 +181,7 @@
array_size = XEN_FLEX_RING_SIZE(map->ring_order);
size = pvcalls_queued(prod, cons, array_size);
if (size == 0)
- return;
+ return false;
memset(&msg, 0, sizeof(msg));
msg.msg_flags |= MSG_DONTWAIT;
@@ -198,12 +199,11 @@
atomic_set(&map->write, 0);
ret = inet_sendmsg(map->sock, &msg, size);
- if (ret == -EAGAIN || (ret >= 0 && ret < size)) {
+ if (ret == -EAGAIN) {
atomic_inc(&map->write);
atomic_inc(&map->io);
+ return true;
}
- if (ret == -EAGAIN)
- return;
/* write the data, then update the indexes */
virt_wmb();
@@ -216,9 +216,13 @@
}
/* update the indexes, then notify the other end */
virt_wmb();
- if (prod != cons + ret)
+ if (prod != cons + ret) {
atomic_inc(&map->write);
+ atomic_inc(&map->io);
+ }
notify_remote_via_irq(map->irq);
+
+ return true;
}
static void pvcalls_back_ioworker(struct work_struct *work)
@@ -227,6 +231,7 @@
struct pvcalls_ioworker, register_work);
struct sock_mapping *map = container_of(ioworker, struct sock_mapping,
ioworker);
+ unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
while (atomic_read(&map->io) > 0) {
if (atomic_read(&map->release) > 0) {
@@ -234,10 +239,18 @@
return;
}
- if (atomic_read(&map->read) > 0)
- pvcalls_conn_back_read(map);
- if (atomic_read(&map->write) > 0)
- pvcalls_conn_back_write(map);
+ if (atomic_read(&map->read) > 0 &&
+ pvcalls_conn_back_read(map))
+ eoi_flags = 0;
+ if (atomic_read(&map->write) > 0 &&
+ pvcalls_conn_back_write(map))
+ eoi_flags = 0;
+
+ if (atomic_read(&map->eoi) > 0 && !atomic_read(&map->write)) {
+ atomic_set(&map->eoi, 0);
+ xen_irq_lateeoi(map->irq, eoi_flags);
+ eoi_flags = XEN_EOI_FLAG_SPURIOUS;
+ }
atomic_dec(&map->io);
}
@@ -334,12 +347,9 @@
goto out;
map->bytes = page;
- ret = bind_interdomain_evtchn_to_irqhandler(fedata->dev->otherend_id,
- evtchn,
- pvcalls_back_conn_event,
- 0,
- "pvcalls-backend",
- map);
+ ret = bind_interdomain_evtchn_to_irqhandler_lateeoi(
+ fedata->dev->otherend_id, evtchn,
+ pvcalls_back_conn_event, 0, "pvcalls-backend", map);
if (ret < 0)
goto out;
map->irq = ret;
@@ -873,15 +883,18 @@
{
struct xenbus_device *dev = dev_id;
struct pvcalls_fedata *fedata = NULL;
+ unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
- if (dev == NULL)
- return IRQ_HANDLED;
+ if (dev) {
+ fedata = dev_get_drvdata(&dev->dev);
+ if (fedata) {
+ pvcalls_back_work(fedata);
+ eoi_flags = 0;
+ }
+ }
- fedata = dev_get_drvdata(&dev->dev);
- if (fedata == NULL)
- return IRQ_HANDLED;
+ xen_irq_lateeoi(irq, eoi_flags);
- pvcalls_back_work(fedata);
return IRQ_HANDLED;
}
@@ -891,12 +904,15 @@
struct pvcalls_ioworker *iow;
if (map == NULL || map->sock == NULL || map->sock->sk == NULL ||
- map->sock->sk->sk_user_data != map)
+ map->sock->sk->sk_user_data != map) {
+ xen_irq_lateeoi(irq, 0);
return IRQ_HANDLED;
+ }
iow = &map->ioworker;
atomic_inc(&map->write);
+ atomic_inc(&map->eoi);
atomic_inc(&map->io);
queue_work(iow->wq, &iow->register_work);
@@ -931,7 +947,7 @@
goto error;
}
- err = bind_interdomain_evtchn_to_irq(dev->otherend_id, evtchn);
+ err = bind_interdomain_evtchn_to_irq_lateeoi(dev->otherend_id, evtchn);
if (err < 0)
goto error;
fedata->irq = err;
@@ -1087,7 +1103,8 @@
case XenbusStateInitialised:
switch (state) {
case XenbusStateConnected:
- backend_connect(dev);
+ if (backend_connect(dev))
+ return;
xenbus_switch_state(dev, XenbusStateConnected);
break;
case XenbusStateClosing:
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index bd3a10d..0634642 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -335,6 +335,7 @@
int order = get_order(size);
phys_addr_t phys;
u64 dma_mask = DMA_BIT_MASK(32);
+ struct page *page;
if (hwdev && hwdev->coherent_dma_mask)
dma_mask = hwdev->coherent_dma_mask;
@@ -346,9 +347,14 @@
/* Convert the size to actually allocated. */
size = 1UL << (order + XEN_PAGE_SHIFT);
+ if (is_vmalloc_addr(vaddr))
+ page = vmalloc_to_page(vaddr);
+ else
+ page = virt_to_page(vaddr);
+
if (!WARN_ON((dev_addr + size - 1 > dma_mask) ||
range_straddles_page_boundary(phys, size)) &&
- TestClearPageXenRemapped(virt_to_page(vaddr)))
+ TestClearPageXenRemapped(page))
xen_destroy_contiguous_region(phys, order);
xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs);
diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c
index 6d12fc3..a8d2443 100644
--- a/drivers/xen/xen-balloon.c
+++ b/drivers/xen/xen-balloon.c
@@ -94,7 +94,7 @@
"%llu", &static_max) == 1))
static_max >>= PAGE_SHIFT - 10;
else
- static_max = new_target;
+ static_max = balloon_stats.current_pages;
target_diff = (xen_pv_domain() || xen_initial_domain()) ? 0
: static_max - balloon_stats.target_pages;
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
index 097410a..adf3aae 100644
--- a/drivers/xen/xen-pciback/pci_stub.c
+++ b/drivers/xen/xen-pciback/pci_stub.c
@@ -733,10 +733,17 @@
wmb();
notify_remote_via_irq(pdev->evtchn_irq);
+ /* Enable IRQ to signal "request done". */
+ xen_pcibk_lateeoi(pdev, 0);
+
ret = wait_event_timeout(xen_pcibk_aer_wait_queue,
!(test_bit(_XEN_PCIB_active, (unsigned long *)
&sh_info->flags)), 300*HZ);
+ /* Enable IRQ for pcifront request if not already active. */
+ if (!test_bit(_PDEVF_op_active, &pdev->flags))
+ xen_pcibk_lateeoi(pdev, 0);
+
if (!ret) {
if (test_bit(_XEN_PCIB_active,
(unsigned long *)&sh_info->flags)) {
@@ -750,13 +757,6 @@
}
clear_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags);
- if (test_bit(_XEN_PCIF_active,
- (unsigned long *)&sh_info->flags)) {
- dev_dbg(&psdev->dev->dev,
- "schedule pci_conf service in " DRV_NAME "\n");
- xen_pcibk_test_and_schedule_op(psdev->pdev);
- }
-
res = (pci_ers_result_t)aer_op->err;
return res;
}
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
index 263c059..235cdfe 100644
--- a/drivers/xen/xen-pciback/pciback.h
+++ b/drivers/xen/xen-pciback/pciback.h
@@ -14,6 +14,7 @@
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/atomic.h>
+#include <xen/events.h>
#include <xen/interface/io/pciif.h>
#define DRV_NAME "xen-pciback"
@@ -27,6 +28,8 @@
#define PDEVF_op_active (1<<(_PDEVF_op_active))
#define _PCIB_op_pending (1)
#define PCIB_op_pending (1<<(_PCIB_op_pending))
+#define _EOI_pending (2)
+#define EOI_pending (1<<(_EOI_pending))
struct xen_pcibk_device {
void *pci_dev_data;
@@ -182,12 +185,17 @@
irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id);
void xen_pcibk_do_op(struct work_struct *data);
+static inline void xen_pcibk_lateeoi(struct xen_pcibk_device *pdev,
+ unsigned int eoi_flag)
+{
+ if (test_and_clear_bit(_EOI_pending, &pdev->flags))
+ xen_irq_lateeoi(pdev->evtchn_irq, eoi_flag);
+}
+
int xen_pcibk_xenbus_register(void);
void xen_pcibk_xenbus_unregister(void);
extern int verbose_request;
-
-void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev);
#endif
/* Handles shared IRQs that can to device domain and control domain. */
diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c
index 787966f..c4ed2c6 100644
--- a/drivers/xen/xen-pciback/pciback_ops.c
+++ b/drivers/xen/xen-pciback/pciback_ops.c
@@ -297,26 +297,41 @@
return 0;
}
#endif
+
+static inline bool xen_pcibk_test_op_pending(struct xen_pcibk_device *pdev)
+{
+ return test_bit(_XEN_PCIF_active,
+ (unsigned long *)&pdev->sh_info->flags) &&
+ !test_and_set_bit(_PDEVF_op_active, &pdev->flags);
+}
+
/*
* Now the same evtchn is used for both pcifront conf_read_write request
* as well as pcie aer front end ack. We use a new work_queue to schedule
* xen_pcibk conf_read_write service for avoiding confict with aer_core
* do_recovery job which also use the system default work_queue
*/
-void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev)
+static void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev)
{
+ bool eoi = true;
+
/* Check that frontend is requesting an operation and that we are not
* already processing a request */
- if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)
- && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) {
+ if (xen_pcibk_test_op_pending(pdev)) {
schedule_work(&pdev->op_work);
+ eoi = false;
}
/*_XEN_PCIB_active should have been cleared by pcifront. And also make
sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/
if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags)
&& test_bit(_PCIB_op_pending, &pdev->flags)) {
wake_up(&xen_pcibk_aer_wait_queue);
+ eoi = false;
}
+
+ /* EOI if there was nothing to do. */
+ if (eoi)
+ xen_pcibk_lateeoi(pdev, XEN_EOI_FLAG_SPURIOUS);
}
/* Performing the configuration space reads/writes must not be done in atomic
@@ -324,10 +339,8 @@
* use of semaphores). This function is intended to be called from a work
* queue in process context taking a struct xen_pcibk_device as a parameter */
-void xen_pcibk_do_op(struct work_struct *data)
+static void xen_pcibk_do_one_op(struct xen_pcibk_device *pdev)
{
- struct xen_pcibk_device *pdev =
- container_of(data, struct xen_pcibk_device, op_work);
struct pci_dev *dev;
struct xen_pcibk_dev_data *dev_data = NULL;
struct xen_pci_op *op = &pdev->op;
@@ -400,16 +413,31 @@
smp_mb__before_atomic(); /* /after/ clearing PCIF_active */
clear_bit(_PDEVF_op_active, &pdev->flags);
smp_mb__after_atomic(); /* /before/ final check for work */
+}
- /* Check to see if the driver domain tried to start another request in
- * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active.
- */
- xen_pcibk_test_and_schedule_op(pdev);
+void xen_pcibk_do_op(struct work_struct *data)
+{
+ struct xen_pcibk_device *pdev =
+ container_of(data, struct xen_pcibk_device, op_work);
+
+ do {
+ xen_pcibk_do_one_op(pdev);
+ } while (xen_pcibk_test_op_pending(pdev));
+
+ xen_pcibk_lateeoi(pdev, 0);
}
irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id)
{
struct xen_pcibk_device *pdev = dev_id;
+ bool eoi;
+
+ /* IRQs might come in before pdev->evtchn_irq is written. */
+ if (unlikely(pdev->evtchn_irq != irq))
+ pdev->evtchn_irq = irq;
+
+ eoi = test_and_set_bit(_EOI_pending, &pdev->flags);
+ WARN(eoi, "IRQ while EOI pending\n");
xen_pcibk_test_and_schedule_op(pdev);
diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c
index f6ba181..3031308 100644
--- a/drivers/xen/xen-pciback/vpci.c
+++ b/drivers/xen/xen-pciback/vpci.c
@@ -69,7 +69,7 @@
struct pci_dev *dev, int devid,
publish_pci_dev_cb publish_cb)
{
- int err = 0, slot, func = -1;
+ int err = 0, slot, func = PCI_FUNC(dev->devfn);
struct pci_dev_entry *t, *dev_entry;
struct vpci_dev_data *vpci_dev = pdev->pci_dev_data;
@@ -94,23 +94,26 @@
/*
* Keep multi-function devices together on the virtual PCI bus, except
- * virtual functions.
+ * that we want to keep virtual functions at func 0 on their own. They
+ * aren't multi-function devices and hence their presence at func 0
+ * may cause guests to not scan the other functions.
*/
- if (!dev->is_virtfn) {
+ if (!dev->is_virtfn || func) {
for (slot = 0; slot < PCI_SLOT_MAX; slot++) {
if (list_empty(&vpci_dev->dev_list[slot]))
continue;
t = list_entry(list_first(&vpci_dev->dev_list[slot]),
struct pci_dev_entry, list);
+ if (t->dev->is_virtfn && !PCI_FUNC(t->dev->devfn))
+ continue;
if (match_slot(dev, t->dev)) {
pr_info("vpci: %s: assign to virtual slot %d func %d\n",
pci_name(dev), slot,
- PCI_FUNC(dev->devfn));
+ func);
list_add_tail(&dev_entry->list,
&vpci_dev->dev_list[slot]);
- func = PCI_FUNC(dev->devfn);
goto unlock;
}
}
@@ -123,7 +126,6 @@
pci_name(dev), slot);
list_add_tail(&dev_entry->list,
&vpci_dev->dev_list[slot]);
- func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn);
goto unlock;
}
}
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 833b2d2..a255b08 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -123,7 +123,7 @@
pdev->sh_info = vaddr;
- err = bind_interdomain_evtchn_to_irqhandler(
+ err = bind_interdomain_evtchn_to_irqhandler_lateeoi(
pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event,
0, DRV_NAME, pdev);
if (err < 0) {
@@ -358,7 +358,8 @@
return err;
}
-static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev)
+static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev,
+ enum xenbus_state state)
{
int err = 0;
int num_devs;
@@ -372,9 +373,7 @@
dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n");
mutex_lock(&pdev->dev_lock);
- /* Make sure we only reconfigure once */
- if (xenbus_read_driver_state(pdev->xdev->nodename) !=
- XenbusStateReconfiguring)
+ if (xenbus_read_driver_state(pdev->xdev->nodename) != state)
goto out;
err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d",
@@ -499,6 +498,10 @@
}
}
+ if (state != XenbusStateReconfiguring)
+ /* Make sure we only reconfigure once. */
+ goto out;
+
err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured);
if (err) {
xenbus_dev_fatal(pdev->xdev, err,
@@ -524,7 +527,7 @@
break;
case XenbusStateReconfiguring:
- xen_pcibk_reconfigure(pdev);
+ xen_pcibk_reconfigure(pdev, XenbusStateReconfiguring);
break;
case XenbusStateConnected:
@@ -663,6 +666,15 @@
xen_pcibk_setup_backend(pdev);
break;
+ case XenbusStateInitialised:
+ /*
+ * We typically move to Initialised when the first device was
+ * added. Hence subsequent devices getting added may need
+ * reconfiguring.
+ */
+ xen_pcibk_reconfigure(pdev, XenbusStateInitialised);
+ break;
+
default:
break;
}
@@ -688,7 +700,7 @@
/* watch the backend node for backend configuration information */
err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch,
- xen_pcibk_be_watch);
+ NULL, xen_pcibk_be_watch);
if (err)
goto out;
diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c
index ba0942e..32aba2e 100644
--- a/drivers/xen/xen-scsiback.c
+++ b/drivers/xen/xen-scsiback.c
@@ -91,7 +91,6 @@
unsigned int irq;
struct vscsiif_back_ring ring;
- int ring_error;
spinlock_t ring_lock;
atomic_t nr_unreplied_reqs;
@@ -423,12 +422,12 @@
return 0;
err = gnttab_map_refs(map, NULL, pg, cnt);
- BUG_ON(err);
for (i = 0; i < cnt; i++) {
if (unlikely(map[i].status != GNTST_okay)) {
pr_err("invalid buffer -- could not remap it\n");
map[i].handle = SCSIBACK_INVALID_HANDLE;
- err = -ENOMEM;
+ if (!err)
+ err = -ENOMEM;
} else {
get_page(pg[i]);
}
@@ -722,7 +721,8 @@
return pending_req;
}
-static int scsiback_do_cmd_fn(struct vscsibk_info *info)
+static int scsiback_do_cmd_fn(struct vscsibk_info *info,
+ unsigned int *eoi_flags)
{
struct vscsiif_back_ring *ring = &info->ring;
struct vscsiif_request ring_req;
@@ -739,11 +739,12 @@
rc = ring->rsp_prod_pvt;
pr_warn("Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n",
info->domid, rp, rc, rp - rc);
- info->ring_error = 1;
- return 0;
+ return -EINVAL;
}
while ((rc != rp)) {
+ *eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
+
if (RING_REQUEST_CONS_OVERFLOW(ring, rc))
break;
@@ -802,13 +803,16 @@
static irqreturn_t scsiback_irq_fn(int irq, void *dev_id)
{
struct vscsibk_info *info = dev_id;
+ int rc;
+ unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
- if (info->ring_error)
- return IRQ_HANDLED;
-
- while (scsiback_do_cmd_fn(info))
+ while ((rc = scsiback_do_cmd_fn(info, &eoi_flags)) > 0)
cond_resched();
+ /* In case of a ring error we keep the event channel masked. */
+ if (!rc)
+ xen_irq_lateeoi(irq, eoi_flags);
+
return IRQ_HANDLED;
}
@@ -829,7 +833,7 @@
sring = (struct vscsiif_sring *)area;
BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
- err = bind_interdomain_evtchn_to_irq(info->domid, evtchn);
+ err = bind_interdomain_evtchn_to_irq_lateeoi(info->domid, evtchn);
if (err < 0)
goto unmap_page;
@@ -1252,7 +1256,6 @@
info->domid = dev->otherend_id;
spin_lock_init(&info->ring_lock);
- info->ring_error = 0;
atomic_set(&info->nr_unreplied_reqs, 0);
init_waitqueue_head(&info->waiting_to_free);
info->dev = dev;
diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h
index d75a238..88516a8 100644
--- a/drivers/xen/xenbus/xenbus.h
+++ b/drivers/xen/xenbus/xenbus.h
@@ -44,6 +44,8 @@
int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename);
int (*probe)(struct xen_bus_type *bus, const char *type,
const char *dir);
+ bool (*otherend_will_handle)(struct xenbus_watch *watch,
+ const char *path, const char *token);
void (*otherend_changed)(struct xenbus_watch *watch, const char *path,
const char *token);
struct bus_type bus;
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index e17ca81..81eddb8 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -114,18 +114,22 @@
*/
int xenbus_watch_path(struct xenbus_device *dev, const char *path,
struct xenbus_watch *watch,
+ bool (*will_handle)(struct xenbus_watch *,
+ const char *, const char *),
void (*callback)(struct xenbus_watch *,
const char *, const char *))
{
int err;
watch->node = path;
+ watch->will_handle = will_handle;
watch->callback = callback;
err = register_xenbus_watch(watch);
if (err) {
watch->node = NULL;
+ watch->will_handle = NULL;
watch->callback = NULL;
xenbus_dev_fatal(dev, err, "adding watch on %s", path);
}
@@ -152,6 +156,8 @@
*/
int xenbus_watch_pathfmt(struct xenbus_device *dev,
struct xenbus_watch *watch,
+ bool (*will_handle)(struct xenbus_watch *,
+ const char *, const char *),
void (*callback)(struct xenbus_watch *,
const char *, const char *),
const char *pathfmt, ...)
@@ -168,7 +174,7 @@
xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch");
return -ENOMEM;
}
- err = xenbus_watch_path(dev, path, watch, callback);
+ err = xenbus_watch_path(dev, path, watch, will_handle, callback);
if (err)
kfree(path);
@@ -363,8 +369,14 @@
int i, j;
for (i = 0; i < nr_pages; i++) {
- err = gnttab_grant_foreign_access(dev->otherend_id,
- virt_to_gfn(vaddr), 0);
+ unsigned long gfn;
+
+ if (is_vmalloc_addr(vaddr))
+ gfn = pfn_to_gfn(vmalloc_to_pfn(vaddr));
+ else
+ gfn = virt_to_gfn(vaddr);
+
+ err = gnttab_grant_foreign_access(dev->otherend_id, gfn, 0);
if (err < 0) {
xenbus_dev_fatal(dev, err,
"granting access to ring page");
@@ -448,7 +460,14 @@
int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs,
unsigned int nr_grefs, void **vaddr)
{
- return ring_ops->map(dev, gnt_refs, nr_grefs, vaddr);
+ int err;
+
+ err = ring_ops->map(dev, gnt_refs, nr_grefs, vaddr);
+ /* Some hypervisors are buggy and can return 1. */
+ if (err > 0)
+ err = GNTST_general_error;
+
+ return err;
}
EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c
index d239fc3..e5fda02 100644
--- a/drivers/xen/xenbus/xenbus_comms.c
+++ b/drivers/xen/xenbus/xenbus_comms.c
@@ -57,16 +57,8 @@
static int xenbus_irq;
static struct task_struct *xenbus_task;
-static DECLARE_WORK(probe_work, xenbus_probe);
-
-
static irqreturn_t wake_waiting(int irq, void *unused)
{
- if (unlikely(xenstored_ready == 0)) {
- xenstored_ready = 1;
- schedule_work(&probe_work);
- }
-
wake_up(&xb_waitq);
return IRQ_HANDLED;
}
@@ -313,6 +305,8 @@
req->msg.type = state.msg.type;
req->msg.len = state.msg.len;
req->body = state.body;
+ /* write body, then update state */
+ virt_wmb();
req->state = xb_req_state_got_reply;
req->cb(req);
} else
@@ -395,6 +389,8 @@
if (state.req->state == xb_req_state_aborted)
kfree(state.req);
else {
+ /* write err, then update state */
+ virt_wmb();
state.req->state = xb_req_state_got_reply;
wake_up(&state.req->wq);
}
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 5b47188..652894d 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -136,6 +136,7 @@
container_of(dev->dev.bus, struct xen_bus_type, bus);
return xenbus_watch_pathfmt(dev, &dev->otherend_watch,
+ bus->otherend_will_handle,
bus->otherend_changed,
"%s/%s", dev->otherend, "state");
}
@@ -682,29 +683,107 @@
}
EXPORT_SYMBOL_GPL(unregister_xenstore_notifier);
-void xenbus_probe(struct work_struct *unused)
+static void xenbus_probe(void)
{
xenstored_ready = 1;
+ /*
+ * In the HVM case, xenbus_init() deferred its call to
+ * xs_init() in case callbacks were not operational yet.
+ * So do it now.
+ */
+ if (xen_store_domain_type == XS_HVM)
+ xs_init();
+
/* Notify others that xenstore is up */
blocking_notifier_call_chain(&xenstore_chain, 0, NULL);
}
-EXPORT_SYMBOL_GPL(xenbus_probe);
-static int __init xenbus_probe_initcall(void)
+/*
+ * Returns true when XenStore init must be deferred in order to
+ * allow the PCI platform device to be initialised, before we
+ * can actually have event channel interrupts working.
+ */
+static bool xs_hvm_defer_init_for_callback(void)
{
- if (!xen_domain())
- return -ENODEV;
+#ifdef CONFIG_XEN_PVHVM
+ return xen_store_domain_type == XS_HVM &&
+ !xen_have_vector_callback;
+#else
+ return false;
+#endif
+}
- if (xen_initial_domain() || xen_hvm_domain())
- return 0;
+static int xenbus_probe_thread(void *unused)
+{
+ DEFINE_WAIT(w);
- xenbus_probe(NULL);
+ /*
+ * We actually just want to wait for *any* trigger of xb_waitq,
+ * and run xenbus_probe() the moment it occurs.
+ */
+ prepare_to_wait(&xb_waitq, &w, TASK_INTERRUPTIBLE);
+ schedule();
+ finish_wait(&xb_waitq, &w);
+
+ DPRINTK("probing");
+ xenbus_probe();
return 0;
}
+static int __init xenbus_probe_initcall(void)
+{
+ /*
+ * Probe XenBus here in the XS_PV case, and also XS_HVM unless we
+ * need to wait for the platform PCI device to come up.
+ */
+ if (xen_store_domain_type == XS_PV ||
+ (xen_store_domain_type == XS_HVM &&
+ !xs_hvm_defer_init_for_callback()))
+ xenbus_probe();
+
+ /*
+ * For XS_LOCAL, spawn a thread which will wait for xenstored
+ * or a xenstore-stubdom to be started, then probe. It will be
+ * triggered when communication starts happening, by waiting
+ * on xb_waitq.
+ */
+ if (xen_store_domain_type == XS_LOCAL) {
+ struct task_struct *probe_task;
+
+ probe_task = kthread_run(xenbus_probe_thread, NULL,
+ "xenbus_probe");
+ if (IS_ERR(probe_task))
+ return PTR_ERR(probe_task);
+ }
+ return 0;
+}
device_initcall(xenbus_probe_initcall);
+int xen_set_callback_via(uint64_t via)
+{
+ struct xen_hvm_param a;
+ int ret;
+
+ a.domid = DOMID_SELF;
+ a.index = HVM_PARAM_CALLBACK_IRQ;
+ a.value = via;
+
+ ret = HYPERVISOR_hvm_op(HVMOP_set_param, &a);
+ if (ret)
+ return ret;
+
+ /*
+ * If xenbus_probe_initcall() deferred the xenbus_probe()
+ * due to the callback not functioning yet, we can do it now.
+ */
+ if (!xenstored_ready && xs_hvm_defer_init_for_callback())
+ xenbus_probe();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xen_set_callback_via);
+
/* Set up event channel for xenstored which is run as a local process
* (this is normally used only in dom0)
*/
@@ -817,11 +896,17 @@
break;
}
- /* Initialize the interface to xenstore. */
- err = xs_init();
- if (err) {
- pr_warn("Error initializing xenstore comms: %i\n", err);
- goto out_error;
+ /*
+ * HVM domains may not have a functional callback yet. In that
+ * case let xs_init() be called from xenbus_probe(), which will
+ * get invoked at an appropriate time.
+ */
+ if (xen_store_domain_type != XS_HVM) {
+ err = xs_init();
+ if (err) {
+ pr_warn("Error initializing xenstore comms: %i\n", err);
+ goto out_error;
+ }
}
if ((xen_store_domain_type != XS_LOCAL) &&
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c
index b0bed4f..4bb6030 100644
--- a/drivers/xen/xenbus/xenbus_probe_backend.c
+++ b/drivers/xen/xenbus/xenbus_probe_backend.c
@@ -180,6 +180,12 @@
return err;
}
+static bool frontend_will_handle(struct xenbus_watch *watch,
+ const char *path, const char *token)
+{
+ return watch->nr_pending == 0;
+}
+
static void frontend_changed(struct xenbus_watch *watch,
const char *path, const char *token)
{
@@ -191,6 +197,7 @@
.levels = 3, /* backend/type/<frontend>/<id> */
.get_bus_id = backend_bus_id,
.probe = xenbus_probe_backend,
+ .otherend_will_handle = frontend_will_handle,
.otherend_changed = frontend_changed,
.bus = {
.name = "xen-backend",
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index ddc18da..12e02eb 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -191,8 +191,11 @@
static bool test_reply(struct xb_req_data *req)
{
- if (req->state == xb_req_state_got_reply || !xenbus_ok())
+ if (req->state == xb_req_state_got_reply || !xenbus_ok()) {
+ /* read req->state before all other fields */
+ virt_rmb();
return true;
+ }
/* Make sure to reread req->state each time. */
barrier();
@@ -202,7 +205,7 @@
static void *read_reply(struct xb_req_data *req)
{
- while (req->state != xb_req_state_got_reply) {
+ do {
wait_event(req->wq, test_reply(req));
if (!xenbus_ok())
@@ -216,7 +219,7 @@
if (req->err)
return ERR_PTR(req->err);
- }
+ } while (req->state != xb_req_state_got_reply);
return req->body;
}
@@ -702,9 +705,13 @@
spin_lock(&watches_lock);
event->handle = find_watch(event->token);
- if (event->handle != NULL) {
+ if (event->handle != NULL &&
+ (!event->handle->will_handle ||
+ event->handle->will_handle(event->handle,
+ event->path, event->token))) {
spin_lock(&watch_events_lock);
list_add_tail(&event->list, &watch_events);
+ event->handle->nr_pending++;
wake_up(&watch_events_waitq);
spin_unlock(&watch_events_lock);
} else
@@ -762,6 +769,8 @@
sprintf(token, "%lX", (long)watch);
+ watch->nr_pending = 0;
+
down_read(&xs_watch_rwsem);
spin_lock(&watches_lock);
@@ -811,11 +820,14 @@
/* Cancel pending watch events. */
spin_lock(&watch_events_lock);
- list_for_each_entry_safe(event, tmp, &watch_events, list) {
- if (event->handle != watch)
- continue;
- list_del(&event->list);
- kfree(event);
+ if (watch->nr_pending) {
+ list_for_each_entry_safe(event, tmp, &watch_events, list) {
+ if (event->handle != watch)
+ continue;
+ list_del(&event->list);
+ kfree(event);
+ }
+ watch->nr_pending = 0;
}
spin_unlock(&watch_events_lock);
@@ -862,7 +874,6 @@
static int xenwatch_thread(void *unused)
{
- struct list_head *ent;
struct xs_watch_event *event;
xenwatch_pid = current->pid;
@@ -877,13 +888,15 @@
mutex_lock(&xenwatch_mutex);
spin_lock(&watch_events_lock);
- ent = watch_events.next;
- if (ent != &watch_events)
- list_del(ent);
+ event = list_first_entry_or_null(&watch_events,
+ struct xs_watch_event, list);
+ if (event) {
+ list_del(&event->list);
+ event->handle->nr_pending--;
+ }
spin_unlock(&watch_events_lock);
- if (ent != &watch_events) {
- event = list_entry(ent, struct xs_watch_event, list);
+ if (event) {
event->handle->callback(event->handle, event->path,
event->token);
kfree(event);