Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index a1eec71..94daf82 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -9,4 +9,5 @@
hv_vmbus-y := vmbus_drv.o \
hv.o connection.o channel.o \
channel_mgmt.o ring_buffer.o hv_trace.o
+hv_vmbus-$(CONFIG_HYPERV_TESTING) += hv_debugfs.o
hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o hv_fcopy.o hv_utils_transport.o
diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 23f358c..f064fa6 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -18,23 +18,101 @@
#include <linux/uio.h>
#include <linux/interrupt.h>
#include <asm/page.h>
+#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
-#define NUM_PAGES_SPANNED(addr, len) \
-((PAGE_ALIGN(addr + len) >> PAGE_SHIFT) - (addr >> PAGE_SHIFT))
-
-static unsigned long virt_to_hvpfn(void *addr)
+/*
+ * hv_gpadl_size - Return the real size of a gpadl, the size that Hyper-V uses
+ *
+ * For BUFFER gpadl, Hyper-V uses the exact same size as the guest does.
+ *
+ * For RING gpadl, in each ring, the guest uses one PAGE_SIZE as the header
+ * (because of the alignment requirement), however, the hypervisor only
+ * uses the first HV_HYP_PAGE_SIZE as the header, therefore leaving a
+ * (PAGE_SIZE - HV_HYP_PAGE_SIZE) gap. And since there are two rings in a
+ * ringbuffer, the total size for a RING gpadl that Hyper-V uses is the
+ * total size that the guest uses minus twice of the gap size.
+ */
+static inline u32 hv_gpadl_size(enum hv_gpadl_type type, u32 size)
{
- phys_addr_t paddr;
+ switch (type) {
+ case HV_GPADL_BUFFER:
+ return size;
+ case HV_GPADL_RING:
+ /* The size of a ringbuffer must be page-aligned */
+ BUG_ON(size % PAGE_SIZE);
+ /*
+ * Two things to notice here:
+ * 1) We're processing two ring buffers as a unit
+ * 2) We're skipping any space larger than HV_HYP_PAGE_SIZE in
+ * the first guest-size page of each of the two ring buffers.
+ * So we effectively subtract out two guest-size pages, and add
+ * back two Hyper-V size pages.
+ */
+ return size - 2 * (PAGE_SIZE - HV_HYP_PAGE_SIZE);
+ }
+ BUG();
+ return 0;
+}
- if (is_vmalloc_addr(addr))
- paddr = page_to_phys(vmalloc_to_page(addr)) +
- offset_in_page(addr);
- else
- paddr = __pa(addr);
+/*
+ * hv_ring_gpadl_send_hvpgoffset - Calculate the send offset (in unit of
+ * HV_HYP_PAGE) in a ring gpadl based on the
+ * offset in the guest
+ *
+ * @offset: the offset (in bytes) where the send ringbuffer starts in the
+ * virtual address space of the guest
+ */
+static inline u32 hv_ring_gpadl_send_hvpgoffset(u32 offset)
+{
- return paddr >> PAGE_SHIFT;
+ /*
+ * For RING gpadl, in each ring, the guest uses one PAGE_SIZE as the
+ * header (because of the alignment requirement), however, the
+ * hypervisor only uses the first HV_HYP_PAGE_SIZE as the header,
+ * therefore leaving a (PAGE_SIZE - HV_HYP_PAGE_SIZE) gap.
+ *
+ * And to calculate the effective send offset in gpadl, we need to
+ * substract this gap.
+ */
+ return (offset - (PAGE_SIZE - HV_HYP_PAGE_SIZE)) >> HV_HYP_PAGE_SHIFT;
+}
+
+/*
+ * hv_gpadl_hvpfn - Return the Hyper-V page PFN of the @i th Hyper-V page in
+ * the gpadl
+ *
+ * @type: the type of the gpadl
+ * @kbuffer: the pointer to the gpadl in the guest
+ * @size: the total size (in bytes) of the gpadl
+ * @send_offset: the offset (in bytes) where the send ringbuffer starts in the
+ * virtual address space of the guest
+ * @i: the index
+ */
+static inline u64 hv_gpadl_hvpfn(enum hv_gpadl_type type, void *kbuffer,
+ u32 size, u32 send_offset, int i)
+{
+ int send_idx = hv_ring_gpadl_send_hvpgoffset(send_offset);
+ unsigned long delta = 0UL;
+
+ switch (type) {
+ case HV_GPADL_BUFFER:
+ break;
+ case HV_GPADL_RING:
+ if (i == 0)
+ delta = 0;
+ else if (i <= send_idx)
+ delta = PAGE_SIZE - HV_HYP_PAGE_SIZE;
+ else
+ delta = 2 * (PAGE_SIZE - HV_HYP_PAGE_SIZE);
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ return virt_to_hvpfn(kbuffer + delta + (HV_HYP_PAGE_SIZE * i));
}
/*
@@ -111,164 +189,6 @@
}
EXPORT_SYMBOL_GPL(vmbus_alloc_ring);
-static int __vmbus_open(struct vmbus_channel *newchannel,
- void *userdata, u32 userdatalen,
- void (*onchannelcallback)(void *context), void *context)
-{
- struct vmbus_channel_open_channel *open_msg;
- struct vmbus_channel_msginfo *open_info = NULL;
- struct page *page = newchannel->ringbuffer_page;
- u32 send_pages, recv_pages;
- unsigned long flags;
- int err;
-
- if (userdatalen > MAX_USER_DEFINED_BYTES)
- return -EINVAL;
-
- send_pages = newchannel->ringbuffer_send_offset;
- recv_pages = newchannel->ringbuffer_pagecount - send_pages;
-
- spin_lock_irqsave(&newchannel->lock, flags);
- if (newchannel->state != CHANNEL_OPEN_STATE) {
- spin_unlock_irqrestore(&newchannel->lock, flags);
- return -EINVAL;
- }
- spin_unlock_irqrestore(&newchannel->lock, flags);
-
- newchannel->state = CHANNEL_OPENING_STATE;
- newchannel->onchannel_callback = onchannelcallback;
- newchannel->channel_callback_context = context;
-
- err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages);
- if (err)
- goto error_clean_ring;
-
- err = hv_ringbuffer_init(&newchannel->inbound,
- &page[send_pages], recv_pages);
- if (err)
- goto error_clean_ring;
-
- /* Establish the gpadl for the ring buffer */
- newchannel->ringbuffer_gpadlhandle = 0;
-
- err = vmbus_establish_gpadl(newchannel,
- page_address(newchannel->ringbuffer_page),
- (send_pages + recv_pages) << PAGE_SHIFT,
- &newchannel->ringbuffer_gpadlhandle);
- if (err)
- goto error_clean_ring;
-
- /* Create and init the channel open message */
- open_info = kmalloc(sizeof(*open_info) +
- sizeof(struct vmbus_channel_open_channel),
- GFP_KERNEL);
- if (!open_info) {
- err = -ENOMEM;
- goto error_free_gpadl;
- }
-
- init_completion(&open_info->waitevent);
- open_info->waiting_channel = newchannel;
-
- open_msg = (struct vmbus_channel_open_channel *)open_info->msg;
- open_msg->header.msgtype = CHANNELMSG_OPENCHANNEL;
- open_msg->openid = newchannel->offermsg.child_relid;
- open_msg->child_relid = newchannel->offermsg.child_relid;
- open_msg->ringbuffer_gpadlhandle = newchannel->ringbuffer_gpadlhandle;
- open_msg->downstream_ringbuffer_pageoffset = newchannel->ringbuffer_send_offset;
- open_msg->target_vp = newchannel->target_vp;
-
- if (userdatalen)
- memcpy(open_msg->userdata, userdata, userdatalen);
-
- spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
- list_add_tail(&open_info->msglistentry,
- &vmbus_connection.chn_msg_list);
- spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
-
- if (newchannel->rescind) {
- err = -ENODEV;
- goto error_free_info;
- }
-
- err = vmbus_post_msg(open_msg,
- sizeof(struct vmbus_channel_open_channel), true);
-
- trace_vmbus_open(open_msg, err);
-
- if (err != 0)
- goto error_clean_msglist;
-
- wait_for_completion(&open_info->waitevent);
-
- spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
- list_del(&open_info->msglistentry);
- spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
-
- if (newchannel->rescind) {
- err = -ENODEV;
- goto error_free_info;
- }
-
- if (open_info->response.open_result.status) {
- err = -EAGAIN;
- goto error_free_info;
- }
-
- newchannel->state = CHANNEL_OPENED_STATE;
- kfree(open_info);
- return 0;
-
-error_clean_msglist:
- spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
- list_del(&open_info->msglistentry);
- spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
-error_free_info:
- kfree(open_info);
-error_free_gpadl:
- vmbus_teardown_gpadl(newchannel, newchannel->ringbuffer_gpadlhandle);
- newchannel->ringbuffer_gpadlhandle = 0;
-error_clean_ring:
- hv_ringbuffer_cleanup(&newchannel->outbound);
- hv_ringbuffer_cleanup(&newchannel->inbound);
- newchannel->state = CHANNEL_OPEN_STATE;
- return err;
-}
-
-/*
- * vmbus_connect_ring - Open the channel but reuse ring buffer
- */
-int vmbus_connect_ring(struct vmbus_channel *newchannel,
- void (*onchannelcallback)(void *context), void *context)
-{
- return __vmbus_open(newchannel, NULL, 0, onchannelcallback, context);
-}
-EXPORT_SYMBOL_GPL(vmbus_connect_ring);
-
-/*
- * vmbus_open - Open the specified channel.
- */
-int vmbus_open(struct vmbus_channel *newchannel,
- u32 send_ringbuffer_size, u32 recv_ringbuffer_size,
- void *userdata, u32 userdatalen,
- void (*onchannelcallback)(void *context), void *context)
-{
- int err;
-
- err = vmbus_alloc_ring(newchannel, send_ringbuffer_size,
- recv_ringbuffer_size);
- if (err)
- return err;
-
- err = __vmbus_open(newchannel, userdata, userdatalen,
- onchannelcallback, context);
- if (err)
- vmbus_free_ring(newchannel);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(vmbus_open);
-
/* Used for Hyper-V Socket: a guest client's connect() to the host */
int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
const guid_t *shv_host_servie_id)
@@ -290,9 +210,38 @@
EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
/*
+ * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
+ *
+ * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not
+ * ACK such messages. IOW we can't know when the host will stop interrupting
+ * the "old" vCPU and start interrupting the "new" vCPU for the given channel.
+ *
+ * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
+ * VERSION_WIN10_V4_1.
+ */
+int vmbus_send_modifychannel(u32 child_relid, u32 target_vp)
+{
+ struct vmbus_channel_modifychannel conn_msg;
+ int ret;
+
+ memset(&conn_msg, 0, sizeof(conn_msg));
+ conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+ conn_msg.child_relid = child_relid;
+ conn_msg.target_vp = target_vp;
+
+ ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true);
+
+ trace_vmbus_send_modifychannel(&conn_msg, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
+
+/*
* create_gpadl_header - Creates a gpadl for the specified buffer
*/
-static int create_gpadl_header(void *kbuffer, u32 size,
+static int create_gpadl_header(enum hv_gpadl_type type, void *kbuffer,
+ u32 size, u32 send_offset,
struct vmbus_channel_msginfo **msginfo)
{
int i;
@@ -305,7 +254,7 @@
int pfnsum, pfncount, pfnleft, pfncurr, pfnsize;
- pagecount = size >> PAGE_SHIFT;
+ pagecount = hv_gpadl_size(type, size) >> HV_HYP_PAGE_SHIFT;
/* do we need a gpadl body msg */
pfnsize = MAX_SIZE_CHANNEL_MESSAGE -
@@ -332,10 +281,10 @@
gpadl_header->range_buflen = sizeof(struct gpa_range) +
pagecount * sizeof(u64);
gpadl_header->range[0].byte_offset = 0;
- gpadl_header->range[0].byte_count = size;
+ gpadl_header->range[0].byte_count = hv_gpadl_size(type, size);
for (i = 0; i < pfncount; i++)
- gpadl_header->range[0].pfn_array[i] = virt_to_hvpfn(
- kbuffer + PAGE_SIZE * i);
+ gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn(
+ type, kbuffer, size, send_offset, i);
*msginfo = msgheader;
pfnsum = pfncount;
@@ -386,8 +335,8 @@
* so the hypervisor guarantees that this is ok.
*/
for (i = 0; i < pfncurr; i++)
- gpadl_body->pfn[i] = virt_to_hvpfn(
- kbuffer + PAGE_SIZE * (pfnsum + i));
+ gpadl_body->pfn[i] = hv_gpadl_hvpfn(type,
+ kbuffer, size, send_offset, pfnsum + i);
/* add to msg header */
list_add_tail(&msgbody->msglistentry,
@@ -413,10 +362,10 @@
gpadl_header->range_buflen = sizeof(struct gpa_range) +
pagecount * sizeof(u64);
gpadl_header->range[0].byte_offset = 0;
- gpadl_header->range[0].byte_count = size;
+ gpadl_header->range[0].byte_count = hv_gpadl_size(type, size);
for (i = 0; i < pagecount; i++)
- gpadl_header->range[0].pfn_array[i] = virt_to_hvpfn(
- kbuffer + PAGE_SIZE * i);
+ gpadl_header->range[0].pfn_array[i] = hv_gpadl_hvpfn(
+ type, kbuffer, size, send_offset, i);
*msginfo = msgheader;
}
@@ -429,15 +378,20 @@
}
/*
- * vmbus_establish_gpadl - Establish a GPADL for the specified buffer
+ * __vmbus_establish_gpadl - Establish a GPADL for a buffer or ringbuffer
*
* @channel: a channel
+ * @type: the type of the corresponding GPADL, only meaningful for the guest.
* @kbuffer: from kmalloc or vmalloc
* @size: page-size multiple
+ * @send_offset: the offset (in bytes) where the send ring buffer starts,
+ * should be 0 for BUFFER type gpadl
* @gpadl_handle: some funky thing
*/
-int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer,
- u32 size, u32 *gpadl_handle)
+static int __vmbus_establish_gpadl(struct vmbus_channel *channel,
+ enum hv_gpadl_type type, void *kbuffer,
+ u32 size, u32 send_offset,
+ u32 *gpadl_handle)
{
struct vmbus_channel_gpadl_header *gpadlmsg;
struct vmbus_channel_gpadl_body *gpadl_body;
@@ -451,7 +405,7 @@
next_gpadl_handle =
(atomic_inc_return(&vmbus_connection.next_gpadl_handle) - 1);
- ret = create_gpadl_header(kbuffer, size, &msginfo);
+ ret = create_gpadl_header(type, kbuffer, size, send_offset, &msginfo);
if (ret)
return ret;
@@ -532,8 +486,184 @@
kfree(msginfo);
return ret;
}
+
+/*
+ * vmbus_establish_gpadl - Establish a GPADL for the specified buffer
+ *
+ * @channel: a channel
+ * @kbuffer: from kmalloc or vmalloc
+ * @size: page-size multiple
+ * @gpadl_handle: some funky thing
+ */
+int vmbus_establish_gpadl(struct vmbus_channel *channel, void *kbuffer,
+ u32 size, u32 *gpadl_handle)
+{
+ return __vmbus_establish_gpadl(channel, HV_GPADL_BUFFER, kbuffer, size,
+ 0U, gpadl_handle);
+}
EXPORT_SYMBOL_GPL(vmbus_establish_gpadl);
+static int __vmbus_open(struct vmbus_channel *newchannel,
+ void *userdata, u32 userdatalen,
+ void (*onchannelcallback)(void *context), void *context)
+{
+ struct vmbus_channel_open_channel *open_msg;
+ struct vmbus_channel_msginfo *open_info = NULL;
+ struct page *page = newchannel->ringbuffer_page;
+ u32 send_pages, recv_pages;
+ unsigned long flags;
+ int err;
+
+ if (userdatalen > MAX_USER_DEFINED_BYTES)
+ return -EINVAL;
+
+ send_pages = newchannel->ringbuffer_send_offset;
+ recv_pages = newchannel->ringbuffer_pagecount - send_pages;
+
+ if (newchannel->state != CHANNEL_OPEN_STATE)
+ return -EINVAL;
+
+ newchannel->state = CHANNEL_OPENING_STATE;
+ newchannel->onchannel_callback = onchannelcallback;
+ newchannel->channel_callback_context = context;
+
+ err = hv_ringbuffer_init(&newchannel->outbound, page, send_pages);
+ if (err)
+ goto error_clean_ring;
+
+ err = hv_ringbuffer_init(&newchannel->inbound,
+ &page[send_pages], recv_pages);
+ if (err)
+ goto error_clean_ring;
+
+ /* Establish the gpadl for the ring buffer */
+ newchannel->ringbuffer_gpadlhandle = 0;
+
+ err = __vmbus_establish_gpadl(newchannel, HV_GPADL_RING,
+ page_address(newchannel->ringbuffer_page),
+ (send_pages + recv_pages) << PAGE_SHIFT,
+ newchannel->ringbuffer_send_offset << PAGE_SHIFT,
+ &newchannel->ringbuffer_gpadlhandle);
+ if (err)
+ goto error_clean_ring;
+
+ /* Create and init the channel open message */
+ open_info = kmalloc(sizeof(*open_info) +
+ sizeof(struct vmbus_channel_open_channel),
+ GFP_KERNEL);
+ if (!open_info) {
+ err = -ENOMEM;
+ goto error_free_gpadl;
+ }
+
+ init_completion(&open_info->waitevent);
+ open_info->waiting_channel = newchannel;
+
+ open_msg = (struct vmbus_channel_open_channel *)open_info->msg;
+ open_msg->header.msgtype = CHANNELMSG_OPENCHANNEL;
+ open_msg->openid = newchannel->offermsg.child_relid;
+ open_msg->child_relid = newchannel->offermsg.child_relid;
+ open_msg->ringbuffer_gpadlhandle = newchannel->ringbuffer_gpadlhandle;
+ /*
+ * The unit of ->downstream_ringbuffer_pageoffset is HV_HYP_PAGE and
+ * the unit of ->ringbuffer_send_offset (i.e. send_pages) is PAGE, so
+ * here we calculate it into HV_HYP_PAGE.
+ */
+ open_msg->downstream_ringbuffer_pageoffset =
+ hv_ring_gpadl_send_hvpgoffset(send_pages << PAGE_SHIFT);
+ open_msg->target_vp = hv_cpu_number_to_vp_number(newchannel->target_cpu);
+
+ if (userdatalen)
+ memcpy(open_msg->userdata, userdata, userdatalen);
+
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_add_tail(&open_info->msglistentry,
+ &vmbus_connection.chn_msg_list);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+ if (newchannel->rescind) {
+ err = -ENODEV;
+ goto error_clean_msglist;
+ }
+
+ err = vmbus_post_msg(open_msg,
+ sizeof(struct vmbus_channel_open_channel), true);
+
+ trace_vmbus_open(open_msg, err);
+
+ if (err != 0)
+ goto error_clean_msglist;
+
+ wait_for_completion(&open_info->waitevent);
+
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_del(&open_info->msglistentry);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+
+ if (newchannel->rescind) {
+ err = -ENODEV;
+ goto error_free_info;
+ }
+
+ if (open_info->response.open_result.status) {
+ err = -EAGAIN;
+ goto error_free_info;
+ }
+
+ newchannel->state = CHANNEL_OPENED_STATE;
+ kfree(open_info);
+ return 0;
+
+error_clean_msglist:
+ spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
+ list_del(&open_info->msglistentry);
+ spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
+error_free_info:
+ kfree(open_info);
+error_free_gpadl:
+ vmbus_teardown_gpadl(newchannel, newchannel->ringbuffer_gpadlhandle);
+ newchannel->ringbuffer_gpadlhandle = 0;
+error_clean_ring:
+ hv_ringbuffer_cleanup(&newchannel->outbound);
+ hv_ringbuffer_cleanup(&newchannel->inbound);
+ newchannel->state = CHANNEL_OPEN_STATE;
+ return err;
+}
+
+/*
+ * vmbus_connect_ring - Open the channel but reuse ring buffer
+ */
+int vmbus_connect_ring(struct vmbus_channel *newchannel,
+ void (*onchannelcallback)(void *context), void *context)
+{
+ return __vmbus_open(newchannel, NULL, 0, onchannelcallback, context);
+}
+EXPORT_SYMBOL_GPL(vmbus_connect_ring);
+
+/*
+ * vmbus_open - Open the specified channel.
+ */
+int vmbus_open(struct vmbus_channel *newchannel,
+ u32 send_ringbuffer_size, u32 recv_ringbuffer_size,
+ void *userdata, u32 userdatalen,
+ void (*onchannelcallback)(void *context), void *context)
+{
+ int err;
+
+ err = vmbus_alloc_ring(newchannel, send_ringbuffer_size,
+ recv_ringbuffer_size);
+ if (err)
+ return err;
+
+ err = __vmbus_open(newchannel, userdata, userdatalen,
+ onchannelcallback, context);
+ if (err)
+ vmbus_free_ring(newchannel);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(vmbus_open);
+
/*
* vmbus_teardown_gpadl -Teardown the specified GPADL handle
*/
@@ -594,35 +724,31 @@
}
EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl);
-static void reset_channel_cb(void *arg)
-{
- struct vmbus_channel *channel = arg;
-
- channel->onchannel_callback = NULL;
-}
-
void vmbus_reset_channel_cb(struct vmbus_channel *channel)
{
+ unsigned long flags;
+
/*
* vmbus_on_event(), running in the per-channel tasklet, can race
* with vmbus_close_internal() in the case of SMP guest, e.g., when
* the former is accessing channel->inbound.ring_buffer, the latter
* could be freeing the ring_buffer pages, so here we must stop it
* first.
+ *
+ * vmbus_chan_sched() might call the netvsc driver callback function
+ * that ends up scheduling NAPI work that accesses the ring buffer.
+ * At this point, we have to ensure that any such work is completed
+ * and that the channel ring buffer is no longer being accessed, cf.
+ * the calls to napi_disable() in netvsc_device_remove().
*/
tasklet_disable(&channel->callback_event);
- channel->sc_creation_callback = NULL;
+ /* See the inline comments in vmbus_chan_sched(). */
+ spin_lock_irqsave(&channel->sched_lock, flags);
+ channel->onchannel_callback = NULL;
+ spin_unlock_irqrestore(&channel->sched_lock, flags);
- /* Stop the callback asap */
- if (channel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(channel->target_cpu, reset_channel_cb,
- channel, true);
- } else {
- reset_channel_cb(channel);
- put_cpu();
- }
+ channel->sc_creation_callback = NULL;
/* Re-enable tasklet for use on re-open */
tasklet_enable(&channel->callback_event);
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 9260ad4..6476bfe 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -18,14 +18,15 @@
#include <linux/module.h>
#include <linux/completion.h>
#include <linux/delay.h>
+#include <linux/cpu.h>
#include <linux/hyperv.h>
#include <asm/mshyperv.h>
#include "hyperv_vmbus.h"
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type);
+static void init_vp_index(struct vmbus_channel *channel);
-static const struct vmbus_device vmbus_devs[] = {
+const struct vmbus_device vmbus_devs[] = {
/* IDE */
{ .dev_type = HV_IDE,
HV_IDE_GUID,
@@ -315,11 +316,10 @@
if (!channel)
return NULL;
- spin_lock_init(&channel->lock);
+ spin_lock_init(&channel->sched_lock);
init_completion(&channel->rescind_event);
INIT_LIST_HEAD(&channel->sc_list);
- INIT_LIST_HEAD(&channel->percpu_list);
tasklet_init(&channel->callback_event,
vmbus_on_event, (unsigned long)channel);
@@ -340,23 +340,49 @@
kobject_put(&channel->kobj);
}
-static void percpu_channel_enq(void *arg)
+void vmbus_channel_map_relid(struct vmbus_channel *channel)
{
- struct vmbus_channel *channel = arg;
- struct hv_per_cpu_context *hv_cpu
- = this_cpu_ptr(hv_context.cpu_context);
-
- list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list);
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
+ return;
+ /*
+ * The mapping of the channel's relid is visible from the CPUs that
+ * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will
+ * execute:
+ *
+ * (a) In the "normal (i.e., not resuming from hibernation)" path,
+ * the full barrier in smp_store_mb() guarantees that the store
+ * is propagated to all CPUs before the add_channel_work work
+ * is queued. In turn, add_channel_work is queued before the
+ * channel's ring buffer is allocated/initialized and the
+ * OPENCHANNEL message for the channel is sent in vmbus_open().
+ * Hyper-V won't start sending the interrupts for the channel
+ * before the OPENCHANNEL message is acked. The memory barrier
+ * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures
+ * that vmbus_chan_sched() must find the channel's relid in
+ * recv_int_page before retrieving the channel pointer from the
+ * array of channels.
+ *
+ * (b) In the "resuming from hibernation" path, the smp_store_mb()
+ * guarantees that the store is propagated to all CPUs before
+ * the VMBus connection is marked as ready for the resume event
+ * (cf. check_ready_for_resume_event()). The interrupt handler
+ * of the VMBus driver and vmbus_chan_sched() can not run before
+ * vmbus_bus_resume() has completed execution (cf. resume_noirq).
+ */
+ smp_store_mb(
+ vmbus_connection.channels[channel->offermsg.child_relid],
+ channel);
}
-static void percpu_channel_deq(void *arg)
+void vmbus_channel_unmap_relid(struct vmbus_channel *channel)
{
- struct vmbus_channel *channel = arg;
-
- list_del_rcu(&channel->percpu_list);
+ if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS))
+ return;
+ WRITE_ONCE(
+ vmbus_connection.channels[channel->offermsg.child_relid],
+ NULL);
}
-
static void vmbus_release_relid(u32 relid)
{
struct vmbus_channel_relid_released msg;
@@ -373,39 +399,37 @@
void hv_process_channel_removal(struct vmbus_channel *channel)
{
- struct vmbus_channel *primary_channel;
- unsigned long flags;
-
- BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
BUG_ON(!channel->rescind);
- if (channel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(channel->target_cpu,
- percpu_channel_deq, channel, true);
- } else {
- percpu_channel_deq(channel);
- put_cpu();
- }
-
- if (channel->primary_channel == NULL) {
- list_del(&channel->listentry);
-
- primary_channel = channel;
- } else {
- primary_channel = channel->primary_channel;
- spin_lock_irqsave(&primary_channel->lock, flags);
- list_del(&channel->sc_list);
- spin_unlock_irqrestore(&primary_channel->lock, flags);
- }
+ /*
+ * hv_process_channel_removal() could find INVALID_RELID only for
+ * hv_sock channels. See the inline comments in vmbus_onoffer().
+ */
+ WARN_ON(channel->offermsg.child_relid == INVALID_RELID &&
+ !is_hvsock_channel(channel));
/*
- * We need to free the bit for init_vp_index() to work in the case
- * of sub-channel, when we reload drivers like hv_netvsc.
+ * Upon suspend, an in-use hv_sock channel is removed from the array of
+ * channels and the relid is invalidated. After hibernation, when the
+ * user-space appplication destroys the channel, it's unnecessary and
+ * unsafe to remove the channel from the array of channels. See also
+ * the inline comments before the call of vmbus_release_relid() below.
*/
- if (channel->affinity_policy == HV_LOCALIZED)
- cpumask_clear_cpu(channel->target_cpu,
- &primary_channel->alloced_cpus_in_node);
+ if (channel->offermsg.child_relid != INVALID_RELID)
+ vmbus_channel_unmap_relid(channel);
+
+ if (channel->primary_channel == NULL)
+ list_del(&channel->listentry);
+ else
+ list_del(&channel->sc_list);
+
+ /*
+ * If this is a "perf" channel, updates the hv_numa_map[] masks so that
+ * init_vp_index() can (re-)use the CPU.
+ */
+ if (hv_is_perf_channel(channel))
+ hv_clear_alloced_cpu(channel->target_cpu);
/*
* Upon suspend, an in-use hv_sock channel is marked as "rescinded" and
@@ -439,24 +463,8 @@
struct vmbus_channel *newchannel =
container_of(work, struct vmbus_channel, add_channel_work);
struct vmbus_channel *primary_channel = newchannel->primary_channel;
- unsigned long flags;
- u16 dev_type;
int ret;
- dev_type = hv_get_dev_type(newchannel);
-
- init_vp_index(newchannel, dev_type);
-
- if (newchannel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(newchannel->target_cpu,
- percpu_channel_enq,
- newchannel, true);
- } else {
- percpu_channel_enq(newchannel);
- put_cpu();
- }
-
/*
* This state is used to indicate a successful open
* so that when we do close the channel normally, we
@@ -488,7 +496,7 @@
if (!newchannel->device_obj)
goto err_deq_chan;
- newchannel->device_obj->device_id = dev_type;
+ newchannel->device_obj->device_id = newchannel->device_id;
/*
* Add the new device to the bus. This will kick off device-driver
* binding which eventually invokes the device driver's AddDevice()
@@ -515,26 +523,16 @@
*/
newchannel->probe_done = true;
- if (primary_channel == NULL) {
+ if (primary_channel == NULL)
list_del(&newchannel->listentry);
- } else {
- spin_lock_irqsave(&primary_channel->lock, flags);
+ else
list_del(&newchannel->sc_list);
- spin_unlock_irqrestore(&primary_channel->lock, flags);
- }
+
+ /* vmbus_process_offer() has mapped the channel. */
+ vmbus_channel_unmap_relid(newchannel);
mutex_unlock(&vmbus_connection.channel_mutex);
- if (newchannel->target_cpu != get_cpu()) {
- put_cpu();
- smp_call_function_single(newchannel->target_cpu,
- percpu_channel_deq,
- newchannel, true);
- } else {
- percpu_channel_deq(newchannel);
- put_cpu();
- }
-
vmbus_release_relid(newchannel->offermsg.child_relid);
free_channel(newchannel);
@@ -548,11 +546,37 @@
{
struct vmbus_channel *channel;
struct workqueue_struct *wq;
- unsigned long flags;
bool fnew = true;
+ /*
+ * Synchronize vmbus_process_offer() and CPU hotplugging:
+ *
+ * CPU1 CPU2
+ *
+ * [vmbus_process_offer()] [Hot removal of the CPU]
+ *
+ * CPU_READ_LOCK CPUS_WRITE_LOCK
+ * LOAD cpu_online_mask SEARCH chn_list
+ * STORE target_cpu LOAD target_cpu
+ * INSERT chn_list STORE cpu_online_mask
+ * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK
+ *
+ * Forbids: CPU1's LOAD from *not* seing CPU2's STORE &&
+ * CPU2's SEARCH from *not* seeing CPU1's INSERT
+ *
+ * Forbids: CPU2's SEARCH from seeing CPU1's INSERT &&
+ * CPU2's LOAD from *not* seing CPU1's STORE
+ */
+ cpus_read_lock();
+
+ /*
+ * Serializes the modifications of the chn_list list as well as
+ * the accesses to next_numa_node_id in init_vp_index().
+ */
mutex_lock(&vmbus_connection.channel_mutex);
+ init_vp_index(newchannel);
+
/* Remember the channels that should be cleaned up upon suspend. */
if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel))
atomic_inc(&vmbus_connection.nr_chan_close_on_suspend);
@@ -573,10 +597,10 @@
}
}
- if (fnew)
+ if (fnew) {
list_add_tail(&newchannel->listentry,
&vmbus_connection.chn_list);
- else {
+ } else {
/*
* Check to see if this is a valid sub-channel.
*/
@@ -594,12 +618,13 @@
* Process the sub-channel.
*/
newchannel->primary_channel = channel;
- spin_lock_irqsave(&channel->lock, flags);
list_add_tail(&newchannel->sc_list, &channel->sc_list);
- spin_unlock_irqrestore(&channel->lock, flags);
}
+ vmbus_channel_map_relid(newchannel);
+
mutex_unlock(&vmbus_connection.channel_mutex);
+ cpus_read_unlock();
/*
* vmbus_process_offer() mustn't call channel->sc_creation_callback()
@@ -632,73 +657,57 @@
* We use this state to statically distribute the channel interrupt load.
*/
static int next_numa_node_id;
-/*
- * init_vp_index() accesses global variables like next_numa_node_id, and
- * it can run concurrently for primary channels and sub-channels: see
- * vmbus_process_offer(), so we need the lock to protect the global
- * variables.
- */
-static DEFINE_SPINLOCK(bind_channel_to_cpu_lock);
/*
* Starting with Win8, we can statically distribute the incoming
* channel interrupt load by binding a channel to VCPU.
- * We distribute the interrupt loads to one or more NUMA nodes based on
- * the channel's affinity_policy.
*
* For pre-win8 hosts or non-performance critical channels we assign the
- * first CPU in the first NUMA node.
+ * VMBUS_CONNECT_CPU.
+ *
+ * Starting with win8, performance critical channels will be distributed
+ * evenly among all the available NUMA nodes. Once the node is assigned,
+ * we will assign the CPU based on a simple round robin scheme.
*/
-static void init_vp_index(struct vmbus_channel *channel, u16 dev_type)
+static void init_vp_index(struct vmbus_channel *channel)
{
- u32 cur_cpu;
- bool perf_chn = vmbus_devs[dev_type].perf_device;
- struct vmbus_channel *primary = channel->primary_channel;
- int next_node;
+ bool perf_chn = hv_is_perf_channel(channel);
cpumask_var_t available_mask;
struct cpumask *alloced_mask;
+ u32 target_cpu;
+ int numa_node;
if ((vmbus_proto_version == VERSION_WS2008) ||
(vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
!alloc_cpumask_var(&available_mask, GFP_KERNEL)) {
/*
* Prior to win8, all channel interrupts are
- * delivered on cpu 0.
+ * delivered on VMBUS_CONNECT_CPU.
* Also if the channel is not a performance critical
- * channel, bind it to cpu 0.
- * In case alloc_cpumask_var() fails, bind it to cpu 0.
+ * channel, bind it to VMBUS_CONNECT_CPU.
+ * In case alloc_cpumask_var() fails, bind it to
+ * VMBUS_CONNECT_CPU.
*/
- channel->numa_node = 0;
- channel->target_cpu = 0;
- channel->target_vp = hv_cpu_number_to_vp_number(0);
+ channel->target_cpu = VMBUS_CONNECT_CPU;
+ if (perf_chn)
+ hv_set_alloced_cpu(VMBUS_CONNECT_CPU);
return;
}
- spin_lock(&bind_channel_to_cpu_lock);
-
- /*
- * Based on the channel affinity policy, we will assign the NUMA
- * nodes.
- */
-
- if ((channel->affinity_policy == HV_BALANCED) || (!primary)) {
- while (true) {
- next_node = next_numa_node_id++;
- if (next_node == nr_node_ids) {
- next_node = next_numa_node_id = 0;
- continue;
- }
- if (cpumask_empty(cpumask_of_node(next_node)))
- continue;
- break;
+ while (true) {
+ numa_node = next_numa_node_id++;
+ if (numa_node == nr_node_ids) {
+ next_numa_node_id = 0;
+ continue;
}
- channel->numa_node = next_node;
- primary = channel;
+ if (cpumask_empty(cpumask_of_node(numa_node)))
+ continue;
+ break;
}
- alloced_mask = &hv_context.hv_numa_map[primary->numa_node];
+ alloced_mask = &hv_context.hv_numa_map[numa_node];
if (cpumask_weight(alloced_mask) ==
- cpumask_weight(cpumask_of_node(primary->numa_node))) {
+ cpumask_weight(cpumask_of_node(numa_node))) {
/*
* We have cycled through all the CPUs in the node;
* reset the alloced map.
@@ -706,59 +715,12 @@
cpumask_clear(alloced_mask);
}
- cpumask_xor(available_mask, alloced_mask,
- cpumask_of_node(primary->numa_node));
+ cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
- cur_cpu = -1;
+ target_cpu = cpumask_first(available_mask);
+ cpumask_set_cpu(target_cpu, alloced_mask);
- if (primary->affinity_policy == HV_LOCALIZED) {
- /*
- * Normally Hyper-V host doesn't create more subchannels
- * than there are VCPUs on the node but it is possible when not
- * all present VCPUs on the node are initialized by guest.
- * Clear the alloced_cpus_in_node to start over.
- */
- if (cpumask_equal(&primary->alloced_cpus_in_node,
- cpumask_of_node(primary->numa_node)))
- cpumask_clear(&primary->alloced_cpus_in_node);
- }
-
- while (true) {
- cur_cpu = cpumask_next(cur_cpu, available_mask);
- if (cur_cpu >= nr_cpu_ids) {
- cur_cpu = -1;
- cpumask_copy(available_mask,
- cpumask_of_node(primary->numa_node));
- continue;
- }
-
- if (primary->affinity_policy == HV_LOCALIZED) {
- /*
- * NOTE: in the case of sub-channel, we clear the
- * sub-channel related bit(s) in
- * primary->alloced_cpus_in_node in
- * hv_process_channel_removal(), so when we
- * reload drivers like hv_netvsc in SMP guest, here
- * we're able to re-allocate
- * bit from primary->alloced_cpus_in_node.
- */
- if (!cpumask_test_cpu(cur_cpu,
- &primary->alloced_cpus_in_node)) {
- cpumask_set_cpu(cur_cpu,
- &primary->alloced_cpus_in_node);
- cpumask_set_cpu(cur_cpu, alloced_mask);
- break;
- }
- } else {
- cpumask_set_cpu(cur_cpu, alloced_mask);
- break;
- }
- }
-
- channel->target_cpu = cur_cpu;
- channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu);
-
- spin_unlock(&bind_channel_to_cpu_lock);
+ channel->target_cpu = target_cpu;
free_cpumask_var(available_mask);
}
@@ -913,6 +875,7 @@
sizeof(struct vmbus_channel_offer_channel));
channel->monitor_grp = (u8)offer->monitorid / 32;
channel->monitor_bit = (u8)offer->monitorid % 32;
+ channel->device_id = hv_get_dev_type(channel);
}
/*
@@ -963,8 +926,6 @@
oldchannel = find_primary_channel_by_offer(offer);
if (oldchannel != NULL) {
- atomic_dec(&vmbus_connection.offer_in_progress);
-
/*
* We're resuming from hibernation: all the sub-channel and
* hv_sock channels we had before the hibernation should have
@@ -972,36 +933,65 @@
* primary channel that we had before the hibernation.
*/
+ /*
+ * { Initially: channel relid = INVALID_RELID,
+ * channels[valid_relid] = NULL }
+ *
+ * CPU1 CPU2
+ *
+ * [vmbus_onoffer()] [vmbus_device_release()]
+ *
+ * LOCK channel_mutex LOCK channel_mutex
+ * STORE channel relid = valid_relid LOAD r1 = channel relid
+ * MAP_RELID channel if (r1 != INVALID_RELID)
+ * UNLOCK channel_mutex UNMAP_RELID channel
+ * UNLOCK channel_mutex
+ *
+ * Forbids: r1 == valid_relid &&
+ * channels[valid_relid] == channel
+ *
+ * Note. r1 can be INVALID_RELID only for an hv_sock channel.
+ * None of the hv_sock channels which were present before the
+ * suspend are re-offered upon the resume. See the WARN_ON()
+ * in hv_process_channel_removal().
+ */
+ mutex_lock(&vmbus_connection.channel_mutex);
+
+ atomic_dec(&vmbus_connection.offer_in_progress);
+
WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID);
/* Fix up the relid. */
oldchannel->offermsg.child_relid = offer->child_relid;
offer_sz = sizeof(*offer);
- if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) {
- check_ready_for_resume_event();
- return;
+ if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) {
+ /*
+ * This is not an error, since the host can also change
+ * the other field(s) of the offer, e.g. on WS RS5
+ * (Build 17763), the offer->connection_id of the
+ * Mellanox VF vmbus device can change when the host
+ * reoffers the device upon resume.
+ */
+ pr_debug("vmbus offer changed: relid=%d\n",
+ offer->child_relid);
+
+ print_hex_dump_debug("Old vmbus offer: ",
+ DUMP_PREFIX_OFFSET, 16, 4,
+ &oldchannel->offermsg, offer_sz,
+ false);
+ print_hex_dump_debug("New vmbus offer: ",
+ DUMP_PREFIX_OFFSET, 16, 4,
+ offer, offer_sz, false);
+
+ /* Fix up the old channel. */
+ vmbus_setup_channel_state(oldchannel, offer);
}
- /*
- * This is not an error, since the host can also change the
- * other field(s) of the offer, e.g. on WS RS5 (Build 17763),
- * the offer->connection_id of the Mellanox VF vmbus device
- * can change when the host reoffers the device upon resume.
- */
- pr_debug("vmbus offer changed: relid=%d\n",
- offer->child_relid);
-
- print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET,
- 16, 4, &oldchannel->offermsg, offer_sz,
- false);
- print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET,
- 16, 4, offer, offer_sz, false);
-
- /* Fix up the old channel. */
- vmbus_setup_channel_state(oldchannel, offer);
-
+ /* Add the channel back to the array of channels. */
+ vmbus_channel_map_relid(oldchannel);
check_ready_for_resume_event();
+ mutex_unlock(&vmbus_connection.channel_mutex);
return;
}
@@ -1051,11 +1041,22 @@
* offer comes in first and then the rescind.
* Since we process these events in work elements,
* and with preemption, we may end up processing
- * the events out of order. Given that we handle these
- * work elements on the same CPU, this is possible only
- * in the case of preemption. In any case wait here
- * until the offer processing has moved beyond the
- * point where the channel is discoverable.
+ * the events out of order. We rely on the synchronization
+ * provided by offer_in_progress and by channel_mutex for
+ * ordering these events:
+ *
+ * { Initially: offer_in_progress = 1 }
+ *
+ * CPU1 CPU2
+ *
+ * [vmbus_onoffer()] [vmbus_onoffer_rescind()]
+ *
+ * LOCK channel_mutex WAIT_ON offer_in_progress == 0
+ * DECREMENT offer_in_progress LOCK channel_mutex
+ * STORE channels[] LOAD channels[]
+ * UNLOCK channel_mutex UNLOCK channel_mutex
+ *
+ * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE
*/
while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
@@ -1354,30 +1355,36 @@
/* Channel message dispatch table */
const struct vmbus_channel_message_table_entry
channel_message_table[CHANNELMSG_COUNT] = {
- { CHANNELMSG_INVALID, 0, NULL },
- { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer },
- { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind },
- { CHANNELMSG_REQUESTOFFERS, 0, NULL },
- { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered },
- { CHANNELMSG_OPENCHANNEL, 0, NULL },
- { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result },
- { CHANNELMSG_CLOSECHANNEL, 0, NULL },
- { CHANNELMSG_GPADL_HEADER, 0, NULL },
- { CHANNELMSG_GPADL_BODY, 0, NULL },
- { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created },
- { CHANNELMSG_GPADL_TEARDOWN, 0, NULL },
- { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown },
- { CHANNELMSG_RELID_RELEASED, 0, NULL },
- { CHANNELMSG_INITIATE_CONTACT, 0, NULL },
- { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response },
- { CHANNELMSG_UNLOAD, 0, NULL },
- { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response },
- { CHANNELMSG_18, 0, NULL },
- { CHANNELMSG_19, 0, NULL },
- { CHANNELMSG_20, 0, NULL },
- { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL },
- { CHANNELMSG_22, 0, NULL },
- { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL },
+ { CHANNELMSG_INVALID, 0, NULL, 0},
+ { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer,
+ sizeof(struct vmbus_channel_offer_channel)},
+ { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind,
+ sizeof(struct vmbus_channel_rescind_offer) },
+ { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0},
+ { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0},
+ { CHANNELMSG_OPENCHANNEL, 0, NULL, 0},
+ { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result,
+ sizeof(struct vmbus_channel_open_result)},
+ { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0},
+ { CHANNELMSG_GPADL_HEADER, 0, NULL, 0},
+ { CHANNELMSG_GPADL_BODY, 0, NULL, 0},
+ { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created,
+ sizeof(struct vmbus_channel_gpadl_created)},
+ { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0},
+ { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown,
+ sizeof(struct vmbus_channel_gpadl_torndown) },
+ { CHANNELMSG_RELID_RELEASED, 0, NULL, 0},
+ { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0},
+ { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response,
+ sizeof(struct vmbus_channel_version_response)},
+ { CHANNELMSG_UNLOAD, 0, NULL, 0},
+ { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0},
+ { CHANNELMSG_18, 0, NULL, 0},
+ { CHANNELMSG_19, 0, NULL, 0},
+ { CHANNELMSG_20, 0, NULL, 0},
+ { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0},
+ { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0},
+ { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0},
};
/*
@@ -1385,13 +1392,8 @@
*
* This is invoked in the vmbus worker thread context.
*/
-void vmbus_onmessage(void *context)
+void vmbus_onmessage(struct vmbus_channel_message_header *hdr)
{
- struct hv_message *msg = context;
- struct vmbus_channel_message_header *hdr;
-
- hdr = (struct vmbus_channel_message_header *)msg->u.payload;
-
trace_vmbus_on_message(hdr);
/*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index c90d790..bfd7f00 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -14,6 +14,7 @@
#include <linux/wait.h>
#include <linux/delay.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/hyperv.h>
@@ -40,29 +41,30 @@
__u32 vmbus_proto_version;
EXPORT_SYMBOL_GPL(vmbus_proto_version);
-static __u32 vmbus_get_next_version(__u32 current_version)
-{
- switch (current_version) {
- case (VERSION_WIN7):
- return VERSION_WS2008;
+/*
+ * Table of VMBus versions listed from newest to oldest.
+ */
+static __u32 vmbus_versions[] = {
+ VERSION_WIN10_V5_2,
+ VERSION_WIN10_V5_1,
+ VERSION_WIN10_V5,
+ VERSION_WIN10_V4_1,
+ VERSION_WIN10,
+ VERSION_WIN8_1,
+ VERSION_WIN8,
+ VERSION_WIN7,
+ VERSION_WS2008
+};
- case (VERSION_WIN8):
- return VERSION_WIN7;
+/*
+ * Maximal VMBus protocol version guests can negotiate. Useful to cap the
+ * VMBus version for testing and debugging purpose.
+ */
+static uint max_version = VERSION_WIN10_V5_2;
- case (VERSION_WIN8_1):
- return VERSION_WIN8;
-
- case (VERSION_WIN10):
- return VERSION_WIN8_1;
-
- case (VERSION_WIN10_V5):
- return VERSION_WIN10;
-
- case (VERSION_WS2008):
- default:
- return VERSION_INVAL;
- }
-}
+module_param(max_version, uint, S_IRUGO);
+MODULE_PARM_DESC(max_version,
+ "Maximal VMBus protocol version which can be negotiated");
int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
{
@@ -79,12 +81,12 @@
msg->vmbus_version_requested = version;
/*
- * VMBus protocol 5.0 (VERSION_WIN10_V5) requires that we must use
- * VMBUS_MESSAGE_CONNECTION_ID_4 for the Initiate Contact Message,
+ * VMBus protocol 5.0 (VERSION_WIN10_V5) and higher require that we must
+ * use VMBUS_MESSAGE_CONNECTION_ID_4 for the Initiate Contact Message,
* and for subsequent messages, we must use the Message Connection ID
* field in the host-returned Version Response Message. And, with
- * VERSION_WIN10_V5, we don't use msg->interrupt_page, but we tell
- * the host explicitly that we still use VMBUS_MESSAGE_SINT(2) for
+ * VERSION_WIN10_V5 and higher, we don't use msg->interrupt_page, but we
+ * tell the host explicitly that we still use VMBUS_MESSAGE_SINT(2) for
* compatibility.
*
* On old hosts, we should always use VMBUS_MESSAGE_CONNECTION_ID (1).
@@ -151,8 +153,8 @@
*/
int vmbus_connect(void)
{
- int ret = 0;
struct vmbus_channel_msginfo *msginfo = NULL;
+ int i, ret = 0;
__u32 version;
/* Initialize the vmbus connection */
@@ -188,7 +190,7 @@
* abstraction stuff
*/
vmbus_connection.int_page =
- (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
+ (void *)hv_alloc_hyperv_zeroed_page();
if (vmbus_connection.int_page == NULL) {
ret = -ENOMEM;
goto cleanup;
@@ -197,14 +199,14 @@
vmbus_connection.recv_int_page = vmbus_connection.int_page;
vmbus_connection.send_int_page =
(void *)((unsigned long)vmbus_connection.int_page +
- (PAGE_SIZE >> 1));
+ (HV_HYP_PAGE_SIZE >> 1));
/*
* Setup the monitor notification facility. The 1st page for
* parent->child and the 2nd page for child->parent
*/
- vmbus_connection.monitor_pages[0] = (void *)__get_free_pages((GFP_KERNEL|__GFP_ZERO), 0);
- vmbus_connection.monitor_pages[1] = (void *)__get_free_pages((GFP_KERNEL|__GFP_ZERO), 0);
+ vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_zeroed_page();
+ vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_zeroed_page();
if ((vmbus_connection.monitor_pages[0] == NULL) ||
(vmbus_connection.monitor_pages[1] == NULL)) {
ret = -ENOMEM;
@@ -226,26 +228,36 @@
* version.
*/
- version = VERSION_CURRENT;
+ for (i = 0; ; i++) {
+ if (i == ARRAY_SIZE(vmbus_versions)) {
+ ret = -EDOM;
+ goto cleanup;
+ }
- do {
+ version = vmbus_versions[i];
+ if (version > max_version)
+ continue;
+
ret = vmbus_negotiate_version(msginfo, version);
if (ret == -ETIMEDOUT)
goto cleanup;
if (vmbus_connection.conn_state == CONNECTED)
break;
-
- version = vmbus_get_next_version(version);
- } while (version != VERSION_INVAL);
-
- if (version == VERSION_INVAL)
- goto cleanup;
+ }
vmbus_proto_version = version;
pr_info("Vmbus version:%d.%d\n",
version >> 16, version & 0xFFFF);
+ vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS,
+ sizeof(struct vmbus_channel *),
+ GFP_KERNEL);
+ if (vmbus_connection.channels == NULL) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
kfree(msginfo);
return 0;
@@ -277,12 +289,12 @@
destroy_workqueue(vmbus_connection.work_queue);
if (vmbus_connection.int_page) {
- free_pages((unsigned long)vmbus_connection.int_page, 0);
+ hv_free_hyperv_page((unsigned long)vmbus_connection.int_page);
vmbus_connection.int_page = NULL;
}
- free_pages((unsigned long)vmbus_connection.monitor_pages[0], 0);
- free_pages((unsigned long)vmbus_connection.monitor_pages[1], 0);
+ hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[0]);
+ hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[1]);
vmbus_connection.monitor_pages[0] = NULL;
vmbus_connection.monitor_pages[1] = NULL;
}
@@ -293,33 +305,9 @@
*/
struct vmbus_channel *relid2channel(u32 relid)
{
- struct vmbus_channel *channel;
- struct vmbus_channel *found_channel = NULL;
- struct list_head *cur, *tmp;
- struct vmbus_channel *cur_sc;
-
- BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex));
-
- list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
- if (channel->offermsg.child_relid == relid) {
- found_channel = channel;
- break;
- } else if (!list_empty(&channel->sc_list)) {
- /*
- * Deal with sub-channels.
- */
- list_for_each_safe(cur, tmp, &channel->sc_list) {
- cur_sc = list_entry(cur, struct vmbus_channel,
- sc_list);
- if (cur_sc->offermsg.child_relid == relid) {
- found_channel = cur_sc;
- break;
- }
- }
- }
- }
-
- return found_channel;
+ if (WARN_ON(relid >= MAX_CHANNEL_RELIDS))
+ return NULL;
+ return READ_ONCE(vmbus_connection.channels[relid]);
}
/*
@@ -343,6 +331,7 @@
trace_vmbus_on_event(channel);
+ hv_debug_delay_test(channel, INTERRUPT_DELAY);
do {
void (*callback_fn)(void *);
@@ -395,7 +384,7 @@
case HV_STATUS_INVALID_CONNECTION_ID:
/*
* See vmbus_negotiate_version(): VMBus protocol 5.0
- * requires that we must use
+ * and higher require that we must use
* VMBUS_MESSAGE_CONNECTION_ID_4 for the Initiate
* Contact message, but on old hosts that only
* support VMBus protocol 4.0 or lower, here we get
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index f849a1a..f202ac7 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -117,8 +117,6 @@
pr_err("Unable to allocate post msg page\n");
goto err;
}
-
- INIT_LIST_HEAD(&hv_cpu->chan_list);
}
return 0;
@@ -167,7 +165,7 @@
hv_get_simp(simp.as_uint64);
simp.simp_enabled = 1;
simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page)
- >> PAGE_SHIFT;
+ >> HV_HYP_PAGE_SHIFT;
hv_set_simp(simp.as_uint64);
@@ -175,20 +173,16 @@
hv_get_siefp(siefp.as_uint64);
siefp.siefp_enabled = 1;
siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page)
- >> PAGE_SHIFT;
+ >> HV_HYP_PAGE_SHIFT;
hv_set_siefp(siefp.as_uint64);
/* Setup the shared SINT. */
hv_get_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
- shared_sint.vector = HYPERVISOR_CALLBACK_VECTOR;
+ shared_sint.vector = hv_get_vector();
shared_sint.masked = false;
- if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
- shared_sint.auto_eoi = false;
- else
- shared_sint.auto_eoi = true;
-
+ shared_sint.auto_eoi = hv_recommend_using_aeoi();
hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64);
/* Enable the global synic bit */
@@ -202,7 +196,7 @@
{
hv_synic_enable_regs(cpu);
- hv_stimer_init(cpu);
+ hv_stimer_legacy_init(cpu, VMBUS_MESSAGE_SINT);
return 0;
}
@@ -247,7 +241,6 @@
{
struct vmbus_channel *channel, *sc;
bool channel_found = false;
- unsigned long flags;
/*
* Hyper-V does not provide a way to change the connect CPU once
@@ -262,9 +255,10 @@
/*
* Search for channels which are bound to the CPU we're about to
- * cleanup. In case we find one and vmbus is still connected we need to
- * fail, this will effectively prevent CPU offlining. There is no way
- * we can re-bind channels to different CPUs for now.
+ * cleanup. In case we find one and vmbus is still connected, we
+ * fail; this will effectively prevent CPU offlining.
+ *
+ * TODO: Re-bind the channels to different CPUs.
*/
mutex_lock(&vmbus_connection.channel_mutex);
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
@@ -272,14 +266,12 @@
channel_found = true;
break;
}
- spin_lock_irqsave(&channel->lock, flags);
list_for_each_entry(sc, &channel->sc_list, sc_list) {
if (sc->target_cpu == cpu) {
channel_found = true;
break;
}
}
- spin_unlock_irqrestore(&channel->lock, flags);
if (channel_found)
break;
}
@@ -288,7 +280,7 @@
if (channel_found && vmbus_connection.conn_state == CONNECTED)
return -EBUSY;
- hv_stimer_cleanup(cpu);
+ hv_stimer_legacy_cleanup(cpu);
hv_synic_disable_regs(cpu);
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index bd4e72f..eb56e09 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -23,6 +23,9 @@
#include <linux/percpu_counter.h>
#include <linux/hyperv.h>
+#include <asm/hyperv-tlfs.h>
+
+#include <asm/mshyperv.h>
#define CREATE_TRACE_POINTS
#include "hv_trace_balloon.h"
@@ -341,8 +344,6 @@
*
* mem_range: Memory range to hot add.
*
- * On Linux we currently don't support this since we cannot hot add
- * arbitrary granularity of memory.
*/
struct dm_hot_add {
@@ -457,6 +458,7 @@
struct work_struct wrk;
};
+static bool allow_hibernation;
static bool hot_add = true;
static bool do_hot_add;
/*
@@ -477,7 +479,7 @@
MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
static atomic_t trans_id = ATOMIC_INIT(0);
-static int dm_ring_size = (5 * PAGE_SIZE);
+static int dm_ring_size = 20 * 1024;
/*
* Driver specific state.
@@ -493,10 +495,10 @@
};
-static __u8 recv_buffer[PAGE_SIZE];
-static __u8 balloon_up_send_buffer[PAGE_SIZE];
-#define PAGES_IN_2M 512
-#define HA_CHUNK (32 * 1024)
+static __u8 recv_buffer[HV_HYP_PAGE_SIZE];
+static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
+#define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
+#define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
struct hv_dynmem_device {
struct hv_device *dev;
@@ -531,7 +533,6 @@
* State to synchronize hot-add.
*/
struct completion ol_waitevent;
- bool ha_waiting;
/*
* This thread handles hot-add
* requests from the host as well as notifying
@@ -632,10 +633,7 @@
switch (val) {
case MEM_ONLINE:
case MEM_CANCEL_ONLINE:
- if (dm_device.ha_waiting) {
- dm_device.ha_waiting = false;
- complete(&dm_device.ol_waitevent);
- }
+ complete(&dm_device.ol_waitevent);
break;
case MEM_OFFLINE:
@@ -680,9 +678,7 @@
__ClearPageOffline(pg);
/* This frame is currently backed; online the page. */
- __online_page_set_limits(pg);
- __online_page_increment_counters(pg);
- __online_page_free(pg);
+ generic_online_page(pg, 0);
lockdep_assert_held(&dm_device.ha_lock);
dm_device.num_pages_onlined++;
@@ -726,12 +722,11 @@
has->covered_end_pfn += processed_pfn;
spin_unlock_irqrestore(&dm_device.ha_lock, flags);
- init_completion(&dm_device.ol_waitevent);
- dm_device.ha_waiting = !memhp_auto_online;
+ reinit_completion(&dm_device.ol_waitevent);
nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
ret = add_memory(nid, PFN_PHYS((start_pfn)),
- (HA_CHUNK << PAGE_SHIFT));
+ (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
if (ret) {
pr_err("hot_add memory failed error is %d\n", ret);
@@ -753,15 +748,14 @@
}
/*
- * Wait for the memory block to be onlined when memory onlining
- * is done outside of kernel (memhp_auto_online). Since the hot
- * add has succeeded, it is ok to proceed even if the pages in
- * the hot added region have not been "onlined" within the
- * allowed time.
+ * Wait for memory to get onlined. If the kernel onlined the
+ * memory when adding it, this will return directly. Otherwise,
+ * it will wait for user space to online the memory. This helps
+ * to avoid adding memory faster than it is getting onlined. As
+ * adding succeeded, it is ok to proceed even if the memory was
+ * not onlined in time.
*/
- if (dm_device.ha_waiting)
- wait_for_completion_timeout(&dm_device.ol_waitevent,
- 5*HZ);
+ wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
post_status(&dm_device);
}
}
@@ -1053,8 +1047,12 @@
else
resp.result = 0;
- if (!do_hot_add || (resp.page_count == 0))
- pr_err("Memory hot add failed\n");
+ if (!do_hot_add || resp.page_count == 0) {
+ if (!allow_hibernation)
+ pr_err("Memory hot add failed\n");
+ else
+ pr_info("Ignore hot-add request!\n");
+ }
dm->state = DM_INITIALIZED;
resp.hdr.trans_id = atomic_inc_return(&trans_id);
@@ -1076,7 +1074,7 @@
__u64 *max_page_count = (__u64 *)&info_hdr[1];
pr_info("Max. dynamic memory size: %llu MB\n",
- (*max_page_count) >> (20 - PAGE_SHIFT));
+ (*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT));
}
break;
@@ -1215,7 +1213,7 @@
for (i = 0; i < num_pages / alloc_unit; i++) {
if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
- PAGE_SIZE)
+ HV_HYP_PAGE_SIZE)
return i * alloc_unit;
/*
@@ -1268,9 +1266,9 @@
/*
* We will attempt 2M allocations. However, if we fail to
- * allocate 2M chunks, we will go back to 4k allocations.
+ * allocate 2M chunks, we will go back to PAGE_SIZE allocations.
*/
- alloc_unit = 512;
+ alloc_unit = PAGES_IN_2M;
avail_pages = si_mem_available();
floor = compute_balloon_floor();
@@ -1285,7 +1283,7 @@
}
while (!done) {
- memset(balloon_up_send_buffer, 0, PAGE_SIZE);
+ memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE);
bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
bl_resp->hdr.type = DM_BALLOON_RESPONSE;
bl_resp->hdr.size = sizeof(struct dm_balloon_response);
@@ -1484,7 +1482,7 @@
memset(recv_buffer, 0, sizeof(recv_buffer));
vmbus_recvpacket(dev->channel, recv_buffer,
- PAGE_SIZE, &recvlen, &requestid);
+ HV_HYP_PAGE_SIZE, &recvlen, &requestid);
if (recvlen > 0) {
dm_msg = (struct dm_message *)recv_buffer;
@@ -1502,6 +1500,11 @@
break;
case DM_BALLOON_REQUEST:
+ if (allow_hibernation) {
+ pr_info("Ignore balloon-up request!\n");
+ break;
+ }
+
if (dm->state == DM_BALLOON_UP)
pr_warn("Currently ballooning\n");
bal_msg = (struct dm_balloon *)recv_buffer;
@@ -1511,6 +1514,11 @@
break;
case DM_UNBALLOON_REQUEST:
+ if (allow_hibernation) {
+ pr_info("Ignore balloon-down request!\n");
+ break;
+ }
+
dm->state = DM_BALLOON_DOWN;
balloon_down(dm,
(struct dm_unballoon_request *)recv_buffer);
@@ -1616,6 +1624,11 @@
cap_msg.hdr.size = sizeof(struct dm_capabilities);
cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
+ /*
+ * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host
+ * currently still requires the bits to be set, so we have to add code
+ * to fail the host's hot-add and balloon up/down requests, if any.
+ */
cap_msg.caps.cap_bits.balloon = 1;
cap_msg.caps.cap_bits.hot_add = 1;
@@ -1665,6 +1678,10 @@
{
int ret;
+ allow_hibernation = hv_is_hibernation_supported();
+ if (allow_hibernation)
+ hot_add = false;
+
#ifdef CONFIG_MEMORY_HOTPLUG
do_hot_add = hot_add;
#else
@@ -1683,6 +1700,7 @@
#ifdef CONFIG_MEMORY_HOTPLUG
set_online_page_callback(&hv_online_page);
+ init_completion(&dm_device.ol_waitevent);
register_memory_notifier(&hv_memory_nb);
#endif
@@ -1704,6 +1722,8 @@
return 0;
probe_error:
+ dm_device.state = DM_INIT_ERROR;
+ dm_device.thread = NULL;
vmbus_close(dev->channel);
#ifdef CONFIG_MEMORY_HOTPLUG
unregister_memory_notifier(&hv_memory_nb);
@@ -1745,6 +1765,59 @@
return 0;
}
+static int balloon_suspend(struct hv_device *hv_dev)
+{
+ struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev);
+
+ tasklet_disable(&hv_dev->channel->callback_event);
+
+ cancel_work_sync(&dm->balloon_wrk.wrk);
+ cancel_work_sync(&dm->ha_wrk.wrk);
+
+ if (dm->thread) {
+ kthread_stop(dm->thread);
+ dm->thread = NULL;
+ vmbus_close(hv_dev->channel);
+ }
+
+ tasklet_enable(&hv_dev->channel->callback_event);
+
+ return 0;
+
+}
+
+static int balloon_resume(struct hv_device *dev)
+{
+ int ret;
+
+ dm_device.state = DM_INITIALIZING;
+
+ ret = balloon_connect_vsp(dev);
+
+ if (ret != 0)
+ goto out;
+
+ dm_device.thread =
+ kthread_run(dm_thread_func, &dm_device, "hv_balloon");
+ if (IS_ERR(dm_device.thread)) {
+ ret = PTR_ERR(dm_device.thread);
+ dm_device.thread = NULL;
+ goto close_channel;
+ }
+
+ dm_device.state = DM_INITIALIZED;
+ return 0;
+close_channel:
+ vmbus_close(dev->channel);
+out:
+ dm_device.state = DM_INIT_ERROR;
+#ifdef CONFIG_MEMORY_HOTPLUG
+ unregister_memory_notifier(&hv_memory_nb);
+ restore_online_page_callback(&hv_online_page);
+#endif
+ return ret;
+}
+
static const struct hv_vmbus_device_id id_table[] = {
/* Dynamic Memory Class ID */
/* 525074DC-8985-46e2-8057-A307DC18A502 */
@@ -1759,6 +1832,8 @@
.id_table = id_table,
.probe = balloon_probe,
.remove = balloon_remove,
+ .suspend = balloon_suspend,
+ .resume = balloon_resume,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
diff --git a/drivers/hv/hv_debugfs.c b/drivers/hv/hv_debugfs.c
new file mode 100644
index 0000000..ccf752b
--- /dev/null
+++ b/drivers/hv/hv_debugfs.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ * Branden Bonaby <brandonbonaby94@gmail.com>
+ */
+
+#include <linux/hyperv.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+
+#include "hyperv_vmbus.h"
+
+static struct dentry *hv_debug_root;
+
+static int hv_debugfs_delay_get(void *data, u64 *val)
+{
+ *val = *(u32 *)data;
+ return 0;
+}
+
+static int hv_debugfs_delay_set(void *data, u64 val)
+{
+ if (val > 1000)
+ return -EINVAL;
+ *(u32 *)data = val;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(hv_debugfs_delay_fops, hv_debugfs_delay_get,
+ hv_debugfs_delay_set, "%llu\n");
+
+static int hv_debugfs_state_get(void *data, u64 *val)
+{
+ *val = *(bool *)data;
+ return 0;
+}
+
+static int hv_debugfs_state_set(void *data, u64 val)
+{
+ if (val == 1)
+ *(bool *)data = true;
+ else if (val == 0)
+ *(bool *)data = false;
+ else
+ return -EINVAL;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(hv_debugfs_state_fops, hv_debugfs_state_get,
+ hv_debugfs_state_set, "%llu\n");
+
+/* Setup delay files to store test values */
+static int hv_debug_delay_files(struct hv_device *dev, struct dentry *root)
+{
+ struct vmbus_channel *channel = dev->channel;
+ char *buffer = "fuzz_test_buffer_interrupt_delay";
+ char *message = "fuzz_test_message_delay";
+ int *buffer_val = &channel->fuzz_testing_interrupt_delay;
+ int *message_val = &channel->fuzz_testing_message_delay;
+ struct dentry *buffer_file, *message_file;
+
+ buffer_file = debugfs_create_file(buffer, 0644, root,
+ buffer_val,
+ &hv_debugfs_delay_fops);
+ if (IS_ERR(buffer_file)) {
+ pr_debug("debugfs_hyperv: file %s not created\n", buffer);
+ return PTR_ERR(buffer_file);
+ }
+
+ message_file = debugfs_create_file(message, 0644, root,
+ message_val,
+ &hv_debugfs_delay_fops);
+ if (IS_ERR(message_file)) {
+ pr_debug("debugfs_hyperv: file %s not created\n", message);
+ return PTR_ERR(message_file);
+ }
+
+ return 0;
+}
+
+/* Setup test state value for vmbus device */
+static int hv_debug_set_test_state(struct hv_device *dev, struct dentry *root)
+{
+ struct vmbus_channel *channel = dev->channel;
+ bool *state = &channel->fuzz_testing_state;
+ char *status = "fuzz_test_state";
+ struct dentry *test_state;
+
+ test_state = debugfs_create_file(status, 0644, root,
+ state,
+ &hv_debugfs_state_fops);
+ if (IS_ERR(test_state)) {
+ pr_debug("debugfs_hyperv: file %s not created\n", status);
+ return PTR_ERR(test_state);
+ }
+
+ return 0;
+}
+
+/* Bind hv device to a dentry for debugfs */
+static void hv_debug_set_dir_dentry(struct hv_device *dev, struct dentry *root)
+{
+ if (hv_debug_root)
+ dev->debug_dir = root;
+}
+
+/* Create all test dentry's and names for fuzz testing */
+int hv_debug_add_dev_dir(struct hv_device *dev)
+{
+ const char *device = dev_name(&dev->device);
+ char *delay_name = "delay";
+ struct dentry *delay, *dev_root;
+ int ret;
+
+ if (!IS_ERR(hv_debug_root)) {
+ dev_root = debugfs_create_dir(device, hv_debug_root);
+ if (IS_ERR(dev_root)) {
+ pr_debug("debugfs_hyperv: hyperv/%s/ not created\n",
+ device);
+ return PTR_ERR(dev_root);
+ }
+ hv_debug_set_test_state(dev, dev_root);
+ hv_debug_set_dir_dentry(dev, dev_root);
+ delay = debugfs_create_dir(delay_name, dev_root);
+
+ if (IS_ERR(delay)) {
+ pr_debug("debugfs_hyperv: hyperv/%s/%s/ not created\n",
+ device, delay_name);
+ return PTR_ERR(delay);
+ }
+ ret = hv_debug_delay_files(dev, delay);
+
+ return ret;
+ }
+ pr_debug("debugfs_hyperv: hyperv/ not in root debugfs path\n");
+ return PTR_ERR(hv_debug_root);
+}
+
+/* Remove dentry associated with released hv device */
+void hv_debug_rm_dev_dir(struct hv_device *dev)
+{
+ if (!IS_ERR(hv_debug_root))
+ debugfs_remove_recursive(dev->debug_dir);
+}
+
+/* Remove all dentrys associated with vmbus testing */
+void hv_debug_rm_all_dir(void)
+{
+ debugfs_remove_recursive(hv_debug_root);
+}
+
+/* Delay buffer/message reads on a vmbus channel */
+void hv_debug_delay_test(struct vmbus_channel *channel, enum delay delay_type)
+{
+ struct vmbus_channel *test_channel = channel->primary_channel ?
+ channel->primary_channel :
+ channel;
+ bool state = test_channel->fuzz_testing_state;
+
+ if (state) {
+ if (delay_type == 0)
+ udelay(test_channel->fuzz_testing_interrupt_delay);
+ else
+ udelay(test_channel->fuzz_testing_message_delay);
+ }
+}
+
+/* Initialize top dentry for vmbus testing */
+int hv_debug_init(void)
+{
+ hv_debug_root = debugfs_create_dir("hyperv", NULL);
+ if (IS_ERR(hv_debug_root)) {
+ pr_debug("debugfs_hyperv: hyperv/ not created\n");
+ return PTR_ERR(hv_debug_root);
+ }
+ return 0;
+}
diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c
index 7e30ae0..5040d7e 100644
--- a/drivers/hv/hv_fcopy.c
+++ b/drivers/hv/hv_fcopy.c
@@ -13,6 +13,7 @@
#include <linux/workqueue.h>
#include <linux/hyperv.h>
#include <linux/sched.h>
+#include <asm/hyperv-tlfs.h>
#include "hyperv_vmbus.h"
#include "hv_utils_transport.h"
@@ -70,7 +71,7 @@
{
/* Transaction is finished, reset the state here to avoid races. */
fcopy_transaction.state = HVUTIL_READY;
- hv_fcopy_onchannelcallback(channel);
+ tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event);
}
static void fcopy_timeout_func(struct work_struct *dummy)
@@ -234,7 +235,7 @@
if (fcopy_transaction.state > HVUTIL_READY)
return;
- vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
+ vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
&requestid);
if (recvlen <= 0)
return;
@@ -345,9 +346,61 @@
return 0;
}
+static void hv_fcopy_cancel_work(void)
+{
+ cancel_delayed_work_sync(&fcopy_timeout_work);
+ cancel_work_sync(&fcopy_send_work);
+}
+
+int hv_fcopy_pre_suspend(void)
+{
+ struct vmbus_channel *channel = fcopy_transaction.recv_channel;
+ struct hv_fcopy_hdr *fcopy_msg;
+
+ /*
+ * Fake a CANCEL_FCOPY message for the user space daemon in case the
+ * daemon is in the middle of copying some file. It doesn't matter if
+ * there is already a message pending to be delivered to the user
+ * space since we force fcopy_transaction.state to be HVUTIL_READY, so
+ * the user space daemon's write() will fail with EINVAL (see
+ * fcopy_on_msg()), and the daemon will reset the device by closing
+ * and re-opening it.
+ */
+ fcopy_msg = kzalloc(sizeof(*fcopy_msg), GFP_KERNEL);
+ if (!fcopy_msg)
+ return -ENOMEM;
+
+ tasklet_disable(&channel->callback_event);
+
+ fcopy_msg->operation = CANCEL_FCOPY;
+
+ hv_fcopy_cancel_work();
+
+ /* We don't care about the return value. */
+ hvutil_transport_send(hvt, fcopy_msg, sizeof(*fcopy_msg), NULL);
+
+ kfree(fcopy_msg);
+
+ fcopy_transaction.state = HVUTIL_READY;
+
+ /* tasklet_enable() will be called in hv_fcopy_pre_resume(). */
+ return 0;
+}
+
+int hv_fcopy_pre_resume(void)
+{
+ struct vmbus_channel *channel = fcopy_transaction.recv_channel;
+
+ tasklet_enable(&channel->callback_event);
+
+ return 0;
+}
+
void hv_fcopy_deinit(void)
{
fcopy_transaction.state = HVUTIL_DEVICE_DYING;
- cancel_delayed_work_sync(&fcopy_timeout_work);
+
+ hv_fcopy_cancel_work();
+
hvutil_transport_destroy(hvt);
}
diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 5054d11..754d35a 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -27,6 +27,7 @@
#include <linux/connector.h>
#include <linux/workqueue.h>
#include <linux/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include "hyperv_vmbus.h"
#include "hv_utils_transport.h"
@@ -353,7 +354,7 @@
out->body.kvp_ip_val.dhcp_enabled = in->kvp_ip_val.dhcp_enabled;
- /* fallthrough */
+ fallthrough;
case KVP_OP_GET_IP_INFO:
utf16s_to_utf8s((wchar_t *)in->kvp_ip_val.adapter_id,
@@ -661,7 +662,7 @@
if (kvp_transaction.state > HVUTIL_READY)
return;
- vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 4, &recvlen,
+ vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 4, &recvlen,
&requestid);
if (recvlen > 0) {
@@ -757,11 +758,50 @@
return 0;
}
-void hv_kvp_deinit(void)
+static void hv_kvp_cancel_work(void)
{
- kvp_transaction.state = HVUTIL_DEVICE_DYING;
cancel_delayed_work_sync(&kvp_host_handshake_work);
cancel_delayed_work_sync(&kvp_timeout_work);
cancel_work_sync(&kvp_sendkey_work);
+}
+
+int hv_kvp_pre_suspend(void)
+{
+ struct vmbus_channel *channel = kvp_transaction.recv_channel;
+
+ tasklet_disable(&channel->callback_event);
+
+ /*
+ * If there is a pending transtion, it's unnecessary to tell the host
+ * that the transaction will fail, because that is implied when
+ * util_suspend() calls vmbus_close() later.
+ */
+ hv_kvp_cancel_work();
+
+ /*
+ * Forece the state to READY to handle the ICMSGTYPE_NEGOTIATE message
+ * later. The user space daemon may go out of order and its write()
+ * may fail with EINVAL: this doesn't matter since the daemon will
+ * reset the device by closing and re-opening it.
+ */
+ kvp_transaction.state = HVUTIL_READY;
+ return 0;
+}
+
+int hv_kvp_pre_resume(void)
+{
+ struct vmbus_channel *channel = kvp_transaction.recv_channel;
+
+ tasklet_enable(&channel->callback_event);
+
+ return 0;
+}
+
+void hv_kvp_deinit(void)
+{
+ kvp_transaction.state = HVUTIL_DEVICE_DYING;
+
+ hv_kvp_cancel_work();
+
hvutil_transport_destroy(hvt);
}
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
index 20ba95b..783779e 100644
--- a/drivers/hv/hv_snapshot.c
+++ b/drivers/hv/hv_snapshot.c
@@ -12,6 +12,7 @@
#include <linux/connector.h>
#include <linux/workqueue.h>
#include <linux/hyperv.h>
+#include <asm/hyperv-tlfs.h>
#include "hyperv_vmbus.h"
#include "hv_utils_transport.h"
@@ -79,7 +80,7 @@
{
/* Transaction is finished, reset the state here to avoid races. */
vss_transaction.state = HVUTIL_READY;
- hv_vss_onchannelcallback(channel);
+ tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event);
}
/*
@@ -297,7 +298,7 @@
if (vss_transaction.state > HVUTIL_READY)
return;
- vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
+ vmbus_recvpacket(channel, recv_buffer, HV_HYP_PAGE_SIZE * 2, &recvlen,
&requestid);
if (recvlen > 0) {
@@ -378,10 +379,61 @@
return 0;
}
+static void hv_vss_cancel_work(void)
+{
+ cancel_delayed_work_sync(&vss_timeout_work);
+ cancel_work_sync(&vss_handle_request_work);
+}
+
+int hv_vss_pre_suspend(void)
+{
+ struct vmbus_channel *channel = vss_transaction.recv_channel;
+ struct hv_vss_msg *vss_msg;
+
+ /*
+ * Fake a THAW message for the user space daemon in case the daemon
+ * has frozen the file systems. It doesn't matter if there is already
+ * a message pending to be delivered to the user space since we force
+ * vss_transaction.state to be HVUTIL_READY, so the user space daemon's
+ * write() will fail with EINVAL (see vss_on_msg()), and the daemon
+ * will reset the device by closing and re-opening it.
+ */
+ vss_msg = kzalloc(sizeof(*vss_msg), GFP_KERNEL);
+ if (!vss_msg)
+ return -ENOMEM;
+
+ tasklet_disable(&channel->callback_event);
+
+ vss_msg->vss_hdr.operation = VSS_OP_THAW;
+
+ /* Cancel any possible pending work. */
+ hv_vss_cancel_work();
+
+ /* We don't care about the return value. */
+ hvutil_transport_send(hvt, vss_msg, sizeof(*vss_msg), NULL);
+
+ kfree(vss_msg);
+
+ vss_transaction.state = HVUTIL_READY;
+
+ /* tasklet_enable() will be called in hv_vss_pre_resume(). */
+ return 0;
+}
+
+int hv_vss_pre_resume(void)
+{
+ struct vmbus_channel *channel = vss_transaction.recv_channel;
+
+ tasklet_enable(&channel->callback_event);
+
+ return 0;
+}
+
void hv_vss_deinit(void)
{
vss_transaction.state = HVUTIL_DEVICE_DYING;
- cancel_delayed_work_sync(&vss_timeout_work);
- cancel_work_sync(&vss_handle_request_work);
+
+ hv_vss_cancel_work();
+
hvutil_transport_destroy(hvt);
}
diff --git a/drivers/hv/hv_trace.h b/drivers/hv/hv_trace.h
index e70783e..6063bb2 100644
--- a/drivers/hv/hv_trace.h
+++ b/drivers/hv/hv_trace.h
@@ -44,10 +44,8 @@
__entry->monitorid = offer->monitorid;
__entry->is_ddc_int = offer->is_dedicated_interrupt;
__entry->connection_id = offer->connection_id;
- memcpy(__entry->if_type,
- &offer->offer.if_type.b, 16);
- memcpy(__entry->if_instance,
- &offer->offer.if_instance.b, 16);
+ export_guid(__entry->if_type, &offer->offer.if_type);
+ export_guid(__entry->if_instance, &offer->offer.if_instance);
__entry->chn_flags = offer->offer.chn_flags;
__entry->mmio_mb = offer->offer.mmio_megabytes;
__entry->sub_idx = offer->offer.sub_channel_index;
@@ -286,8 +284,8 @@
__field(int, ret)
),
TP_fast_assign(
- memcpy(__entry->guest_id, &msg->guest_endpoint_id.b, 16);
- memcpy(__entry->host_id, &msg->host_service_id.b, 16);
+ export_guid(__entry->guest_id, &msg->guest_endpoint_id);
+ export_guid(__entry->host_id, &msg->host_service_id);
__entry->ret = ret;
),
TP_printk("sending guest_endpoint_id %pUl, host_service_id %pUl, "
@@ -296,6 +294,25 @@
)
);
+TRACE_EVENT(vmbus_send_modifychannel,
+ TP_PROTO(const struct vmbus_channel_modifychannel *msg,
+ int ret),
+ TP_ARGS(msg, ret),
+ TP_STRUCT__entry(
+ __field(u32, child_relid)
+ __field(u32, target_vp)
+ __field(int, ret)
+ ),
+ TP_fast_assign(
+ __entry->child_relid = msg->child_relid;
+ __entry->target_vp = msg->target_vp;
+ __entry->ret = ret;
+ ),
+ TP_printk("binding child_relid 0x%x to target_vp 0x%x, ret %d",
+ __entry->child_relid, __entry->target_vp, __entry->ret
+ )
+ );
+
DECLARE_EVENT_CLASS(vmbus_channel,
TP_PROTO(const struct vmbus_channel *channel),
TP_ARGS(channel),
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 1671f6f..1b914e4 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -24,6 +24,10 @@
#define SD_MAJOR 3
#define SD_MINOR 0
+#define SD_MINOR_1 1
+#define SD_MINOR_2 2
+#define SD_VERSION_3_1 (SD_MAJOR << 16 | SD_MINOR_1)
+#define SD_VERSION_3_2 (SD_MAJOR << 16 | SD_MINOR_2)
#define SD_VERSION (SD_MAJOR << 16 | SD_MINOR)
#define SD_MAJOR_1 1
@@ -50,8 +54,10 @@
static int ts_srv_version;
static int hb_srv_version;
-#define SD_VER_COUNT 2
+#define SD_VER_COUNT 4
static const int sd_versions[] = {
+ SD_VERSION_3_2,
+ SD_VERSION_3_1,
SD_VERSION,
SD_VERSION_1
};
@@ -75,18 +81,56 @@
UTIL_WS2K8_FW_VERSION
};
+/*
+ * Send the "hibernate" udev event in a thread context.
+ */
+struct hibernate_work_context {
+ struct work_struct work;
+ struct hv_device *dev;
+};
+
+static struct hibernate_work_context hibernate_context;
+static bool hibernation_supported;
+
+static void send_hibernate_uevent(struct work_struct *work)
+{
+ char *uevent_env[2] = { "EVENT=hibernate", NULL };
+ struct hibernate_work_context *ctx;
+
+ ctx = container_of(work, struct hibernate_work_context, work);
+
+ kobject_uevent_env(&ctx->dev->device.kobj, KOBJ_CHANGE, uevent_env);
+
+ pr_info("Sent hibernation uevent\n");
+}
+
+static int hv_shutdown_init(struct hv_util_service *srv)
+{
+ struct vmbus_channel *channel = srv->channel;
+
+ INIT_WORK(&hibernate_context.work, send_hibernate_uevent);
+ hibernate_context.dev = channel->device_obj;
+
+ hibernation_supported = hv_is_hibernation_supported();
+
+ return 0;
+}
+
static void shutdown_onchannelcallback(void *context);
static struct hv_util_service util_shutdown = {
.util_cb = shutdown_onchannelcallback,
+ .util_init = hv_shutdown_init,
};
static int hv_timesync_init(struct hv_util_service *srv);
+static int hv_timesync_pre_suspend(void);
static void hv_timesync_deinit(void);
static void timesync_onchannelcallback(void *context);
static struct hv_util_service util_timesynch = {
.util_cb = timesync_onchannelcallback,
.util_init = hv_timesync_init,
+ .util_pre_suspend = hv_timesync_pre_suspend,
.util_deinit = hv_timesync_deinit,
};
@@ -98,18 +142,24 @@
static struct hv_util_service util_kvp = {
.util_cb = hv_kvp_onchannelcallback,
.util_init = hv_kvp_init,
+ .util_pre_suspend = hv_kvp_pre_suspend,
+ .util_pre_resume = hv_kvp_pre_resume,
.util_deinit = hv_kvp_deinit,
};
static struct hv_util_service util_vss = {
.util_cb = hv_vss_onchannelcallback,
.util_init = hv_vss_init,
+ .util_pre_suspend = hv_vss_pre_suspend,
+ .util_pre_resume = hv_vss_pre_resume,
.util_deinit = hv_vss_deinit,
};
static struct hv_util_service util_fcopy = {
.util_cb = hv_fcopy_onchannelcallback,
.util_init = hv_fcopy_init,
+ .util_pre_suspend = hv_fcopy_pre_suspend,
+ .util_pre_resume = hv_fcopy_pre_resume,
.util_deinit = hv_fcopy_deinit,
};
@@ -118,17 +168,27 @@
orderly_poweroff(true);
}
+static void perform_restart(struct work_struct *dummy)
+{
+ orderly_reboot();
+}
+
/*
* Perform the shutdown operation in a thread context.
*/
static DECLARE_WORK(shutdown_work, perform_shutdown);
+/*
+ * Perform the restart operation in a thread context.
+ */
+static DECLARE_WORK(restart_work, perform_restart);
+
static void shutdown_onchannelcallback(void *context)
{
struct vmbus_channel *channel = context;
+ struct work_struct *work = NULL;
u32 recvlen;
u64 requestid;
- bool execute_shutdown = false;
u8 *shut_txf_buf = util_shutdown.recv_buffer;
struct shutdown_msg_data *shutdown_msg;
@@ -136,7 +196,7 @@
struct icmsg_hdr *icmsghdrp;
vmbus_recvpacket(channel, shut_txf_buf,
- PAGE_SIZE, &recvlen, &requestid);
+ HV_HYP_PAGE_SIZE, &recvlen, &requestid);
if (recvlen > 0) {
icmsghdrp = (struct icmsg_hdr *)&shut_txf_buf[
@@ -157,19 +217,37 @@
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
+ /*
+ * shutdown_msg->flags can be 0(shut down), 2(reboot),
+ * or 4(hibernate). It may bitwise-OR 1, which means
+ * performing the request by force. Linux always tries
+ * to perform the request by force.
+ */
switch (shutdown_msg->flags) {
case 0:
case 1:
icmsghdrp->status = HV_S_OK;
- execute_shutdown = true;
-
+ work = &shutdown_work;
pr_info("Shutdown request received -"
" graceful shutdown initiated\n");
break;
+ case 2:
+ case 3:
+ icmsghdrp->status = HV_S_OK;
+ work = &restart_work;
+ pr_info("Restart request received -"
+ " graceful restart initiated\n");
+ break;
+ case 4:
+ case 5:
+ pr_info("Hibernation request received\n");
+ icmsghdrp->status = hibernation_supported ?
+ HV_S_OK : HV_E_FAIL;
+ if (hibernation_supported)
+ work = &hibernate_context.work;
+ break;
default:
icmsghdrp->status = HV_E_FAIL;
- execute_shutdown = false;
-
pr_info("Shutdown request received -"
" Invalid request\n");
break;
@@ -184,8 +262,8 @@
VM_PKT_DATA_INBAND, 0);
}
- if (execute_shutdown == true)
- schedule_work(&shutdown_work);
+ if (work)
+ schedule_work(work);
}
/*
@@ -204,26 +282,52 @@
spinlock_t lock;
} host_ts;
-static struct timespec64 hv_get_adj_host_time(void)
+static inline u64 reftime_to_ns(u64 reftime)
{
- struct timespec64 ts;
- u64 newtime, reftime;
+ return (reftime - WLTIMEDELTA) * 100;
+}
+
+/*
+ * Hard coded threshold for host timesync delay: 600 seconds
+ */
+static const u64 HOST_TIMESYNC_DELAY_THRESH = 600 * (u64)NSEC_PER_SEC;
+
+static int hv_get_adj_host_time(struct timespec64 *ts)
+{
+ u64 newtime, reftime, timediff_adj;
unsigned long flags;
+ int ret = 0;
spin_lock_irqsave(&host_ts.lock, flags);
- reftime = hyperv_cs->read(hyperv_cs);
- newtime = host_ts.host_time + (reftime - host_ts.ref_time);
- ts = ns_to_timespec64((newtime - WLTIMEDELTA) * 100);
+ reftime = hv_read_reference_counter();
+
+ /*
+ * We need to let the caller know that last update from host
+ * is older than the max allowable threshold. clock_gettime()
+ * and PTP ioctl do not have a documented error that we could
+ * return for this specific case. Use ESTALE to report this.
+ */
+ timediff_adj = reftime - host_ts.ref_time;
+ if (timediff_adj * 100 > HOST_TIMESYNC_DELAY_THRESH) {
+ pr_warn_once("TIMESYNC IC: Stale time stamp, %llu nsecs old\n",
+ (timediff_adj * 100));
+ ret = -ESTALE;
+ }
+
+ newtime = host_ts.host_time + timediff_adj;
+ *ts = ns_to_timespec64(reftime_to_ns(newtime));
spin_unlock_irqrestore(&host_ts.lock, flags);
- return ts;
+ return ret;
}
static void hv_set_host_time(struct work_struct *work)
{
- struct timespec64 ts = hv_get_adj_host_time();
- do_settimeofday64(&ts);
+ struct timespec64 ts;
+
+ if (!hv_get_adj_host_time(&ts))
+ do_settimeofday64(&ts);
}
/*
@@ -250,7 +354,7 @@
*/
spin_lock_irqsave(&host_ts.lock, flags);
- cur_reftime = hyperv_cs->read(hyperv_cs);
+ cur_reftime = hv_read_reference_counter();
host_ts.host_time = hosttime;
host_ts.ref_time = cur_reftime;
@@ -283,10 +387,23 @@
struct ictimesync_ref_data *refdata;
u8 *time_txf_buf = util_timesynch.recv_buffer;
- vmbus_recvpacket(channel, time_txf_buf,
- PAGE_SIZE, &recvlen, &requestid);
+ /*
+ * Drain the ring buffer and use the last packet to update
+ * host_ts
+ */
+ while (1) {
+ int ret = vmbus_recvpacket(channel, time_txf_buf,
+ HV_HYP_PAGE_SIZE, &recvlen,
+ &requestid);
+ if (ret) {
+ pr_warn_once("TimeSync IC pkt recv failed (Err: %d)\n",
+ ret);
+ break;
+ }
- if (recvlen > 0) {
+ if (!recvlen)
+ break;
+
icmsghdrp = (struct icmsg_hdr *)&time_txf_buf[
sizeof(struct vmbuspipe_hdr)];
@@ -315,7 +432,7 @@
sizeof(struct vmbuspipe_hdr) +
sizeof(struct icmsg_hdr)];
adj_guesttime(timedatap->parenttime,
- hyperv_cs->read(hyperv_cs),
+ hv_read_reference_counter(),
timedatap->flags);
}
}
@@ -346,7 +463,7 @@
while (1) {
vmbus_recvpacket(channel, hbeat_txf_buf,
- PAGE_SIZE, &recvlen, &requestid);
+ HV_HYP_PAGE_SIZE, &recvlen, &requestid);
if (!recvlen)
break;
@@ -383,6 +500,9 @@
}
}
+#define HV_UTIL_RING_SEND_SIZE VMBUS_RING_SIZE(3 * HV_HYP_PAGE_SIZE)
+#define HV_UTIL_RING_RECV_SIZE VMBUS_RING_SIZE(3 * HV_HYP_PAGE_SIZE)
+
static int util_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
@@ -390,7 +510,7 @@
(struct hv_util_service *)dev_id->driver_data;
int ret;
- srv->recv_buffer = kmalloc(PAGE_SIZE * 4, GFP_KERNEL);
+ srv->recv_buffer = kmalloc(HV_HYP_PAGE_SIZE * 4, GFP_KERNEL);
if (!srv->recv_buffer)
return -ENOMEM;
srv->channel = dev->channel;
@@ -413,8 +533,9 @@
hv_set_drvdata(dev, srv);
- ret = vmbus_open(dev->channel, 4 * PAGE_SIZE, 4 * PAGE_SIZE, NULL, 0,
- srv->util_cb, dev->channel);
+ ret = vmbus_open(dev->channel, HV_UTIL_RING_SEND_SIZE,
+ HV_UTIL_RING_RECV_SIZE, NULL, 0, srv->util_cb,
+ dev->channel);
if (ret)
goto error;
@@ -440,6 +561,44 @@
return 0;
}
+/*
+ * When we're in util_suspend(), all the userspace processes have been frozen
+ * (refer to hibernate() -> freeze_processes()). The userspace is thawed only
+ * after the whole resume procedure, including util_resume(), finishes.
+ */
+static int util_suspend(struct hv_device *dev)
+{
+ struct hv_util_service *srv = hv_get_drvdata(dev);
+ int ret = 0;
+
+ if (srv->util_pre_suspend) {
+ ret = srv->util_pre_suspend();
+ if (ret)
+ return ret;
+ }
+
+ vmbus_close(dev->channel);
+
+ return 0;
+}
+
+static int util_resume(struct hv_device *dev)
+{
+ struct hv_util_service *srv = hv_get_drvdata(dev);
+ int ret = 0;
+
+ if (srv->util_pre_resume) {
+ ret = srv->util_pre_resume();
+ if (ret)
+ return ret;
+ }
+
+ ret = vmbus_open(dev->channel, HV_UTIL_RING_SEND_SIZE,
+ HV_UTIL_RING_RECV_SIZE, NULL, 0, srv->util_cb,
+ dev->channel);
+ return ret;
+}
+
static const struct hv_vmbus_device_id id_table[] = {
/* Shutdown guid */
{ HV_SHUTDOWN_GUID,
@@ -476,6 +635,8 @@
.id_table = id_table,
.probe = util_probe,
.remove = util_remove,
+ .suspend = util_suspend,
+ .resume = util_resume,
.driver = {
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
},
@@ -503,9 +664,7 @@
static int hv_ptp_gettime(struct ptp_clock_info *info, struct timespec64 *ts)
{
- *ts = hv_get_adj_host_time();
-
- return 0;
+ return hv_get_adj_host_time(ts);
}
static struct ptp_clock_info ptp_hyperv_info = {
@@ -523,7 +682,7 @@
static int hv_timesync_init(struct hv_util_service *srv)
{
/* TimeSync requires Hyper-V clocksource. */
- if (!hyperv_cs)
+ if (!hv_read_reference_counter)
return -ENODEV;
spin_lock_init(&host_ts.lock);
@@ -545,11 +704,23 @@
return 0;
}
+static void hv_timesync_cancel_work(void)
+{
+ cancel_work_sync(&adj_time_work);
+}
+
+static int hv_timesync_pre_suspend(void)
+{
+ hv_timesync_cancel_work();
+ return 0;
+}
+
static void hv_timesync_deinit(void)
{
if (hv_ptp_clock)
ptp_clock_unregister(hv_ptp_clock);
- cancel_work_sync(&adj_time_work);
+
+ hv_timesync_cancel_work();
}
static int __init init_hyperv_utils(void)
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index cabcb66..7845fa5 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -13,6 +13,7 @@
#define _HYPERV_VMBUS_H
#include <linux/list.h>
+#include <linux/bitops.h>
#include <asm/sync_bitops.h>
#include <asm/hyperv-tlfs.h>
#include <linux/atomic.h>
@@ -132,12 +133,6 @@
* basis.
*/
struct tasklet_struct msg_dpc;
-
- /*
- * To optimize the mapping of relid to channel, maintain
- * per-cpu list of the channels based on their CPU affinity.
- */
- struct list_head chan_list;
};
struct hv_context {
@@ -202,6 +197,8 @@
/* TODO: Need to make this configurable */
#define MAX_NUM_CHANNELS_SUPPORTED 256
+#define MAX_CHANNEL_RELIDS \
+ max(MAX_NUM_CHANNELS_SUPPORTED, HV_EVENT_FLAGS_COUNT)
enum vmbus_connect_state {
DISCONNECTED,
@@ -251,6 +248,9 @@
struct list_head chn_list;
struct mutex channel_mutex;
+ /* Array of channels */
+ struct vmbus_channel **channels;
+
/*
* An offer message is handled first on the work_queue, and then
* is further handled on handle_primary_chan_wq or
@@ -293,7 +293,7 @@
struct list_head msglist_entry;
/* The message itself */
- unsigned char msg[0];
+ unsigned char msg[];
};
@@ -318,6 +318,7 @@
enum vmbus_channel_message_type message_type;
enum vmbus_message_handler_type handler_type;
void (*message_handler)(struct vmbus_channel_message_header *msg);
+ u32 min_payload_len;
};
extern const struct vmbus_channel_message_table_entry
@@ -337,6 +338,9 @@
void vmbus_remove_channel_attr_group(struct vmbus_channel *channel);
+void vmbus_channel_map_relid(struct vmbus_channel *channel);
+void vmbus_channel_unmap_relid(struct vmbus_channel *channel);
+
struct vmbus_channel *relid2channel(u32 relid);
void vmbus_free_channels(void);
@@ -353,14 +357,20 @@
int hv_kvp_init(struct hv_util_service *srv);
void hv_kvp_deinit(void);
+int hv_kvp_pre_suspend(void);
+int hv_kvp_pre_resume(void);
void hv_kvp_onchannelcallback(void *context);
int hv_vss_init(struct hv_util_service *srv);
void hv_vss_deinit(void);
+int hv_vss_pre_suspend(void);
+int hv_vss_pre_resume(void);
void hv_vss_onchannelcallback(void *context);
int hv_fcopy_init(struct hv_util_service *srv);
void hv_fcopy_deinit(void);
+int hv_fcopy_pre_suspend(void);
+int hv_fcopy_pre_resume(void);
void hv_fcopy_onchannelcallback(void *context);
void vmbus_initiate_unload(bool crash);
@@ -369,12 +379,7 @@
{
if (!channel)
return;
-
- if (in_interrupt() && (channel->target_cpu == smp_processor_id())) {
- cb(channel);
- return;
- }
- smp_call_function_single(channel->target_cpu, cb, channel, true);
+ cb(channel);
}
enum hvutil_device_state {
@@ -386,4 +391,83 @@
HVUTIL_DEVICE_DYING, /* driver unload is in progress */
};
+enum delay {
+ INTERRUPT_DELAY = 0,
+ MESSAGE_DELAY = 1,
+};
+
+extern const struct vmbus_device vmbus_devs[];
+
+static inline bool hv_is_perf_channel(struct vmbus_channel *channel)
+{
+ return vmbus_devs[channel->device_id].perf_device;
+}
+
+static inline bool hv_is_alloced_cpu(unsigned int cpu)
+{
+ struct vmbus_channel *channel, *sc;
+
+ lockdep_assert_held(&vmbus_connection.channel_mutex);
+ /*
+ * List additions/deletions as well as updates of the target CPUs are
+ * protected by channel_mutex.
+ */
+ list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+ if (!hv_is_perf_channel(channel))
+ continue;
+ if (channel->target_cpu == cpu)
+ return true;
+ list_for_each_entry(sc, &channel->sc_list, sc_list) {
+ if (sc->target_cpu == cpu)
+ return true;
+ }
+ }
+ return false;
+}
+
+static inline void hv_set_alloced_cpu(unsigned int cpu)
+{
+ cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]);
+}
+
+static inline void hv_clear_alloced_cpu(unsigned int cpu)
+{
+ if (hv_is_alloced_cpu(cpu))
+ return;
+ cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]);
+}
+
+static inline void hv_update_alloced_cpus(unsigned int old_cpu,
+ unsigned int new_cpu)
+{
+ hv_set_alloced_cpu(new_cpu);
+ hv_clear_alloced_cpu(old_cpu);
+}
+
+#ifdef CONFIG_HYPERV_TESTING
+
+int hv_debug_add_dev_dir(struct hv_device *dev);
+void hv_debug_rm_dev_dir(struct hv_device *dev);
+void hv_debug_rm_all_dir(void);
+int hv_debug_init(void);
+void hv_debug_delay_test(struct vmbus_channel *channel, enum delay delay_type);
+
+#else /* CONFIG_HYPERV_TESTING */
+
+static inline void hv_debug_rm_dev_dir(struct hv_device *dev) {};
+static inline void hv_debug_rm_all_dir(void) {};
+static inline void hv_debug_delay_test(struct vmbus_channel *channel,
+ enum delay delay_type) {};
+static inline int hv_debug_init(void)
+{
+ return -1;
+}
+
+static inline int hv_debug_add_dev_dir(struct hv_device *dev)
+{
+ return -1;
+}
+
+#endif /* CONFIG_HYPERV_TESTING */
+
#endif /* _HYPERV_VMBUS_H */
diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c
index 9a03b16..356e221 100644
--- a/drivers/hv/ring_buffer.c
+++ b/drivers/hv/ring_buffer.c
@@ -396,6 +396,7 @@
struct hv_ring_buffer_info *rbi = &channel->inbound;
struct vmpacket_descriptor *desc;
+ hv_debug_delay_test(channel, MESSAGE_DELAY);
if (hv_pkt_iter_avail(rbi) < sizeof(struct vmpacket_descriptor))
return NULL;
@@ -421,6 +422,7 @@
u32 packetlen = desc->len8 << 3;
u32 dsize = rbi->ring_datasize;
+ hv_debug_delay_test(channel, MESSAGE_DELAY);
/* bump offset to next potential packet */
rbi->priv_read_index += packetlen + VMBUS_PKT_TRAILER;
if (rbi->priv_read_index >= dsize)
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 2d2568d..362da2a 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -23,7 +23,6 @@
#include <linux/cpu.h>
#include <linux/sched/task_stack.h>
-#include <asm/mshyperv.h>
#include <linux/delay.h>
#include <linux/notifier.h>
#include <linux/ptrace.h>
@@ -49,6 +48,10 @@
static void *hv_panic_page;
+/* Values parsed from ACPI DSDT */
+static int vmbus_irq;
+int vmbus_interrupt;
+
/*
* Boolean to control whether to report panic messages over Hyper-V.
*
@@ -84,9 +87,13 @@
static int hyperv_die_event(struct notifier_block *nb, unsigned long val,
void *args)
{
- struct die_args *die = (struct die_args *)args;
+ struct die_args *die = args;
struct pt_regs *regs = die->regs;
+ /* Don't notify Hyper-V if the die event is other than oops */
+ if (val != DIE_OOPS)
+ return NOTIFY_DONE;
+
/*
* Hyper-V should be notified only once about a panic. If we will be
* doing hyperv_report_panic_msg() later with kmsg data, don't do
@@ -107,7 +114,7 @@
static const char *fb_mmio_name = "fb_range";
static struct resource *fb_mmio;
static struct resource *hyperv_mmio;
-static DEFINE_SEMAPHORE(hyperv_mmio_lock);
+static DEFINE_MUTEX(hyperv_mmio_lock);
static int vmbus_exists(void)
{
@@ -117,14 +124,6 @@
return 0;
}
-#define VMBUS_ALIAS_LEN ((sizeof((struct hv_vmbus_device_id *)0)->guid) * 2)
-static void print_alias_name(struct hv_device *hv_dev, char *alias_name)
-{
- int i;
- for (i = 0; i < VMBUS_ALIAS_LEN; i += 2)
- sprintf(&alias_name[i], "%02x", hv_dev->dev_type.b[i/2]);
-}
-
static u8 channel_monitor_group(const struct vmbus_channel *channel)
{
return (u8)channel->offermsg.monitorid / 32;
@@ -201,7 +200,7 @@
if (!hv_dev->channel)
return -ENODEV;
return sprintf(buf, "{%pUl}\n",
- hv_dev->channel->offermsg.offer.if_type.b);
+ &hv_dev->channel->offermsg.offer.if_type);
}
static DEVICE_ATTR_RO(class_id);
@@ -213,7 +212,7 @@
if (!hv_dev->channel)
return -ENODEV;
return sprintf(buf, "{%pUl}\n",
- hv_dev->channel->offermsg.offer.if_instance.b);
+ &hv_dev->channel->offermsg.offer.if_instance);
}
static DEVICE_ATTR_RO(device_id);
@@ -221,10 +220,8 @@
struct device_attribute *dev_attr, char *buf)
{
struct hv_device *hv_dev = device_to_hv_device(dev);
- char alias_name[VMBUS_ALIAS_LEN + 1];
- print_alias_name(hv_dev, alias_name);
- return sprintf(buf, "vmbus:%s\n", alias_name);
+ return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type);
}
static DEVICE_ATTR_RO(modalias);
@@ -237,7 +234,7 @@
if (!hv_dev->channel)
return -ENODEV;
- return sprintf(buf, "%d\n", hv_dev->channel->numa_node);
+ return sprintf(buf, "%d\n", cpu_to_node(hv_dev->channel->target_cpu));
}
static DEVICE_ATTR_RO(numa_node);
#endif
@@ -518,18 +515,17 @@
{
struct hv_device *hv_dev = device_to_hv_device(dev);
struct vmbus_channel *channel = hv_dev->channel, *cur_sc;
- unsigned long flags;
int buf_size = PAGE_SIZE, n_written, tot_written;
struct list_head *cur;
if (!channel)
return -ENODEV;
+ mutex_lock(&vmbus_connection.channel_mutex);
+
tot_written = snprintf(buf, buf_size, "%u:%u\n",
channel->offermsg.child_relid, channel->target_cpu);
- spin_lock_irqsave(&channel->lock, flags);
-
list_for_each(cur, &channel->sc_list) {
if (tot_written >= buf_size - 1)
break;
@@ -543,7 +539,7 @@
tot_written += n_written;
}
- spin_unlock_irqrestore(&channel->lock, flags);
+ mutex_unlock(&vmbus_connection.channel_mutex);
return tot_written;
}
@@ -693,12 +689,9 @@
static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env)
{
struct hv_device *dev = device_to_hv_device(device);
- int ret;
- char alias_name[VMBUS_ALIAS_LEN + 1];
+ const char *format = "MODALIAS=vmbus:%*phN";
- print_alias_name(dev, alias_name);
- ret = add_uevent_var(env, "MODALIAS=vmbus:%s", alias_name);
- return ret;
+ return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type);
}
static const struct hv_vmbus_device_id *
@@ -991,6 +984,8 @@
struct hv_device *hv_dev = device_to_hv_device(device);
struct vmbus_channel *channel = hv_dev->channel;
+ hv_debug_rm_dev_dir(hv_dev);
+
mutex_lock(&vmbus_connection.channel_mutex);
hv_process_channel_removal(channel);
mutex_unlock(&vmbus_connection.channel_mutex);
@@ -1031,7 +1026,10 @@
struct onmessage_work_context {
struct work_struct work;
- struct hv_message msg;
+ struct {
+ struct hv_message_header header;
+ u8 payload[];
+ } msg;
};
static void vmbus_onmessage_work(struct work_struct *work)
@@ -1044,7 +1042,8 @@
ctx = container_of(work, struct onmessage_work_context,
work);
- vmbus_onmessage(&ctx->msg);
+ vmbus_onmessage((struct vmbus_channel_message_header *)
+ &ctx->msg.payload);
kfree(ctx);
}
@@ -1059,6 +1058,13 @@
struct onmessage_work_context *ctx;
u32 message_type = msg->header.message_type;
+ /*
+ * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as
+ * it is being used in 'struct vmbus_channel_message_header' definition
+ * which is supposed to match hypervisor ABI.
+ */
+ BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32));
+
if (message_type == HVMSG_NONE)
/* no msg */
return;
@@ -1072,24 +1078,39 @@
goto msg_handled;
}
+ if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) {
+ WARN_ONCE(1, "payload size is too large (%d)\n",
+ msg->header.payload_size);
+ goto msg_handled;
+ }
+
entry = &channel_message_table[hdr->msgtype];
if (!entry->message_handler)
goto msg_handled;
+ if (msg->header.payload_size < entry->min_payload_len) {
+ WARN_ONCE(1, "message too short: msgtype=%d len=%d\n",
+ hdr->msgtype, msg->header.payload_size);
+ goto msg_handled;
+ }
+
if (entry->handler_type == VMHT_BLOCKING) {
- ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
+ ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size,
+ GFP_ATOMIC);
if (ctx == NULL)
return;
INIT_WORK(&ctx->work, vmbus_onmessage_work);
- memcpy(&ctx->msg, msg, sizeof(*msg));
+ memcpy(&ctx->msg, msg, sizeof(msg->header) +
+ msg->header.payload_size);
/*
* The host can generate a rescind message while we
* may still be handling the original offer. We deal with
- * this condition by ensuring the processing is done on the
- * same CPU.
+ * this condition by relying on the synchronization provided
+ * by offer_in_progress and by channel_mutex. See also the
+ * inline comments in vmbus_onoffer_rescind().
*/
switch (hdr->msgtype) {
case CHANNELMSG_RESCIND_CHANNELOFFER:
@@ -1111,16 +1132,34 @@
* work queue: the RESCIND handler can not start to
* run before the OFFER handler finishes.
*/
- schedule_work_on(VMBUS_CONNECT_CPU,
- &ctx->work);
+ schedule_work(&ctx->work);
break;
case CHANNELMSG_OFFERCHANNEL:
+ /*
+ * The host sends the offer message of a given channel
+ * before sending the rescind message of the same
+ * channel. These messages are sent to the guest's
+ * connect CPU; the guest then starts processing them
+ * in the tasklet handler on this CPU:
+ *
+ * VMBUS_CONNECT_CPU
+ *
+ * [vmbus_on_msg_dpc()]
+ * atomic_inc() // CHANNELMSG_OFFERCHANNEL
+ * queue_work()
+ * ...
+ * [vmbus_on_msg_dpc()]
+ * schedule_work() // CHANNELMSG_RESCIND_CHANNELOFFER
+ *
+ * We rely on the memory-ordering properties of the
+ * queue_work() and schedule_work() primitives, which
+ * guarantee that the atomic increment will be visible
+ * to the CPUs which will execute the offer & rescind
+ * works by the time these works will start execution.
+ */
atomic_inc(&vmbus_connection.offer_in_progress);
- queue_work_on(VMBUS_CONNECT_CPU,
- vmbus_connection.work_queue,
- &ctx->work);
- break;
+ fallthrough;
default:
queue_work(vmbus_connection.work_queue, &ctx->work);
@@ -1145,10 +1184,11 @@
WARN_ON(!is_hvsock_channel(channel));
/*
- * sizeof(*ctx) is small and the allocation should really not fail,
+ * Allocation size is small and the allocation should really not fail,
* otherwise the state of the hv_sock connections ends up in limbo.
*/
- ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
+ ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind),
+ GFP_KERNEL | __GFP_NOFAIL);
/*
* So far, these are not really used by Linux. Just set them to the
@@ -1158,31 +1198,17 @@
ctx->msg.header.payload_size = sizeof(*rescind);
/* These values are actually used by Linux. */
- rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload;
+ rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload;
rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER;
rescind->child_relid = channel->offermsg.child_relid;
INIT_WORK(&ctx->work, vmbus_onmessage_work);
- queue_work_on(VMBUS_CONNECT_CPU,
- vmbus_connection.work_queue,
- &ctx->work);
+ queue_work(vmbus_connection.work_queue, &ctx->work);
}
#endif /* CONFIG_PM_SLEEP */
/*
- * Direct callback for channels using other deferred processing
- */
-static void vmbus_channel_isr(struct vmbus_channel *channel)
-{
- void (*callback_fn)(void *);
-
- callback_fn = READ_ONCE(channel->onchannel_callback);
- if (likely(callback_fn != NULL))
- (*callback_fn)(channel->channel_callback_context);
-}
-
-/*
* Schedule all channels with events pending
*/
static void vmbus_chan_sched(struct hv_per_cpu_context *hv_cpu)
@@ -1212,6 +1238,7 @@
return;
for_each_set_bit(relid, recv_int_page, maxbits) {
+ void (*callback_fn)(void *context);
struct vmbus_channel *channel;
if (!sync_test_and_clear_bit(relid, recv_int_page))
@@ -1221,33 +1248,54 @@
if (relid == 0)
continue;
+ /*
+ * Pairs with the kfree_rcu() in vmbus_chan_release().
+ * Guarantees that the channel data structure doesn't
+ * get freed while the channel pointer below is being
+ * dereferenced.
+ */
rcu_read_lock();
/* Find channel based on relid */
- list_for_each_entry_rcu(channel, &hv_cpu->chan_list, percpu_list) {
- if (channel->offermsg.child_relid != relid)
- continue;
+ channel = relid2channel(relid);
+ if (channel == NULL)
+ goto sched_unlock_rcu;
- if (channel->rescind)
- continue;
+ if (channel->rescind)
+ goto sched_unlock_rcu;
- trace_vmbus_chan_sched(channel);
+ /*
+ * Make sure that the ring buffer data structure doesn't get
+ * freed while we dereference the ring buffer pointer. Test
+ * for the channel's onchannel_callback being NULL within a
+ * sched_lock critical section. See also the inline comments
+ * in vmbus_reset_channel_cb().
+ */
+ spin_lock(&channel->sched_lock);
- ++channel->interrupts;
+ callback_fn = channel->onchannel_callback;
+ if (unlikely(callback_fn == NULL))
+ goto sched_unlock;
- switch (channel->callback_mode) {
- case HV_CALL_ISR:
- vmbus_channel_isr(channel);
- break;
+ trace_vmbus_chan_sched(channel);
- case HV_CALL_BATCHED:
- hv_begin_read(&channel->inbound);
- /* fallthrough */
- case HV_CALL_DIRECT:
- tasklet_schedule(&channel->callback_event);
- }
+ ++channel->interrupts;
+
+ switch (channel->callback_mode) {
+ case HV_CALL_ISR:
+ (*callback_fn)(channel->channel_callback_context);
+ break;
+
+ case HV_CALL_BATCHED:
+ hv_begin_read(&channel->inbound);
+ fallthrough;
+ case HV_CALL_DIRECT:
+ tasklet_schedule(&channel->callback_event);
}
+sched_unlock:
+ spin_unlock(&channel->sched_lock);
+sched_unlock_rcu:
rcu_read_unlock();
}
}
@@ -1303,7 +1351,7 @@
tasklet_schedule(&hv_cpu->msg_dpc);
}
- add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0);
+ add_interrupt_randomness(hv_get_vector(), 0);
}
/*
@@ -1326,7 +1374,7 @@
* Write dump contents to the page. No need to synchronize; panic should
* be single-threaded.
*/
- kmsg_dump_get_buffer(dumper, true, hv_panic_page, PAGE_SIZE,
+ kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE,
&bytes_written);
if (bytes_written)
hyperv_report_panic_msg(panic_pa, bytes_written);
@@ -1376,7 +1424,6 @@
{
int ret;
- /* Hypervisor initialization...setup hypercall page..etc */
ret = hv_init();
if (ret != 0) {
pr_err("Unable to initialize the hypervisor - 0x%x\n", ret);
@@ -1387,16 +1434,14 @@
if (ret)
return ret;
- hv_setup_vmbus_irq(vmbus_isr);
+ ret = hv_setup_vmbus_irq(vmbus_irq, vmbus_isr);
+ if (ret)
+ goto err_setup;
ret = hv_synic_alloc();
if (ret)
goto err_alloc;
- ret = hv_stimer_alloc(VMBUS_MESSAGE_SINT);
- if (ret < 0)
- goto err_alloc;
-
/*
* Initialize the per-cpu interrupt state and stimer state.
* Then connect to the host.
@@ -1430,7 +1475,7 @@
*/
hv_get_crash_ctl(hyperv_crash_ctl);
if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) {
- hv_panic_page = (void *)get_zeroed_page(GFP_KERNEL);
+ hv_panic_page = (void *)hv_alloc_hyperv_zeroed_page();
if (hv_panic_page) {
ret = kmsg_dump_register(&hv_kmsg_dumper);
if (ret) {
@@ -1463,11 +1508,10 @@
err_connect:
cpuhp_remove_state(hyperv_cpuhp_online);
err_cpuhp:
- hv_stimer_free();
-err_alloc:
hv_synic_free();
+err_alloc:
hv_remove_vmbus_irq();
-
+err_setup:
bus_unregister(&hv_bus);
unregister_sysctl_table(hv_ctl_table_hdr);
hv_ctl_table_hdr = NULL;
@@ -1570,8 +1614,24 @@
return attribute->show(chan, buf);
}
+static ssize_t vmbus_chan_attr_store(struct kobject *kobj,
+ struct attribute *attr, const char *buf,
+ size_t count)
+{
+ const struct vmbus_chan_attribute *attribute
+ = container_of(attr, struct vmbus_chan_attribute, attr);
+ struct vmbus_channel *chan
+ = container_of(kobj, struct vmbus_channel, kobj);
+
+ if (!attribute->store)
+ return -EIO;
+
+ return attribute->store(chan, buf, count);
+}
+
static const struct sysfs_ops vmbus_chan_sysfs_ops = {
.show = vmbus_chan_attr_show,
+ .store = vmbus_chan_attr_store,
};
static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf)
@@ -1642,11 +1702,108 @@
}
static VMBUS_CHAN_ATTR_RO(write_avail);
-static ssize_t show_target_cpu(struct vmbus_channel *channel, char *buf)
+static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
{
return sprintf(buf, "%u\n", channel->target_cpu);
}
-static VMBUS_CHAN_ATTR(cpu, S_IRUGO, show_target_cpu, NULL);
+static ssize_t target_cpu_store(struct vmbus_channel *channel,
+ const char *buf, size_t count)
+{
+ u32 target_cpu, origin_cpu;
+ ssize_t ret = count;
+
+ if (vmbus_proto_version < VERSION_WIN10_V4_1)
+ return -EIO;
+
+ if (sscanf(buf, "%uu", &target_cpu) != 1)
+ return -EIO;
+
+ /* Validate target_cpu for the cpumask_test_cpu() operation below. */
+ if (target_cpu >= nr_cpumask_bits)
+ return -EINVAL;
+
+ /* No CPUs should come up or down during this. */
+ cpus_read_lock();
+
+ if (!cpu_online(target_cpu)) {
+ cpus_read_unlock();
+ return -EINVAL;
+ }
+
+ /*
+ * Synchronizes target_cpu_store() and channel closure:
+ *
+ * { Initially: state = CHANNEL_OPENED }
+ *
+ * CPU1 CPU2
+ *
+ * [target_cpu_store()] [vmbus_disconnect_ring()]
+ *
+ * LOCK channel_mutex LOCK channel_mutex
+ * LOAD r1 = state LOAD r2 = state
+ * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED)
+ * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN
+ * [...] SEND CLOSECHANNEL
+ * UNLOCK channel_mutex UNLOCK channel_mutex
+ *
+ * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes
+ * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND
+ *
+ * Note. The host processes the channel messages "sequentially", in
+ * the order in which they are received on a per-partition basis.
+ */
+ mutex_lock(&vmbus_connection.channel_mutex);
+
+ /*
+ * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
+ * avoid sending the message and fail here for such channels.
+ */
+ if (channel->state != CHANNEL_OPENED_STATE) {
+ ret = -EIO;
+ goto cpu_store_unlock;
+ }
+
+ origin_cpu = channel->target_cpu;
+ if (target_cpu == origin_cpu)
+ goto cpu_store_unlock;
+
+ if (vmbus_send_modifychannel(channel->offermsg.child_relid,
+ hv_cpu_number_to_vp_number(target_cpu))) {
+ ret = -EIO;
+ goto cpu_store_unlock;
+ }
+
+ /*
+ * Warning. At this point, there is *no* guarantee that the host will
+ * have successfully processed the vmbus_send_modifychannel() request.
+ * See the header comment of vmbus_send_modifychannel() for more info.
+ *
+ * Lags in the processing of the above vmbus_send_modifychannel() can
+ * result in missed interrupts if the "old" target CPU is taken offline
+ * before Hyper-V starts sending interrupts to the "new" target CPU.
+ * But apart from this offlining scenario, the code tolerates such
+ * lags. It will function correctly even if a channel interrupt comes
+ * in on a CPU that is different from the channel target_cpu value.
+ */
+
+ channel->target_cpu = target_cpu;
+
+ /* See init_vp_index(). */
+ if (hv_is_perf_channel(channel))
+ hv_update_alloced_cpus(origin_cpu, target_cpu);
+
+ /* Currently set only for storvsc channels. */
+ if (channel->change_target_cpu_callback) {
+ (*channel->change_target_cpu_callback)(channel,
+ origin_cpu, target_cpu);
+ }
+
+cpu_store_unlock:
+ mutex_unlock(&vmbus_connection.channel_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
static ssize_t channel_pending_show(struct vmbus_channel *channel,
char *buf)
@@ -1787,8 +1944,10 @@
kobj->kset = dev->channels_kset;
ret = kobject_init_and_add(kobj, &vmbus_chan_ktype, NULL,
"%u", relid);
- if (ret)
+ if (ret) {
+ kobject_put(kobj);
return ret;
+ }
ret = sysfs_create_group(kobj, &vmbus_chan_group);
@@ -1797,6 +1956,7 @@
* The calling functions' error handling paths will cleanup the
* empty channel directory.
*/
+ kobject_put(kobj);
dev_err(device, "Unable to set up channel sysfs files\n");
return ret;
}
@@ -1847,7 +2007,7 @@
int ret;
dev_set_name(&child_device_obj->device, "%pUl",
- child_device_obj->channel->offermsg.offer.if_instance.b);
+ &child_device_obj->channel->offermsg.offer.if_instance);
child_device_obj->device.bus = &hv_bus;
child_device_obj->device.parent = &hv_acpi_dev->dev;
@@ -1876,6 +2036,7 @@
pr_err("Unable to register primary channeln");
goto err_kset_unregister;
}
+ hv_debug_add_dev_dir(child_device_obj);
return 0;
@@ -1918,6 +2079,7 @@
struct resource *new_res;
struct resource **old_res = &hyperv_mmio;
struct resource **prev_res = NULL;
+ struct resource r;
switch (res->type) {
@@ -1936,6 +2098,23 @@
end = res->data.address64.address.maximum;
break;
+ /*
+ * The IRQ information is needed only on ARM64, which Hyper-V
+ * sets up in the extended format. IRQ information is present
+ * on x86/x64 in the non-extended format but it is not used by
+ * Linux. So don't bother checking for the non-extended format.
+ */
+ case ACPI_RESOURCE_TYPE_EXTENDED_IRQ:
+ if (!acpi_dev_resource_interrupt(res, 0, &r)) {
+ pr_err("Unable to parse Hyper-V ACPI interrupt\n");
+ return AE_ERROR;
+ }
+ /* ARM64 INTID for VMbus */
+ vmbus_interrupt = res->data.extended_irq.interrupts[0];
+ /* Linux IRQ number */
+ vmbus_irq = r.start;
+ return AE_OK;
+
default:
/* Unused resource type */
return AE_OK;
@@ -2077,7 +2256,7 @@
int retval;
retval = -ENXIO;
- down(&hyperv_mmio_lock);
+ mutex_lock(&hyperv_mmio_lock);
/*
* If overlaps with frame buffers are allowed, then first attempt to
@@ -2124,7 +2303,7 @@
}
exit:
- up(&hyperv_mmio_lock);
+ mutex_unlock(&hyperv_mmio_lock);
return retval;
}
EXPORT_SYMBOL_GPL(vmbus_allocate_mmio);
@@ -2141,7 +2320,7 @@
{
struct resource *iter;
- down(&hyperv_mmio_lock);
+ mutex_lock(&hyperv_mmio_lock);
for (iter = hyperv_mmio; iter; iter = iter->sibling) {
if ((iter->start >= start + size) || (iter->end <= start))
continue;
@@ -2149,7 +2328,7 @@
__release_region(iter, start, size);
}
release_mem_region(start, size);
- up(&hyperv_mmio_lock);
+ mutex_unlock(&hyperv_mmio_lock);
}
EXPORT_SYMBOL_GPL(vmbus_free_mmio);
@@ -2195,7 +2374,6 @@
static int vmbus_bus_suspend(struct device *dev)
{
struct vmbus_channel *channel, *sc;
- unsigned long flags;
while (atomic_read(&vmbus_connection.offer_in_progress) != 0) {
/*
@@ -2240,9 +2418,12 @@
list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
/*
- * Invalidate the field. Upon resume, vmbus_onoffer() will fix
- * up the field, and the other fields (if necessary).
+ * Remove the channel from the array of channels and invalidate
+ * the channel's relid. Upon resume, vmbus_onoffer() will fix
+ * up the relid (and other fields, if necessary) and add the
+ * channel back to the array.
*/
+ vmbus_channel_unmap_relid(channel);
channel->offermsg.child_relid = INVALID_RELID;
if (is_hvsock_channel(channel)) {
@@ -2253,12 +2434,10 @@
continue;
}
- spin_lock_irqsave(&channel->lock, flags);
list_for_each_entry(sc, &channel->sc_list, sc_list) {
pr_err("Sub-channel not deleted!\n");
WARN_ON_ONCE(1);
}
- spin_unlock_irqrestore(&channel->lock, flags);
atomic_inc(&vmbus_connection.nr_chan_fixup_on_resume);
}
@@ -2283,8 +2462,7 @@
* We only use the 'vmbus_proto_version', which was in use before
* hibernation, to re-negotiate with the host.
*/
- if (vmbus_proto_version == VERSION_INVAL ||
- vmbus_proto_version == 0) {
+ if (!vmbus_proto_version) {
pr_err("Invalid proto version = 0x%x\n", vmbus_proto_version);
return -EINVAL;
}
@@ -2367,7 +2545,6 @@
/* Make sure conn_state is set as hv_synic_cleanup checks for it */
mb();
cpuhp_remove_state(hyperv_cpuhp_online);
- hyperv_cleanup();
};
static void hv_crash_handler(struct pt_regs *regs)
@@ -2383,26 +2560,28 @@
cpu = smp_processor_id();
hv_stimer_cleanup(cpu);
hv_synic_disable_regs(cpu);
- hyperv_cleanup();
};
static int hv_synic_suspend(void)
{
/*
- * When we reach here, all the non-boot CPUs have been offlined, and
- * the stimers on them have been unbound in hv_synic_cleanup() ->
+ * When we reach here, all the non-boot CPUs have been offlined.
+ * If we're in a legacy configuration where stimer Direct Mode is
+ * not enabled, the stimers on the non-boot CPUs have been unbound
+ * in hv_synic_cleanup() -> hv_stimer_legacy_cleanup() ->
* hv_stimer_cleanup() -> clockevents_unbind_device().
*
- * hv_synic_suspend() only runs on CPU0 with interrupts disabled. Here
- * we do not unbind the stimer on CPU0 because: 1) it's unnecessary
- * because the interrupts remain disabled between syscore_suspend()
- * and syscore_resume(): see create_image() and resume_target_kernel();
+ * hv_synic_suspend() only runs on CPU0 with interrupts disabled.
+ * Here we do not call hv_stimer_legacy_cleanup() on CPU0 because:
+ * 1) it's unnecessary as interrupts remain disabled between
+ * syscore_suspend() and syscore_resume(): see create_image() and
+ * resume_target_kernel()
* 2) the stimer on CPU0 is automatically disabled later by
* syscore_suspend() -> timekeeping_suspend() -> tick_suspend() -> ...
- * -> clockevents_shutdown() -> ... -> hv_ce_shutdown(); 3) a warning
- * would be triggered if we call clockevents_unbind_device(), which
- * may sleep, in an interrupts-disabled context. So, we intentionally
- * don't call hv_stimer_cleanup(0) here.
+ * -> clockevents_shutdown() -> ... -> hv_ce_shutdown()
+ * 3) a warning would be triggered if we call
+ * clockevents_unbind_device(), which may sleep, in an
+ * interrupts-disabled context.
*/
hv_synic_disable_regs(0);
@@ -2449,6 +2628,7 @@
ret = -ETIMEDOUT;
goto cleanup;
}
+ hv_debug_init();
ret = vmbus_bus_init();
if (ret)
@@ -2485,7 +2665,10 @@
tasklet_kill(&hv_cpu->msg_dpc);
}
+ hv_debug_rm_all_dir();
+
vmbus_free_channels();
+ kfree(vmbus_connection.channels);
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) {
kmsg_dump_unregister(&hv_kmsg_dumper);