Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index 8eb1675..9260ad4 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -763,13 +763,19 @@
free_cpumask_var(available_mask);
}
+#define UNLOAD_DELAY_UNIT_MS 10 /* 10 milliseconds */
+#define UNLOAD_WAIT_MS (100*1000) /* 100 seconds */
+#define UNLOAD_WAIT_LOOPS (UNLOAD_WAIT_MS/UNLOAD_DELAY_UNIT_MS)
+#define UNLOAD_MSG_MS (5*1000) /* Every 5 seconds */
+#define UNLOAD_MSG_LOOPS (UNLOAD_MSG_MS/UNLOAD_DELAY_UNIT_MS)
+
static void vmbus_wait_for_unload(void)
{
int cpu;
void *page_addr;
struct hv_message *msg;
struct vmbus_channel_message_header *hdr;
- u32 message_type;
+ u32 message_type, i;
/*
* CHANNELMSG_UNLOAD_RESPONSE is always delivered to the CPU which was
@@ -779,10 +785,18 @@
* functional and vmbus_unload_response() will complete
* vmbus_connection.unload_event. If not, the last thing we can do is
* read message pages for all CPUs directly.
+ *
+ * Wait up to 100 seconds since an Azure host must writeback any dirty
+ * data in its disk cache before the VMbus UNLOAD request will
+ * complete. This flushing has been empirically observed to take up
+ * to 50 seconds in cases with a lot of dirty data, so allow additional
+ * leeway and for inaccuracies in mdelay(). But eventually time out so
+ * that the panic path can't get hung forever in case the response
+ * message isn't seen.
*/
- while (1) {
+ for (i = 1; i <= UNLOAD_WAIT_LOOPS; i++) {
if (completion_done(&vmbus_connection.unload_event))
- break;
+ goto completed;
for_each_online_cpu(cpu) {
struct hv_per_cpu_context *hv_cpu
@@ -805,9 +819,18 @@
vmbus_signal_eom(msg, message_type);
}
- mdelay(10);
- }
+ /*
+ * Give a notice periodically so someone watching the
+ * serial output won't think it is completely hung.
+ */
+ if (!(i % UNLOAD_MSG_LOOPS))
+ pr_notice("Waiting for VMBus UNLOAD to complete\n");
+ mdelay(UNLOAD_DELAY_UNIT_MS);
+ }
+ pr_err("Continuing even though VMBus UNLOAD did not complete\n");
+
+completed:
/*
* We're crashing and already got the UNLOAD_RESPONSE, cleanup all
* maybe-pending messages on all CPUs to be able to receive new
@@ -839,6 +862,9 @@
{
struct vmbus_channel_message_header hdr;
+ if (xchg(&vmbus_connection.conn_state, DISCONNECTED) == DISCONNECTED)
+ return;
+
/* Pre-Win2012R2 hosts don't support reconnect */
if (vmbus_proto_version < VERSION_WIN8_1)
return;
@@ -1095,8 +1121,7 @@
vmbus_device_unregister(channel->device_obj);
put_device(dev);
}
- }
- if (channel->primary_channel != NULL) {
+ } else if (channel->primary_channel != NULL) {
/*
* Sub-channel is being rescinded. Following is the channel
* close sequence when initiated from the driveri (refer to
@@ -1351,6 +1376,8 @@
{ CHANNELMSG_19, 0, NULL },
{ CHANNELMSG_20, 0, NULL },
{ CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL },
+ { CHANNELMSG_22, 0, NULL },
+ { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL },
};
/*
@@ -1362,25 +1389,16 @@
{
struct hv_message *msg = context;
struct vmbus_channel_message_header *hdr;
- int size;
hdr = (struct vmbus_channel_message_header *)msg->u.payload;
- size = msg->header.payload_size;
trace_vmbus_on_message(hdr);
- if (hdr->msgtype >= CHANNELMSG_COUNT) {
- pr_err("Received invalid channel message type %d size %d\n",
- hdr->msgtype, size);
- print_hex_dump_bytes("", DUMP_PREFIX_NONE,
- (unsigned char *)msg->u.payload, size);
- return;
- }
-
- if (channel_message_table[hdr->msgtype].message_handler)
- channel_message_table[hdr->msgtype].message_handler(hdr);
- else
- pr_err("Unhandled channel message type %d\n", hdr->msgtype);
+ /*
+ * vmbus_on_msg_dpc() makes sure the hdr->msgtype here can not go
+ * out of bound and the message_handler pointer can not be NULL.
+ */
+ channel_message_table[hdr->msgtype].message_handler(hdr);
}
/*
diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c
index 6e4c015..c90d790 100644
--- a/drivers/hv/connection.c
+++ b/drivers/hv/connection.c
@@ -67,7 +67,6 @@
int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version)
{
int ret = 0;
- unsigned int cur_cpu;
struct vmbus_channel_initiate_contact *msg;
unsigned long flags;
@@ -100,24 +99,7 @@
msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]);
msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]);
- /*
- * We want all channel messages to be delivered on CPU 0.
- * This has been the behavior pre-win8. This is not
- * perf issue and having all channel messages delivered on CPU 0
- * would be ok.
- * For post win8 hosts, we support receiving channel messagges on
- * all the CPUs. This is needed for kexec to work correctly where
- * the CPU attempting to connect may not be CPU 0.
- */
- if (version >= VERSION_WIN8_1) {
- cur_cpu = get_cpu();
- msg->target_vcpu = hv_cpu_number_to_vp_number(cur_cpu);
- vmbus_connection.connect_cpu = cur_cpu;
- put_cpu();
- } else {
- msg->target_vcpu = 0;
- vmbus_connection.connect_cpu = 0;
- }
+ msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU);
/*
* Add to list before we send the request since we may
diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c
index fcc5279..f849a1a 100644
--- a/drivers/hv/hv.c
+++ b/drivers/hv/hv.c
@@ -250,6 +250,17 @@
unsigned long flags;
/*
+ * Hyper-V does not provide a way to change the connect CPU once
+ * it is set; we must prevent the connect CPU from going offline
+ * while the VM is running normally. But in the panic or kexec()
+ * path where the vmbus is already disconnected, the CPU must be
+ * allowed to shut down.
+ */
+ if (cpu == VMBUS_CONNECT_CPU &&
+ vmbus_connection.conn_state == CONNECTED)
+ return -EBUSY;
+
+ /*
* Search for channels which are bound to the CPU we're about to
* cleanup. In case we find one and vmbus is still connected we need to
* fail, this will effectively prevent CPU offlining. There is no way
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 34bd735..bd4e72f 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -1213,10 +1213,7 @@
unsigned int i, j;
struct page *pg;
- if (num_pages < alloc_unit)
- return 0;
-
- for (i = 0; (i * alloc_unit) < num_pages; i++) {
+ for (i = 0; i < num_pages / alloc_unit; i++) {
if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
PAGE_SIZE)
return i * alloc_unit;
@@ -1254,7 +1251,7 @@
}
- return num_pages;
+ return i * alloc_unit;
}
static void balloon_up(struct work_struct *dummy)
@@ -1269,9 +1266,6 @@
long avail_pages;
unsigned long floor;
- /* The host balloons pages in 2M granularity. */
- WARN_ON_ONCE(num_pages % PAGES_IN_2M != 0);
-
/*
* We will attempt 2M allocations. However, if we fail to
* allocate 2M chunks, we will go back to 4k allocations.
@@ -1281,14 +1275,13 @@
avail_pages = si_mem_available();
floor = compute_balloon_floor();
- /* Refuse to balloon below the floor, keep the 2M granularity. */
+ /* Refuse to balloon below the floor. */
if (avail_pages < num_pages || avail_pages - num_pages < floor) {
- pr_warn("Balloon request will be partially fulfilled. %s\n",
+ pr_info("Balloon request will be partially fulfilled. %s\n",
avail_pages < num_pages ? "Not enough memory." :
"Balloon floor reached.");
num_pages = avail_pages > floor ? (avail_pages - floor) : 0;
- num_pages -= num_pages % PAGES_IN_2M;
}
while (!done) {
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index e32681e..1671f6f 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -537,8 +537,8 @@
*/
hv_ptp_clock = ptp_clock_register(&ptp_hyperv_info, NULL);
if (IS_ERR_OR_NULL(hv_ptp_clock)) {
- pr_err("cannot register PTP clock: %ld\n",
- PTR_ERR(hv_ptp_clock));
+ pr_err("cannot register PTP clock: %d\n",
+ PTR_ERR_OR_ZERO(hv_ptp_clock));
hv_ptp_clock = NULL;
}
diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h
index af9379a..cabcb66 100644
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -212,12 +212,13 @@
#define MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT
-struct vmbus_connection {
- /*
- * CPU on which the initial host contact was made.
- */
- int connect_cpu;
+/*
+ * The CPU that Hyper-V will interrupt for VMBUS messages, such as
+ * CHANNELMSG_OFFERCHANNEL and CHANNELMSG_RESCIND_CHANNELOFFER.
+ */
+#define VMBUS_CONNECT_CPU 0
+struct vmbus_connection {
u32 msg_conn_id;
atomic_t offer_in_progress;
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 53a60c8..2d2568d 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -31,6 +31,7 @@
#include <linux/kdebug.h>
#include <linux/efi.h>
#include <linux/random.h>
+#include <linux/kernel.h>
#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
#include "hyperv_vmbus.h"
@@ -48,14 +49,35 @@
static void *hv_panic_page;
+/*
+ * Boolean to control whether to report panic messages over Hyper-V.
+ *
+ * It can be set via /proc/sys/kernel/hyperv/record_panic_msg
+ */
+static int sysctl_record_panic_msg = 1;
+
+static int hyperv_report_reg(void)
+{
+ return !sysctl_record_panic_msg || !hv_panic_page;
+}
+
static int hyperv_panic_event(struct notifier_block *nb, unsigned long val,
void *args)
{
struct pt_regs *regs;
- regs = current_pt_regs();
+ vmbus_initiate_unload(true);
- hyperv_report_panic(regs, val);
+ /*
+ * Hyper-V should be notified only once about a panic. If we will be
+ * doing hyperv_report_panic_msg() later with kmsg data, don't do
+ * the notification here.
+ */
+ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE
+ && hyperv_report_reg()) {
+ regs = current_pt_regs();
+ hyperv_report_panic(regs, val, false);
+ }
return NOTIFY_DONE;
}
@@ -65,7 +87,13 @@
struct die_args *die = (struct die_args *)args;
struct pt_regs *regs = die->regs;
- hyperv_report_panic(regs, val);
+ /*
+ * Hyper-V should be notified only once about a panic. If we will be
+ * doing hyperv_report_panic_msg() later with kmsg data, don't do
+ * the notification here.
+ */
+ if (hyperv_report_reg())
+ hyperv_report_panic(regs, val, true);
return NOTIFY_DONE;
}
@@ -950,6 +978,9 @@
return drv->resume(dev);
}
+#else
+#define vmbus_suspend NULL
+#define vmbus_resume NULL
#endif /* CONFIG_PM_SLEEP */
/*
@@ -967,11 +998,22 @@
}
/*
- * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than
- * SET_SYSTEM_SLEEP_PM_OPS: see the comment before vmbus_bus_pm.
+ * Note: we must use the "noirq" ops: see the comment before vmbus_bus_pm.
+ *
+ * suspend_noirq/resume_noirq are set to NULL to support Suspend-to-Idle: we
+ * shouldn't suspend the vmbus devices upon Suspend-to-Idle, otherwise there
+ * is no way to wake up a Generation-2 VM.
+ *
+ * The other 4 ops are for hibernation.
*/
+
static const struct dev_pm_ops vmbus_pm = {
- SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_suspend, vmbus_resume)
+ .suspend_noirq = NULL,
+ .resume_noirq = NULL,
+ .freeze_noirq = vmbus_suspend,
+ .thaw_noirq = vmbus_resume,
+ .poweroff_noirq = vmbus_suspend,
+ .restore_noirq = vmbus_resume,
};
/* The one and only one */
@@ -1031,6 +1073,10 @@
}
entry = &channel_message_table[hdr->msgtype];
+
+ if (!entry->message_handler)
+ goto msg_handled;
+
if (entry->handler_type == VMHT_BLOCKING) {
ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC);
if (ctx == NULL)
@@ -1050,14 +1096,28 @@
/*
* If we are handling the rescind message;
* schedule the work on the global work queue.
+ *
+ * The OFFER message and the RESCIND message should
+ * not be handled by the same serialized work queue,
+ * because the OFFER handler may call vmbus_open(),
+ * which tries to open the channel by sending an
+ * OPEN_CHANNEL message to the host and waits for
+ * the host's response; however, if the host has
+ * rescinded the channel before it receives the
+ * OPEN_CHANNEL message, the host just silently
+ * ignores the OPEN_CHANNEL message; as a result,
+ * the guest's OFFER handler hangs for ever, if we
+ * handle the RESCIND message in the same serialized
+ * work queue: the RESCIND handler can not start to
+ * run before the OFFER handler finishes.
*/
- schedule_work_on(vmbus_connection.connect_cpu,
+ schedule_work_on(VMBUS_CONNECT_CPU,
&ctx->work);
break;
case CHANNELMSG_OFFERCHANNEL:
atomic_inc(&vmbus_connection.offer_in_progress);
- queue_work_on(vmbus_connection.connect_cpu,
+ queue_work_on(VMBUS_CONNECT_CPU,
vmbus_connection.work_queue,
&ctx->work);
break;
@@ -1104,7 +1164,7 @@
INIT_WORK(&ctx->work, vmbus_onmessage_work);
- queue_work_on(vmbus_connection.connect_cpu,
+ queue_work_on(VMBUS_CONNECT_CPU,
vmbus_connection.work_queue,
&ctx->work);
}
@@ -1247,13 +1307,6 @@
}
/*
- * Boolean to control whether to report panic messages over Hyper-V.
- *
- * It can be set via /proc/sys/kernel/hyperv/record_panic_msg
- */
-static int sysctl_record_panic_msg = 1;
-
-/*
* Callback from kmsg_dump. Grab as much as possible from the end of the kmsg
* buffer and call into Hyper-V to transfer the data.
*/
@@ -1380,19 +1433,29 @@
hv_panic_page = (void *)get_zeroed_page(GFP_KERNEL);
if (hv_panic_page) {
ret = kmsg_dump_register(&hv_kmsg_dumper);
- if (ret)
+ if (ret) {
pr_err("Hyper-V: kmsg dump register "
"error 0x%x\n", ret);
+ hv_free_hyperv_page(
+ (unsigned long)hv_panic_page);
+ hv_panic_page = NULL;
+ }
} else
pr_err("Hyper-V: panic message page memory "
"allocation failed");
}
register_die_notifier(&hyperv_die_block);
- atomic_notifier_chain_register(&panic_notifier_list,
- &hyperv_panic_block);
}
+ /*
+ * Always register the panic notifier because we need to unload
+ * the VMbus channel connection to prevent any VMbus
+ * activity after the VM panics.
+ */
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &hyperv_panic_block);
+
vmbus_request_offers();
return 0;
@@ -1406,7 +1469,6 @@
hv_remove_vmbus_irq();
bus_unregister(&hv_bus);
- free_page((unsigned long)hv_panic_page);
unregister_sysctl_table(hv_ctl_table_hdr);
hv_ctl_table_hdr = NULL;
return ret;
@@ -2169,7 +2231,10 @@
if (atomic_read(&vmbus_connection.nr_chan_close_on_suspend) > 0)
wait_for_completion(&vmbus_connection.ready_for_suspend_event);
- WARN_ON(atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0);
+ if (atomic_read(&vmbus_connection.nr_chan_fixup_on_resume) != 0) {
+ pr_err("Can not suspend due to a previous failed resuming\n");
+ return -EBUSY;
+ }
mutex_lock(&vmbus_connection.channel_mutex);
@@ -2202,8 +2267,6 @@
vmbus_initiate_unload(false);
- vmbus_connection.conn_state = DISCONNECTED;
-
/* Reset the event for the next resume. */
reinit_completion(&vmbus_connection.ready_for_resume_event);
@@ -2245,13 +2308,18 @@
vmbus_request_offers();
- wait_for_completion(&vmbus_connection.ready_for_resume_event);
+ if (wait_for_completion_timeout(
+ &vmbus_connection.ready_for_resume_event, 10 * HZ) == 0)
+ pr_err("Some vmbus device is missing after suspending?\n");
/* Reset the event for the next suspend. */
reinit_completion(&vmbus_connection.ready_for_suspend_event);
return 0;
}
+#else
+#define vmbus_bus_suspend NULL
+#define vmbus_bus_resume NULL
#endif /* CONFIG_PM_SLEEP */
static const struct acpi_device_id vmbus_acpi_device_ids[] = {
@@ -2262,16 +2330,24 @@
MODULE_DEVICE_TABLE(acpi, vmbus_acpi_device_ids);
/*
- * Note: we must use SET_NOIRQ_SYSTEM_SLEEP_PM_OPS rather than
- * SET_SYSTEM_SLEEP_PM_OPS, otherwise NIC SR-IOV can not work, because the
- * "pci_dev_pm_ops" uses the "noirq" callbacks: in the resume path, the
- * pci "noirq" restore callback runs before "non-noirq" callbacks (see
+ * Note: we must use the "no_irq" ops, otherwise hibernation can not work with
+ * PCI device assignment, because "pci_dev_pm_ops" uses the "noirq" ops: in
+ * the resume path, the pci "noirq" restore op runs before "non-noirq" op (see
* resume_target_kernel() -> dpm_resume_start(), and hibernation_restore() ->
* dpm_resume_end()). This means vmbus_bus_resume() and the pci-hyperv's
- * resume callback must also run via the "noirq" callbacks.
+ * resume callback must also run via the "noirq" ops.
+ *
+ * Set suspend_noirq/resume_noirq to NULL for Suspend-to-Idle: see the comment
+ * earlier in this file before vmbus_pm.
*/
+
static const struct dev_pm_ops vmbus_bus_pm = {
- SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(vmbus_bus_suspend, vmbus_bus_resume)
+ .suspend_noirq = NULL,
+ .resume_noirq = NULL,
+ .freeze_noirq = vmbus_bus_suspend,
+ .thaw_noirq = vmbus_bus_resume,
+ .poweroff_noirq = vmbus_bus_suspend,
+ .restore_noirq = vmbus_bus_resume
};
static struct acpi_driver vmbus_acpi_driver = {
@@ -2288,7 +2364,6 @@
{
hv_stimer_global_cleanup();
vmbus_initiate_unload(false);
- vmbus_connection.conn_state = DISCONNECTED;
/* Make sure conn_state is set as hv_synic_cleanup checks for it */
mb();
cpuhp_remove_state(hyperv_cpuhp_online);
@@ -2305,10 +2380,9 @@
* doing the cleanup for current CPU only. This should be sufficient
* for kdump.
*/
- vmbus_connection.conn_state = DISCONNECTED;
cpu = smp_processor_id();
hv_stimer_cleanup(cpu);
- hv_synic_cleanup(cpu);
+ hv_synic_disable_regs(cpu);
hyperv_cleanup();
};