Blame - arch/powerpc/kernel/time.c - hafnium/third_party/linux

blob: 70f145e0248776d6692439d60e4fe6f10097abc0 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/*
				2	* Common time routines among all ppc machines.
				3	*
				4	* Written by Cort Dougan (cort@cs.nmt.edu) to merge
				5	* Paul Mackerras' version and mine for PReP and Pmac.
				6	* MPC8xx/MBX changes by Dan Malek (dmalek@jlc.net).
				7	* Converted for 64-bit by Mike Corrigan (mikejc@us.ibm.com)
				8	*
				9	* First round of bugfixes by Gabriel Paubert (paubert@iram.es)
				10	* to make clock more stable (2.4.0-test5). The only thing
				11	* that this code assumes is that the timebases have been synchronized
				12	* by firmware on SMP and are never stopped (never do sleep
				13	* on SMP then, nap and doze are OK).
				14	*
				15	* Speeded up do_gettimeofday by getting rid of references to
				16	* xtime (which required locks for consistency). (mikejc@us.ibm.com)
				17	*
				18	* TODO (not necessarily in this file):
				19	* - improve precision and reproducibility of timebase frequency
				20	* measurement at boot time.
				21	* - for astronomical applications: add a new function to get
				22	* non ambiguous timestamps even around leap seconds. This needs
				23	* a new timestamp format and a good name.
				24	*
				25	* 1997-09-10 Updated NTP code according to technical memorandum Jan '96
				26	* "A Kernel Model for Precision Timekeeping" by Dave Mills
				27	*
				28	* This program is free software; you can redistribute it and/or
				29	* modify it under the terms of the GNU General Public License
				30	* as published by the Free Software Foundation; either version
				31	* 2 of the License, or (at your option) any later version.
				32	*/
				33
				34	#include <linux/errno.h>
				35	#include <linux/export.h>
				36	#include <linux/sched.h>
				37	#include <linux/sched/clock.h>
				38	#include <linux/kernel.h>
				39	#include <linux/param.h>
				40	#include <linux/string.h>
				41	#include <linux/mm.h>
				42	#include <linux/interrupt.h>
				43	#include <linux/timex.h>
				44	#include <linux/kernel_stat.h>
				45	#include <linux/time.h>
				46	#include <linux/clockchips.h>
				47	#include <linux/init.h>
				48	#include <linux/profile.h>
				49	#include <linux/cpu.h>
				50	#include <linux/security.h>
				51	#include <linux/percpu.h>
				52	#include <linux/rtc.h>
				53	#include <linux/jiffies.h>
				54	#include <linux/posix-timers.h>
				55	#include <linux/irq.h>
				56	#include <linux/delay.h>
				57	#include <linux/irq_work.h>
				58	#include <linux/clk-provider.h>
				59	#include <linux/suspend.h>
				60	#include <linux/rtc.h>
				61	#include <linux/sched/cputime.h>
				62	#include <linux/processor.h>
				63	#include <asm/trace.h>
				64
				65	#include <asm/io.h>
				66	#include <asm/nvram.h>
				67	#include <asm/cache.h>
				68	#include <asm/machdep.h>
				69	#include <linux/uaccess.h>
				70	#include <asm/time.h>
				71	#include <asm/prom.h>
				72	#include <asm/irq.h>
				73	#include <asm/div64.h>
				74	#include <asm/smp.h>
				75	#include <asm/vdso_datapage.h>
				76	#include <asm/firmware.h>
				77	#include <asm/asm-prototypes.h>
				78
				79	/* powerpc clocksource/clockevent code */
				80
				81	#include <linux/clockchips.h>
				82	#include <linux/timekeeper_internal.h>
				83
				84	static u64 rtc_read(struct clocksource *);
				85	static struct clocksource clocksource_rtc = {
				86	.name = "rtc",
				87	.rating = 400,
				88	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
				89	.mask = CLOCKSOURCE_MASK(64),
				90	.read = rtc_read,
				91	};
				92
				93	static u64 timebase_read(struct clocksource *);
				94	static struct clocksource clocksource_timebase = {
				95	.name = "timebase",
				96	.rating = 400,
				97	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
				98	.mask = CLOCKSOURCE_MASK(64),
				99	.read = timebase_read,
				100	};
				101
				102	#define DECREMENTER_DEFAULT_MAX 0x7FFFFFFF
				103	u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
				104
				105	static int decrementer_set_next_event(unsigned long evt,
				106	struct clock_event_device *dev);
				107	static int decrementer_shutdown(struct clock_event_device *evt);
				108
				109	struct clock_event_device decrementer_clockevent = {
				110	.name = "decrementer",
				111	.rating = 200,
				112	.irq = 0,
				113	.set_next_event = decrementer_set_next_event,
				114	.set_state_shutdown = decrementer_shutdown,
				115	.tick_resume = decrementer_shutdown,
				116	.features = CLOCK_EVT_FEAT_ONESHOT \|
				117	CLOCK_EVT_FEAT_C3STOP,
				118	};
				119	EXPORT_SYMBOL(decrementer_clockevent);
				120
				121	DEFINE_PER_CPU(u64, decrementers_next_tb);
				122	static DEFINE_PER_CPU(struct clock_event_device, decrementers);
				123
				124	#define XSEC_PER_SEC (1024*1024)
				125
				126	#ifdef CONFIG_PPC64
				127	#define SCALE_XSEC(xsec, max) (((xsec) * max) / XSEC_PER_SEC)
				128	#else
				129	/* compute ((xsec << 12) * max) >> 32 */
				130	#define SCALE_XSEC(xsec, max) mulhwu((xsec) << 12, max)
				131	#endif
				132
				133	unsigned long tb_ticks_per_jiffy;
				134	unsigned long tb_ticks_per_usec = 100; /* sane default */
				135	EXPORT_SYMBOL(tb_ticks_per_usec);
				136	unsigned long tb_ticks_per_sec;
				137	EXPORT_SYMBOL(tb_ticks_per_sec); /* for cputime_t conversions */
				138
				139	DEFINE_SPINLOCK(rtc_lock);
				140	EXPORT_SYMBOL_GPL(rtc_lock);
				141
				142	static u64 tb_to_ns_scale __read_mostly;
				143	static unsigned tb_to_ns_shift __read_mostly;
				144	static u64 boot_tb __read_mostly;
				145
				146	extern struct timezone sys_tz;
				147	static long timezone_offset;
				148
				149	unsigned long ppc_proc_freq;
				150	EXPORT_SYMBOL_GPL(ppc_proc_freq);
				151	unsigned long ppc_tb_freq;
				152	EXPORT_SYMBOL_GPL(ppc_tb_freq);
				153
				154	#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
				155	/*
				156	* Factor for converting from cputime_t (timebase ticks) to
				157	* microseconds. This is stored as 0.64 fixed-point binary fraction.
				158	*/
				159	u64 __cputime_usec_factor;
				160	EXPORT_SYMBOL(__cputime_usec_factor);
				161
				162	#ifdef CONFIG_PPC_SPLPAR
				163	void (dtl_consumer)(struct dtl_entry , u64);
				164	#endif
				165
				166	static void calc_cputime_factors(void)
				167	{
				168	struct div_result res;
				169
				170	div128_by_32(1000000, 0, tb_ticks_per_sec, &res);
				171	__cputime_usec_factor = res.result_low;
				172	}
				173
				174	/*
				175	* Read the SPURR on systems that have it, otherwise the PURR,
				176	* or if that doesn't exist return the timebase value passed in.
				177	*/
				178	static unsigned long read_spurr(unsigned long tb)
				179	{
				180	if (cpu_has_feature(CPU_FTR_SPURR))
				181	return mfspr(SPRN_SPURR);
				182	if (cpu_has_feature(CPU_FTR_PURR))
				183	return mfspr(SPRN_PURR);
				184	return tb;
				185	}
				186
				187	#ifdef CONFIG_PPC_SPLPAR
				188
				189	/*
				190	* Scan the dispatch trace log and count up the stolen time.
				191	* Should be called with interrupts disabled.
				192	*/
				193	static u64 scan_dispatch_log(u64 stop_tb)
				194	{
				195	u64 i = local_paca->dtl_ridx;
				196	struct dtl_entry *dtl = local_paca->dtl_curr;
				197	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
				198	struct lppaca *vpa = local_paca->lppaca_ptr;
				199	u64 tb_delta;
				200	u64 stolen = 0;
				201	u64 dtb;
				202
				203	if (!dtl)
				204	return 0;
				205
				206	if (i == be64_to_cpu(vpa->dtl_idx))
				207	return 0;
				208	while (i < be64_to_cpu(vpa->dtl_idx)) {
				209	dtb = be64_to_cpu(dtl->timebase);
				210	tb_delta = be32_to_cpu(dtl->enqueue_to_dispatch_time) +
				211	be32_to_cpu(dtl->ready_to_enqueue_time);
				212	barrier();
				213	if (i + N_DISPATCH_LOG < be64_to_cpu(vpa->dtl_idx)) {
				214	/* buffer has overflowed */
				215	i = be64_to_cpu(vpa->dtl_idx) - N_DISPATCH_LOG;
				216	dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
				217	continue;
				218	}
				219	if (dtb > stop_tb)
				220	break;
				221	if (dtl_consumer)
				222	dtl_consumer(dtl, i);
				223	stolen += tb_delta;
				224	++i;
				225	++dtl;
				226	if (dtl == dtl_end)
				227	dtl = local_paca->dispatch_log;
				228	}
				229	local_paca->dtl_ridx = i;
				230	local_paca->dtl_curr = dtl;
				231	return stolen;
				232	}
				233
				234	/*
				235	* Accumulate stolen time by scanning the dispatch trace log.
				236	* Called on entry from user mode.
				237	*/
				238	void accumulate_stolen_time(void)
				239	{
				240	u64 sst, ust;
				241	unsigned long save_irq_soft_mask = irq_soft_mask_return();
				242	struct cpu_accounting_data *acct = &local_paca->accounting;
				243
				244	/* We are called early in the exception entry, before
				245	* soft/hard_enabled are sync'ed to the expected state
				246	* for the exception. We are hard disabled but the PACA
				247	* needs to reflect that so various debug stuff doesn't
				248	* complain
				249	*/
				250	irq_soft_mask_set(IRQS_DISABLED);
				251
				252	sst = scan_dispatch_log(acct->starttime_user);
				253	ust = scan_dispatch_log(acct->starttime);
				254	acct->stime -= sst;
				255	acct->utime -= ust;
				256	acct->steal_time += ust + sst;
				257
				258	irq_soft_mask_set(save_irq_soft_mask);
				259	}
				260
				261	static inline u64 calculate_stolen_time(u64 stop_tb)
				262	{
				263	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
				264	return 0;
				265
				266	if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx))
				267	return scan_dispatch_log(stop_tb);
				268
				269	return 0;
				270	}
				271
				272	#else /* CONFIG_PPC_SPLPAR */
				273	static inline u64 calculate_stolen_time(u64 stop_tb)
				274	{
				275	return 0;
				276	}
				277
				278	#endif /* CONFIG_PPC_SPLPAR */
				279
				280	/*
				281	* Account time for a transition between system, hard irq
				282	* or soft irq state.
				283	*/
				284	static unsigned long vtime_delta(struct task_struct *tsk,
				285	unsigned long *stime_scaled,
				286	unsigned long *steal_time)
				287	{
				288	unsigned long now, nowscaled, deltascaled;
				289	unsigned long stime;
				290	unsigned long utime, utime_scaled;
				291	struct cpu_accounting_data *acct = get_accounting(tsk);
				292
				293	WARN_ON_ONCE(!irqs_disabled());
				294
				295	now = mftb();
				296	nowscaled = read_spurr(now);
				297	stime = now - acct->starttime;
				298	acct->starttime = now;
				299	deltascaled = nowscaled - acct->startspurr;
				300	acct->startspurr = nowscaled;
				301
				302	*steal_time = calculate_stolen_time(now);
				303
				304	utime = acct->utime - acct->utime_sspurr;
				305	acct->utime_sspurr = acct->utime;
				306
				307	/*
				308	* Because we don't read the SPURR on every kernel entry/exit,
				309	* deltascaled includes both user and system SPURR ticks.
				310	* Apportion these ticks to system SPURR ticks and user
				311	* SPURR ticks in the same ratio as the system time (delta)
				312	* and user time (udelta) values obtained from the timebase
				313	* over the same interval. The system ticks get accounted here;
				314	* the user ticks get saved up in paca->user_time_scaled to be
				315	* used by account_process_tick.
				316	*/
				317	*stime_scaled = stime;
				318	utime_scaled = utime;
				319	if (deltascaled != stime + utime) {
				320	if (utime) {
				321	stime_scaled = deltascaled stime / (stime + utime);
				322	utime_scaled = deltascaled - *stime_scaled;
				323	} else {
				324	*stime_scaled = deltascaled;
				325	}
				326	}
				327	acct->utime_scaled += utime_scaled;
				328
				329	return stime;
				330	}
				331
				332	void vtime_account_system(struct task_struct *tsk)
				333	{
				334	unsigned long stime, stime_scaled, steal_time;
				335	struct cpu_accounting_data *acct = get_accounting(tsk);
				336
				337	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
				338
				339	stime -= min(stime, steal_time);
				340	acct->steal_time += steal_time;
				341
				342	if ((tsk->flags & PF_VCPU) && !irq_count()) {
				343	acct->gtime += stime;
				344	acct->utime_scaled += stime_scaled;
				345	} else {
				346	if (hardirq_count())
				347	acct->hardirq_time += stime;
				348	else if (in_serving_softirq())
				349	acct->softirq_time += stime;
				350	else
				351	acct->stime += stime;
				352
				353	acct->stime_scaled += stime_scaled;
				354	}
				355	}
				356	EXPORT_SYMBOL_GPL(vtime_account_system);
				357
				358	void vtime_account_idle(struct task_struct *tsk)
				359	{
				360	unsigned long stime, stime_scaled, steal_time;
				361	struct cpu_accounting_data *acct = get_accounting(tsk);
				362
				363	stime = vtime_delta(tsk, &stime_scaled, &steal_time);
				364	acct->idle_time += stime + steal_time;
				365	}
				366
				367	/*
				368	* Account the whole cputime accumulated in the paca
				369	* Must be called with interrupts disabled.
				370	* Assumes that vtime_account_system/idle() has been called
				371	* recently (i.e. since the last entry from usermode) so that
				372	* get_paca()->user_time_scaled is up to date.
				373	*/
				374	void vtime_flush(struct task_struct *tsk)
				375	{
				376	struct cpu_accounting_data *acct = get_accounting(tsk);
				377
				378	if (acct->utime)
				379	account_user_time(tsk, cputime_to_nsecs(acct->utime));
				380
				381	if (acct->utime_scaled)
				382	tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
				383
				384	if (acct->gtime)
				385	account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
				386
				387	if (acct->steal_time)
				388	account_steal_time(cputime_to_nsecs(acct->steal_time));
				389
				390	if (acct->idle_time)
				391	account_idle_time(cputime_to_nsecs(acct->idle_time));
				392
				393	if (acct->stime)
				394	account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
				395	CPUTIME_SYSTEM);
				396	if (acct->stime_scaled)
				397	tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
				398
				399	if (acct->hardirq_time)
				400	account_system_index_time(tsk, cputime_to_nsecs(acct->hardirq_time),
				401	CPUTIME_IRQ);
				402	if (acct->softirq_time)
				403	account_system_index_time(tsk, cputime_to_nsecs(acct->softirq_time),
				404	CPUTIME_SOFTIRQ);
				405
				406	acct->utime = 0;
				407	acct->utime_scaled = 0;
				408	acct->utime_sspurr = 0;
				409	acct->gtime = 0;
				410	acct->steal_time = 0;
				411	acct->idle_time = 0;
				412	acct->stime = 0;
				413	acct->stime_scaled = 0;
				414	acct->hardirq_time = 0;
				415	acct->softirq_time = 0;
				416	}
				417
				418	#else /* ! CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
				419	#define calc_cputime_factors()
				420	#endif
				421
				422	void __delay(unsigned long loops)
				423	{
				424	unsigned long start;
				425	int diff;
				426
				427	spin_begin();
				428	if (__USE_RTC()) {
				429	start = get_rtcl();
				430	do {
				431	/* the RTCL register wraps at 1000000000 */
				432	diff = get_rtcl() - start;
				433	if (diff < 0)
				434	diff += 1000000000;
				435	spin_cpu_relax();
				436	} while (diff < loops);
				437	} else {
				438	start = get_tbl();
				439	while (get_tbl() - start < loops)
				440	spin_cpu_relax();
				441	}
				442	spin_end();
				443	}
				444	EXPORT_SYMBOL(__delay);
				445
				446	void udelay(unsigned long usecs)
				447	{
				448	__delay(tb_ticks_per_usec * usecs);
				449	}
				450	EXPORT_SYMBOL(udelay);
				451
				452	#ifdef CONFIG_SMP
				453	unsigned long profile_pc(struct pt_regs *regs)
				454	{
				455	unsigned long pc = instruction_pointer(regs);
				456
				457	if (in_lock_functions(pc))
				458	return regs->link;
				459
				460	return pc;
				461	}
				462	EXPORT_SYMBOL(profile_pc);
				463	#endif
				464
				465	#ifdef CONFIG_IRQ_WORK
				466
				467	/*
				468	* 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
				469	*/
				470	#ifdef CONFIG_PPC64
				471	static inline unsigned long test_irq_work_pending(void)
				472	{
				473	unsigned long x;
				474
				475	asm volatile("lbz %0,%1(13)"
				476	: "=r" (x)
				477	: "i" (offsetof(struct paca_struct, irq_work_pending)));
				478	return x;
				479	}
				480
				481	static inline void set_irq_work_pending_flag(void)
				482	{
				483	asm volatile("stb %0,%1(13)" : :
				484	"r" (1),
				485	"i" (offsetof(struct paca_struct, irq_work_pending)));
				486	}
				487
				488	static inline void clear_irq_work_pending(void)
				489	{
				490	asm volatile("stb %0,%1(13)" : :
				491	"r" (0),
				492	"i" (offsetof(struct paca_struct, irq_work_pending)));
				493	}
				494
				495	void arch_irq_work_raise(void)
				496	{
				497	preempt_disable();
				498	set_irq_work_pending_flag();
				499	/*
				500	* Non-nmi code running with interrupts disabled will replay
				501	* irq_happened before it re-enables interrupts, so setthe
				502	* decrementer there instead of causing a hardware exception
				503	* which would immediately hit the masked interrupt handler
				504	* and have the net effect of setting the decrementer in
				505	* irq_happened.
				506	*
				507	* NMI interrupts can not check this when they return, so the
				508	* decrementer hardware exception is raised, which will fire
				509	* when interrupts are next enabled.
				510	*
				511	* BookE does not support this yet, it must audit all NMI
				512	* interrupt handlers to ensure they call nmi_enter() so this
				513	* check would be correct.
				514	*/
				515	if (IS_ENABLED(CONFIG_BOOKE) \|\| !irqs_disabled() \|\| in_nmi()) {
				516	set_dec(1);
				517	} else {
				518	hard_irq_disable();
				519	local_paca->irq_happened \|= PACA_IRQ_DEC;
				520	}
				521	preempt_enable();
				522	}
				523
				524	#else /* 32-bit */
				525
				526	DEFINE_PER_CPU(u8, irq_work_pending);
				527
				528	#define set_irq_work_pending_flag() __this_cpu_write(irq_work_pending, 1)
				529	#define test_irq_work_pending() __this_cpu_read(irq_work_pending)
				530	#define clear_irq_work_pending() __this_cpu_write(irq_work_pending, 0)
				531
				532	void arch_irq_work_raise(void)
				533	{
				534	preempt_disable();
				535	set_irq_work_pending_flag();
				536	set_dec(1);
				537	preempt_enable();
				538	}
				539
				540	#endif /* 32 vs 64 bit */
				541
				542	#else /* CONFIG_IRQ_WORK */
				543
				544	#define test_irq_work_pending() 0
				545	#define clear_irq_work_pending()
				546
				547	#endif /* CONFIG_IRQ_WORK */
				548
				549	/*
				550	* timer_interrupt - gets called when the decrementer overflows,
				551	* with interrupts disabled.
				552	*/
				553	void timer_interrupt(struct pt_regs *regs)
				554	{
				555	struct clock_event_device *evt = this_cpu_ptr(&decrementers);
				556	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
				557	struct pt_regs *old_regs;
				558	u64 now;
				559
				560	/* Some implementations of hotplug will get timer interrupts while
				561	* offline, just ignore these and we also need to set
				562	* decrementers_next_tb as MAX to make sure __check_irq_replay
				563	* don't replay timer interrupt when return, otherwise we'll trap
				564	* here infinitely :(
				565	*/
				566	if (unlikely(!cpu_online(smp_processor_id()))) {
				567	*next_tb = ~(u64)0;
				568	set_dec(decrementer_max);
				569	return;
				570	}
				571
				572	/* Ensure a positive value is written to the decrementer, or else
				573	* some CPUs will continue to take decrementer exceptions. When the
				574	* PPC_WATCHDOG (decrementer based) is configured, keep this at most
				575	* 31 bits, which is about 4 seconds on most systems, which gives
				576	* the watchdog a chance of catching timer interrupt hard lockups.
				577	*/
				578	if (IS_ENABLED(CONFIG_PPC_WATCHDOG))
				579	set_dec(0x7fffffff);
				580	else
				581	set_dec(decrementer_max);
				582
				583	/* Conditionally hard-enable interrupts now that the DEC has been
				584	* bumped to its maximum value
				585	*/
				586	may_hard_irq_enable();
				587
				588
				589	#if defined(CONFIG_PPC32) && defined(CONFIG_PPC_PMAC)
				590	if (atomic_read(&ppc_n_lost_interrupts) != 0)
				591	do_IRQ(regs);
				592	#endif
				593
				594	old_regs = set_irq_regs(regs);
				595	irq_enter();
				596	trace_timer_interrupt_entry(regs);
				597
				598	if (test_irq_work_pending()) {
				599	clear_irq_work_pending();
				600	irq_work_run();
				601	}
				602
				603	now = get_tb_or_rtc();
				604	if (now >= *next_tb) {
				605	*next_tb = ~(u64)0;
				606	if (evt->event_handler)
				607	evt->event_handler(evt);
				608	__this_cpu_inc(irq_stat.timer_irqs_event);
				609	} else {
				610	now = *next_tb - now;
				611	if (now <= decrementer_max)
				612	set_dec(now);
				613	/* We may have raced with new irq work */
				614	if (test_irq_work_pending())
				615	set_dec(1);
				616	__this_cpu_inc(irq_stat.timer_irqs_others);
				617	}
				618
				619	trace_timer_interrupt_exit(regs);
				620	irq_exit();
				621	set_irq_regs(old_regs);
				622	}
				623	EXPORT_SYMBOL(timer_interrupt);
				624
				625	#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
				626	void timer_broadcast_interrupt(void)
				627	{
				628	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
				629
				630	*next_tb = ~(u64)0;
				631	tick_receive_broadcast();
				632	__this_cpu_inc(irq_stat.broadcast_irqs_event);
				633	}
				634	#endif
				635
				636	/*
				637	* Hypervisor decrementer interrupts shouldn't occur but are sometimes
				638	* left pending on exit from a KVM guest. We don't need to do anything
				639	* to clear them, as they are edge-triggered.
				640	*/
				641	void hdec_interrupt(struct pt_regs *regs)
				642	{
				643	}
				644
				645	#ifdef CONFIG_SUSPEND
				646	static void generic_suspend_disable_irqs(void)
				647	{
				648	/* Disable the decrementer, so that it doesn't interfere
				649	* with suspending.
				650	*/
				651
				652	set_dec(decrementer_max);
				653	local_irq_disable();
				654	set_dec(decrementer_max);
				655	}
				656
				657	static void generic_suspend_enable_irqs(void)
				658	{
				659	local_irq_enable();
				660	}
				661
				662	/* Overrides the weak version in kernel/power/main.c */
				663	void arch_suspend_disable_irqs(void)
				664	{
				665	if (ppc_md.suspend_disable_irqs)
				666	ppc_md.suspend_disable_irqs();
				667	generic_suspend_disable_irqs();
				668	}
				669
				670	/* Overrides the weak version in kernel/power/main.c */
				671	void arch_suspend_enable_irqs(void)
				672	{
				673	generic_suspend_enable_irqs();
				674	if (ppc_md.suspend_enable_irqs)
				675	ppc_md.suspend_enable_irqs();
				676	}
				677	#endif
				678
				679	unsigned long long tb_to_ns(unsigned long long ticks)
				680	{
				681	return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
				682	}
				683	EXPORT_SYMBOL_GPL(tb_to_ns);
				684
				685	/*
				686	* Scheduler clock - returns current time in nanosec units.
				687	*
				688	* Note: mulhdu(a, b) (multiply high double unsigned) returns
				689	* the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
				690	* are 64-bit unsigned numbers.
				691	*/
				692	notrace unsigned long long sched_clock(void)
				693	{
				694	if (__USE_RTC())
				695	return get_rtc();
				696	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
				697	}
				698
				699
				700	#ifdef CONFIG_PPC_PSERIES
				701
				702	/*
				703	* Running clock - attempts to give a view of time passing for a virtualised
				704	* kernels.
				705	* Uses the VTB register if available otherwise a next best guess.
				706	*/
				707	unsigned long long running_clock(void)
				708	{
				709	/*
				710	* Don't read the VTB as a host since KVM does not switch in host
				711	* timebase into the VTB when it takes a guest off the CPU, reading the
				712	* VTB would result in reading 'last switched out' guest VTB.
				713	*
				714	* Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it
				715	* would be unsafe to rely only on the #ifdef above.
				716	*/
				717	if (firmware_has_feature(FW_FEATURE_LPAR) &&
				718	cpu_has_feature(CPU_FTR_ARCH_207S))
				719	return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
				720
				721	/*
				722	* This is a next best approximation without a VTB.
				723	* On a host which is running bare metal there should never be any stolen
				724	* time and on a host which doesn't do any virtualisation TB should equal
				725	* VTB so it makes no difference anyway.
				726	*/
				727	return local_clock() - kcpustat_this_cpu->cpustat[CPUTIME_STEAL];
				728	}
				729	#endif
				730
				731	static int __init get_freq(char name, int cells, unsigned long val)
				732	{
				733	struct device_node *cpu;
				734	const __be32 *fp;
				735	int found = 0;
				736
				737	/* The cpu node should have timebase and clock frequency properties */
				738	cpu = of_find_node_by_type(NULL, "cpu");
				739
				740	if (cpu) {
				741	fp = of_get_property(cpu, name, NULL);
				742	if (fp) {
				743	found = 1;
				744	*val = of_read_ulong(fp, cells);
				745	}
				746
				747	of_node_put(cpu);
				748	}
				749
				750	return found;
				751	}
				752
				753	static void start_cpu_decrementer(void)
				754	{
				755	#if defined(CONFIG_BOOKE) \|\| defined(CONFIG_40x)
				756	unsigned int tcr;
				757
				758	/* Clear any pending timer interrupts */
				759	mtspr(SPRN_TSR, TSR_ENW \| TSR_WIS \| TSR_DIS \| TSR_FIS);
				760
				761	tcr = mfspr(SPRN_TCR);
				762	/*
				763	* The watchdog may have already been enabled by u-boot. So leave
				764	* TRC[WP] (Watchdog Period) alone.
				765	*/
				766	tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */
				767	tcr \|= TCR_DIE; /* Enable decrementer */
				768	mtspr(SPRN_TCR, tcr);
				769	#endif
				770	}
				771
				772	void __init generic_calibrate_decr(void)
				773	{
				774	ppc_tb_freq = DEFAULT_TB_FREQ; /* hardcoded default */
				775
				776	if (!get_freq("ibm,extended-timebase-frequency", 2, &ppc_tb_freq) &&
				777	!get_freq("timebase-frequency", 1, &ppc_tb_freq)) {
				778
				779	printk(KERN_ERR "WARNING: Estimating decrementer frequency "
				780	"(not found)\n");
				781	}
				782
				783	ppc_proc_freq = DEFAULT_PROC_FREQ; /* hardcoded default */
				784
				785	if (!get_freq("ibm,extended-clock-frequency", 2, &ppc_proc_freq) &&
				786	!get_freq("clock-frequency", 1, &ppc_proc_freq)) {
				787
				788	printk(KERN_ERR "WARNING: Estimating processor frequency "
				789	"(not found)\n");
				790	}
				791	}
				792
				793	int update_persistent_clock64(struct timespec64 now)
				794	{
				795	struct rtc_time tm;
				796
				797	if (!ppc_md.set_rtc_time)
				798	return -ENODEV;
				799
				800	rtc_time64_to_tm(now.tv_sec + 1 + timezone_offset, &tm);
				801
				802	return ppc_md.set_rtc_time(&tm);
				803	}
				804
				805	static void __read_persistent_clock(struct timespec64 *ts)
				806	{
				807	struct rtc_time tm;
				808	static int first = 1;
				809
				810	ts->tv_nsec = 0;
				811	/* XXX this is a litle fragile but will work okay in the short term */
				812	if (first) {
				813	first = 0;
				814	if (ppc_md.time_init)
				815	timezone_offset = ppc_md.time_init();
				816
				817	/* get_boot_time() isn't guaranteed to be safe to call late */
				818	if (ppc_md.get_boot_time) {
				819	ts->tv_sec = ppc_md.get_boot_time() - timezone_offset;
				820	return;
				821	}
				822	}
				823	if (!ppc_md.get_rtc_time) {
				824	ts->tv_sec = 0;
				825	return;
				826	}
				827	ppc_md.get_rtc_time(&tm);
				828
				829	ts->tv_sec = rtc_tm_to_time64(&tm);
				830	}
				831
				832	void read_persistent_clock64(struct timespec64 *ts)
				833	{
				834	__read_persistent_clock(ts);
				835
				836	/* Sanitize it in case real time clock is set below EPOCH */
				837	if (ts->tv_sec < 0) {
				838	ts->tv_sec = 0;
				839	ts->tv_nsec = 0;
				840	}
				841
				842	}
				843
				844	/* clocksource code */
				845	static notrace u64 rtc_read(struct clocksource *cs)
				846	{
				847	return (u64)get_rtc();
				848	}
				849
				850	static notrace u64 timebase_read(struct clocksource *cs)
				851	{
				852	return (u64)get_tb();
				853	}
				854
				855
				856	void update_vsyscall(struct timekeeper *tk)
				857	{
				858	struct timespec xt;
				859	struct clocksource *clock = tk->tkr_mono.clock;
				860	u32 mult = tk->tkr_mono.mult;
				861	u32 shift = tk->tkr_mono.shift;
				862	u64 cycle_last = tk->tkr_mono.cycle_last;
				863	u64 new_tb_to_xs, new_stamp_xsec;
				864	u64 frac_sec;
				865
				866	if (clock != &clocksource_timebase)
				867	return;
				868
				869	xt.tv_sec = tk->xtime_sec;
				870	xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
				871
				872	/* Make userspace gettimeofday spin until we're done. */
				873	++vdso_data->tb_update_count;
				874	smp_mb();
				875
				876	/*
				877	* This computes ((2^20 / 1e9) * mult) >> shift as a
				878	* 0.64 fixed-point fraction.
				879	* The computation in the else clause below won't overflow
				880	* (as long as the timebase frequency is >= 1.049 MHz)
				881	* but loses precision because we lose the low bits of the constant
				882	* in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9.
				883	* For a shift of 24 the error is about 0.5e-9, or about 0.5ns
				884	* over a second. (Shift values are usually 22, 23 or 24.)
				885	* For high frequency clocks such as the 512MHz timebase clock
				886	* on POWER[6789], the mult value is small (e.g. 32768000)
				887	* and so we can shift the constant by 16 initially
				888	* (295147905179 ~= 2^(20+64-16) / 1e9) and then do the
				889	* remaining shifts after the multiplication, which gives a
				890	* more accurate result (e.g. with mult = 32768000, shift = 24,
				891	* the error is only about 1.2e-12, or 0.7ns over 10 minutes).
				892	*/
				893	if (mult <= 62500000 && clock->shift >= 16)
				894	new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16);
				895	else
				896	new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
				897
				898	/*
				899	* Compute the fractional second in units of 2^-32 seconds.
				900	* The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift
				901	* in nanoseconds, so multiplying that by 2^32 / 1e9 gives
				902	* it in units of 2^-32 seconds.
				903	* We assume shift <= 32 because clocks_calc_mult_shift()
				904	* generates shift values in the range 0 - 32.
				905	*/
				906	frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift);
				907	do_div(frac_sec, NSEC_PER_SEC);
				908
				909	/*
				910	* Work out new stamp_xsec value for any legacy users of systemcfg.
				911	* stamp_xsec is in units of 2^-20 seconds.
				912	*/
				913	new_stamp_xsec = frac_sec >> 12;
				914	new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC;
				915
				916	/*
				917	* tb_update_count is used to allow the userspace gettimeofday code
				918	* to assure itself that it sees a consistent view of the tb_to_xs and
				919	* stamp_xsec variables. It reads the tb_update_count, then reads
				920	* tb_to_xs and stamp_xsec and then reads tb_update_count again. If
				921	* the two values of tb_update_count match and are even then the
				922	* tb_to_xs and stamp_xsec values are consistent. If not, then it
				923	* loops back and reads them again until this criteria is met.
				924	*/
				925	vdso_data->tb_orig_stamp = cycle_last;
				926	vdso_data->stamp_xsec = new_stamp_xsec;
				927	vdso_data->tb_to_xs = new_tb_to_xs;
				928	vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec;
				929	vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
				930	vdso_data->stamp_xtime = xt;
				931	vdso_data->stamp_sec_fraction = frac_sec;
				932	smp_wmb();
				933	++(vdso_data->tb_update_count);
				934	}
				935
				936	void update_vsyscall_tz(void)
				937	{
				938	vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
				939	vdso_data->tz_dsttime = sys_tz.tz_dsttime;
				940	}
				941
				942	static void __init clocksource_init(void)
				943	{
				944	struct clocksource *clock;
				945
				946	if (__USE_RTC())
				947	clock = &clocksource_rtc;
				948	else
				949	clock = &clocksource_timebase;
				950
				951	if (clocksource_register_hz(clock, tb_ticks_per_sec)) {
				952	printk(KERN_ERR "clocksource: %s is already registered\n",
				953	clock->name);
				954	return;
				955	}
				956
				957	printk(KERN_INFO "clocksource: %s mult[%x] shift[%d] registered\n",
				958	clock->name, clock->mult, clock->shift);
				959	}
				960
				961	static int decrementer_set_next_event(unsigned long evt,
				962	struct clock_event_device *dev)
				963	{
				964	__this_cpu_write(decrementers_next_tb, get_tb_or_rtc() + evt);
				965	set_dec(evt);
				966
				967	/* We may have raced with new irq work */
				968	if (test_irq_work_pending())
				969	set_dec(1);
				970
				971	return 0;
				972	}
				973
				974	static int decrementer_shutdown(struct clock_event_device *dev)
				975	{
				976	decrementer_set_next_event(decrementer_max, dev);
				977	return 0;
				978	}
				979
				980	static void register_decrementer_clockevent(int cpu)
				981	{
				982	struct clock_event_device *dec = &per_cpu(decrementers, cpu);
				983
				984	*dec = decrementer_clockevent;
				985	dec->cpumask = cpumask_of(cpu);
				986
				987	printk_once(KERN_DEBUG "clockevent: %s mult[%x] shift[%d] cpu[%d]\n",
				988	dec->name, dec->mult, dec->shift, cpu);
				989
				990	clockevents_register_device(dec);
				991	}
				992
				993	static void enable_large_decrementer(void)
				994	{
				995	if (!cpu_has_feature(CPU_FTR_ARCH_300))
				996	return;
				997
				998	if (decrementer_max <= DECREMENTER_DEFAULT_MAX)
				999	return;
				1000
				1001	/*
				1002	* If we're running as the hypervisor we need to enable the LD manually
				1003	* otherwise firmware should have done it for us.
				1004	*/
				1005	if (cpu_has_feature(CPU_FTR_HVMODE))
				1006	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) \| LPCR_LD);
				1007	}
				1008
				1009	static void __init set_decrementer_max(void)
				1010	{
				1011	struct device_node *cpu;
				1012	u32 bits = 32;
				1013
				1014	/* Prior to ISAv3 the decrementer is always 32 bit */
				1015	if (!cpu_has_feature(CPU_FTR_ARCH_300))
				1016	return;
				1017
				1018	cpu = of_find_node_by_type(NULL, "cpu");
				1019
				1020	if (of_property_read_u32(cpu, "ibm,dec-bits", &bits) == 0) {
				1021	if (bits > 64 \|\| bits < 32) {
				1022	pr_warn("time_init: firmware supplied invalid ibm,dec-bits");
				1023	bits = 32;
				1024	}
				1025
				1026	/* calculate the signed maximum given this many bits */
				1027	decrementer_max = (1ul << (bits - 1)) - 1;
				1028	}
				1029
				1030	of_node_put(cpu);
				1031
				1032	pr_info("time_init: %u bit decrementer (max: %llx)\n",
				1033	bits, decrementer_max);
				1034	}
				1035
				1036	static void __init init_decrementer_clockevent(void)
				1037	{
				1038	int cpu = smp_processor_id();
				1039
				1040	clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4);
				1041
				1042	decrementer_clockevent.max_delta_ns =
				1043	clockevent_delta2ns(decrementer_max, &decrementer_clockevent);
				1044	decrementer_clockevent.max_delta_ticks = decrementer_max;
				1045	decrementer_clockevent.min_delta_ns =
				1046	clockevent_delta2ns(2, &decrementer_clockevent);
				1047	decrementer_clockevent.min_delta_ticks = 2;
				1048
				1049	register_decrementer_clockevent(cpu);
				1050	}
				1051
				1052	void secondary_cpu_time_init(void)
				1053	{
				1054	/* Enable and test the large decrementer for this cpu */
				1055	enable_large_decrementer();
				1056
				1057	/* Start the decrementer on CPUs that have manual control
				1058	* such as BookE
				1059	*/
				1060	start_cpu_decrementer();
				1061
				1062	/* FIME: Should make unrelatred change to move snapshot_timebase
				1063	* call here ! */
				1064	register_decrementer_clockevent(smp_processor_id());
				1065	}
				1066
				1067	/* This function is only called on the boot processor */
				1068	void __init time_init(void)
				1069	{
				1070	struct div_result res;
				1071	u64 scale;
				1072	unsigned shift;
				1073
				1074	if (__USE_RTC()) {
				1075	/* 601 processor: dec counts down by 128 every 128ns */
				1076	ppc_tb_freq = 1000000000;
				1077	} else {
				1078	/* Normal PowerPC with timebase register */
				1079	ppc_md.calibrate_decr();
				1080	printk(KERN_DEBUG "time_init: decrementer frequency = %lu.%.6lu MHz\n",
				1081	ppc_tb_freq / 1000000, ppc_tb_freq % 1000000);
				1082	printk(KERN_DEBUG "time_init: processor frequency = %lu.%.6lu MHz\n",
				1083	ppc_proc_freq / 1000000, ppc_proc_freq % 1000000);
				1084	}
				1085
				1086	tb_ticks_per_jiffy = ppc_tb_freq / HZ;
				1087	tb_ticks_per_sec = ppc_tb_freq;
				1088	tb_ticks_per_usec = ppc_tb_freq / 1000000;
				1089	calc_cputime_factors();
				1090
				1091	/*
				1092	* Compute scale factor for sched_clock.
				1093	* The calibrate_decr() function has set tb_ticks_per_sec,
				1094	* which is the timebase frequency.
				1095	* We compute 1e9 * 2^64 / tb_ticks_per_sec and interpret
				1096	* the 128-bit result as a 64.64 fixed-point number.
				1097	* We then shift that number right until it is less than 1.0,
				1098	* giving us the scale factor and shift count to use in
				1099	* sched_clock().
				1100	*/
				1101	div128_by_32(1000000000, 0, tb_ticks_per_sec, &res);
				1102	scale = res.result_low;
				1103	for (shift = 0; res.result_high != 0; ++shift) {
				1104	scale = (scale >> 1) \| (res.result_high << 63);
				1105	res.result_high >>= 1;
				1106	}
				1107	tb_to_ns_scale = scale;
				1108	tb_to_ns_shift = shift;
				1109	/* Save the current timebase to pretty up CONFIG_PRINTK_TIME */
				1110	boot_tb = get_tb_or_rtc();
				1111
				1112	/* If platform provided a timezone (pmac), we correct the time */
				1113	if (timezone_offset) {
				1114	sys_tz.tz_minuteswest = -timezone_offset / 60;
				1115	sys_tz.tz_dsttime = 0;
				1116	}
				1117
				1118	vdso_data->tb_update_count = 0;
				1119	vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
				1120
				1121	/* initialise and enable the large decrementer (if we have one) */
				1122	set_decrementer_max();
				1123	enable_large_decrementer();
				1124
				1125	/* Start the decrementer on CPUs that have manual control
				1126	* such as BookE
				1127	*/
				1128	start_cpu_decrementer();
				1129
				1130	/* Register the clocksource */
				1131	clocksource_init();
				1132
				1133	init_decrementer_clockevent();
				1134	tick_setup_hrtimer_broadcast();
				1135
				1136	#ifdef CONFIG_COMMON_CLK
				1137	of_clk_init(NULL);
				1138	#endif
				1139	}
				1140
				1141	/*
				1142	* Divide a 128-bit dividend by a 32-bit divisor, leaving a 128 bit
				1143	* result.
				1144	*/
				1145	void div128_by_32(u64 dividend_high, u64 dividend_low,
				1146	unsigned divisor, struct div_result *dr)
				1147	{
				1148	unsigned long a, b, c, d;
				1149	unsigned long w, x, y, z;
				1150	u64 ra, rb, rc;
				1151
				1152	a = dividend_high >> 32;
				1153	b = dividend_high & 0xffffffff;
				1154	c = dividend_low >> 32;
				1155	d = dividend_low & 0xffffffff;
				1156
				1157	w = a / divisor;
				1158	ra = ((u64)(a - (w * divisor)) << 32) + b;
				1159
				1160	rb = ((u64) do_div(ra, divisor) << 32) + c;
				1161	x = ra;
				1162
				1163	rc = ((u64) do_div(rb, divisor) << 32) + d;
				1164	y = rb;
				1165
				1166	do_div(rc, divisor);
				1167	z = rc;
				1168
				1169	dr->result_high = ((u64)w << 32) + x;
				1170	dr->result_low = ((u64)y << 32) + z;
				1171
				1172	}
				1173
				1174	/* We don't need to calibrate delay, we use the CPU timebase for that */
				1175	void calibrate_delay(void)
				1176	{
				1177	/* Some generic code (such as spinlock debug) use loops_per_jiffy
				1178	* as the number of __delay(1) in a jiffy, so make it so
				1179	*/
				1180	loops_per_jiffy = tb_ticks_per_jiffy;
				1181	}
				1182
				1183	#if IS_ENABLED(CONFIG_RTC_DRV_GENERIC)
				1184	static int rtc_generic_get_time(struct device dev, struct rtc_time tm)
				1185	{
				1186	ppc_md.get_rtc_time(tm);
				1187	return 0;
				1188	}
				1189
				1190	static int rtc_generic_set_time(struct device dev, struct rtc_time tm)
				1191	{
				1192	if (!ppc_md.set_rtc_time)
				1193	return -EOPNOTSUPP;
				1194
				1195	if (ppc_md.set_rtc_time(tm) < 0)
				1196	return -EOPNOTSUPP;
				1197
				1198	return 0;
				1199	}
				1200
				1201	static const struct rtc_class_ops rtc_generic_ops = {
				1202	.read_time = rtc_generic_get_time,
				1203	.set_time = rtc_generic_set_time,
				1204	};
				1205
				1206	static int __init rtc_init(void)
				1207	{
				1208	struct platform_device *pdev;
				1209
				1210	if (!ppc_md.get_rtc_time)
				1211	return -ENODEV;
				1212
				1213	pdev = platform_device_register_data(NULL, "rtc-generic", -1,
				1214	&rtc_generic_ops,
				1215	sizeof(rtc_generic_ops));
				1216
				1217	return PTR_ERR_OR_ZERO(pdev);
				1218	}
				1219
				1220	device_initcall(rtc_init);
				1221	#endif