Blame - arch/powerpc/lib/memcmp_64.S - hafnium/third_party/linux.git

blob: 844d8e774492e65929168bfff4d0655fa50dda74 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/*
				2	* Author: Anton Blanchard <anton@au.ibm.com>
				3	* Copyright 2015 IBM Corporation.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public License
				7	* as published by the Free Software Foundation; either version
				8	* 2 of the License, or (at your option) any later version.
				9	*/
				10	#include <asm/ppc_asm.h>
				11	#include <asm/export.h>
				12	#include <asm/ppc-opcode.h>
				13
				14	#define off8 r6
				15	#define off16 r7
				16	#define off24 r8
				17
				18	#define rA r9
				19	#define rB r10
				20	#define rC r11
				21	#define rD r27
				22	#define rE r28
				23	#define rF r29
				24	#define rG r30
				25	#define rH r31
				26
				27	#ifdef __LITTLE_ENDIAN__
				28	#define LH lhbrx
				29	#define LW lwbrx
				30	#define LD ldbrx
				31	#define LVS lvsr
				32	#define VPERM(_VRT,_VRA,_VRB,_VRC) \
				33	vperm _VRT,_VRB,_VRA,_VRC
				34	#else
				35	#define LH lhzx
				36	#define LW lwzx
				37	#define LD ldx
				38	#define LVS lvsl
				39	#define VPERM(_VRT,_VRA,_VRB,_VRC) \
				40	vperm _VRT,_VRA,_VRB,_VRC
				41	#endif
				42
				43	#define VMX_THRESH 4096
				44	#define ENTER_VMX_OPS \
				45	mflr r0; \
				46	std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
				47	std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
				48	std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
				49	std r0,16(r1); \
				50	stdu r1,-STACKFRAMESIZE(r1); \
				51	bl enter_vmx_ops; \
				52	cmpwi cr1,r3,0; \
				53	ld r0,STACKFRAMESIZE+16(r1); \
				54	ld r3,STK_REG(R31)(r1); \
				55	ld r4,STK_REG(R30)(r1); \
				56	ld r5,STK_REG(R29)(r1); \
				57	addi r1,r1,STACKFRAMESIZE; \
				58	mtlr r0
				59
				60	#define EXIT_VMX_OPS \
				61	mflr r0; \
				62	std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
				63	std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
				64	std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
				65	std r0,16(r1); \
				66	stdu r1,-STACKFRAMESIZE(r1); \
				67	bl exit_vmx_ops; \
				68	ld r0,STACKFRAMESIZE+16(r1); \
				69	ld r3,STK_REG(R31)(r1); \
				70	ld r4,STK_REG(R30)(r1); \
				71	ld r5,STK_REG(R29)(r1); \
				72	addi r1,r1,STACKFRAMESIZE; \
				73	mtlr r0
				74
				75	/*
				76	* LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
				77	* 16 bytes boundary and permute the result with the 1st 16 bytes.
				78
				79	* \| y y y y y y y y y y y y y 0 1 2 \| 3 4 5 6 7 8 9 a b c d e f z z z \|
				80	* ^ ^ ^
				81	* 0xbbbb10 0xbbbb20 0xbbb30
				82	* ^
				83	* _vaddr
				84	*
				85	*
				86	* _vmask is the mask generated by LVS
				87	* _v1st_qw is the 1st aligned QW of current addr which is already loaded.
				88	* for example: 0xyyyyyyyyyyyyy012 for big endian
				89	* _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
				90	* for example: 0x3456789abcdefzzz for big endian
				91	* The permute result is saved in _v_res.
				92	* for example: 0x0123456789abcdef for big endian.
				93	*/
				94	#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
				95	lvx _v2nd_qw,_vaddr,off16; \
				96	VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
				97
				98	/*
				99	* There are 2 categories for memcmp:
				100	* 1) src/dst has the same offset to the 8 bytes boundary. The handlers
				101	* are named like .Lsameoffset_xxxx
				102	* 2) src/dst has different offset to the 8 bytes boundary. The handlers
				103	* are named like .Ldiffoffset_xxxx
				104	*/
				105	_GLOBAL_TOC(memcmp)
				106	cmpdi cr1,r5,0
				107
				108	/* Use the short loop if the src/dst addresses are not
				109	* with the same offset of 8 bytes align boundary.
				110	*/
				111	xor r6,r3,r4
				112	andi. r6,r6,7
				113
				114	/* Fall back to short loop if compare at aligned addrs
				115	* with less than 8 bytes.
				116	*/
				117	cmpdi cr6,r5,7
				118
				119	beq cr1,.Lzero
				120	bgt cr6,.Lno_short
				121
				122	.Lshort:
				123	mtctr r5
				124	1: lbz rA,0(r3)
				125	lbz rB,0(r4)
				126	subf. rC,rB,rA
				127	bne .Lnon_zero
				128	bdz .Lzero
				129
				130	lbz rA,1(r3)
				131	lbz rB,1(r4)
				132	subf. rC,rB,rA
				133	bne .Lnon_zero
				134	bdz .Lzero
				135
				136	lbz rA,2(r3)
				137	lbz rB,2(r4)
				138	subf. rC,rB,rA
				139	bne .Lnon_zero
				140	bdz .Lzero
				141
				142	lbz rA,3(r3)
				143	lbz rB,3(r4)
				144	subf. rC,rB,rA
				145	bne .Lnon_zero
				146
				147	addi r3,r3,4
				148	addi r4,r4,4
				149
				150	bdnz 1b
				151
				152	.Lzero:
				153	li r3,0
				154	blr
				155
				156	.Lno_short:
				157	dcbt 0,r3
				158	dcbt 0,r4
				159	bne .Ldiffoffset_8bytes_make_align_start
				160
				161
				162	.Lsameoffset_8bytes_make_align_start:
				163	/* attempt to compare bytes not aligned with 8 bytes so that
				164	* rest comparison can run based on 8 bytes alignment.
				165	*/
				166	andi. r6,r3,7
				167
				168	/* Try to compare the first double word which is not 8 bytes aligned:
				169	* load the first double word at (src & ~7UL) and shift left appropriate
				170	* bits before comparision.
				171	*/
				172	rlwinm r6,r3,3,26,28
				173	beq .Lsameoffset_8bytes_aligned
				174	clrrdi r3,r3,3
				175	clrrdi r4,r4,3
				176	LD rA,0,r3
				177	LD rB,0,r4
				178	sld rA,rA,r6
				179	sld rB,rB,r6
				180	cmpld cr0,rA,rB
				181	srwi r6,r6,3
				182	bne cr0,.LcmpAB_lightweight
				183	subfic r6,r6,8
				184	subf. r5,r6,r5
				185	addi r3,r3,8
				186	addi r4,r4,8
				187	beq .Lzero
				188
				189	.Lsameoffset_8bytes_aligned:
				190	/* now we are aligned with 8 bytes.
				191	* Use .Llong loop if left cmp bytes are equal or greater than 32B.
				192	*/
				193	cmpdi cr6,r5,31
				194	bgt cr6,.Llong
				195
				196	.Lcmp_lt32bytes:
				197	/* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
				198	cmpdi cr5,r5,7
				199	srdi r0,r5,3
				200	ble cr5,.Lcmp_rest_lt8bytes
				201
				202	/* handle 8 ~ 31 bytes */
				203	clrldi r5,r5,61
				204	mtctr r0
				205	2:
				206	LD rA,0,r3
				207	LD rB,0,r4
				208	cmpld cr0,rA,rB
				209	addi r3,r3,8
				210	addi r4,r4,8
				211	bne cr0,.LcmpAB_lightweight
				212	bdnz 2b
				213
				214	cmpwi r5,0
				215	beq .Lzero
				216
				217	.Lcmp_rest_lt8bytes:
				218	/* Here we have only less than 8 bytes to compare with. at least s1
				219	* Address is aligned with 8 bytes.
				220	* The next double words are load and shift right with appropriate
				221	* bits.
				222	*/
				223	subfic r6,r5,8
				224	slwi r6,r6,3
				225	LD rA,0,r3
				226	LD rB,0,r4
				227	srd rA,rA,r6
				228	srd rB,rB,r6
				229	cmpld cr0,rA,rB
				230	bne cr0,.LcmpAB_lightweight
				231	b .Lzero
				232
				233	.Lnon_zero:
				234	mr r3,rC
				235	blr
				236
				237	.Llong:
				238	#ifdef CONFIG_ALTIVEC
				239	BEGIN_FTR_SECTION
				240	/* Try to use vmx loop if length is equal or greater than 4K */
				241	cmpldi cr6,r5,VMX_THRESH
				242	bge cr6,.Lsameoffset_vmx_cmp
				243	END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
				244
				245	.Llong_novmx_cmp:
				246	#endif
				247	/* At least s1 addr is aligned with 8 bytes */
				248	li off8,8
				249	li off16,16
				250	li off24,24
				251
				252	std r31,-8(r1)
				253	std r30,-16(r1)
				254	std r29,-24(r1)
				255	std r28,-32(r1)
				256	std r27,-40(r1)
				257
				258	srdi r0,r5,5
				259	mtctr r0
				260	andi. r5,r5,31
				261
				262	LD rA,0,r3
				263	LD rB,0,r4
				264
				265	LD rC,off8,r3
				266	LD rD,off8,r4
				267
				268	LD rE,off16,r3
				269	LD rF,off16,r4
				270
				271	LD rG,off24,r3
				272	LD rH,off24,r4
				273	cmpld cr0,rA,rB
				274
				275	addi r3,r3,32
				276	addi r4,r4,32
				277
				278	bdz .Lfirst32
				279
				280	LD rA,0,r3
				281	LD rB,0,r4
				282	cmpld cr1,rC,rD
				283
				284	LD rC,off8,r3
				285	LD rD,off8,r4
				286	cmpld cr6,rE,rF
				287
				288	LD rE,off16,r3
				289	LD rF,off16,r4
				290	cmpld cr7,rG,rH
				291	bne cr0,.LcmpAB
				292
				293	LD rG,off24,r3
				294	LD rH,off24,r4
				295	cmpld cr0,rA,rB
				296	bne cr1,.LcmpCD
				297
				298	addi r3,r3,32
				299	addi r4,r4,32
				300
				301	bdz .Lsecond32
				302
				303	.balign 16
				304
				305	1: LD rA,0,r3
				306	LD rB,0,r4
				307	cmpld cr1,rC,rD
				308	bne cr6,.LcmpEF
				309
				310	LD rC,off8,r3
				311	LD rD,off8,r4
				312	cmpld cr6,rE,rF
				313	bne cr7,.LcmpGH
				314
				315	LD rE,off16,r3
				316	LD rF,off16,r4
				317	cmpld cr7,rG,rH
				318	bne cr0,.LcmpAB
				319
				320	LD rG,off24,r3
				321	LD rH,off24,r4
				322	cmpld cr0,rA,rB
				323	bne cr1,.LcmpCD
				324
				325	addi r3,r3,32
				326	addi r4,r4,32
				327
				328	bdnz 1b
				329
				330	.Lsecond32:
				331	cmpld cr1,rC,rD
				332	bne cr6,.LcmpEF
				333
				334	cmpld cr6,rE,rF
				335	bne cr7,.LcmpGH
				336
				337	cmpld cr7,rG,rH
				338	bne cr0,.LcmpAB
				339
				340	bne cr1,.LcmpCD
				341	bne cr6,.LcmpEF
				342	bne cr7,.LcmpGH
				343
				344	.Ltail:
				345	ld r31,-8(r1)
				346	ld r30,-16(r1)
				347	ld r29,-24(r1)
				348	ld r28,-32(r1)
				349	ld r27,-40(r1)
				350
				351	cmpdi r5,0
				352	beq .Lzero
				353	b .Lshort
				354
				355	.Lfirst32:
				356	cmpld cr1,rC,rD
				357	cmpld cr6,rE,rF
				358	cmpld cr7,rG,rH
				359
				360	bne cr0,.LcmpAB
				361	bne cr1,.LcmpCD
				362	bne cr6,.LcmpEF
				363	bne cr7,.LcmpGH
				364
				365	b .Ltail
				366
				367	.LcmpAB:
				368	li r3,1
				369	bgt cr0,.Lout
				370	li r3,-1
				371	b .Lout
				372
				373	.LcmpCD:
				374	li r3,1
				375	bgt cr1,.Lout
				376	li r3,-1
				377	b .Lout
				378
				379	.LcmpEF:
				380	li r3,1
				381	bgt cr6,.Lout
				382	li r3,-1
				383	b .Lout
				384
				385	.LcmpGH:
				386	li r3,1
				387	bgt cr7,.Lout
				388	li r3,-1
				389
				390	.Lout:
				391	ld r31,-8(r1)
				392	ld r30,-16(r1)
				393	ld r29,-24(r1)
				394	ld r28,-32(r1)
				395	ld r27,-40(r1)
				396	blr
				397
				398	.LcmpAB_lightweight: /* skip NV GPRS restore */
				399	li r3,1
				400	bgtlr
				401	li r3,-1
				402	blr
				403
				404	#ifdef CONFIG_ALTIVEC
				405	.Lsameoffset_vmx_cmp:
				406	/* Enter with src/dst addrs has the same offset with 8 bytes
				407	* align boundary.
				408	*
				409	* There is an optimization based on following fact: memcmp()
				410	* prones to fail early at the first 32 bytes.
				411	* Before applying VMX instructions which will lead to 32x128bits
				412	* VMX regs load/restore penalty, we compare the first 32 bytes
				413	* so that we can catch the ~80% fail cases.
				414	*/
				415
				416	li r0,4
				417	mtctr r0
				418	.Lsameoffset_prechk_32B_loop:
				419	LD rA,0,r3
				420	LD rB,0,r4
				421	cmpld cr0,rA,rB
				422	addi r3,r3,8
				423	addi r4,r4,8
				424	bne cr0,.LcmpAB_lightweight
				425	addi r5,r5,-8
				426	bdnz .Lsameoffset_prechk_32B_loop
				427
				428	ENTER_VMX_OPS
				429	beq cr1,.Llong_novmx_cmp
				430
				431	3:
				432	/* need to check whether r4 has the same offset with r3
				433	* for 16 bytes boundary.
				434	*/
				435	xor r0,r3,r4
				436	andi. r0,r0,0xf
				437	bne .Ldiffoffset_vmx_cmp_start
				438
				439	/* len is no less than 4KB. Need to align with 16 bytes further.
				440	*/
				441	andi. rA,r3,8
				442	LD rA,0,r3
				443	beq 4f
				444	LD rB,0,r4
				445	cmpld cr0,rA,rB
				446	addi r3,r3,8
				447	addi r4,r4,8
				448	addi r5,r5,-8
				449
				450	beq cr0,4f
				451	/* save and restore cr0 */
				452	mfocrf r5,128
				453	EXIT_VMX_OPS
				454	mtocrf 128,r5
				455	b .LcmpAB_lightweight
				456
				457	4:
				458	/* compare 32 bytes for each loop */
				459	srdi r0,r5,5
				460	mtctr r0
				461	clrldi r5,r5,59
				462	li off16,16
				463
				464	.balign 16
				465	5:
				466	lvx v0,0,r3
				467	lvx v1,0,r4
				468	VCMPEQUD_RC(v0,v0,v1)
				469	bnl cr6,7f
				470	lvx v0,off16,r3
				471	lvx v1,off16,r4
				472	VCMPEQUD_RC(v0,v0,v1)
				473	bnl cr6,6f
				474	addi r3,r3,32
				475	addi r4,r4,32
				476	bdnz 5b
				477
				478	EXIT_VMX_OPS
				479	cmpdi r5,0
				480	beq .Lzero
				481	b .Lcmp_lt32bytes
				482
				483	6:
				484	addi r3,r3,16
				485	addi r4,r4,16
				486
				487	7:
				488	/* diff the last 16 bytes */
				489	EXIT_VMX_OPS
				490	LD rA,0,r3
				491	LD rB,0,r4
				492	cmpld cr0,rA,rB
				493	li off8,8
				494	bne cr0,.LcmpAB_lightweight
				495
				496	LD rA,off8,r3
				497	LD rB,off8,r4
				498	cmpld cr0,rA,rB
				499	bne cr0,.LcmpAB_lightweight
				500	b .Lzero
				501	#endif
				502
				503	.Ldiffoffset_8bytes_make_align_start:
				504	/* now try to align s1 with 8 bytes */
				505	rlwinm r6,r3,3,26,28
				506	beq .Ldiffoffset_align_s1_8bytes
				507
				508	clrrdi r3,r3,3
				509	LD rA,0,r3
				510	LD rB,0,r4 /* unaligned load */
				511	sld rA,rA,r6
				512	srd rA,rA,r6
				513	srd rB,rB,r6
				514	cmpld cr0,rA,rB
				515	srwi r6,r6,3
				516	bne cr0,.LcmpAB_lightweight
				517
				518	subfic r6,r6,8
				519	subf. r5,r6,r5
				520	addi r3,r3,8
				521	add r4,r4,r6
				522
				523	beq .Lzero
				524
				525	.Ldiffoffset_align_s1_8bytes:
				526	/* now s1 is aligned with 8 bytes. */
				527	#ifdef CONFIG_ALTIVEC
				528	BEGIN_FTR_SECTION
				529	/* only do vmx ops when the size equal or greater than 4K bytes */
				530	cmpdi cr5,r5,VMX_THRESH
				531	bge cr5,.Ldiffoffset_vmx_cmp
				532	END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
				533
				534	.Ldiffoffset_novmx_cmp:
				535	#endif
				536
				537
				538	cmpdi cr5,r5,31
				539	ble cr5,.Lcmp_lt32bytes
				540
				541	#ifdef CONFIG_ALTIVEC
				542	b .Llong_novmx_cmp
				543	#else
				544	b .Llong
				545	#endif
				546
				547	#ifdef CONFIG_ALTIVEC
				548	.Ldiffoffset_vmx_cmp:
				549	/* perform a 32 bytes pre-checking before
				550	* enable VMX operations.
				551	*/
				552	li r0,4
				553	mtctr r0
				554	.Ldiffoffset_prechk_32B_loop:
				555	LD rA,0,r3
				556	LD rB,0,r4
				557	cmpld cr0,rA,rB
				558	addi r3,r3,8
				559	addi r4,r4,8
				560	bne cr0,.LcmpAB_lightweight
				561	addi r5,r5,-8
				562	bdnz .Ldiffoffset_prechk_32B_loop
				563
				564	ENTER_VMX_OPS
				565	beq cr1,.Ldiffoffset_novmx_cmp
				566
				567	.Ldiffoffset_vmx_cmp_start:
				568	/* Firstly try to align r3 with 16 bytes */
				569	andi. r6,r3,0xf
				570	li off16,16
				571	beq .Ldiffoffset_vmx_s1_16bytes_align
				572
				573	LVS v3,0,r3
				574	LVS v4,0,r4
				575
				576	lvx v5,0,r3
				577	lvx v6,0,r4
				578	LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
				579	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
				580
				581	VCMPEQUB_RC(v7,v9,v10)
				582	bnl cr6,.Ldiffoffset_vmx_diff_found
				583
				584	subfic r6,r6,16
				585	subf r5,r6,r5
				586	add r3,r3,r6
				587	add r4,r4,r6
				588
				589	.Ldiffoffset_vmx_s1_16bytes_align:
				590	/* now s1 is aligned with 16 bytes */
				591	lvx v6,0,r4
				592	LVS v4,0,r4
				593	srdi r6,r5,5 /* loop for 32 bytes each */
				594	clrldi r5,r5,59
				595	mtctr r6
				596
				597	.balign 16
				598	.Ldiffoffset_vmx_32bytesloop:
				599	/* the first qw of r4 was saved in v6 */
				600	lvx v9,0,r3
				601	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
				602	VCMPEQUB_RC(v7,v9,v10)
				603	vor v6,v8,v8
				604	bnl cr6,.Ldiffoffset_vmx_diff_found
				605
				606	addi r3,r3,16
				607	addi r4,r4,16
				608
				609	lvx v9,0,r3
				610	LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
				611	VCMPEQUB_RC(v7,v9,v10)
				612	vor v6,v8,v8
				613	bnl cr6,.Ldiffoffset_vmx_diff_found
				614
				615	addi r3,r3,16
				616	addi r4,r4,16
				617
				618	bdnz .Ldiffoffset_vmx_32bytesloop
				619
				620	EXIT_VMX_OPS
				621
				622	cmpdi r5,0
				623	beq .Lzero
				624	b .Lcmp_lt32bytes
				625
				626	.Ldiffoffset_vmx_diff_found:
				627	EXIT_VMX_OPS
				628	/* anyway, the diff will appear in next 16 bytes */
				629	li r5,16
				630	b .Lcmp_lt32bytes
				631
				632	#endif
				633	EXPORT_SYMBOL(memcmp)