Blame - net/sched/sch_netem.c - hafnium/third_party/linux.git

blob: 74c0f656f28c5131ef0c7ed13c59b44a221808a9 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame^]	1	/*
				2	* net/sched/sch_netem.c Network emulator
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public License
				6	* as published by the Free Software Foundation; either version
				7	* 2 of the License.
				8	*
				9	* Many of the algorithms and ideas for this came from
				10	* NIST Net which is not copyrighted.
				11	*
				12	* Authors: Stephen Hemminger <shemminger@osdl.org>
				13	* Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
				14	*/
				15
				16	#include <linux/mm.h>
				17	#include <linux/module.h>
				18	#include <linux/slab.h>
				19	#include <linux/types.h>
				20	#include <linux/kernel.h>
				21	#include <linux/errno.h>
				22	#include <linux/skbuff.h>
				23	#include <linux/vmalloc.h>
				24	#include <linux/rtnetlink.h>
				25	#include <linux/reciprocal_div.h>
				26	#include <linux/rbtree.h>
				27
				28	#include <net/netlink.h>
				29	#include <net/pkt_sched.h>
				30	#include <net/inet_ecn.h>
				31
				32	#define VERSION "1.3"
				33
				34	/* Network Emulation Queuing algorithm.
				35	====================================
				36
				37	Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
				38	Network Emulation Tool
				39	[2] Luigi Rizzo, DummyNet for FreeBSD
				40
				41	----------------------------------------------------------------
				42
				43	This started out as a simple way to delay outgoing packets to
				44	test TCP but has grown to include most of the functionality
				45	of a full blown network emulator like NISTnet. It can delay
				46	packets and add random jitter (and correlation). The random
				47	distribution can be loaded from a table as well to provide
				48	normal, Pareto, or experimental curves. Packet loss,
				49	duplication, and reordering can also be emulated.
				50
				51	This qdisc does not do classification that can be handled in
				52	layering other disciplines. It does not need to do bandwidth
				53	control either since that can be handled by using token
				54	bucket or other rate control.
				55
				56	Correlated Loss Generator models
				57
				58	Added generation of correlated loss according to the
				59	"Gilbert-Elliot" model, a 4-state markov model.
				60
				61	References:
				62	[1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
				63	[2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
				64	and intuitive loss model for packet networks and its implementation
				65	in the Netem module in the Linux kernel", available in [1]
				66
				67	Authors: Stefano Salsano <stefano.salsano at uniroma2.it
				68	Fabio Ludovici <fabio.ludovici at yahoo.it>
				69	*/
				70
				71	struct disttable {
				72	u32 size;
				73	s16 table[0];
				74	};
				75
				76	struct netem_sched_data {
				77	/* internal t(ime)fifo qdisc uses t_root and sch->limit */
				78	struct rb_root t_root;
				79
				80	/* optional qdisc for classful handling (NULL at netem init) */
				81	struct Qdisc *qdisc;
				82
				83	struct qdisc_watchdog watchdog;
				84
				85	s64 latency;
				86	s64 jitter;
				87
				88	u32 loss;
				89	u32 ecn;
				90	u32 limit;
				91	u32 counter;
				92	u32 gap;
				93	u32 duplicate;
				94	u32 reorder;
				95	u32 corrupt;
				96	u64 rate;
				97	s32 packet_overhead;
				98	u32 cell_size;
				99	struct reciprocal_value cell_size_reciprocal;
				100	s32 cell_overhead;
				101
				102	struct crndstate {
				103	u32 last;
				104	u32 rho;
				105	} delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
				106
				107	struct disttable *delay_dist;
				108
				109	enum {
				110	CLG_RANDOM,
				111	CLG_4_STATES,
				112	CLG_GILB_ELL,
				113	} loss_model;
				114
				115	enum {
				116	TX_IN_GAP_PERIOD = 1,
				117	TX_IN_BURST_PERIOD,
				118	LOST_IN_GAP_PERIOD,
				119	LOST_IN_BURST_PERIOD,
				120	} _4_state_model;
				121
				122	enum {
				123	GOOD_STATE = 1,
				124	BAD_STATE,
				125	} GE_state_model;
				126
				127	/* Correlated Loss Generation models */
				128	struct clgstate {
				129	/* state of the Markov chain */
				130	u8 state;
				131
				132	/* 4-states and Gilbert-Elliot models */
				133	u32 a1; /* p13 for 4-states or p for GE */
				134	u32 a2; /* p31 for 4-states or r for GE */
				135	u32 a3; /* p32 for 4-states or h for GE */
				136	u32 a4; /* p14 for 4-states or 1-k for GE */
				137	u32 a5; /* p23 used only in 4-states */
				138	} clg;
				139
				140	struct tc_netem_slot slot_config;
				141	struct slotstate {
				142	u64 slot_next;
				143	s32 packets_left;
				144	s32 bytes_left;
				145	} slot;
				146
				147	struct disttable *slot_dist;
				148	};
				149
				150	/* Time stamp put into socket buffer control block
				151	* Only valid when skbs are in our internal t(ime)fifo queue.
				152	*
				153	* As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
				154	* and skb->next & skb->prev are scratch space for a qdisc,
				155	* we save skb->tstamp value in skb->cb[] before destroying it.
				156	*/
				157	struct netem_skb_cb {
				158	u64 time_to_send;
				159	};
				160
				161	static inline struct netem_skb_cb netem_skb_cb(struct sk_buff skb)
				162	{
				163	/* we assume we can use skb next/prev/tstamp as storage for rb_node */
				164	qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
				165	return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
				166	}
				167
				168	/* init_crandom - initialize correlated random number generator
				169	* Use entropy source for initial seed.
				170	*/
				171	static void init_crandom(struct crndstate *state, unsigned long rho)
				172	{
				173	state->rho = rho;
				174	state->last = prandom_u32();
				175	}
				176
				177	/* get_crandom - correlated random number generator
				178	* Next number depends on last value.
				179	* rho is scaled to avoid floating point.
				180	*/
				181	static u32 get_crandom(struct crndstate *state)
				182	{
				183	u64 value, rho;
				184	unsigned long answer;
				185
				186	if (!state \|\| state->rho == 0) /* no correlation */
				187	return prandom_u32();
				188
				189	value = prandom_u32();
				190	rho = (u64)state->rho + 1;
				191	answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
				192	state->last = answer;
				193	return answer;
				194	}
				195
				196	/* loss_4state - 4-state model loss generator
				197	* Generates losses according to the 4-state Markov chain adopted in
				198	* the GI (General and Intuitive) loss model.
				199	*/
				200	static bool loss_4state(struct netem_sched_data *q)
				201	{
				202	struct clgstate *clg = &q->clg;
				203	u32 rnd = prandom_u32();
				204
				205	/*
				206	* Makes a comparison between rnd and the transition
				207	* probabilities outgoing from the current state, then decides the
				208	* next state and if the next packet has to be transmitted or lost.
				209	* The four states correspond to:
				210	* TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
				211	* LOST_IN_BURST_PERIOD => isolated losses within a gap period
				212	* LOST_IN_GAP_PERIOD => lost packets within a burst period
				213	* TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
				214	*/
				215	switch (clg->state) {
				216	case TX_IN_GAP_PERIOD:
				217	if (rnd < clg->a4) {
				218	clg->state = LOST_IN_BURST_PERIOD;
				219	return true;
				220	} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
				221	clg->state = LOST_IN_GAP_PERIOD;
				222	return true;
				223	} else if (clg->a1 + clg->a4 < rnd) {
				224	clg->state = TX_IN_GAP_PERIOD;
				225	}
				226
				227	break;
				228	case TX_IN_BURST_PERIOD:
				229	if (rnd < clg->a5) {
				230	clg->state = LOST_IN_GAP_PERIOD;
				231	return true;
				232	} else {
				233	clg->state = TX_IN_BURST_PERIOD;
				234	}
				235
				236	break;
				237	case LOST_IN_GAP_PERIOD:
				238	if (rnd < clg->a3)
				239	clg->state = TX_IN_BURST_PERIOD;
				240	else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
				241	clg->state = TX_IN_GAP_PERIOD;
				242	} else if (clg->a2 + clg->a3 < rnd) {
				243	clg->state = LOST_IN_GAP_PERIOD;
				244	return true;
				245	}
				246	break;
				247	case LOST_IN_BURST_PERIOD:
				248	clg->state = TX_IN_GAP_PERIOD;
				249	break;
				250	}
				251
				252	return false;
				253	}
				254
				255	/* loss_gilb_ell - Gilbert-Elliot model loss generator
				256	* Generates losses according to the Gilbert-Elliot loss model or
				257	* its special cases (Gilbert or Simple Gilbert)
				258	*
				259	* Makes a comparison between random number and the transition
				260	* probabilities outgoing from the current state, then decides the
				261	* next state. A second random number is extracted and the comparison
				262	* with the loss probability of the current state decides if the next
				263	* packet will be transmitted or lost.
				264	*/
				265	static bool loss_gilb_ell(struct netem_sched_data *q)
				266	{
				267	struct clgstate *clg = &q->clg;
				268
				269	switch (clg->state) {
				270	case GOOD_STATE:
				271	if (prandom_u32() < clg->a1)
				272	clg->state = BAD_STATE;
				273	if (prandom_u32() < clg->a4)
				274	return true;
				275	break;
				276	case BAD_STATE:
				277	if (prandom_u32() < clg->a2)
				278	clg->state = GOOD_STATE;
				279	if (prandom_u32() > clg->a3)
				280	return true;
				281	}
				282
				283	return false;
				284	}
				285
				286	static bool loss_event(struct netem_sched_data *q)
				287	{
				288	switch (q->loss_model) {
				289	case CLG_RANDOM:
				290	/* Random packet drop 0 => none, ~0 => all */
				291	return q->loss && q->loss >= get_crandom(&q->loss_cor);
				292
				293	case CLG_4_STATES:
				294	/* 4state loss model algorithm (used also for GI model)
				295	* Extracts a value from the markov 4 state loss generator,
				296	* if it is 1 drops a packet and if needed writes the event in
				297	* the kernel logs
				298	*/
				299	return loss_4state(q);
				300
				301	case CLG_GILB_ELL:
				302	/* Gilbert-Elliot loss model algorithm
				303	* Extracts a value from the Gilbert-Elliot loss generator,
				304	* if it is 1 drops a packet and if needed writes the event in
				305	* the kernel logs
				306	*/
				307	return loss_gilb_ell(q);
				308	}
				309
				310	return false; /* not reached */
				311	}
				312
				313
				314	/* tabledist - return a pseudo-randomly distributed value with mean mu and
				315	* std deviation sigma. Uses table lookup to approximate the desired
				316	* distribution, and a uniformly-distributed pseudo-random source.
				317	*/
				318	static s64 tabledist(s64 mu, s32 sigma,
				319	struct crndstate *state,
				320	const struct disttable *dist)
				321	{
				322	s64 x;
				323	long t;
				324	u32 rnd;
				325
				326	if (sigma == 0)
				327	return mu;
				328
				329	rnd = get_crandom(state);
				330
				331	/* default uniform distribution */
				332	if (dist == NULL)
				333	return ((rnd % (2 * sigma)) + mu) - sigma;
				334
				335	t = dist->table[rnd % dist->size];
				336	x = (sigma % NETEM_DIST_SCALE) * t;
				337	if (x >= 0)
				338	x += NETEM_DIST_SCALE/2;
				339	else
				340	x -= NETEM_DIST_SCALE/2;
				341
				342	return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
				343	}
				344
				345	static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
				346	{
				347	len += q->packet_overhead;
				348
				349	if (q->cell_size) {
				350	u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
				351
				352	if (len > cells * q->cell_size) /* extra cell needed for remainder */
				353	cells++;
				354	len = cells * (q->cell_size + q->cell_overhead);
				355	}
				356
				357	return div64_u64(len * NSEC_PER_SEC, q->rate);
				358	}
				359
				360	static void tfifo_reset(struct Qdisc *sch)
				361	{
				362	struct netem_sched_data *q = qdisc_priv(sch);
				363	struct rb_node *p = rb_first(&q->t_root);
				364
				365	while (p) {
				366	struct sk_buff *skb = rb_to_skb(p);
				367
				368	p = rb_next(p);
				369	rb_erase(&skb->rbnode, &q->t_root);
				370	rtnl_kfree_skbs(skb, skb);
				371	}
				372	}
				373
				374	static void tfifo_enqueue(struct sk_buff nskb, struct Qdisc sch)
				375	{
				376	struct netem_sched_data *q = qdisc_priv(sch);
				377	u64 tnext = netem_skb_cb(nskb)->time_to_send;
				378	struct rb_node *p = &q->t_root.rb_node, parent = NULL;
				379
				380	while (*p) {
				381	struct sk_buff *skb;
				382
				383	parent = *p;
				384	skb = rb_to_skb(parent);
				385	if (tnext >= netem_skb_cb(skb)->time_to_send)
				386	p = &parent->rb_right;
				387	else
				388	p = &parent->rb_left;
				389	}
				390	rb_link_node(&nskb->rbnode, parent, p);
				391	rb_insert_color(&nskb->rbnode, &q->t_root);
				392	sch->q.qlen++;
				393	}
				394
				395	/* netem can't properly corrupt a megapacket (like we get from GSO), so instead
				396	* when we statistically choose to corrupt one, we instead segment it, returning
				397	* the first packet to be corrupted, and re-enqueue the remaining frames
				398	*/
				399	static struct sk_buff netem_segment(struct sk_buff skb, struct Qdisc *sch,
				400	struct sk_buff **to_free)
				401	{
				402	struct sk_buff *segs;
				403	netdev_features_t features = netif_skb_features(skb);
				404
				405	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
				406
				407	if (IS_ERR_OR_NULL(segs)) {
				408	qdisc_drop(skb, sch, to_free);
				409	return NULL;
				410	}
				411	consume_skb(skb);
				412	return segs;
				413	}
				414
				415	static void netem_enqueue_skb_head(struct qdisc_skb_head qh, struct sk_buff skb)
				416	{
				417	skb->next = qh->head;
				418
				419	if (!qh->head)
				420	qh->tail = skb;
				421	qh->head = skb;
				422	qh->qlen++;
				423	}
				424
				425	/*
				426	* Insert one skb into qdisc.
				427	* Note: parent depends on return value to account for queue length.
				428	* NET_XMIT_DROP: queue length didn't change.
				429	* NET_XMIT_SUCCESS: one skb was queued.
				430	*/
				431	static int netem_enqueue(struct sk_buff skb, struct Qdisc sch,
				432	struct sk_buff **to_free)
				433	{
				434	struct netem_sched_data *q = qdisc_priv(sch);
				435	/* We don't fill cb now as skb_unshare() may invalidate it */
				436	struct netem_skb_cb *cb;
				437	struct sk_buff *skb2;
				438	struct sk_buff *segs = NULL;
				439	unsigned int len = 0, last_len, prev_len = qdisc_pkt_len(skb);
				440	int nb = 0;
				441	int count = 1;
				442	int rc = NET_XMIT_SUCCESS;
				443
				444	/* Do not fool qdisc_drop_all() */
				445	skb->prev = NULL;
				446
				447	/* Random duplication */
				448	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
				449	++count;
				450
				451	/* Drop packet? */
				452	if (loss_event(q)) {
				453	if (q->ecn && INET_ECN_set_ce(skb))
				454	qdisc_qstats_drop(sch); /* mark packet */
				455	else
				456	--count;
				457	}
				458	if (count == 0) {
				459	qdisc_qstats_drop(sch);
				460	__qdisc_drop(skb, to_free);
				461	return NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
				462	}
				463
				464	/* If a delay is expected, orphan the skb. (orphaning usually takes
				465	* place at TX completion time, so _before_ the link transit delay)
				466	*/
				467	if (q->latency \|\| q->jitter \|\| q->rate)
				468	skb_orphan_partial(skb);
				469
				470	/*
				471	* If we need to duplicate packet, then re-insert at top of the
				472	* qdisc tree, since parent queuer expects that only one
				473	* skb will be queued.
				474	*/
				475	if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
				476	struct Qdisc *rootq = qdisc_root(sch);
				477	u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
				478
				479	q->duplicate = 0;
				480	rootq->enqueue(skb2, rootq, to_free);
				481	q->duplicate = dupsave;
				482	}
				483
				484	/*
				485	* Randomized packet corruption.
				486	* Make copy if needed since we are modifying
				487	* If packet is going to be hardware checksummed, then
				488	* do it now in software before we mangle it.
				489	*/
				490	if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
				491	if (skb_is_gso(skb)) {
				492	segs = netem_segment(skb, sch, to_free);
				493	if (!segs)
				494	return NET_XMIT_DROP;
				495	} else {
				496	segs = skb;
				497	}
				498
				499	skb = segs;
				500	segs = segs->next;
				501
				502	skb = skb_unshare(skb, GFP_ATOMIC);
				503	if (unlikely(!skb)) {
				504	qdisc_qstats_drop(sch);
				505	goto finish_segs;
				506	}
				507	if (skb->ip_summed == CHECKSUM_PARTIAL &&
				508	skb_checksum_help(skb)) {
				509	qdisc_drop(skb, sch, to_free);
				510	goto finish_segs;
				511	}
				512
				513	skb->data[prandom_u32() % skb_headlen(skb)] ^=
				514	1<<(prandom_u32() % 8);
				515	}
				516
				517	if (unlikely(sch->q.qlen >= sch->limit))
				518	return qdisc_drop_all(skb, sch, to_free);
				519
				520	qdisc_qstats_backlog_inc(sch, skb);
				521
				522	cb = netem_skb_cb(skb);
				523	if (q->gap == 0 \|\| /* not doing reordering */
				524	q->counter < q->gap - 1 \|\| /* inside last reordering gap */
				525	q->reorder < get_crandom(&q->reorder_cor)) {
				526	u64 now;
				527	s64 delay;
				528
				529	delay = tabledist(q->latency, q->jitter,
				530	&q->delay_cor, q->delay_dist);
				531
				532	now = ktime_get_ns();
				533
				534	if (q->rate) {
				535	struct netem_skb_cb *last = NULL;
				536
				537	if (sch->q.tail)
				538	last = netem_skb_cb(sch->q.tail);
				539	if (q->t_root.rb_node) {
				540	struct sk_buff *t_skb;
				541	struct netem_skb_cb *t_last;
				542
				543	t_skb = skb_rb_last(&q->t_root);
				544	t_last = netem_skb_cb(t_skb);
				545	if (!last \|\|
				546	t_last->time_to_send > last->time_to_send) {
				547	last = t_last;
				548	}
				549	}
				550
				551	if (last) {
				552	/*
				553	* Last packet in queue is reference point (now),
				554	* calculate this time bonus and subtract
				555	* from delay.
				556	*/
				557	delay -= last->time_to_send - now;
				558	delay = max_t(s64, 0, delay);
				559	now = last->time_to_send;
				560	}
				561
				562	delay += packet_time_ns(qdisc_pkt_len(skb), q);
				563	}
				564
				565	cb->time_to_send = now + delay;
				566	++q->counter;
				567	tfifo_enqueue(skb, sch);
				568	} else {
				569	/*
				570	* Do re-ordering by putting one out of N packets at the front
				571	* of the queue.
				572	*/
				573	cb->time_to_send = ktime_get_ns();
				574	q->counter = 0;
				575
				576	netem_enqueue_skb_head(&sch->q, skb);
				577	sch->qstats.requeues++;
				578	}
				579
				580	finish_segs:
				581	if (segs) {
				582	while (segs) {
				583	skb2 = segs->next;
				584	segs->next = NULL;
				585	qdisc_skb_cb(segs)->pkt_len = segs->len;
				586	last_len = segs->len;
				587	rc = qdisc_enqueue(segs, sch, to_free);
				588	if (rc != NET_XMIT_SUCCESS) {
				589	if (net_xmit_drop_count(rc))
				590	qdisc_qstats_drop(sch);
				591	} else {
				592	nb++;
				593	len += last_len;
				594	}
				595	segs = skb2;
				596	}
				597	sch->q.qlen += nb;
				598	if (nb > 1)
				599	qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
				600	}
				601	return NET_XMIT_SUCCESS;
				602	}
				603
				604	/* Delay the next round with a new future slot with a
				605	* correct number of bytes and packets.
				606	*/
				607
				608	static void get_slot_next(struct netem_sched_data *q, u64 now)
				609	{
				610	s64 next_delay;
				611
				612	if (!q->slot_dist)
				613	next_delay = q->slot_config.min_delay +
				614	(prandom_u32() *
				615	(q->slot_config.max_delay -
				616	q->slot_config.min_delay) >> 32);
				617	else
				618	next_delay = tabledist(q->slot_config.dist_delay,
				619	(s32)(q->slot_config.dist_jitter),
				620	NULL, q->slot_dist);
				621
				622	q->slot.slot_next = now + next_delay;
				623	q->slot.packets_left = q->slot_config.max_packets;
				624	q->slot.bytes_left = q->slot_config.max_bytes;
				625	}
				626
				627	static struct sk_buff netem_dequeue(struct Qdisc sch)
				628	{
				629	struct netem_sched_data *q = qdisc_priv(sch);
				630	struct sk_buff *skb;
				631	struct rb_node *p;
				632
				633	tfifo_dequeue:
				634	skb = __qdisc_dequeue_head(&sch->q);
				635	if (skb) {
				636	qdisc_qstats_backlog_dec(sch, skb);
				637	deliver:
				638	qdisc_bstats_update(sch, skb);
				639	return skb;
				640	}
				641	p = rb_first(&q->t_root);
				642	if (p) {
				643	u64 time_to_send;
				644	u64 now = ktime_get_ns();
				645
				646	skb = rb_to_skb(p);
				647
				648	/* if more time remaining? */
				649	time_to_send = netem_skb_cb(skb)->time_to_send;
				650	if (q->slot.slot_next && q->slot.slot_next < time_to_send)
				651	get_slot_next(q, now);
				652
				653	if (time_to_send <= now && q->slot.slot_next <= now) {
				654	rb_erase(p, &q->t_root);
				655	sch->q.qlen--;
				656	qdisc_qstats_backlog_dec(sch, skb);
				657	skb->next = NULL;
				658	skb->prev = NULL;
				659	/* skb->dev shares skb->rbnode area,
				660	* we need to restore its value.
				661	*/
				662	skb->dev = qdisc_dev(sch);
				663
				664	#ifdef CONFIG_NET_CLS_ACT
				665	/*
				666	* If it's at ingress let's pretend the delay is
				667	* from the network (tstamp will be updated).
				668	*/
				669	if (skb->tc_redirected && skb->tc_from_ingress)
				670	skb->tstamp = 0;
				671	#endif
				672
				673	if (q->slot.slot_next) {
				674	q->slot.packets_left--;
				675	q->slot.bytes_left -= qdisc_pkt_len(skb);
				676	if (q->slot.packets_left <= 0 \|\|
				677	q->slot.bytes_left <= 0)
				678	get_slot_next(q, now);
				679	}
				680
				681	if (q->qdisc) {
				682	unsigned int pkt_len = qdisc_pkt_len(skb);
				683	struct sk_buff *to_free = NULL;
				684	int err;
				685
				686	err = qdisc_enqueue(skb, q->qdisc, &to_free);
				687	kfree_skb_list(to_free);
				688	if (err != NET_XMIT_SUCCESS &&
				689	net_xmit_drop_count(err)) {
				690	qdisc_qstats_drop(sch);
				691	qdisc_tree_reduce_backlog(sch, 1,
				692	pkt_len);
				693	}
				694	goto tfifo_dequeue;
				695	}
				696	goto deliver;
				697	}
				698
				699	if (q->qdisc) {
				700	skb = q->qdisc->ops->dequeue(q->qdisc);
				701	if (skb)
				702	goto deliver;
				703	}
				704
				705	qdisc_watchdog_schedule_ns(&q->watchdog,
				706	max(time_to_send,
				707	q->slot.slot_next));
				708	}
				709
				710	if (q->qdisc) {
				711	skb = q->qdisc->ops->dequeue(q->qdisc);
				712	if (skb)
				713	goto deliver;
				714	}
				715	return NULL;
				716	}
				717
				718	static void netem_reset(struct Qdisc *sch)
				719	{
				720	struct netem_sched_data *q = qdisc_priv(sch);
				721
				722	qdisc_reset_queue(sch);
				723	tfifo_reset(sch);
				724	if (q->qdisc)
				725	qdisc_reset(q->qdisc);
				726	qdisc_watchdog_cancel(&q->watchdog);
				727	}
				728
				729	static void dist_free(struct disttable *d)
				730	{
				731	kvfree(d);
				732	}
				733
				734	/*
				735	* Distribution data is a variable size payload containing
				736	* signed 16 bit values.
				737	*/
				738
				739	static int get_dist_table(struct Qdisc sch, struct disttable *tbl,
				740	const struct nlattr *attr)
				741	{
				742	size_t n = nla_len(attr)/sizeof(__s16);
				743	const __s16 *data = nla_data(attr);
				744	spinlock_t *root_lock;
				745	struct disttable *d;
				746	int i;
				747
				748	if (n > NETEM_DIST_MAX)
				749	return -EINVAL;
				750
				751	d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
				752	if (!d)
				753	return -ENOMEM;
				754
				755	d->size = n;
				756	for (i = 0; i < n; i++)
				757	d->table[i] = data[i];
				758
				759	root_lock = qdisc_root_sleeping_lock(sch);
				760
				761	spin_lock_bh(root_lock);
				762	swap(*tbl, d);
				763	spin_unlock_bh(root_lock);
				764
				765	dist_free(d);
				766	return 0;
				767	}
				768
				769	static void get_slot(struct netem_sched_data q, const struct nlattr attr)
				770	{
				771	const struct tc_netem_slot *c = nla_data(attr);
				772
				773	q->slot_config = *c;
				774	if (q->slot_config.max_packets == 0)
				775	q->slot_config.max_packets = INT_MAX;
				776	if (q->slot_config.max_bytes == 0)
				777	q->slot_config.max_bytes = INT_MAX;
				778	q->slot.packets_left = q->slot_config.max_packets;
				779	q->slot.bytes_left = q->slot_config.max_bytes;
				780	if (q->slot_config.min_delay \| q->slot_config.max_delay \|
				781	q->slot_config.dist_jitter)
				782	q->slot.slot_next = ktime_get_ns();
				783	else
				784	q->slot.slot_next = 0;
				785	}
				786
				787	static void get_correlation(struct netem_sched_data q, const struct nlattr attr)
				788	{
				789	const struct tc_netem_corr *c = nla_data(attr);
				790
				791	init_crandom(&q->delay_cor, c->delay_corr);
				792	init_crandom(&q->loss_cor, c->loss_corr);
				793	init_crandom(&q->dup_cor, c->dup_corr);
				794	}
				795
				796	static void get_reorder(struct netem_sched_data q, const struct nlattr attr)
				797	{
				798	const struct tc_netem_reorder *r = nla_data(attr);
				799
				800	q->reorder = r->probability;
				801	init_crandom(&q->reorder_cor, r->correlation);
				802	}
				803
				804	static void get_corrupt(struct netem_sched_data q, const struct nlattr attr)
				805	{
				806	const struct tc_netem_corrupt *r = nla_data(attr);
				807
				808	q->corrupt = r->probability;
				809	init_crandom(&q->corrupt_cor, r->correlation);
				810	}
				811
				812	static void get_rate(struct netem_sched_data q, const struct nlattr attr)
				813	{
				814	const struct tc_netem_rate *r = nla_data(attr);
				815
				816	q->rate = r->rate;
				817	q->packet_overhead = r->packet_overhead;
				818	q->cell_size = r->cell_size;
				819	q->cell_overhead = r->cell_overhead;
				820	if (q->cell_size)
				821	q->cell_size_reciprocal = reciprocal_value(q->cell_size);
				822	else
				823	q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
				824	}
				825
				826	static int get_loss_clg(struct netem_sched_data q, const struct nlattr attr)
				827	{
				828	const struct nlattr *la;
				829	int rem;
				830
				831	nla_for_each_nested(la, attr, rem) {
				832	u16 type = nla_type(la);
				833
				834	switch (type) {
				835	case NETEM_LOSS_GI: {
				836	const struct tc_netem_gimodel *gi = nla_data(la);
				837
				838	if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
				839	pr_info("netem: incorrect gi model size\n");
				840	return -EINVAL;
				841	}
				842
				843	q->loss_model = CLG_4_STATES;
				844
				845	q->clg.state = TX_IN_GAP_PERIOD;
				846	q->clg.a1 = gi->p13;
				847	q->clg.a2 = gi->p31;
				848	q->clg.a3 = gi->p32;
				849	q->clg.a4 = gi->p14;
				850	q->clg.a5 = gi->p23;
				851	break;
				852	}
				853
				854	case NETEM_LOSS_GE: {
				855	const struct tc_netem_gemodel *ge = nla_data(la);
				856
				857	if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
				858	pr_info("netem: incorrect ge model size\n");
				859	return -EINVAL;
				860	}
				861
				862	q->loss_model = CLG_GILB_ELL;
				863	q->clg.state = GOOD_STATE;
				864	q->clg.a1 = ge->p;
				865	q->clg.a2 = ge->r;
				866	q->clg.a3 = ge->h;
				867	q->clg.a4 = ge->k1;
				868	break;
				869	}
				870
				871	default:
				872	pr_info("netem: unknown loss type %u\n", type);
				873	return -EINVAL;
				874	}
				875	}
				876
				877	return 0;
				878	}
				879
				880	static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
				881	[TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) },
				882	[TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) },
				883	[TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) },
				884	[TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) },
				885	[TCA_NETEM_LOSS] = { .type = NLA_NESTED },
				886	[TCA_NETEM_ECN] = { .type = NLA_U32 },
				887	[TCA_NETEM_RATE64] = { .type = NLA_U64 },
				888	[TCA_NETEM_LATENCY64] = { .type = NLA_S64 },
				889	[TCA_NETEM_JITTER64] = { .type = NLA_S64 },
				890	[TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) },
				891	};
				892
				893	static int parse_attr(struct nlattr tb[], int maxtype, struct nlattr nla,
				894	const struct nla_policy *policy, int len)
				895	{
				896	int nested_len = nla_len(nla) - NLA_ALIGN(len);
				897
				898	if (nested_len < 0) {
				899	pr_info("netem: invalid attributes len %d\n", nested_len);
				900	return -EINVAL;
				901	}
				902
				903	if (nested_len >= nla_attr_size(0))
				904	return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
				905	nested_len, policy, NULL);
				906
				907	memset(tb, 0, sizeof(struct nlattr ) (maxtype + 1));
				908	return 0;
				909	}
				910
				911	/* Parse netlink message to set options */
				912	static int netem_change(struct Qdisc sch, struct nlattr opt,
				913	struct netlink_ext_ack *extack)
				914	{
				915	struct netem_sched_data *q = qdisc_priv(sch);
				916	struct nlattr *tb[TCA_NETEM_MAX + 1];
				917	struct tc_netem_qopt *qopt;
				918	struct clgstate old_clg;
				919	int old_loss_model = CLG_RANDOM;
				920	int ret;
				921
				922	if (opt == NULL)
				923	return -EINVAL;
				924
				925	qopt = nla_data(opt);
				926	ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
				927	if (ret < 0)
				928	return ret;
				929
				930	/* backup q->clg and q->loss_model */
				931	old_clg = q->clg;
				932	old_loss_model = q->loss_model;
				933
				934	if (tb[TCA_NETEM_LOSS]) {
				935	ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
				936	if (ret) {
				937	q->loss_model = old_loss_model;
				938	return ret;
				939	}
				940	} else {
				941	q->loss_model = CLG_RANDOM;
				942	}
				943
				944	if (tb[TCA_NETEM_DELAY_DIST]) {
				945	ret = get_dist_table(sch, &q->delay_dist,
				946	tb[TCA_NETEM_DELAY_DIST]);
				947	if (ret)
				948	goto get_table_failure;
				949	}
				950
				951	if (tb[TCA_NETEM_SLOT_DIST]) {
				952	ret = get_dist_table(sch, &q->slot_dist,
				953	tb[TCA_NETEM_SLOT_DIST]);
				954	if (ret)
				955	goto get_table_failure;
				956	}
				957
				958	sch->limit = qopt->limit;
				959
				960	q->latency = PSCHED_TICKS2NS(qopt->latency);
				961	q->jitter = PSCHED_TICKS2NS(qopt->jitter);
				962	q->limit = qopt->limit;
				963	q->gap = qopt->gap;
				964	q->counter = 0;
				965	q->loss = qopt->loss;
				966	q->duplicate = qopt->duplicate;
				967
				968	/* for compatibility with earlier versions.
				969	* if gap is set, need to assume 100% probability
				970	*/
				971	if (q->gap)
				972	q->reorder = ~0;
				973
				974	if (tb[TCA_NETEM_CORR])
				975	get_correlation(q, tb[TCA_NETEM_CORR]);
				976
				977	if (tb[TCA_NETEM_REORDER])
				978	get_reorder(q, tb[TCA_NETEM_REORDER]);
				979
				980	if (tb[TCA_NETEM_CORRUPT])
				981	get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
				982
				983	if (tb[TCA_NETEM_RATE])
				984	get_rate(q, tb[TCA_NETEM_RATE]);
				985
				986	if (tb[TCA_NETEM_RATE64])
				987	q->rate = max_t(u64, q->rate,
				988	nla_get_u64(tb[TCA_NETEM_RATE64]));
				989
				990	if (tb[TCA_NETEM_LATENCY64])
				991	q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
				992
				993	if (tb[TCA_NETEM_JITTER64])
				994	q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
				995
				996	if (tb[TCA_NETEM_ECN])
				997	q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
				998
				999	if (tb[TCA_NETEM_SLOT])
				1000	get_slot(q, tb[TCA_NETEM_SLOT]);
				1001
				1002	return ret;
				1003
				1004	get_table_failure:
				1005	/* recover clg and loss_model, in case of
				1006	* q->clg and q->loss_model were modified
				1007	* in get_loss_clg()
				1008	*/
				1009	q->clg = old_clg;
				1010	q->loss_model = old_loss_model;
				1011	return ret;
				1012	}
				1013
				1014	static int netem_init(struct Qdisc sch, struct nlattr opt,
				1015	struct netlink_ext_ack *extack)
				1016	{
				1017	struct netem_sched_data *q = qdisc_priv(sch);
				1018	int ret;
				1019
				1020	qdisc_watchdog_init(&q->watchdog, sch);
				1021
				1022	if (!opt)
				1023	return -EINVAL;
				1024
				1025	q->loss_model = CLG_RANDOM;
				1026	ret = netem_change(sch, opt, extack);
				1027	if (ret)
				1028	pr_info("netem: change failed\n");
				1029	return ret;
				1030	}
				1031
				1032	static void netem_destroy(struct Qdisc *sch)
				1033	{
				1034	struct netem_sched_data *q = qdisc_priv(sch);
				1035
				1036	qdisc_watchdog_cancel(&q->watchdog);
				1037	if (q->qdisc)
				1038	qdisc_destroy(q->qdisc);
				1039	dist_free(q->delay_dist);
				1040	dist_free(q->slot_dist);
				1041	}
				1042
				1043	static int dump_loss_model(const struct netem_sched_data *q,
				1044	struct sk_buff *skb)
				1045	{
				1046	struct nlattr *nest;
				1047
				1048	nest = nla_nest_start(skb, TCA_NETEM_LOSS);
				1049	if (nest == NULL)
				1050	goto nla_put_failure;
				1051
				1052	switch (q->loss_model) {
				1053	case CLG_RANDOM:
				1054	/* legacy loss model */
				1055	nla_nest_cancel(skb, nest);
				1056	return 0; /* no data */
				1057
				1058	case CLG_4_STATES: {
				1059	struct tc_netem_gimodel gi = {
				1060	.p13 = q->clg.a1,
				1061	.p31 = q->clg.a2,
				1062	.p32 = q->clg.a3,
				1063	.p14 = q->clg.a4,
				1064	.p23 = q->clg.a5,
				1065	};
				1066
				1067	if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
				1068	goto nla_put_failure;
				1069	break;
				1070	}
				1071	case CLG_GILB_ELL: {
				1072	struct tc_netem_gemodel ge = {
				1073	.p = q->clg.a1,
				1074	.r = q->clg.a2,
				1075	.h = q->clg.a3,
				1076	.k1 = q->clg.a4,
				1077	};
				1078
				1079	if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
				1080	goto nla_put_failure;
				1081	break;
				1082	}
				1083	}
				1084
				1085	nla_nest_end(skb, nest);
				1086	return 0;
				1087
				1088	nla_put_failure:
				1089	nla_nest_cancel(skb, nest);
				1090	return -1;
				1091	}
				1092
				1093	static int netem_dump(struct Qdisc sch, struct sk_buff skb)
				1094	{
				1095	const struct netem_sched_data *q = qdisc_priv(sch);
				1096	struct nlattr nla = (struct nlattr ) skb_tail_pointer(skb);
				1097	struct tc_netem_qopt qopt;
				1098	struct tc_netem_corr cor;
				1099	struct tc_netem_reorder reorder;
				1100	struct tc_netem_corrupt corrupt;
				1101	struct tc_netem_rate rate;
				1102	struct tc_netem_slot slot;
				1103
				1104	qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
				1105	UINT_MAX);
				1106	qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
				1107	UINT_MAX);
				1108	qopt.limit = q->limit;
				1109	qopt.loss = q->loss;
				1110	qopt.gap = q->gap;
				1111	qopt.duplicate = q->duplicate;
				1112	if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
				1113	goto nla_put_failure;
				1114
				1115	if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
				1116	goto nla_put_failure;
				1117
				1118	if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
				1119	goto nla_put_failure;
				1120
				1121	cor.delay_corr = q->delay_cor.rho;
				1122	cor.loss_corr = q->loss_cor.rho;
				1123	cor.dup_corr = q->dup_cor.rho;
				1124	if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
				1125	goto nla_put_failure;
				1126
				1127	reorder.probability = q->reorder;
				1128	reorder.correlation = q->reorder_cor.rho;
				1129	if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
				1130	goto nla_put_failure;
				1131
				1132	corrupt.probability = q->corrupt;
				1133	corrupt.correlation = q->corrupt_cor.rho;
				1134	if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
				1135	goto nla_put_failure;
				1136
				1137	if (q->rate >= (1ULL << 32)) {
				1138	if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
				1139	TCA_NETEM_PAD))
				1140	goto nla_put_failure;
				1141	rate.rate = ~0U;
				1142	} else {
				1143	rate.rate = q->rate;
				1144	}
				1145	rate.packet_overhead = q->packet_overhead;
				1146	rate.cell_size = q->cell_size;
				1147	rate.cell_overhead = q->cell_overhead;
				1148	if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
				1149	goto nla_put_failure;
				1150
				1151	if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
				1152	goto nla_put_failure;
				1153
				1154	if (dump_loss_model(q, skb) != 0)
				1155	goto nla_put_failure;
				1156
				1157	if (q->slot_config.min_delay \| q->slot_config.max_delay \|
				1158	q->slot_config.dist_jitter) {
				1159	slot = q->slot_config;
				1160	if (slot.max_packets == INT_MAX)
				1161	slot.max_packets = 0;
				1162	if (slot.max_bytes == INT_MAX)
				1163	slot.max_bytes = 0;
				1164	if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
				1165	goto nla_put_failure;
				1166	}
				1167
				1168	return nla_nest_end(skb, nla);
				1169
				1170	nla_put_failure:
				1171	nlmsg_trim(skb, nla);
				1172	return -1;
				1173	}
				1174
				1175	static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
				1176	struct sk_buff skb, struct tcmsg tcm)
				1177	{
				1178	struct netem_sched_data *q = qdisc_priv(sch);
				1179
				1180	if (cl != 1 \|\| !q->qdisc) /* only one class */
				1181	return -ENOENT;
				1182
				1183	tcm->tcm_handle \|= TC_H_MIN(1);
				1184	tcm->tcm_info = q->qdisc->handle;
				1185
				1186	return 0;
				1187	}
				1188
				1189	static int netem_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
				1190	struct Qdisc *old, struct netlink_ext_ack extack)
				1191	{
				1192	struct netem_sched_data *q = qdisc_priv(sch);
				1193
				1194	*old = qdisc_replace(sch, new, &q->qdisc);
				1195	return 0;
				1196	}
				1197
				1198	static struct Qdisc netem_leaf(struct Qdisc sch, unsigned long arg)
				1199	{
				1200	struct netem_sched_data *q = qdisc_priv(sch);
				1201	return q->qdisc;
				1202	}
				1203
				1204	static unsigned long netem_find(struct Qdisc *sch, u32 classid)
				1205	{
				1206	return 1;
				1207	}
				1208
				1209	static void netem_walk(struct Qdisc sch, struct qdisc_walker walker)
				1210	{
				1211	if (!walker->stop) {
				1212	if (walker->count >= walker->skip)
				1213	if (walker->fn(sch, 1, walker) < 0) {
				1214	walker->stop = 1;
				1215	return;
				1216	}
				1217	walker->count++;
				1218	}
				1219	}
				1220
				1221	static const struct Qdisc_class_ops netem_class_ops = {
				1222	.graft = netem_graft,
				1223	.leaf = netem_leaf,
				1224	.find = netem_find,
				1225	.walk = netem_walk,
				1226	.dump = netem_dump_class,
				1227	};
				1228
				1229	static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
				1230	.id = "netem",
				1231	.cl_ops = &netem_class_ops,
				1232	.priv_size = sizeof(struct netem_sched_data),
				1233	.enqueue = netem_enqueue,
				1234	.dequeue = netem_dequeue,
				1235	.peek = qdisc_peek_dequeued,
				1236	.init = netem_init,
				1237	.reset = netem_reset,
				1238	.destroy = netem_destroy,
				1239	.change = netem_change,
				1240	.dump = netem_dump,
				1241	.owner = THIS_MODULE,
				1242	};
				1243
				1244
				1245	static int __init netem_module_init(void)
				1246	{
				1247	pr_info("netem: version " VERSION "\n");
				1248	return register_qdisc(&netem_qdisc_ops);
				1249	}
				1250	static void __exit netem_module_exit(void)
				1251	{
				1252	unregister_qdisc(&netem_qdisc_ops);
				1253	}
				1254	module_init(netem_module_init)
				1255	module_exit(netem_module_exit)
				1256	MODULE_LICENSE("GPL");