Blame - fs/namespace.c - hafnium/third_party/linux.git

blob: 1fce41ba353570a8d84a5fcde9b8ea4d03438d80 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/*
				2	* linux/fs/namespace.c
				3	*
				4	* (C) Copyright Al Viro 2000, 2001
				5	* Released under GPL v2.
				6	*
				7	* Based on code from fs/super.c, copyright Linus Torvalds and others.
				8	* Heavily rewritten.
				9	*/
				10
				11	#include <linux/syscalls.h>
				12	#include <linux/export.h>
				13	#include <linux/capability.h>
				14	#include <linux/mnt_namespace.h>
				15	#include <linux/user_namespace.h>
				16	#include <linux/namei.h>
				17	#include <linux/security.h>
				18	#include <linux/cred.h>
				19	#include <linux/idr.h>
				20	#include <linux/init.h> /* init_rootfs */
				21	#include <linux/fs_struct.h> /* get_fs_root et.al. */
				22	#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
				23	#include <linux/uaccess.h>
				24	#include <linux/proc_ns.h>
				25	#include <linux/magic.h>
				26	#include <linux/bootmem.h>
				27	#include <linux/task_work.h>
				28	#include <linux/sched/task.h>
				29
				30	#include "pnode.h"
				31	#include "internal.h"
				32
				33	/* Maximum number of mounts in a mount namespace */
				34	unsigned int sysctl_mount_max __read_mostly = 100000;
				35
				36	static unsigned int m_hash_mask __read_mostly;
				37	static unsigned int m_hash_shift __read_mostly;
				38	static unsigned int mp_hash_mask __read_mostly;
				39	static unsigned int mp_hash_shift __read_mostly;
				40
				41	static __initdata unsigned long mhash_entries;
				42	static int __init set_mhash_entries(char *str)
				43	{
				44	if (!str)
				45	return 0;
				46	mhash_entries = simple_strtoul(str, &str, 0);
				47	return 1;
				48	}
				49	__setup("mhash_entries=", set_mhash_entries);
				50
				51	static __initdata unsigned long mphash_entries;
				52	static int __init set_mphash_entries(char *str)
				53	{
				54	if (!str)
				55	return 0;
				56	mphash_entries = simple_strtoul(str, &str, 0);
				57	return 1;
				58	}
				59	__setup("mphash_entries=", set_mphash_entries);
				60
				61	static u64 event;
				62	static DEFINE_IDA(mnt_id_ida);
				63	static DEFINE_IDA(mnt_group_ida);
				64
				65	static struct hlist_head *mount_hashtable __read_mostly;
				66	static struct hlist_head *mountpoint_hashtable __read_mostly;
				67	static struct kmem_cache *mnt_cache __read_mostly;
				68	static DECLARE_RWSEM(namespace_sem);
				69
				70	/* /sys/fs */
				71	struct kobject *fs_kobj;
				72	EXPORT_SYMBOL_GPL(fs_kobj);
				73
				74	/*
				75	* vfsmount lock may be taken for read to prevent changes to the
				76	* vfsmount hash, ie. during mountpoint lookups or walking back
				77	* up the tree.
				78	*
				79	* It should be taken for write in all cases where the vfsmount
				80	* tree or hash is modified or when a vfsmount structure is modified.
				81	*/
				82	__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
				83
				84	static inline struct hlist_head m_hash(struct vfsmount mnt, struct dentry *dentry)
				85	{
				86	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
				87	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
				88	tmp = tmp + (tmp >> m_hash_shift);
				89	return &mount_hashtable[tmp & m_hash_mask];
				90	}
				91
				92	static inline struct hlist_head mp_hash(struct dentry dentry)
				93	{
				94	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
				95	tmp = tmp + (tmp >> mp_hash_shift);
				96	return &mountpoint_hashtable[tmp & mp_hash_mask];
				97	}
				98
				99	static int mnt_alloc_id(struct mount *mnt)
				100	{
				101	int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
				102
				103	if (res < 0)
				104	return res;
				105	mnt->mnt_id = res;
				106	return 0;
				107	}
				108
				109	static void mnt_free_id(struct mount *mnt)
				110	{
				111	ida_free(&mnt_id_ida, mnt->mnt_id);
				112	}
				113
				114	/*
				115	* Allocate a new peer group ID
				116	*/
				117	static int mnt_alloc_group_id(struct mount *mnt)
				118	{
				119	int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
				120
				121	if (res < 0)
				122	return res;
				123	mnt->mnt_group_id = res;
				124	return 0;
				125	}
				126
				127	/*
				128	* Release a peer group ID
				129	*/
				130	void mnt_release_group_id(struct mount *mnt)
				131	{
				132	ida_free(&mnt_group_ida, mnt->mnt_group_id);
				133	mnt->mnt_group_id = 0;
				134	}
				135
				136	/*
				137	* vfsmount lock must be held for read
				138	*/
				139	static inline void mnt_add_count(struct mount *mnt, int n)
				140	{
				141	#ifdef CONFIG_SMP
				142	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
				143	#else
				144	preempt_disable();
				145	mnt->mnt_count += n;
				146	preempt_enable();
				147	#endif
				148	}
				149
				150	/*
				151	* vfsmount lock must be held for write
				152	*/
				153	unsigned int mnt_get_count(struct mount *mnt)
				154	{
				155	#ifdef CONFIG_SMP
				156	unsigned int count = 0;
				157	int cpu;
				158
				159	for_each_possible_cpu(cpu) {
				160	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
				161	}
				162
				163	return count;
				164	#else
				165	return mnt->mnt_count;
				166	#endif
				167	}
				168
				169	static void drop_mountpoint(struct fs_pin *p)
				170	{
				171	struct mount *m = container_of(p, struct mount, mnt_umount);
				172	dput(m->mnt_ex_mountpoint);
				173	pin_remove(p);
				174	mntput(&m->mnt);
				175	}
				176
				177	static struct mount alloc_vfsmnt(const char name)
				178	{
				179	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
				180	if (mnt) {
				181	int err;
				182
				183	err = mnt_alloc_id(mnt);
				184	if (err)
				185	goto out_free_cache;
				186
				187	if (name) {
				188	mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
				189	if (!mnt->mnt_devname)
				190	goto out_free_id;
				191	}
				192
				193	#ifdef CONFIG_SMP
				194	mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
				195	if (!mnt->mnt_pcp)
				196	goto out_free_devname;
				197
				198	this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
				199	#else
				200	mnt->mnt_count = 1;
				201	mnt->mnt_writers = 0;
				202	#endif
				203
				204	INIT_HLIST_NODE(&mnt->mnt_hash);
				205	INIT_LIST_HEAD(&mnt->mnt_child);
				206	INIT_LIST_HEAD(&mnt->mnt_mounts);
				207	INIT_LIST_HEAD(&mnt->mnt_list);
				208	INIT_LIST_HEAD(&mnt->mnt_expire);
				209	INIT_LIST_HEAD(&mnt->mnt_share);
				210	INIT_LIST_HEAD(&mnt->mnt_slave_list);
				211	INIT_LIST_HEAD(&mnt->mnt_slave);
				212	INIT_HLIST_NODE(&mnt->mnt_mp_list);
				213	INIT_LIST_HEAD(&mnt->mnt_umounting);
				214	init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
				215	}
				216	return mnt;
				217
				218	#ifdef CONFIG_SMP
				219	out_free_devname:
				220	kfree_const(mnt->mnt_devname);
				221	#endif
				222	out_free_id:
				223	mnt_free_id(mnt);
				224	out_free_cache:
				225	kmem_cache_free(mnt_cache, mnt);
				226	return NULL;
				227	}
				228
				229	/*
				230	* Most r/o checks on a fs are for operations that take
				231	* discrete amounts of time, like a write() or unlink().
				232	* We must keep track of when those operations start
				233	* (for permission checks) and when they end, so that
				234	* we can determine when writes are able to occur to
				235	* a filesystem.
				236	*/
				237	/*
				238	* __mnt_is_readonly: check whether a mount is read-only
				239	* @mnt: the mount to check for its write status
				240	*
				241	* This shouldn't be used directly ouside of the VFS.
				242	* It does not guarantee that the filesystem will stay
				243	* r/w, just that it is right now. This can not and
				244	* should not be used in place of IS_RDONLY(inode).
				245	* mnt_want/drop_write() will _keep_ the filesystem
				246	* r/w.
				247	*/
				248	int __mnt_is_readonly(struct vfsmount *mnt)
				249	{
				250	if (mnt->mnt_flags & MNT_READONLY)
				251	return 1;
				252	if (sb_rdonly(mnt->mnt_sb))
				253	return 1;
				254	return 0;
				255	}
				256	EXPORT_SYMBOL_GPL(__mnt_is_readonly);
				257
				258	static inline void mnt_inc_writers(struct mount *mnt)
				259	{
				260	#ifdef CONFIG_SMP
				261	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
				262	#else
				263	mnt->mnt_writers++;
				264	#endif
				265	}
				266
				267	static inline void mnt_dec_writers(struct mount *mnt)
				268	{
				269	#ifdef CONFIG_SMP
				270	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
				271	#else
				272	mnt->mnt_writers--;
				273	#endif
				274	}
				275
				276	static unsigned int mnt_get_writers(struct mount *mnt)
				277	{
				278	#ifdef CONFIG_SMP
				279	unsigned int count = 0;
				280	int cpu;
				281
				282	for_each_possible_cpu(cpu) {
				283	count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
				284	}
				285
				286	return count;
				287	#else
				288	return mnt->mnt_writers;
				289	#endif
				290	}
				291
				292	static int mnt_is_readonly(struct vfsmount *mnt)
				293	{
				294	if (mnt->mnt_sb->s_readonly_remount)
				295	return 1;
				296	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
				297	smp_rmb();
				298	return __mnt_is_readonly(mnt);
				299	}
				300
				301	/*
				302	* Most r/o & frozen checks on a fs are for operations that take discrete
				303	* amounts of time, like a write() or unlink(). We must keep track of when
				304	* those operations start (for permission checks) and when they end, so that we
				305	* can determine when writes are able to occur to a filesystem.
				306	*/
				307	/**
				308	* __mnt_want_write - get write access to a mount without freeze protection
				309	* @m: the mount on which to take a write
				310	*
				311	* This tells the low-level filesystem that a write is about to be performed to
				312	* it, and makes sure that writes are allowed (mnt it read-write) before
				313	* returning success. This operation does not protect against filesystem being
				314	* frozen. When the write operation is finished, __mnt_drop_write() must be
				315	* called. This is effectively a refcount.
				316	*/
				317	int __mnt_want_write(struct vfsmount *m)
				318	{
				319	struct mount *mnt = real_mount(m);
				320	int ret = 0;
				321
				322	preempt_disable();
				323	mnt_inc_writers(mnt);
				324	/*
				325	* The store to mnt_inc_writers must be visible before we pass
				326	* MNT_WRITE_HOLD loop below, so that the slowpath can see our
				327	* incremented count after it has set MNT_WRITE_HOLD.
				328	*/
				329	smp_mb();
				330	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
				331	cpu_relax();
				332	/*
				333	* After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
				334	* be set to match its requirements. So we must not load that until
				335	* MNT_WRITE_HOLD is cleared.
				336	*/
				337	smp_rmb();
				338	if (mnt_is_readonly(m)) {
				339	mnt_dec_writers(mnt);
				340	ret = -EROFS;
				341	}
				342	preempt_enable();
				343
				344	return ret;
				345	}
				346
				347	/**
				348	* mnt_want_write - get write access to a mount
				349	* @m: the mount on which to take a write
				350	*
				351	* This tells the low-level filesystem that a write is about to be performed to
				352	* it, and makes sure that writes are allowed (mount is read-write, filesystem
				353	* is not frozen) before returning success. When the write operation is
				354	* finished, mnt_drop_write() must be called. This is effectively a refcount.
				355	*/
				356	int mnt_want_write(struct vfsmount *m)
				357	{
				358	int ret;
				359
				360	sb_start_write(m->mnt_sb);
				361	ret = __mnt_want_write(m);
				362	if (ret)
				363	sb_end_write(m->mnt_sb);
				364	return ret;
				365	}
				366	EXPORT_SYMBOL_GPL(mnt_want_write);
				367
				368	/**
				369	* mnt_clone_write - get write access to a mount
				370	* @mnt: the mount on which to take a write
				371	*
				372	* This is effectively like mnt_want_write, except
				373	* it must only be used to take an extra write reference
				374	* on a mountpoint that we already know has a write reference
				375	* on it. This allows some optimisation.
				376	*
				377	* After finished, mnt_drop_write must be called as usual to
				378	* drop the reference.
				379	*/
				380	int mnt_clone_write(struct vfsmount *mnt)
				381	{
				382	/* superblock may be r/o */
				383	if (__mnt_is_readonly(mnt))
				384	return -EROFS;
				385	preempt_disable();
				386	mnt_inc_writers(real_mount(mnt));
				387	preempt_enable();
				388	return 0;
				389	}
				390	EXPORT_SYMBOL_GPL(mnt_clone_write);
				391
				392	/**
				393	* __mnt_want_write_file - get write access to a file's mount
				394	* @file: the file who's mount on which to take a write
				395	*
				396	* This is like __mnt_want_write, but it takes a file and can
				397	* do some optimisations if the file is open for write already
				398	*/
				399	int __mnt_want_write_file(struct file *file)
				400	{
				401	if (!(file->f_mode & FMODE_WRITER))
				402	return __mnt_want_write(file->f_path.mnt);
				403	else
				404	return mnt_clone_write(file->f_path.mnt);
				405	}
				406
				407	/**
				408	* mnt_want_write_file - get write access to a file's mount
				409	* @file: the file who's mount on which to take a write
				410	*
				411	* This is like mnt_want_write, but it takes a file and can
				412	* do some optimisations if the file is open for write already
				413	*/
				414	int mnt_want_write_file(struct file *file)
				415	{
				416	int ret;
				417
				418	sb_start_write(file_inode(file)->i_sb);
				419	ret = __mnt_want_write_file(file);
				420	if (ret)
				421	sb_end_write(file_inode(file)->i_sb);
				422	return ret;
				423	}
				424	EXPORT_SYMBOL_GPL(mnt_want_write_file);
				425
				426	/**
				427	* __mnt_drop_write - give up write access to a mount
				428	* @mnt: the mount on which to give up write access
				429	*
				430	* Tells the low-level filesystem that we are done
				431	* performing writes to it. Must be matched with
				432	* __mnt_want_write() call above.
				433	*/
				434	void __mnt_drop_write(struct vfsmount *mnt)
				435	{
				436	preempt_disable();
				437	mnt_dec_writers(real_mount(mnt));
				438	preempt_enable();
				439	}
				440
				441	/**
				442	* mnt_drop_write - give up write access to a mount
				443	* @mnt: the mount on which to give up write access
				444	*
				445	* Tells the low-level filesystem that we are done performing writes to it and
				446	* also allows filesystem to be frozen again. Must be matched with
				447	* mnt_want_write() call above.
				448	*/
				449	void mnt_drop_write(struct vfsmount *mnt)
				450	{
				451	__mnt_drop_write(mnt);
				452	sb_end_write(mnt->mnt_sb);
				453	}
				454	EXPORT_SYMBOL_GPL(mnt_drop_write);
				455
				456	void __mnt_drop_write_file(struct file *file)
				457	{
				458	__mnt_drop_write(file->f_path.mnt);
				459	}
				460
				461	void mnt_drop_write_file(struct file *file)
				462	{
				463	__mnt_drop_write_file(file);
				464	sb_end_write(file_inode(file)->i_sb);
				465	}
				466	EXPORT_SYMBOL(mnt_drop_write_file);
				467
				468	static int mnt_make_readonly(struct mount *mnt)
				469	{
				470	int ret = 0;
				471
				472	lock_mount_hash();
				473	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				474	/*
				475	* After storing MNT_WRITE_HOLD, we'll read the counters. This store
				476	* should be visible before we do.
				477	*/
				478	smp_mb();
				479
				480	/*
				481	* With writers on hold, if this value is zero, then there are
				482	* definitely no active writers (although held writers may subsequently
				483	* increment the count, they'll have to wait, and decrement it after
				484	* seeing MNT_READONLY).
				485	*
				486	* It is OK to have counter incremented on one CPU and decremented on
				487	* another: the sum will add up correctly. The danger would be when we
				488	* sum up each counter, if we read a counter before it is incremented,
				489	* but then read another CPU's count which it has been subsequently
				490	* decremented from -- we would see more decrements than we should.
				491	* MNT_WRITE_HOLD protects against this scenario, because
				492	* mnt_want_write first increments count, then smp_mb, then spins on
				493	* MNT_WRITE_HOLD, so it can't be decremented by another CPU while
				494	* we're counting up here.
				495	*/
				496	if (mnt_get_writers(mnt) > 0)
				497	ret = -EBUSY;
				498	else
				499	mnt->mnt.mnt_flags \|= MNT_READONLY;
				500	/*
				501	* MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
				502	* that become unheld will see MNT_READONLY.
				503	*/
				504	smp_wmb();
				505	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				506	unlock_mount_hash();
				507	return ret;
				508	}
				509
				510	static void __mnt_unmake_readonly(struct mount *mnt)
				511	{
				512	lock_mount_hash();
				513	mnt->mnt.mnt_flags &= ~MNT_READONLY;
				514	unlock_mount_hash();
				515	}
				516
				517	int sb_prepare_remount_readonly(struct super_block *sb)
				518	{
				519	struct mount *mnt;
				520	int err = 0;
				521
				522	/* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */
				523	if (atomic_long_read(&sb->s_remove_count))
				524	return -EBUSY;
				525
				526	lock_mount_hash();
				527	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				528	if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
				529	mnt->mnt.mnt_flags \|= MNT_WRITE_HOLD;
				530	smp_mb();
				531	if (mnt_get_writers(mnt) > 0) {
				532	err = -EBUSY;
				533	break;
				534	}
				535	}
				536	}
				537	if (!err && atomic_long_read(&sb->s_remove_count))
				538	err = -EBUSY;
				539
				540	if (!err) {
				541	sb->s_readonly_remount = 1;
				542	smp_wmb();
				543	}
				544	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
				545	if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
				546	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
				547	}
				548	unlock_mount_hash();
				549
				550	return err;
				551	}
				552
				553	static void free_vfsmnt(struct mount *mnt)
				554	{
				555	kfree_const(mnt->mnt_devname);
				556	#ifdef CONFIG_SMP
				557	free_percpu(mnt->mnt_pcp);
				558	#endif
				559	kmem_cache_free(mnt_cache, mnt);
				560	}
				561
				562	static void delayed_free_vfsmnt(struct rcu_head *head)
				563	{
				564	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
				565	}
				566
				567	/* call under rcu_read_lock */
				568	int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				569	{
				570	struct mount *mnt;
				571	if (read_seqretry(&mount_lock, seq))
				572	return 1;
				573	if (bastard == NULL)
				574	return 0;
				575	mnt = real_mount(bastard);
				576	mnt_add_count(mnt, 1);
				577	smp_mb(); // see mntput_no_expire()
				578	if (likely(!read_seqretry(&mount_lock, seq)))
				579	return 0;
				580	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
				581	mnt_add_count(mnt, -1);
				582	return 1;
				583	}
				584	lock_mount_hash();
				585	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
				586	mnt_add_count(mnt, -1);
				587	unlock_mount_hash();
				588	return 1;
				589	}
				590	unlock_mount_hash();
				591	/* caller will mntput() */
				592	return -1;
				593	}
				594
				595	/* call under rcu_read_lock */
				596	bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
				597	{
				598	int res = __legitimize_mnt(bastard, seq);
				599	if (likely(!res))
				600	return true;
				601	if (unlikely(res < 0)) {
				602	rcu_read_unlock();
				603	mntput(bastard);
				604	rcu_read_lock();
				605	}
				606	return false;
				607	}
				608
				609	/*
				610	* find the first mount at @dentry on vfsmount @mnt.
				611	* call under rcu_read_lock()
				612	*/
				613	struct mount __lookup_mnt(struct vfsmount mnt, struct dentry *dentry)
				614	{
				615	struct hlist_head *head = m_hash(mnt, dentry);
				616	struct mount *p;
				617
				618	hlist_for_each_entry_rcu(p, head, mnt_hash)
				619	if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
				620	return p;
				621	return NULL;
				622	}
				623
				624	/*
				625	* lookup_mnt - Return the first child mount mounted at path
				626	*
				627	* "First" means first mounted chronologically. If you create the
				628	* following mounts:
				629	*
				630	* mount /dev/sda1 /mnt
				631	* mount /dev/sda2 /mnt
				632	* mount /dev/sda3 /mnt
				633	*
				634	* Then lookup_mnt() on the base /mnt dentry in the root mount will
				635	* return successively the root dentry and vfsmount of /dev/sda1, then
				636	* /dev/sda2, then /dev/sda3, then NULL.
				637	*
				638	* lookup_mnt takes a reference to the found vfsmount.
				639	*/
				640	struct vfsmount lookup_mnt(const struct path path)
				641	{
				642	struct mount *child_mnt;
				643	struct vfsmount *m;
				644	unsigned seq;
				645
				646	rcu_read_lock();
				647	do {
				648	seq = read_seqbegin(&mount_lock);
				649	child_mnt = __lookup_mnt(path->mnt, path->dentry);
				650	m = child_mnt ? &child_mnt->mnt : NULL;
				651	} while (!legitimize_mnt(m, seq));
				652	rcu_read_unlock();
				653	return m;
				654	}
				655
				656	/*
				657	* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
				658	* current mount namespace.
				659	*
				660	* The common case is dentries are not mountpoints at all and that
				661	* test is handled inline. For the slow case when we are actually
				662	* dealing with a mountpoint of some kind, walk through all of the
				663	* mounts in the current mount namespace and test to see if the dentry
				664	* is a mountpoint.
				665	*
				666	* The mount_hashtable is not usable in the context because we
				667	* need to identify all mounts that may be in the current mount
				668	* namespace not just a mount that happens to have some specified
				669	* parent mount.
				670	*/
				671	bool __is_local_mountpoint(struct dentry *dentry)
				672	{
				673	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				674	struct mount *mnt;
				675	bool is_covered = false;
				676
				677	if (!d_mountpoint(dentry))
				678	goto out;
				679
				680	down_read(&namespace_sem);
				681	list_for_each_entry(mnt, &ns->list, mnt_list) {
				682	is_covered = (mnt->mnt_mountpoint == dentry);
				683	if (is_covered)
				684	break;
				685	}
				686	up_read(&namespace_sem);
				687	out:
				688	return is_covered;
				689	}
				690
				691	static struct mountpoint lookup_mountpoint(struct dentry dentry)
				692	{
				693	struct hlist_head *chain = mp_hash(dentry);
				694	struct mountpoint *mp;
				695
				696	hlist_for_each_entry(mp, chain, m_hash) {
				697	if (mp->m_dentry == dentry) {
				698	mp->m_count++;
				699	return mp;
				700	}
				701	}
				702	return NULL;
				703	}
				704
				705	static struct mountpoint get_mountpoint(struct dentry dentry)
				706	{
				707	struct mountpoint mp, new = NULL;
				708	int ret;
				709
				710	if (d_mountpoint(dentry)) {
				711	/* might be worth a WARN_ON() */
				712	if (d_unlinked(dentry))
				713	return ERR_PTR(-ENOENT);
				714	mountpoint:
				715	read_seqlock_excl(&mount_lock);
				716	mp = lookup_mountpoint(dentry);
				717	read_sequnlock_excl(&mount_lock);
				718	if (mp)
				719	goto done;
				720	}
				721
				722	if (!new)
				723	new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
				724	if (!new)
				725	return ERR_PTR(-ENOMEM);
				726
				727
				728	/* Exactly one processes may set d_mounted */
				729	ret = d_set_mounted(dentry);
				730
				731	/* Someone else set d_mounted? */
				732	if (ret == -EBUSY)
				733	goto mountpoint;
				734
				735	/* The dentry is not available as a mountpoint? */
				736	mp = ERR_PTR(ret);
				737	if (ret)
				738	goto done;
				739
				740	/* Add the new mountpoint to the hash table */
				741	read_seqlock_excl(&mount_lock);
				742	new->m_dentry = dentry;
				743	new->m_count = 1;
				744	hlist_add_head(&new->m_hash, mp_hash(dentry));
				745	INIT_HLIST_HEAD(&new->m_list);
				746	read_sequnlock_excl(&mount_lock);
				747
				748	mp = new;
				749	new = NULL;
				750	done:
				751	kfree(new);
				752	return mp;
				753	}
				754
				755	static void put_mountpoint(struct mountpoint *mp)
				756	{
				757	if (!--mp->m_count) {
				758	struct dentry *dentry = mp->m_dentry;
				759	BUG_ON(!hlist_empty(&mp->m_list));
				760	spin_lock(&dentry->d_lock);
				761	dentry->d_flags &= ~DCACHE_MOUNTED;
				762	spin_unlock(&dentry->d_lock);
				763	hlist_del(&mp->m_hash);
				764	kfree(mp);
				765	}
				766	}
				767
				768	static inline int check_mnt(struct mount *mnt)
				769	{
				770	return mnt->mnt_ns == current->nsproxy->mnt_ns;
				771	}
				772
				773	/*
				774	* vfsmount lock must be held for write
				775	*/
				776	static void touch_mnt_namespace(struct mnt_namespace *ns)
				777	{
				778	if (ns) {
				779	ns->event = ++event;
				780	wake_up_interruptible(&ns->poll);
				781	}
				782	}
				783
				784	/*
				785	* vfsmount lock must be held for write
				786	*/
				787	static void __touch_mnt_namespace(struct mnt_namespace *ns)
				788	{
				789	if (ns && ns->event != event) {
				790	ns->event = event;
				791	wake_up_interruptible(&ns->poll);
				792	}
				793	}
				794
				795	/*
				796	* vfsmount lock must be held for write
				797	*/
				798	static void unhash_mnt(struct mount *mnt)
				799	{
				800	mnt->mnt_parent = mnt;
				801	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				802	list_del_init(&mnt->mnt_child);
				803	hlist_del_init_rcu(&mnt->mnt_hash);
				804	hlist_del_init(&mnt->mnt_mp_list);
				805	put_mountpoint(mnt->mnt_mp);
				806	mnt->mnt_mp = NULL;
				807	}
				808
				809	/*
				810	* vfsmount lock must be held for write
				811	*/
				812	static void detach_mnt(struct mount mnt, struct path old_path)
				813	{
				814	old_path->dentry = mnt->mnt_mountpoint;
				815	old_path->mnt = &mnt->mnt_parent->mnt;
				816	unhash_mnt(mnt);
				817	}
				818
				819	/*
				820	* vfsmount lock must be held for write
				821	*/
				822	static void umount_mnt(struct mount *mnt)
				823	{
				824	/* old mountpoint will be dropped when we can do that */
				825	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
				826	unhash_mnt(mnt);
				827	}
				828
				829	/*
				830	* vfsmount lock must be held for write
				831	*/
				832	void mnt_set_mountpoint(struct mount *mnt,
				833	struct mountpoint *mp,
				834	struct mount *child_mnt)
				835	{
				836	mp->m_count++;
				837	mnt_add_count(mnt, 1); /* essentially, that's mntget */
				838	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
				839	child_mnt->mnt_parent = mnt;
				840	child_mnt->mnt_mp = mp;
				841	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
				842	}
				843
				844	static void __attach_mnt(struct mount mnt, struct mount parent)
				845	{
				846	hlist_add_head_rcu(&mnt->mnt_hash,
				847	m_hash(&parent->mnt, mnt->mnt_mountpoint));
				848	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
				849	}
				850
				851	/*
				852	* vfsmount lock must be held for write
				853	*/
				854	static void attach_mnt(struct mount *mnt,
				855	struct mount *parent,
				856	struct mountpoint *mp)
				857	{
				858	mnt_set_mountpoint(parent, mp, mnt);
				859	__attach_mnt(mnt, parent);
				860	}
				861
				862	void mnt_change_mountpoint(struct mount parent, struct mountpoint mp, struct mount *mnt)
				863	{
				864	struct mountpoint *old_mp = mnt->mnt_mp;
				865	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
				866	struct mount *old_parent = mnt->mnt_parent;
				867
				868	list_del_init(&mnt->mnt_child);
				869	hlist_del_init(&mnt->mnt_mp_list);
				870	hlist_del_init_rcu(&mnt->mnt_hash);
				871
				872	attach_mnt(mnt, parent, mp);
				873
				874	put_mountpoint(old_mp);
				875
				876	/*
				877	* Safely avoid even the suggestion this code might sleep or
				878	* lock the mount hash by taking advantage of the knowledge that
				879	* mnt_change_mountpoint will not release the final reference
				880	* to a mountpoint.
				881	*
				882	* During mounting, the mount passed in as the parent mount will
				883	* continue to use the old mountpoint and during unmounting, the
				884	* old mountpoint will continue to exist until namespace_unlock,
				885	* which happens well after mnt_change_mountpoint.
				886	*/
				887	spin_lock(&old_mountpoint->d_lock);
				888	old_mountpoint->d_lockref.count--;
				889	spin_unlock(&old_mountpoint->d_lock);
				890
				891	mnt_add_count(old_parent, -1);
				892	}
				893
				894	/*
				895	* vfsmount lock must be held for write
				896	*/
				897	static void commit_tree(struct mount *mnt)
				898	{
				899	struct mount *parent = mnt->mnt_parent;
				900	struct mount *m;
				901	LIST_HEAD(head);
				902	struct mnt_namespace *n = parent->mnt_ns;
				903
				904	BUG_ON(parent == mnt);
				905
				906	list_add_tail(&head, &mnt->mnt_list);
				907	list_for_each_entry(m, &head, mnt_list)
				908	m->mnt_ns = n;
				909
				910	list_splice(&head, n->list.prev);
				911
				912	n->mounts += n->pending_mounts;
				913	n->pending_mounts = 0;
				914
				915	__attach_mnt(mnt, parent);
				916	touch_mnt_namespace(n);
				917	}
				918
				919	static struct mount next_mnt(struct mount p, struct mount *root)
				920	{
				921	struct list_head *next = p->mnt_mounts.next;
				922	if (next == &p->mnt_mounts) {
				923	while (1) {
				924	if (p == root)
				925	return NULL;
				926	next = p->mnt_child.next;
				927	if (next != &p->mnt_parent->mnt_mounts)
				928	break;
				929	p = p->mnt_parent;
				930	}
				931	}
				932	return list_entry(next, struct mount, mnt_child);
				933	}
				934
				935	static struct mount skip_mnt_tree(struct mount p)
				936	{
				937	struct list_head *prev = p->mnt_mounts.prev;
				938	while (prev != &p->mnt_mounts) {
				939	p = list_entry(prev, struct mount, mnt_child);
				940	prev = p->mnt_mounts.prev;
				941	}
				942	return p;
				943	}
				944
				945	struct vfsmount *
				946	vfs_kern_mount(struct file_system_type type, int flags, const char name, void *data)
				947	{
				948	struct mount *mnt;
				949	struct dentry *root;
				950
				951	if (!type)
				952	return ERR_PTR(-ENODEV);
				953
				954	mnt = alloc_vfsmnt(name);
				955	if (!mnt)
				956	return ERR_PTR(-ENOMEM);
				957
				958	if (flags & SB_KERNMOUNT)
				959	mnt->mnt.mnt_flags = MNT_INTERNAL;
				960
				961	root = mount_fs(type, flags, name, data);
				962	if (IS_ERR(root)) {
				963	mnt_free_id(mnt);
				964	free_vfsmnt(mnt);
				965	return ERR_CAST(root);
				966	}
				967
				968	mnt->mnt.mnt_root = root;
				969	mnt->mnt.mnt_sb = root->d_sb;
				970	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				971	mnt->mnt_parent = mnt;
				972	lock_mount_hash();
				973	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
				974	unlock_mount_hash();
				975	return &mnt->mnt;
				976	}
				977	EXPORT_SYMBOL_GPL(vfs_kern_mount);
				978
				979	struct vfsmount *
				980	vfs_submount(const struct dentry mountpoint, struct file_system_type type,
				981	const char name, void data)
				982	{
				983	/* Until it is worked out how to pass the user namespace
				984	* through from the parent mount to the submount don't support
				985	* unprivileged mounts with submounts.
				986	*/
				987	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
				988	return ERR_PTR(-EPERM);
				989
				990	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
				991	}
				992	EXPORT_SYMBOL_GPL(vfs_submount);
				993
				994	static struct mount clone_mnt(struct mount old, struct dentry *root,
				995	int flag)
				996	{
				997	struct super_block *sb = old->mnt.mnt_sb;
				998	struct mount *mnt;
				999	int err;
				1000
				1001	mnt = alloc_vfsmnt(old->mnt_devname);
				1002	if (!mnt)
				1003	return ERR_PTR(-ENOMEM);
				1004
				1005	if (flag & (CL_SLAVE \| CL_PRIVATE \| CL_SHARED_TO_SLAVE))
				1006	mnt->mnt_group_id = 0; /* not a peer of original */
				1007	else
				1008	mnt->mnt_group_id = old->mnt_group_id;
				1009
				1010	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
				1011	err = mnt_alloc_group_id(mnt);
				1012	if (err)
				1013	goto out_free;
				1014	}
				1015
				1016	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
				1017	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD\|MNT_MARKED\|MNT_INTERNAL);
				1018	/* Don't allow unprivileged users to change mount flags */
				1019	if (flag & CL_UNPRIVILEGED) {
				1020	mnt->mnt.mnt_flags \|= MNT_LOCK_ATIME;
				1021
				1022	if (mnt->mnt.mnt_flags & MNT_READONLY)
				1023	mnt->mnt.mnt_flags \|= MNT_LOCK_READONLY;
				1024
				1025	if (mnt->mnt.mnt_flags & MNT_NODEV)
				1026	mnt->mnt.mnt_flags \|= MNT_LOCK_NODEV;
				1027
				1028	if (mnt->mnt.mnt_flags & MNT_NOSUID)
				1029	mnt->mnt.mnt_flags \|= MNT_LOCK_NOSUID;
				1030
				1031	if (mnt->mnt.mnt_flags & MNT_NOEXEC)
				1032	mnt->mnt.mnt_flags \|= MNT_LOCK_NOEXEC;
				1033	}
				1034
				1035	/* Don't allow unprivileged users to reveal what is under a mount */
				1036	if ((flag & CL_UNPRIVILEGED) &&
				1037	(!(flag & CL_EXPIRE) \|\| list_empty(&old->mnt_expire)))
				1038	mnt->mnt.mnt_flags \|= MNT_LOCKED;
				1039
				1040	atomic_inc(&sb->s_active);
				1041	mnt->mnt.mnt_sb = sb;
				1042	mnt->mnt.mnt_root = dget(root);
				1043	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
				1044	mnt->mnt_parent = mnt;
				1045	lock_mount_hash();
				1046	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
				1047	unlock_mount_hash();
				1048
				1049	if ((flag & CL_SLAVE) \|\|
				1050	((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
				1051	list_add(&mnt->mnt_slave, &old->mnt_slave_list);
				1052	mnt->mnt_master = old;
				1053	CLEAR_MNT_SHARED(mnt);
				1054	} else if (!(flag & CL_PRIVATE)) {
				1055	if ((flag & CL_MAKE_SHARED) \|\| IS_MNT_SHARED(old))
				1056	list_add(&mnt->mnt_share, &old->mnt_share);
				1057	if (IS_MNT_SLAVE(old))
				1058	list_add(&mnt->mnt_slave, &old->mnt_slave);
				1059	mnt->mnt_master = old->mnt_master;
				1060	} else {
				1061	CLEAR_MNT_SHARED(mnt);
				1062	}
				1063	if (flag & CL_MAKE_SHARED)
				1064	set_mnt_shared(mnt);
				1065
				1066	/* stick the duplicate mount on the same expiry list
				1067	* as the original if that was on one */
				1068	if (flag & CL_EXPIRE) {
				1069	if (!list_empty(&old->mnt_expire))
				1070	list_add(&mnt->mnt_expire, &old->mnt_expire);
				1071	}
				1072
				1073	return mnt;
				1074
				1075	out_free:
				1076	mnt_free_id(mnt);
				1077	free_vfsmnt(mnt);
				1078	return ERR_PTR(err);
				1079	}
				1080
				1081	static void cleanup_mnt(struct mount *mnt)
				1082	{
				1083	/*
				1084	* This probably indicates that somebody messed
				1085	* up a mnt_want/drop_write() pair. If this
				1086	* happens, the filesystem was probably unable
				1087	* to make r/w->r/o transitions.
				1088	*/
				1089	/*
				1090	* The locking used to deal with mnt_count decrement provides barriers,
				1091	* so mnt_get_writers() below is safe.
				1092	*/
				1093	WARN_ON(mnt_get_writers(mnt));
				1094	if (unlikely(mnt->mnt_pins.first))
				1095	mnt_pin_kill(mnt);
				1096	fsnotify_vfsmount_delete(&mnt->mnt);
				1097	dput(mnt->mnt.mnt_root);
				1098	deactivate_super(mnt->mnt.mnt_sb);
				1099	mnt_free_id(mnt);
				1100	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
				1101	}
				1102
				1103	static void __cleanup_mnt(struct rcu_head *head)
				1104	{
				1105	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
				1106	}
				1107
				1108	static LLIST_HEAD(delayed_mntput_list);
				1109	static void delayed_mntput(struct work_struct *unused)
				1110	{
				1111	struct llist_node *node = llist_del_all(&delayed_mntput_list);
				1112	struct mount m, t;
				1113
				1114	llist_for_each_entry_safe(m, t, node, mnt_llist)
				1115	cleanup_mnt(m);
				1116	}
				1117	static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
				1118
				1119	static void mntput_no_expire(struct mount *mnt)
				1120	{
				1121	rcu_read_lock();
				1122	if (likely(READ_ONCE(mnt->mnt_ns))) {
				1123	/*
				1124	* Since we don't do lock_mount_hash() here,
				1125	* ->mnt_ns can change under us. However, if it's
				1126	* non-NULL, then there's a reference that won't
				1127	* be dropped until after an RCU delay done after
				1128	* turning ->mnt_ns NULL. So if we observe it
				1129	* non-NULL under rcu_read_lock(), the reference
				1130	* we are dropping is not the final one.
				1131	*/
				1132	mnt_add_count(mnt, -1);
				1133	rcu_read_unlock();
				1134	return;
				1135	}
				1136	lock_mount_hash();
				1137	/*
				1138	* make sure that if __legitimize_mnt() has not seen us grab
				1139	* mount_lock, we'll see their refcount increment here.
				1140	*/
				1141	smp_mb();
				1142	mnt_add_count(mnt, -1);
				1143	if (mnt_get_count(mnt)) {
				1144	rcu_read_unlock();
				1145	unlock_mount_hash();
				1146	return;
				1147	}
				1148	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
				1149	rcu_read_unlock();
				1150	unlock_mount_hash();
				1151	return;
				1152	}
				1153	mnt->mnt.mnt_flags \|= MNT_DOOMED;
				1154	rcu_read_unlock();
				1155
				1156	list_del(&mnt->mnt_instance);
				1157
				1158	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
				1159	struct mount p, tmp;
				1160	list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
				1161	umount_mnt(p);
				1162	}
				1163	}
				1164	unlock_mount_hash();
				1165
				1166	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
				1167	struct task_struct *task = current;
				1168	if (likely(!(task->flags & PF_KTHREAD))) {
				1169	init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
				1170	if (!task_work_add(task, &mnt->mnt_rcu, true))
				1171	return;
				1172	}
				1173	if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
				1174	schedule_delayed_work(&delayed_mntput_work, 1);
				1175	return;
				1176	}
				1177	cleanup_mnt(mnt);
				1178	}
				1179
				1180	void mntput(struct vfsmount *mnt)
				1181	{
				1182	if (mnt) {
				1183	struct mount *m = real_mount(mnt);
				1184	/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
				1185	if (unlikely(m->mnt_expiry_mark))
				1186	m->mnt_expiry_mark = 0;
				1187	mntput_no_expire(m);
				1188	}
				1189	}
				1190	EXPORT_SYMBOL(mntput);
				1191
				1192	struct vfsmount mntget(struct vfsmount mnt)
				1193	{
				1194	if (mnt)
				1195	mnt_add_count(real_mount(mnt), 1);
				1196	return mnt;
				1197	}
				1198	EXPORT_SYMBOL(mntget);
				1199
				1200	/* path_is_mountpoint() - Check if path is a mount in the current
				1201	* namespace.
				1202	*
				1203	* d_mountpoint() can only be used reliably to establish if a dentry is
				1204	* not mounted in any namespace and that common case is handled inline.
				1205	* d_mountpoint() isn't aware of the possibility there may be multiple
				1206	* mounts using a given dentry in a different namespace. This function
				1207	* checks if the passed in path is a mountpoint rather than the dentry
				1208	* alone.
				1209	*/
				1210	bool path_is_mountpoint(const struct path *path)
				1211	{
				1212	unsigned seq;
				1213	bool res;
				1214
				1215	if (!d_mountpoint(path->dentry))
				1216	return false;
				1217
				1218	rcu_read_lock();
				1219	do {
				1220	seq = read_seqbegin(&mount_lock);
				1221	res = __path_is_mountpoint(path);
				1222	} while (read_seqretry(&mount_lock, seq));
				1223	rcu_read_unlock();
				1224
				1225	return res;
				1226	}
				1227	EXPORT_SYMBOL(path_is_mountpoint);
				1228
				1229	struct vfsmount mnt_clone_internal(const struct path path)
				1230	{
				1231	struct mount *p;
				1232	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
				1233	if (IS_ERR(p))
				1234	return ERR_CAST(p);
				1235	p->mnt.mnt_flags \|= MNT_INTERNAL;
				1236	return &p->mnt;
				1237	}
				1238
				1239	#ifdef CONFIG_PROC_FS
				1240	/* iterator; we want it to have access to namespace_sem, thus here... */
				1241	static void m_start(struct seq_file m, loff_t *pos)
				1242	{
				1243	struct proc_mounts *p = m->private;
				1244
				1245	down_read(&namespace_sem);
				1246	if (p->cached_event == p->ns->event) {
				1247	void *v = p->cached_mount;
				1248	if (*pos == p->cached_index)
				1249	return v;
				1250	if (*pos == p->cached_index + 1) {
				1251	v = seq_list_next(v, &p->ns->list, &p->cached_index);
				1252	return p->cached_mount = v;
				1253	}
				1254	}
				1255
				1256	p->cached_event = p->ns->event;
				1257	p->cached_mount = seq_list_start(&p->ns->list, *pos);
				1258	p->cached_index = *pos;
				1259	return p->cached_mount;
				1260	}
				1261
				1262	static void m_next(struct seq_file m, void v, loff_t pos)
				1263	{
				1264	struct proc_mounts *p = m->private;
				1265
				1266	p->cached_mount = seq_list_next(v, &p->ns->list, pos);
				1267	p->cached_index = *pos;
				1268	return p->cached_mount;
				1269	}
				1270
				1271	static void m_stop(struct seq_file m, void v)
				1272	{
				1273	up_read(&namespace_sem);
				1274	}
				1275
				1276	static int m_show(struct seq_file m, void v)
				1277	{
				1278	struct proc_mounts *p = m->private;
				1279	struct mount *r = list_entry(v, struct mount, mnt_list);
				1280	return p->show(m, &r->mnt);
				1281	}
				1282
				1283	const struct seq_operations mounts_op = {
				1284	.start = m_start,
				1285	.next = m_next,
				1286	.stop = m_stop,
				1287	.show = m_show,
				1288	};
				1289	#endif /* CONFIG_PROC_FS */
				1290
				1291	/**
				1292	* may_umount_tree - check if a mount tree is busy
				1293	* @mnt: root of mount tree
				1294	*
				1295	* This is called to check if a tree of mounts has any
				1296	* open files, pwds, chroots or sub mounts that are
				1297	* busy.
				1298	*/
				1299	int may_umount_tree(struct vfsmount *m)
				1300	{
				1301	struct mount *mnt = real_mount(m);
				1302	int actual_refs = 0;
				1303	int minimum_refs = 0;
				1304	struct mount *p;
				1305	BUG_ON(!m);
				1306
				1307	/* write lock needed for mnt_get_count */
				1308	lock_mount_hash();
				1309	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1310	actual_refs += mnt_get_count(p);
				1311	minimum_refs += 2;
				1312	}
				1313	unlock_mount_hash();
				1314
				1315	if (actual_refs > minimum_refs)
				1316	return 0;
				1317
				1318	return 1;
				1319	}
				1320
				1321	EXPORT_SYMBOL(may_umount_tree);
				1322
				1323	/**
				1324	* may_umount - check if a mount point is busy
				1325	* @mnt: root of mount
				1326	*
				1327	* This is called to check if a mount point has any
				1328	* open files, pwds, chroots or sub mounts. If the
				1329	* mount has sub mounts this will return busy
				1330	* regardless of whether the sub mounts are busy.
				1331	*
				1332	* Doesn't take quota and stuff into account. IOW, in some cases it will
				1333	* give false negatives. The main reason why it's here is that we need
				1334	* a non-destructive way to look for easily umountable filesystems.
				1335	*/
				1336	int may_umount(struct vfsmount *mnt)
				1337	{
				1338	int ret = 1;
				1339	down_read(&namespace_sem);
				1340	lock_mount_hash();
				1341	if (propagate_mount_busy(real_mount(mnt), 2))
				1342	ret = 0;
				1343	unlock_mount_hash();
				1344	up_read(&namespace_sem);
				1345	return ret;
				1346	}
				1347
				1348	EXPORT_SYMBOL(may_umount);
				1349
				1350	static HLIST_HEAD(unmounted); /* protected by namespace_sem */
				1351
				1352	static void namespace_unlock(void)
				1353	{
				1354	struct hlist_head head;
				1355
				1356	hlist_move_list(&unmounted, &head);
				1357
				1358	up_write(&namespace_sem);
				1359
				1360	if (likely(hlist_empty(&head)))
				1361	return;
				1362
				1363	synchronize_rcu();
				1364
				1365	group_pin_kill(&head);
				1366	}
				1367
				1368	static inline void namespace_lock(void)
				1369	{
				1370	down_write(&namespace_sem);
				1371	}
				1372
				1373	enum umount_tree_flags {
				1374	UMOUNT_SYNC = 1,
				1375	UMOUNT_PROPAGATE = 2,
				1376	UMOUNT_CONNECTED = 4,
				1377	};
				1378
				1379	static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
				1380	{
				1381	/* Leaving mounts connected is only valid for lazy umounts */
				1382	if (how & UMOUNT_SYNC)
				1383	return true;
				1384
				1385	/* A mount without a parent has nothing to be connected to */
				1386	if (!mnt_has_parent(mnt))
				1387	return true;
				1388
				1389	/* Because the reference counting rules change when mounts are
				1390	* unmounted and connected, umounted mounts may not be
				1391	* connected to mounted mounts.
				1392	*/
				1393	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
				1394	return true;
				1395
				1396	/* Has it been requested that the mount remain connected? */
				1397	if (how & UMOUNT_CONNECTED)
				1398	return false;
				1399
				1400	/* Is the mount locked such that it needs to remain connected? */
				1401	if (IS_MNT_LOCKED(mnt))
				1402	return false;
				1403
				1404	/* By default disconnect the mount */
				1405	return true;
				1406	}
				1407
				1408	/*
				1409	* mount_lock must be held
				1410	* namespace_sem must be held for write
				1411	*/
				1412	static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
				1413	{
				1414	LIST_HEAD(tmp_list);
				1415	struct mount *p;
				1416
				1417	if (how & UMOUNT_PROPAGATE)
				1418	propagate_mount_unlock(mnt);
				1419
				1420	/* Gather the mounts to umount */
				1421	for (p = mnt; p; p = next_mnt(p, mnt)) {
				1422	p->mnt.mnt_flags \|= MNT_UMOUNT;
				1423	list_move(&p->mnt_list, &tmp_list);
				1424	}
				1425
				1426	/* Hide the mounts from mnt_mounts */
				1427	list_for_each_entry(p, &tmp_list, mnt_list) {
				1428	list_del_init(&p->mnt_child);
				1429	}
				1430
				1431	/* Add propogated mounts to the tmp_list */
				1432	if (how & UMOUNT_PROPAGATE)
				1433	propagate_umount(&tmp_list);
				1434
				1435	while (!list_empty(&tmp_list)) {
				1436	struct mnt_namespace *ns;
				1437	bool disconnect;
				1438	p = list_first_entry(&tmp_list, struct mount, mnt_list);
				1439	list_del_init(&p->mnt_expire);
				1440	list_del_init(&p->mnt_list);
				1441	ns = p->mnt_ns;
				1442	if (ns) {
				1443	ns->mounts--;
				1444	__touch_mnt_namespace(ns);
				1445	}
				1446	p->mnt_ns = NULL;
				1447	if (how & UMOUNT_SYNC)
				1448	p->mnt.mnt_flags \|= MNT_SYNC_UMOUNT;
				1449
				1450	disconnect = disconnect_mount(p, how);
				1451
				1452	pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
				1453	disconnect ? &unmounted : NULL);
				1454	if (mnt_has_parent(p)) {
				1455	mnt_add_count(p->mnt_parent, -1);
				1456	if (!disconnect) {
				1457	/* Don't forget about p */
				1458	list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
				1459	} else {
				1460	umount_mnt(p);
				1461	}
				1462	}
				1463	change_mnt_propagation(p, MS_PRIVATE);
				1464	}
				1465	}
				1466
				1467	static void shrink_submounts(struct mount *mnt);
				1468
				1469	static int do_umount(struct mount *mnt, int flags)
				1470	{
				1471	struct super_block *sb = mnt->mnt.mnt_sb;
				1472	int retval;
				1473
				1474	retval = security_sb_umount(&mnt->mnt, flags);
				1475	if (retval)
				1476	return retval;
				1477
				1478	/*
				1479	* Allow userspace to request a mountpoint be expired rather than
				1480	* unmounting unconditionally. Unmount only happens if:
				1481	* (1) the mark is already set (the mark is cleared by mntput())
				1482	* (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
				1483	*/
				1484	if (flags & MNT_EXPIRE) {
				1485	if (&mnt->mnt == current->fs->root.mnt \|\|
				1486	flags & (MNT_FORCE \| MNT_DETACH))
				1487	return -EINVAL;
				1488
				1489	/*
				1490	* probably don't strictly need the lock here if we examined
				1491	* all race cases, but it's a slowpath.
				1492	*/
				1493	lock_mount_hash();
				1494	if (mnt_get_count(mnt) != 2) {
				1495	unlock_mount_hash();
				1496	return -EBUSY;
				1497	}
				1498	unlock_mount_hash();
				1499
				1500	if (!xchg(&mnt->mnt_expiry_mark, 1))
				1501	return -EAGAIN;
				1502	}
				1503
				1504	/*
				1505	* If we may have to abort operations to get out of this
				1506	* mount, and they will themselves hold resources we must
				1507	* allow the fs to do things. In the Unix tradition of
				1508	* 'Gee thats tricky lets do it in userspace' the umount_begin
				1509	* might fail to complete on the first run through as other tasks
				1510	* must return, and the like. Thats for the mount program to worry
				1511	* about for the moment.
				1512	*/
				1513
				1514	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
				1515	sb->s_op->umount_begin(sb);
				1516	}
				1517
				1518	/*
				1519	* No sense to grab the lock for this test, but test itself looks
				1520	* somewhat bogus. Suggestions for better replacement?
				1521	* Ho-hum... In principle, we might treat that as umount + switch
				1522	* to rootfs. GC would eventually take care of the old vfsmount.
				1523	* Actually it makes sense, especially if rootfs would contain a
				1524	* /reboot - static binary that would close all descriptors and
				1525	* call reboot(9). Then init(8) could umount root and exec /reboot.
				1526	*/
				1527	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
				1528	/*
				1529	* Special case for "unmounting" root ...
				1530	* we just try to remount it readonly.
				1531	*/
				1532	if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
				1533	return -EPERM;
				1534	down_write(&sb->s_umount);
				1535	if (!sb_rdonly(sb))
				1536	retval = do_remount_sb(sb, SB_RDONLY, NULL, 0);
				1537	up_write(&sb->s_umount);
				1538	return retval;
				1539	}
				1540
				1541	namespace_lock();
				1542	lock_mount_hash();
				1543
				1544	/* Recheck MNT_LOCKED with the locks held */
				1545	retval = -EINVAL;
				1546	if (mnt->mnt.mnt_flags & MNT_LOCKED)
				1547	goto out;
				1548
				1549	event++;
				1550	if (flags & MNT_DETACH) {
				1551	if (!list_empty(&mnt->mnt_list))
				1552	umount_tree(mnt, UMOUNT_PROPAGATE);
				1553	retval = 0;
				1554	} else {
				1555	shrink_submounts(mnt);
				1556	retval = -EBUSY;
				1557	if (!propagate_mount_busy(mnt, 2)) {
				1558	if (!list_empty(&mnt->mnt_list))
				1559	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				1560	retval = 0;
				1561	}
				1562	}
				1563	out:
				1564	unlock_mount_hash();
				1565	namespace_unlock();
				1566	return retval;
				1567	}
				1568
				1569	/*
				1570	* __detach_mounts - lazily unmount all mounts on the specified dentry
				1571	*
				1572	* During unlink, rmdir, and d_drop it is possible to loose the path
				1573	* to an existing mountpoint, and wind up leaking the mount.
				1574	* detach_mounts allows lazily unmounting those mounts instead of
				1575	* leaking them.
				1576	*
				1577	* The caller may hold dentry->d_inode->i_mutex.
				1578	*/
				1579	void __detach_mounts(struct dentry *dentry)
				1580	{
				1581	struct mountpoint *mp;
				1582	struct mount *mnt;
				1583
				1584	namespace_lock();
				1585	lock_mount_hash();
				1586	mp = lookup_mountpoint(dentry);
				1587	if (IS_ERR_OR_NULL(mp))
				1588	goto out_unlock;
				1589
				1590	event++;
				1591	while (!hlist_empty(&mp->m_list)) {
				1592	mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
				1593	if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
				1594	hlist_add_head(&mnt->mnt_umount.s_list, &unmounted);
				1595	umount_mnt(mnt);
				1596	}
				1597	else umount_tree(mnt, UMOUNT_CONNECTED);
				1598	}
				1599	put_mountpoint(mp);
				1600	out_unlock:
				1601	unlock_mount_hash();
				1602	namespace_unlock();
				1603	}
				1604
				1605	/*
				1606	* Is the caller allowed to modify his namespace?
				1607	*/
				1608	static inline bool may_mount(void)
				1609	{
				1610	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
				1611	}
				1612
				1613	static inline bool may_mandlock(void)
				1614	{
				1615	#ifndef CONFIG_MANDATORY_FILE_LOCKING
				1616	return false;
				1617	#endif
				1618	return capable(CAP_SYS_ADMIN);
				1619	}
				1620
				1621	/*
				1622	* Now umount can handle mount points as well as block devices.
				1623	* This is important for filesystems which use unnamed block devices.
				1624	*
				1625	* We now support a flag for forced unmount like the other 'big iron'
				1626	* unixes. Our API is identical to OSF/1 to avoid making a mess of AMD
				1627	*/
				1628
				1629	int ksys_umount(char __user *name, int flags)
				1630	{
				1631	struct path path;
				1632	struct mount *mnt;
				1633	int retval;
				1634	int lookup_flags = 0;
				1635
				1636	if (flags & ~(MNT_FORCE \| MNT_DETACH \| MNT_EXPIRE \| UMOUNT_NOFOLLOW))
				1637	return -EINVAL;
				1638
				1639	if (!may_mount())
				1640	return -EPERM;
				1641
				1642	if (!(flags & UMOUNT_NOFOLLOW))
				1643	lookup_flags \|= LOOKUP_FOLLOW;
				1644
				1645	retval = user_path_mountpoint_at(AT_FDCWD, name, lookup_flags, &path);
				1646	if (retval)
				1647	goto out;
				1648	mnt = real_mount(path.mnt);
				1649	retval = -EINVAL;
				1650	if (path.dentry != path.mnt->mnt_root)
				1651	goto dput_and_out;
				1652	if (!check_mnt(mnt))
				1653	goto dput_and_out;
				1654	if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
				1655	goto dput_and_out;
				1656	retval = -EPERM;
				1657	if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
				1658	goto dput_and_out;
				1659
				1660	retval = do_umount(mnt, flags);
				1661	dput_and_out:
				1662	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
				1663	dput(path.dentry);
				1664	mntput_no_expire(mnt);
				1665	out:
				1666	return retval;
				1667	}
				1668
				1669	SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
				1670	{
				1671	return ksys_umount(name, flags);
				1672	}
				1673
				1674	#ifdef __ARCH_WANT_SYS_OLDUMOUNT
				1675
				1676	/*
				1677	* The 2.0 compatible umount. No flags.
				1678	*/
				1679	SYSCALL_DEFINE1(oldumount, char __user *, name)
				1680	{
				1681	return ksys_umount(name, 0);
				1682	}
				1683
				1684	#endif
				1685
				1686	static bool is_mnt_ns_file(struct dentry *dentry)
				1687	{
				1688	/* Is this a proxy for a mount namespace? */
				1689	return dentry->d_op == &ns_dentry_operations &&
				1690	dentry->d_fsdata == &mntns_operations;
				1691	}
				1692
				1693	struct mnt_namespace to_mnt_ns(struct ns_common ns)
				1694	{
				1695	return container_of(ns, struct mnt_namespace, ns);
				1696	}
				1697
				1698	static bool mnt_ns_loop(struct dentry *dentry)
				1699	{
				1700	/* Could bind mounting the mount namespace inode cause a
				1701	* mount namespace loop?
				1702	*/
				1703	struct mnt_namespace *mnt_ns;
				1704	if (!is_mnt_ns_file(dentry))
				1705	return false;
				1706
				1707	mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
				1708	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
				1709	}
				1710
				1711	struct mount copy_tree(struct mount mnt, struct dentry *dentry,
				1712	int flag)
				1713	{
				1714	struct mount res, p, q, r, *parent;
				1715
				1716	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt))
				1717	return ERR_PTR(-EINVAL);
				1718
				1719	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
				1720	return ERR_PTR(-EINVAL);
				1721
				1722	res = q = clone_mnt(mnt, dentry, flag);
				1723	if (IS_ERR(q))
				1724	return q;
				1725
				1726	q->mnt_mountpoint = mnt->mnt_mountpoint;
				1727
				1728	p = mnt;
				1729	list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) {
				1730	struct mount *s;
				1731	if (!is_subdir(r->mnt_mountpoint, dentry))
				1732	continue;
				1733
				1734	for (s = r; s; s = next_mnt(s, r)) {
				1735	if (!(flag & CL_COPY_UNBINDABLE) &&
				1736	IS_MNT_UNBINDABLE(s)) {
				1737	if (s->mnt.mnt_flags & MNT_LOCKED) {
				1738	/* Both unbindable and locked. */
				1739	q = ERR_PTR(-EPERM);
				1740	goto out;
				1741	} else {
				1742	s = skip_mnt_tree(s);
				1743	continue;
				1744	}
				1745	}
				1746	if (!(flag & CL_COPY_MNT_NS_FILE) &&
				1747	is_mnt_ns_file(s->mnt.mnt_root)) {
				1748	s = skip_mnt_tree(s);
				1749	continue;
				1750	}
				1751	while (p != s->mnt_parent) {
				1752	p = p->mnt_parent;
				1753	q = q->mnt_parent;
				1754	}
				1755	p = s;
				1756	parent = q;
				1757	q = clone_mnt(p, p->mnt.mnt_root, flag);
				1758	if (IS_ERR(q))
				1759	goto out;
				1760	lock_mount_hash();
				1761	list_add_tail(&q->mnt_list, &res->mnt_list);
				1762	attach_mnt(q, parent, p->mnt_mp);
				1763	unlock_mount_hash();
				1764	}
				1765	}
				1766	return res;
				1767	out:
				1768	if (res) {
				1769	lock_mount_hash();
				1770	umount_tree(res, UMOUNT_SYNC);
				1771	unlock_mount_hash();
				1772	}
				1773	return q;
				1774	}
				1775
				1776	/* Caller should check returned pointer for errors */
				1777
				1778	struct vfsmount collect_mounts(const struct path path)
				1779	{
				1780	struct mount *tree;
				1781	namespace_lock();
				1782	if (!check_mnt(real_mount(path->mnt)))
				1783	tree = ERR_PTR(-EINVAL);
				1784	else
				1785	tree = copy_tree(real_mount(path->mnt), path->dentry,
				1786	CL_COPY_ALL \| CL_PRIVATE);
				1787	namespace_unlock();
				1788	if (IS_ERR(tree))
				1789	return ERR_CAST(tree);
				1790	return &tree->mnt;
				1791	}
				1792
				1793	void drop_collected_mounts(struct vfsmount *mnt)
				1794	{
				1795	namespace_lock();
				1796	lock_mount_hash();
				1797	umount_tree(real_mount(mnt), 0);
				1798	unlock_mount_hash();
				1799	namespace_unlock();
				1800	}
				1801
				1802	/**
				1803	* clone_private_mount - create a private clone of a path
				1804	*
				1805	* This creates a new vfsmount, which will be the clone of @path. The new will
				1806	* not be attached anywhere in the namespace and will be private (i.e. changes
				1807	* to the originating mount won't be propagated into this).
				1808	*
				1809	* Release with mntput().
				1810	*/
				1811	struct vfsmount clone_private_mount(const struct path path)
				1812	{
				1813	struct mount *old_mnt = real_mount(path->mnt);
				1814	struct mount *new_mnt;
				1815
				1816	if (IS_MNT_UNBINDABLE(old_mnt))
				1817	return ERR_PTR(-EINVAL);
				1818
				1819	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
				1820	if (IS_ERR(new_mnt))
				1821	return ERR_CAST(new_mnt);
				1822
				1823	return &new_mnt->mnt;
				1824	}
				1825	EXPORT_SYMBOL_GPL(clone_private_mount);
				1826
				1827	int iterate_mounts(int (f)(struct vfsmount , void ), void arg,
				1828	struct vfsmount *root)
				1829	{
				1830	struct mount *mnt;
				1831	int res = f(root, arg);
				1832	if (res)
				1833	return res;
				1834	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
				1835	res = f(&mnt->mnt, arg);
				1836	if (res)
				1837	return res;
				1838	}
				1839	return 0;
				1840	}
				1841
				1842	static void cleanup_group_ids(struct mount mnt, struct mount end)
				1843	{
				1844	struct mount *p;
				1845
				1846	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
				1847	if (p->mnt_group_id && !IS_MNT_SHARED(p))
				1848	mnt_release_group_id(p);
				1849	}
				1850	}
				1851
				1852	static int invent_group_ids(struct mount *mnt, bool recurse)
				1853	{
				1854	struct mount *p;
				1855
				1856	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
				1857	if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
				1858	int err = mnt_alloc_group_id(p);
				1859	if (err) {
				1860	cleanup_group_ids(mnt, p);
				1861	return err;
				1862	}
				1863	}
				1864	}
				1865
				1866	return 0;
				1867	}
				1868
				1869	int count_mounts(struct mnt_namespace ns, struct mount mnt)
				1870	{
				1871	unsigned int max = READ_ONCE(sysctl_mount_max);
				1872	unsigned int mounts = 0, old, pending, sum;
				1873	struct mount *p;
				1874
				1875	for (p = mnt; p; p = next_mnt(p, mnt))
				1876	mounts++;
				1877
				1878	old = ns->mounts;
				1879	pending = ns->pending_mounts;
				1880	sum = old + pending;
				1881	if ((old > sum) \|\|
				1882	(pending > sum) \|\|
				1883	(max < sum) \|\|
				1884	(mounts > (max - sum)))
				1885	return -ENOSPC;
				1886
				1887	ns->pending_mounts = pending + mounts;
				1888	return 0;
				1889	}
				1890
				1891	/*
				1892	* @source_mnt : mount tree to be attached
				1893	* @nd : place the mount tree @source_mnt is attached
				1894	* @parent_nd : if non-null, detach the source_mnt from its parent and
				1895	* store the parent mount and mountpoint dentry.
				1896	* (done when source_mnt is moved)
				1897	*
				1898	* NOTE: in the table below explains the semantics when a source mount
				1899	* of a given type is attached to a destination mount of a given type.
				1900	* ---------------------------------------------------------------------------
				1901	* \| BIND MOUNT OPERATION \|
				1902	* \|**************************************************************************
				1903	* \| source-->\| shared \| private \| slave \| unbindable \|
				1904	* \| dest \| \| \| \| \|
				1905	* \| \| \| \| \| \| \|
				1906	* \| v \| \| \| \| \|
				1907	* \|**************************************************************************
				1908	* \| shared \| shared (++) \| shared (+) \| shared(+++)\| invalid \|
				1909	* \| \| \| \| \| \|
				1910	* \|non-shared\| shared (+) \| private \| slave (*) \| invalid \|
				1911	* ***************************************************************************
				1912	* A bind operation clones the source mount and mounts the clone on the
				1913	* destination mount.
				1914	*
				1915	* (++) the cloned mount is propagated to all the mounts in the propagation
				1916	* tree of the destination mount and the cloned mount is added to
				1917	* the peer group of the source mount.
				1918	* (+) the cloned mount is created under the destination mount and is marked
				1919	* as shared. The cloned mount is added to the peer group of the source
				1920	* mount.
				1921	* (+++) the mount is propagated to all the mounts in the propagation tree
				1922	* of the destination mount and the cloned mount is made slave
				1923	* of the same master as that of the source mount. The cloned mount
				1924	* is marked as 'shared and slave'.
				1925	* (*) the cloned mount is made a slave of the same master as that of the
				1926	* source mount.
				1927	*
				1928	* ---------------------------------------------------------------------------
				1929	* \| MOVE MOUNT OPERATION \|
				1930	* \|**************************************************************************
				1931	* \| source-->\| shared \| private \| slave \| unbindable \|
				1932	* \| dest \| \| \| \| \|
				1933	* \| \| \| \| \| \| \|
				1934	* \| v \| \| \| \| \|
				1935	* \|**************************************************************************
				1936	* \| shared \| shared (+) \| shared (+) \| shared(+++) \| invalid \|
				1937	* \| \| \| \| \| \|
				1938	* \|non-shared\| shared (+) \| private \| slave () \| unbindable \|
				1939	* ***************************************************************************
				1940	*
				1941	* (+) the mount is moved to the destination. And is then propagated to
				1942	* all the mounts in the propagation tree of the destination mount.
				1943	* (+*) the mount is moved to the destination.
				1944	* (+++) the mount is moved to the destination and is then propagated to
				1945	* all the mounts belonging to the destination mount's propagation tree.
				1946	* the mount is marked as 'shared and slave'.
				1947	* (*) the mount continues to be a slave at the new location.
				1948	*
				1949	* if the source mount is a tree, the operations explained above is
				1950	* applied to each mount in the tree.
				1951	* Must be called without spinlocks held, since this function can sleep
				1952	* in allocations.
				1953	*/
				1954	static int attach_recursive_mnt(struct mount *source_mnt,
				1955	struct mount *dest_mnt,
				1956	struct mountpoint *dest_mp,
				1957	struct path *parent_path)
				1958	{
				1959	HLIST_HEAD(tree_list);
				1960	struct mnt_namespace *ns = dest_mnt->mnt_ns;
				1961	struct mountpoint *smp;
				1962	struct mount child, p;
				1963	struct hlist_node *n;
				1964	int err;
				1965
				1966	/* Preallocate a mountpoint in case the new mounts need
				1967	* to be tucked under other mounts.
				1968	*/
				1969	smp = get_mountpoint(source_mnt->mnt.mnt_root);
				1970	if (IS_ERR(smp))
				1971	return PTR_ERR(smp);
				1972
				1973	/* Is there space to add these mounts to the mount namespace? */
				1974	if (!parent_path) {
				1975	err = count_mounts(ns, source_mnt);
				1976	if (err)
				1977	goto out;
				1978	}
				1979
				1980	if (IS_MNT_SHARED(dest_mnt)) {
				1981	err = invent_group_ids(source_mnt, true);
				1982	if (err)
				1983	goto out;
				1984	err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
				1985	lock_mount_hash();
				1986	if (err)
				1987	goto out_cleanup_ids;
				1988	for (p = source_mnt; p; p = next_mnt(p, source_mnt))
				1989	set_mnt_shared(p);
				1990	} else {
				1991	lock_mount_hash();
				1992	}
				1993	if (parent_path) {
				1994	detach_mnt(source_mnt, parent_path);
				1995	attach_mnt(source_mnt, dest_mnt, dest_mp);
				1996	touch_mnt_namespace(source_mnt->mnt_ns);
				1997	} else {
				1998	mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
				1999	commit_tree(source_mnt);
				2000	}
				2001
				2002	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
				2003	struct mount *q;
				2004	hlist_del_init(&child->mnt_hash);
				2005	q = __lookup_mnt(&child->mnt_parent->mnt,
				2006	child->mnt_mountpoint);
				2007	if (q)
				2008	mnt_change_mountpoint(child, smp, q);
				2009	commit_tree(child);
				2010	}
				2011	put_mountpoint(smp);
				2012	unlock_mount_hash();
				2013
				2014	return 0;
				2015
				2016	out_cleanup_ids:
				2017	while (!hlist_empty(&tree_list)) {
				2018	child = hlist_entry(tree_list.first, struct mount, mnt_hash);
				2019	child->mnt_parent->mnt_ns->pending_mounts = 0;
				2020	umount_tree(child, UMOUNT_SYNC);
				2021	}
				2022	unlock_mount_hash();
				2023	cleanup_group_ids(source_mnt, NULL);
				2024	out:
				2025	ns->pending_mounts = 0;
				2026
				2027	read_seqlock_excl(&mount_lock);
				2028	put_mountpoint(smp);
				2029	read_sequnlock_excl(&mount_lock);
				2030
				2031	return err;
				2032	}
				2033
				2034	static struct mountpoint lock_mount(struct path path)
				2035	{
				2036	struct vfsmount *mnt;
				2037	struct dentry *dentry = path->dentry;
				2038	retry:
				2039	inode_lock(dentry->d_inode);
				2040	if (unlikely(cant_mount(dentry))) {
				2041	inode_unlock(dentry->d_inode);
				2042	return ERR_PTR(-ENOENT);
				2043	}
				2044	namespace_lock();
				2045	mnt = lookup_mnt(path);
				2046	if (likely(!mnt)) {
				2047	struct mountpoint *mp = get_mountpoint(dentry);
				2048	if (IS_ERR(mp)) {
				2049	namespace_unlock();
				2050	inode_unlock(dentry->d_inode);
				2051	return mp;
				2052	}
				2053	return mp;
				2054	}
				2055	namespace_unlock();
				2056	inode_unlock(path->dentry->d_inode);
				2057	path_put(path);
				2058	path->mnt = mnt;
				2059	dentry = path->dentry = dget(mnt->mnt_root);
				2060	goto retry;
				2061	}
				2062
				2063	static void unlock_mount(struct mountpoint *where)
				2064	{
				2065	struct dentry *dentry = where->m_dentry;
				2066
				2067	read_seqlock_excl(&mount_lock);
				2068	put_mountpoint(where);
				2069	read_sequnlock_excl(&mount_lock);
				2070
				2071	namespace_unlock();
				2072	inode_unlock(dentry->d_inode);
				2073	}
				2074
				2075	static int graft_tree(struct mount mnt, struct mount p, struct mountpoint *mp)
				2076	{
				2077	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
				2078	return -EINVAL;
				2079
				2080	if (d_is_dir(mp->m_dentry) !=
				2081	d_is_dir(mnt->mnt.mnt_root))
				2082	return -ENOTDIR;
				2083
				2084	return attach_recursive_mnt(mnt, p, mp, NULL);
				2085	}
				2086
				2087	/*
				2088	* Sanity check the flags to change_mnt_propagation.
				2089	*/
				2090
				2091	static int flags_to_propagation_type(int ms_flags)
				2092	{
				2093	int type = ms_flags & ~(MS_REC \| MS_SILENT);
				2094
				2095	/* Fail if any non-propagation flags are set */
				2096	if (type & ~(MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				2097	return 0;
				2098	/* Only one propagation flag should be set */
				2099	if (!is_power_of_2(type))
				2100	return 0;
				2101	return type;
				2102	}
				2103
				2104	/*
				2105	* recursively change the type of the mountpoint.
				2106	*/
				2107	static int do_change_type(struct path *path, int ms_flags)
				2108	{
				2109	struct mount *m;
				2110	struct mount *mnt = real_mount(path->mnt);
				2111	int recurse = ms_flags & MS_REC;
				2112	int type;
				2113	int err = 0;
				2114
				2115	if (path->dentry != path->mnt->mnt_root)
				2116	return -EINVAL;
				2117
				2118	type = flags_to_propagation_type(ms_flags);
				2119	if (!type)
				2120	return -EINVAL;
				2121
				2122	namespace_lock();
				2123	if (type == MS_SHARED) {
				2124	err = invent_group_ids(mnt, recurse);
				2125	if (err)
				2126	goto out_unlock;
				2127	}
				2128
				2129	lock_mount_hash();
				2130	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
				2131	change_mnt_propagation(m, type);
				2132	unlock_mount_hash();
				2133
				2134	out_unlock:
				2135	namespace_unlock();
				2136	return err;
				2137	}
				2138
				2139	static bool has_locked_children(struct mount mnt, struct dentry dentry)
				2140	{
				2141	struct mount *child;
				2142	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				2143	if (!is_subdir(child->mnt_mountpoint, dentry))
				2144	continue;
				2145
				2146	if (child->mnt.mnt_flags & MNT_LOCKED)
				2147	return true;
				2148	}
				2149	return false;
				2150	}
				2151
				2152	/*
				2153	* do loopback mount.
				2154	*/
				2155	static int do_loopback(struct path path, const char old_name,
				2156	int recurse)
				2157	{
				2158	struct path old_path;
				2159	struct mount mnt = NULL, old, *parent;
				2160	struct mountpoint *mp;
				2161	int err;
				2162	if (!old_name \|\| !*old_name)
				2163	return -EINVAL;
				2164	err = kern_path(old_name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &old_path);
				2165	if (err)
				2166	return err;
				2167
				2168	err = -EINVAL;
				2169	if (mnt_ns_loop(old_path.dentry))
				2170	goto out;
				2171
				2172	mp = lock_mount(path);
				2173	err = PTR_ERR(mp);
				2174	if (IS_ERR(mp))
				2175	goto out;
				2176
				2177	old = real_mount(old_path.mnt);
				2178	parent = real_mount(path->mnt);
				2179
				2180	err = -EINVAL;
				2181	if (IS_MNT_UNBINDABLE(old))
				2182	goto out2;
				2183
				2184	if (!check_mnt(parent))
				2185	goto out2;
				2186
				2187	if (!check_mnt(old) && old_path.dentry->d_op != &ns_dentry_operations)
				2188	goto out2;
				2189
				2190	if (!recurse && has_locked_children(old, old_path.dentry))
				2191	goto out2;
				2192
				2193	if (recurse)
				2194	mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
				2195	else
				2196	mnt = clone_mnt(old, old_path.dentry, 0);
				2197
				2198	if (IS_ERR(mnt)) {
				2199	err = PTR_ERR(mnt);
				2200	goto out2;
				2201	}
				2202
				2203	mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				2204
				2205	err = graft_tree(mnt, parent, mp);
				2206	if (err) {
				2207	lock_mount_hash();
				2208	umount_tree(mnt, UMOUNT_SYNC);
				2209	unlock_mount_hash();
				2210	}
				2211	out2:
				2212	unlock_mount(mp);
				2213	out:
				2214	path_put(&old_path);
				2215	return err;
				2216	}
				2217
				2218	static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
				2219	{
				2220	int error = 0;
				2221	int readonly_request = 0;
				2222
				2223	if (ms_flags & MS_RDONLY)
				2224	readonly_request = 1;
				2225	if (readonly_request == __mnt_is_readonly(mnt))
				2226	return 0;
				2227
				2228	if (readonly_request)
				2229	error = mnt_make_readonly(real_mount(mnt));
				2230	else
				2231	__mnt_unmake_readonly(real_mount(mnt));
				2232	return error;
				2233	}
				2234
				2235	/*
				2236	* change filesystem flags. dir should be a physical root of filesystem.
				2237	* If you've mounted a non-root directory somewhere and want to do remount
				2238	* on it - tough luck.
				2239	*/
				2240	static int do_remount(struct path *path, int ms_flags, int sb_flags,
				2241	int mnt_flags, void *data)
				2242	{
				2243	int err;
				2244	struct super_block *sb = path->mnt->mnt_sb;
				2245	struct mount *mnt = real_mount(path->mnt);
				2246
				2247	if (!check_mnt(mnt))
				2248	return -EINVAL;
				2249
				2250	if (path->dentry != path->mnt->mnt_root)
				2251	return -EINVAL;
				2252
				2253	/* Don't allow changing of locked mnt flags.
				2254	*
				2255	* No locks need to be held here while testing the various
				2256	* MNT_LOCK flags because those flags can never be cleared
				2257	* once they are set.
				2258	*/
				2259	if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
				2260	!(mnt_flags & MNT_READONLY)) {
				2261	return -EPERM;
				2262	}
				2263	if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
				2264	!(mnt_flags & MNT_NODEV)) {
				2265	return -EPERM;
				2266	}
				2267	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
				2268	!(mnt_flags & MNT_NOSUID)) {
				2269	return -EPERM;
				2270	}
				2271	if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
				2272	!(mnt_flags & MNT_NOEXEC)) {
				2273	return -EPERM;
				2274	}
				2275	if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
				2276	((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
				2277	return -EPERM;
				2278	}
				2279
				2280	err = security_sb_remount(sb, data);
				2281	if (err)
				2282	return err;
				2283
				2284	down_write(&sb->s_umount);
				2285	if (ms_flags & MS_BIND)
				2286	err = change_mount_flags(path->mnt, ms_flags);
				2287	else if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
				2288	err = -EPERM;
				2289	else
				2290	err = do_remount_sb(sb, sb_flags, data, 0);
				2291	if (!err) {
				2292	lock_mount_hash();
				2293	mnt_flags \|= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
				2294	mnt->mnt.mnt_flags = mnt_flags;
				2295	touch_mnt_namespace(mnt->mnt_ns);
				2296	unlock_mount_hash();
				2297	}
				2298	up_write(&sb->s_umount);
				2299	return err;
				2300	}
				2301
				2302	static inline int tree_contains_unbindable(struct mount *mnt)
				2303	{
				2304	struct mount *p;
				2305	for (p = mnt; p; p = next_mnt(p, mnt)) {
				2306	if (IS_MNT_UNBINDABLE(p))
				2307	return 1;
				2308	}
				2309	return 0;
				2310	}
				2311
				2312	static int do_move_mount(struct path path, const char old_name)
				2313	{
				2314	struct path old_path, parent_path;
				2315	struct mount *p;
				2316	struct mount *old;
				2317	struct mountpoint *mp;
				2318	int err;
				2319	if (!old_name \|\| !*old_name)
				2320	return -EINVAL;
				2321	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
				2322	if (err)
				2323	return err;
				2324
				2325	mp = lock_mount(path);
				2326	err = PTR_ERR(mp);
				2327	if (IS_ERR(mp))
				2328	goto out;
				2329
				2330	old = real_mount(old_path.mnt);
				2331	p = real_mount(path->mnt);
				2332
				2333	err = -EINVAL;
				2334	if (!check_mnt(p) \|\| !check_mnt(old))
				2335	goto out1;
				2336
				2337	if (old->mnt.mnt_flags & MNT_LOCKED)
				2338	goto out1;
				2339
				2340	err = -EINVAL;
				2341	if (old_path.dentry != old_path.mnt->mnt_root)
				2342	goto out1;
				2343
				2344	if (!mnt_has_parent(old))
				2345	goto out1;
				2346
				2347	if (d_is_dir(path->dentry) !=
				2348	d_is_dir(old_path.dentry))
				2349	goto out1;
				2350	/*
				2351	* Don't move a mount residing in a shared parent.
				2352	*/
				2353	if (IS_MNT_SHARED(old->mnt_parent))
				2354	goto out1;
				2355	/*
				2356	* Don't move a mount tree containing unbindable mounts to a destination
				2357	* mount which is shared.
				2358	*/
				2359	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
				2360	goto out1;
				2361	err = -ELOOP;
				2362	for (; mnt_has_parent(p); p = p->mnt_parent)
				2363	if (p == old)
				2364	goto out1;
				2365
				2366	err = attach_recursive_mnt(old, real_mount(path->mnt), mp, &parent_path);
				2367	if (err)
				2368	goto out1;
				2369
				2370	/* if the mount is moved, it should no longer be expire
				2371	* automatically */
				2372	list_del_init(&old->mnt_expire);
				2373	out1:
				2374	unlock_mount(mp);
				2375	out:
				2376	if (!err)
				2377	path_put(&parent_path);
				2378	path_put(&old_path);
				2379	return err;
				2380	}
				2381
				2382	static struct vfsmount fs_set_subtype(struct vfsmount mnt, const char *fstype)
				2383	{
				2384	int err;
				2385	const char *subtype = strchr(fstype, '.');
				2386	if (subtype) {
				2387	subtype++;
				2388	err = -EINVAL;
				2389	if (!subtype[0])
				2390	goto err;
				2391	} else
				2392	subtype = "";
				2393
				2394	mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
				2395	err = -ENOMEM;
				2396	if (!mnt->mnt_sb->s_subtype)
				2397	goto err;
				2398	return mnt;
				2399
				2400	err:
				2401	mntput(mnt);
				2402	return ERR_PTR(err);
				2403	}
				2404
				2405	/*
				2406	* add a mount into a namespace's mount tree
				2407	*/
				2408	static int do_add_mount(struct mount newmnt, struct path path, int mnt_flags)
				2409	{
				2410	struct mountpoint *mp;
				2411	struct mount *parent;
				2412	int err;
				2413
				2414	mnt_flags &= ~MNT_INTERNAL_FLAGS;
				2415
				2416	mp = lock_mount(path);
				2417	if (IS_ERR(mp))
				2418	return PTR_ERR(mp);
				2419
				2420	parent = real_mount(path->mnt);
				2421	err = -EINVAL;
				2422	if (unlikely(!check_mnt(parent))) {
				2423	/* that's acceptable only for automounts done in private ns */
				2424	if (!(mnt_flags & MNT_SHRINKABLE))
				2425	goto unlock;
				2426	/* ... and for those we'd better have mountpoint still alive */
				2427	if (!parent->mnt_ns)
				2428	goto unlock;
				2429	}
				2430
				2431	/* Refuse the same filesystem on the same mount point */
				2432	err = -EBUSY;
				2433	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
				2434	path->mnt->mnt_root == path->dentry)
				2435	goto unlock;
				2436
				2437	err = -EINVAL;
				2438	if (d_is_symlink(newmnt->mnt.mnt_root))
				2439	goto unlock;
				2440
				2441	newmnt->mnt.mnt_flags = mnt_flags;
				2442	err = graft_tree(newmnt, parent, mp);
				2443
				2444	unlock:
				2445	unlock_mount(mp);
				2446	return err;
				2447	}
				2448
				2449	static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags);
				2450
				2451	/*
				2452	* create a new mount for userspace and request it to be added into the
				2453	* namespace's tree
				2454	*/
				2455	static int do_new_mount(struct path path, const char fstype, int sb_flags,
				2456	int mnt_flags, const char name, void data)
				2457	{
				2458	struct file_system_type *type;
				2459	struct vfsmount *mnt;
				2460	int err;
				2461
				2462	if (!fstype)
				2463	return -EINVAL;
				2464
				2465	type = get_fs_type(fstype);
				2466	if (!type)
				2467	return -ENODEV;
				2468
				2469	mnt = vfs_kern_mount(type, sb_flags, name, data);
				2470	if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
				2471	!mnt->mnt_sb->s_subtype)
				2472	mnt = fs_set_subtype(mnt, fstype);
				2473
				2474	put_filesystem(type);
				2475	if (IS_ERR(mnt))
				2476	return PTR_ERR(mnt);
				2477
				2478	if (mount_too_revealing(mnt, &mnt_flags)) {
				2479	mntput(mnt);
				2480	return -EPERM;
				2481	}
				2482
				2483	err = do_add_mount(real_mount(mnt), path, mnt_flags);
				2484	if (err)
				2485	mntput(mnt);
				2486	return err;
				2487	}
				2488
				2489	int finish_automount(struct vfsmount m, struct path path)
				2490	{
				2491	struct mount *mnt = real_mount(m);
				2492	int err;
				2493	/* The new mount record should have at least 2 refs to prevent it being
				2494	* expired before we get a chance to add it
				2495	*/
				2496	BUG_ON(mnt_get_count(mnt) < 2);
				2497
				2498	if (m->mnt_sb == path->mnt->mnt_sb &&
				2499	m->mnt_root == path->dentry) {
				2500	err = -ELOOP;
				2501	goto fail;
				2502	}
				2503
				2504	err = do_add_mount(mnt, path, path->mnt->mnt_flags \| MNT_SHRINKABLE);
				2505	if (!err)
				2506	return 0;
				2507	fail:
				2508	/* remove m from any expiration list it may be on */
				2509	if (!list_empty(&mnt->mnt_expire)) {
				2510	namespace_lock();
				2511	list_del_init(&mnt->mnt_expire);
				2512	namespace_unlock();
				2513	}
				2514	mntput(m);
				2515	mntput(m);
				2516	return err;
				2517	}
				2518
				2519	/**
				2520	* mnt_set_expiry - Put a mount on an expiration list
				2521	* @mnt: The mount to list.
				2522	* @expiry_list: The list to add the mount to.
				2523	*/
				2524	void mnt_set_expiry(struct vfsmount mnt, struct list_head expiry_list)
				2525	{
				2526	namespace_lock();
				2527
				2528	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
				2529
				2530	namespace_unlock();
				2531	}
				2532	EXPORT_SYMBOL(mnt_set_expiry);
				2533
				2534	/*
				2535	* process a list of expirable mountpoints with the intent of discarding any
				2536	* mountpoints that aren't in use and haven't been touched since last we came
				2537	* here
				2538	*/
				2539	void mark_mounts_for_expiry(struct list_head *mounts)
				2540	{
				2541	struct mount mnt, next;
				2542	LIST_HEAD(graveyard);
				2543
				2544	if (list_empty(mounts))
				2545	return;
				2546
				2547	namespace_lock();
				2548	lock_mount_hash();
				2549
				2550	/* extract from the expiration list every vfsmount that matches the
				2551	* following criteria:
				2552	* - only referenced by its parent vfsmount
				2553	* - still marked for expiry (marked on the last call here; marks are
				2554	* cleared by mntput())
				2555	*/
				2556	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
				2557	if (!xchg(&mnt->mnt_expiry_mark, 1) \|\|
				2558	propagate_mount_busy(mnt, 1))
				2559	continue;
				2560	list_move(&mnt->mnt_expire, &graveyard);
				2561	}
				2562	while (!list_empty(&graveyard)) {
				2563	mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
				2564	touch_mnt_namespace(mnt->mnt_ns);
				2565	umount_tree(mnt, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				2566	}
				2567	unlock_mount_hash();
				2568	namespace_unlock();
				2569	}
				2570
				2571	EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
				2572
				2573	/*
				2574	* Ripoff of 'select_parent()'
				2575	*
				2576	* search the list of submounts for a given mountpoint, and move any
				2577	* shrinkable submounts to the 'graveyard' list.
				2578	*/
				2579	static int select_submounts(struct mount parent, struct list_head graveyard)
				2580	{
				2581	struct mount *this_parent = parent;
				2582	struct list_head *next;
				2583	int found = 0;
				2584
				2585	repeat:
				2586	next = this_parent->mnt_mounts.next;
				2587	resume:
				2588	while (next != &this_parent->mnt_mounts) {
				2589	struct list_head *tmp = next;
				2590	struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
				2591
				2592	next = tmp->next;
				2593	if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
				2594	continue;
				2595	/*
				2596	* Descend a level if the d_mounts list is non-empty.
				2597	*/
				2598	if (!list_empty(&mnt->mnt_mounts)) {
				2599	this_parent = mnt;
				2600	goto repeat;
				2601	}
				2602
				2603	if (!propagate_mount_busy(mnt, 1)) {
				2604	list_move_tail(&mnt->mnt_expire, graveyard);
				2605	found++;
				2606	}
				2607	}
				2608	/*
				2609	* All done at this level ... ascend and resume the search
				2610	*/
				2611	if (this_parent != parent) {
				2612	next = this_parent->mnt_child.next;
				2613	this_parent = this_parent->mnt_parent;
				2614	goto resume;
				2615	}
				2616	return found;
				2617	}
				2618
				2619	/*
				2620	* process a list of expirable mountpoints with the intent of discarding any
				2621	* submounts of a specific parent mountpoint
				2622	*
				2623	* mount_lock must be held for write
				2624	*/
				2625	static void shrink_submounts(struct mount *mnt)
				2626	{
				2627	LIST_HEAD(graveyard);
				2628	struct mount *m;
				2629
				2630	/* extract submounts of 'mountpoint' from the expiration list */
				2631	while (select_submounts(mnt, &graveyard)) {
				2632	while (!list_empty(&graveyard)) {
				2633	m = list_first_entry(&graveyard, struct mount,
				2634	mnt_expire);
				2635	touch_mnt_namespace(m->mnt_ns);
				2636	umount_tree(m, UMOUNT_PROPAGATE\|UMOUNT_SYNC);
				2637	}
				2638	}
				2639	}
				2640
				2641	/*
				2642	* Some copy_from_user() implementations do not return the exact number of
				2643	* bytes remaining to copy on a fault. But copy_mount_options() requires that.
				2644	* Note that this function differs from copy_from_user() in that it will oops
				2645	* on bad values of `to', rather than returning a short copy.
				2646	*/
				2647	static long exact_copy_from_user(void to, const void __user from,
				2648	unsigned long n)
				2649	{
				2650	char *t = to;
				2651	const char __user *f = from;
				2652	char c;
				2653
				2654	if (!access_ok(VERIFY_READ, from, n))
				2655	return n;
				2656
				2657	while (n) {
				2658	if (__get_user(c, f)) {
				2659	memset(t, 0, n);
				2660	break;
				2661	}
				2662	*t++ = c;
				2663	f++;
				2664	n--;
				2665	}
				2666	return n;
				2667	}
				2668
				2669	void copy_mount_options(const void __user data)
				2670	{
				2671	int i;
				2672	unsigned long size;
				2673	char *copy;
				2674
				2675	if (!data)
				2676	return NULL;
				2677
				2678	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
				2679	if (!copy)
				2680	return ERR_PTR(-ENOMEM);
				2681
				2682	/* We only care that some data at the address the user
				2683	* gave us is valid. Just in case, we'll zero
				2684	* the remainder of the page.
				2685	*/
				2686	/* copy_from_user cannot cross TASK_SIZE ! */
				2687	size = TASK_SIZE - (unsigned long)data;
				2688	if (size > PAGE_SIZE)
				2689	size = PAGE_SIZE;
				2690
				2691	i = size - exact_copy_from_user(copy, data, size);
				2692	if (!i) {
				2693	kfree(copy);
				2694	return ERR_PTR(-EFAULT);
				2695	}
				2696	if (i != PAGE_SIZE)
				2697	memset(copy + i, 0, PAGE_SIZE - i);
				2698	return copy;
				2699	}
				2700
				2701	char copy_mount_string(const void __user data)
				2702	{
				2703	return data ? strndup_user(data, PAGE_SIZE) : NULL;
				2704	}
				2705
				2706	/*
				2707	* Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
				2708	* be given to the mount() call (ie: read-only, no-dev, no-suid etc).
				2709	*
				2710	* data is a (void *) that can point to any structure up to
				2711	* PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
				2712	* information (or be NULL).
				2713	*
				2714	* Pre-0.97 versions of mount() didn't have a flags word.
				2715	* When the flags word was introduced its top half was required
				2716	* to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
				2717	* Therefore, if this magic number is present, it carries no information
				2718	* and must be discarded.
				2719	*/
				2720	long do_mount(const char dev_name, const char __user dir_name,
				2721	const char type_page, unsigned long flags, void data_page)
				2722	{
				2723	struct path path;
				2724	unsigned int mnt_flags = 0, sb_flags;
				2725	int retval = 0;
				2726
				2727	/* Discard magic */
				2728	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
				2729	flags &= ~MS_MGC_MSK;
				2730
				2731	/* Basic sanity checks */
				2732	if (data_page)
				2733	((char *)data_page)[PAGE_SIZE - 1] = 0;
				2734
				2735	if (flags & MS_NOUSER)
				2736	return -EINVAL;
				2737
				2738	/* ... and get the mountpoint */
				2739	retval = user_path(dir_name, &path);
				2740	if (retval)
				2741	return retval;
				2742
				2743	retval = security_sb_mount(dev_name, &path,
				2744	type_page, flags, data_page);
				2745	if (!retval && !may_mount())
				2746	retval = -EPERM;
				2747	if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())
				2748	retval = -EPERM;
				2749	if (retval)
				2750	goto dput_out;
				2751
				2752	/* Default to relatime unless overriden */
				2753	if (!(flags & MS_NOATIME))
				2754	mnt_flags \|= MNT_RELATIME;
				2755
				2756	/* Separate the per-mountpoint flags */
				2757	if (flags & MS_NOSUID)
				2758	mnt_flags \|= MNT_NOSUID;
				2759	if (flags & MS_NODEV)
				2760	mnt_flags \|= MNT_NODEV;
				2761	if (flags & MS_NOEXEC)
				2762	mnt_flags \|= MNT_NOEXEC;
				2763	if (flags & MS_NOATIME)
				2764	mnt_flags \|= MNT_NOATIME;
				2765	if (flags & MS_NODIRATIME)
				2766	mnt_flags \|= MNT_NODIRATIME;
				2767	if (flags & MS_STRICTATIME)
				2768	mnt_flags &= ~(MNT_RELATIME \| MNT_NOATIME);
				2769	if (flags & MS_RDONLY)
				2770	mnt_flags \|= MNT_READONLY;
				2771
				2772	/* The default atime for remount is preservation */
				2773	if ((flags & MS_REMOUNT) &&
				2774	((flags & (MS_NOATIME \| MS_NODIRATIME \| MS_RELATIME \|
				2775	MS_STRICTATIME)) == 0)) {
				2776	mnt_flags &= ~MNT_ATIME_MASK;
				2777	mnt_flags \|= path.mnt->mnt_flags & MNT_ATIME_MASK;
				2778	}
				2779
				2780	sb_flags = flags & (SB_RDONLY \|
				2781	SB_SYNCHRONOUS \|
				2782	SB_MANDLOCK \|
				2783	SB_DIRSYNC \|
				2784	SB_SILENT \|
				2785	SB_POSIXACL \|
				2786	SB_LAZYTIME \|
				2787	SB_I_VERSION);
				2788
				2789	if (flags & MS_REMOUNT)
				2790	retval = do_remount(&path, flags, sb_flags, mnt_flags,
				2791	data_page);
				2792	else if (flags & MS_BIND)
				2793	retval = do_loopback(&path, dev_name, flags & MS_REC);
				2794	else if (flags & (MS_SHARED \| MS_PRIVATE \| MS_SLAVE \| MS_UNBINDABLE))
				2795	retval = do_change_type(&path, flags);
				2796	else if (flags & MS_MOVE)
				2797	retval = do_move_mount(&path, dev_name);
				2798	else
				2799	retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
				2800	dev_name, data_page);
				2801	dput_out:
				2802	path_put(&path);
				2803	return retval;
				2804	}
				2805
				2806	static struct ucounts inc_mnt_namespaces(struct user_namespace ns)
				2807	{
				2808	return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
				2809	}
				2810
				2811	static void dec_mnt_namespaces(struct ucounts *ucounts)
				2812	{
				2813	dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
				2814	}
				2815
				2816	static void free_mnt_ns(struct mnt_namespace *ns)
				2817	{
				2818	ns_free_inum(&ns->ns);
				2819	dec_mnt_namespaces(ns->ucounts);
				2820	put_user_ns(ns->user_ns);
				2821	kfree(ns);
				2822	}
				2823
				2824	/*
				2825	* Assign a sequence number so we can detect when we attempt to bind
				2826	* mount a reference to an older mount namespace into the current
				2827	* mount namespace, preventing reference counting loops. A 64bit
				2828	* number incrementing at 10Ghz will take 12,427 years to wrap which
				2829	* is effectively never, so we can ignore the possibility.
				2830	*/
				2831	static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
				2832
				2833	static struct mnt_namespace alloc_mnt_ns(struct user_namespace user_ns)
				2834	{
				2835	struct mnt_namespace *new_ns;
				2836	struct ucounts *ucounts;
				2837	int ret;
				2838
				2839	ucounts = inc_mnt_namespaces(user_ns);
				2840	if (!ucounts)
				2841	return ERR_PTR(-ENOSPC);
				2842
				2843	new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
				2844	if (!new_ns) {
				2845	dec_mnt_namespaces(ucounts);
				2846	return ERR_PTR(-ENOMEM);
				2847	}
				2848	ret = ns_alloc_inum(&new_ns->ns);
				2849	if (ret) {
				2850	kfree(new_ns);
				2851	dec_mnt_namespaces(ucounts);
				2852	return ERR_PTR(ret);
				2853	}
				2854	new_ns->ns.ops = &mntns_operations;
				2855	new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
				2856	atomic_set(&new_ns->count, 1);
				2857	new_ns->root = NULL;
				2858	INIT_LIST_HEAD(&new_ns->list);
				2859	init_waitqueue_head(&new_ns->poll);
				2860	new_ns->event = 0;
				2861	new_ns->user_ns = get_user_ns(user_ns);
				2862	new_ns->ucounts = ucounts;
				2863	new_ns->mounts = 0;
				2864	new_ns->pending_mounts = 0;
				2865	return new_ns;
				2866	}
				2867
				2868	__latent_entropy
				2869	struct mnt_namespace copy_mnt_ns(unsigned long flags, struct mnt_namespace ns,
				2870	struct user_namespace user_ns, struct fs_struct new_fs)
				2871	{
				2872	struct mnt_namespace *new_ns;
				2873	struct vfsmount rootmnt = NULL, pwdmnt = NULL;
				2874	struct mount p, q;
				2875	struct mount *old;
				2876	struct mount *new;
				2877	int copy_flags;
				2878
				2879	BUG_ON(!ns);
				2880
				2881	if (likely(!(flags & CLONE_NEWNS))) {
				2882	get_mnt_ns(ns);
				2883	return ns;
				2884	}
				2885
				2886	old = ns->root;
				2887
				2888	new_ns = alloc_mnt_ns(user_ns);
				2889	if (IS_ERR(new_ns))
				2890	return new_ns;
				2891
				2892	namespace_lock();
				2893	/* First pass: copy the tree topology */
				2894	copy_flags = CL_COPY_UNBINDABLE \| CL_EXPIRE;
				2895	if (user_ns != ns->user_ns)
				2896	copy_flags \|= CL_SHARED_TO_SLAVE \| CL_UNPRIVILEGED;
				2897	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
				2898	if (IS_ERR(new)) {
				2899	namespace_unlock();
				2900	free_mnt_ns(new_ns);
				2901	return ERR_CAST(new);
				2902	}
				2903	new_ns->root = new;
				2904	list_add_tail(&new_ns->list, &new->mnt_list);
				2905
				2906	/*
				2907	* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
				2908	* as belonging to new namespace. We have already acquired a private
				2909	* fs_struct, so tsk->fs->lock is not needed.
				2910	*/
				2911	p = old;
				2912	q = new;
				2913	while (p) {
				2914	q->mnt_ns = new_ns;
				2915	new_ns->mounts++;
				2916	if (new_fs) {
				2917	if (&p->mnt == new_fs->root.mnt) {
				2918	new_fs->root.mnt = mntget(&q->mnt);
				2919	rootmnt = &p->mnt;
				2920	}
				2921	if (&p->mnt == new_fs->pwd.mnt) {
				2922	new_fs->pwd.mnt = mntget(&q->mnt);
				2923	pwdmnt = &p->mnt;
				2924	}
				2925	}
				2926	p = next_mnt(p, old);
				2927	q = next_mnt(q, new);
				2928	if (!q)
				2929	break;
				2930	while (p->mnt.mnt_root != q->mnt.mnt_root)
				2931	p = next_mnt(p, old);
				2932	}
				2933	namespace_unlock();
				2934
				2935	if (rootmnt)
				2936	mntput(rootmnt);
				2937	if (pwdmnt)
				2938	mntput(pwdmnt);
				2939
				2940	return new_ns;
				2941	}
				2942
				2943	/**
				2944	* create_mnt_ns - creates a private namespace and adds a root filesystem
				2945	* @mnt: pointer to the new root filesystem mountpoint
				2946	*/
				2947	static struct mnt_namespace create_mnt_ns(struct vfsmount m)
				2948	{
				2949	struct mnt_namespace *new_ns = alloc_mnt_ns(&init_user_ns);
				2950	if (!IS_ERR(new_ns)) {
				2951	struct mount *mnt = real_mount(m);
				2952	mnt->mnt_ns = new_ns;
				2953	new_ns->root = mnt;
				2954	new_ns->mounts++;
				2955	list_add(&mnt->mnt_list, &new_ns->list);
				2956	} else {
				2957	mntput(m);
				2958	}
				2959	return new_ns;
				2960	}
				2961
				2962	struct dentry mount_subtree(struct vfsmount mnt, const char *name)
				2963	{
				2964	struct mnt_namespace *ns;
				2965	struct super_block *s;
				2966	struct path path;
				2967	int err;
				2968
				2969	ns = create_mnt_ns(mnt);
				2970	if (IS_ERR(ns))
				2971	return ERR_CAST(ns);
				2972
				2973	err = vfs_path_lookup(mnt->mnt_root, mnt,
				2974	name, LOOKUP_FOLLOW\|LOOKUP_AUTOMOUNT, &path);
				2975
				2976	put_mnt_ns(ns);
				2977
				2978	if (err)
				2979	return ERR_PTR(err);
				2980
				2981	/* trade a vfsmount reference for active sb one */
				2982	s = path.mnt->mnt_sb;
				2983	atomic_inc(&s->s_active);
				2984	mntput(path.mnt);
				2985	/* lock the sucker */
				2986	down_write(&s->s_umount);
				2987	/* ... and return the root of (sub)tree on it */
				2988	return path.dentry;
				2989	}
				2990	EXPORT_SYMBOL(mount_subtree);
				2991
				2992	int ksys_mount(char __user dev_name, char __user dir_name, char __user *type,
				2993	unsigned long flags, void __user *data)
				2994	{
				2995	int ret;
				2996	char *kernel_type;
				2997	char *kernel_dev;
				2998	void *options;
				2999
				3000	kernel_type = copy_mount_string(type);
				3001	ret = PTR_ERR(kernel_type);
				3002	if (IS_ERR(kernel_type))
				3003	goto out_type;
				3004
				3005	kernel_dev = copy_mount_string(dev_name);
				3006	ret = PTR_ERR(kernel_dev);
				3007	if (IS_ERR(kernel_dev))
				3008	goto out_dev;
				3009
				3010	options = copy_mount_options(data);
				3011	ret = PTR_ERR(options);
				3012	if (IS_ERR(options))
				3013	goto out_data;
				3014
				3015	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
				3016
				3017	kfree(options);
				3018	out_data:
				3019	kfree(kernel_dev);
				3020	out_dev:
				3021	kfree(kernel_type);
				3022	out_type:
				3023	return ret;
				3024	}
				3025
				3026	SYSCALL_DEFINE5(mount, char __user , dev_name, char __user , dir_name,
				3027	char __user , type, unsigned long, flags, void __user , data)
				3028	{
				3029	return ksys_mount(dev_name, dir_name, type, flags, data);
				3030	}
				3031
				3032	/*
				3033	* Return true if path is reachable from root
				3034	*
				3035	* namespace_sem or mount_lock is held
				3036	*/
				3037	bool is_path_reachable(struct mount mnt, struct dentry dentry,
				3038	const struct path *root)
				3039	{
				3040	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
				3041	dentry = mnt->mnt_mountpoint;
				3042	mnt = mnt->mnt_parent;
				3043	}
				3044	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
				3045	}
				3046
				3047	bool path_is_under(const struct path path1, const struct path path2)
				3048	{
				3049	bool res;
				3050	read_seqlock_excl(&mount_lock);
				3051	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
				3052	read_sequnlock_excl(&mount_lock);
				3053	return res;
				3054	}
				3055	EXPORT_SYMBOL(path_is_under);
				3056
				3057	/*
				3058	* pivot_root Semantics:
				3059	* Moves the root file system of the current process to the directory put_old,
				3060	* makes new_root as the new root file system of the current process, and sets
				3061	* root/cwd of all processes which had them on the current root to new_root.
				3062	*
				3063	* Restrictions:
				3064	* The new_root and put_old must be directories, and must not be on the
				3065	* same file system as the current process root. The put_old must be
				3066	* underneath new_root, i.e. adding a non-zero number of /.. to the string
				3067	* pointed to by put_old must yield the same directory as new_root. No other
				3068	* file system may be mounted on put_old. After all, new_root is a mountpoint.
				3069	*
				3070	* Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
				3071	* See Documentation/filesystems/ramfs-rootfs-initramfs.txt for alternatives
				3072	* in this situation.
				3073	*
				3074	* Notes:
				3075	* - we don't move root/cwd if they are not at the root (reason: if something
				3076	* cared enough to change them, it's probably wrong to force them elsewhere)
				3077	* - it's okay to pick a root that isn't the root of a file system, e.g.
				3078	* /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
				3079	* though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
				3080	* first.
				3081	*/
				3082	SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
				3083	const char __user *, put_old)
				3084	{
				3085	struct path new, old, parent_path, root_parent, root;
				3086	struct mount new_mnt, root_mnt, *old_mnt;
				3087	struct mountpoint old_mp, root_mp;
				3088	int error;
				3089
				3090	if (!may_mount())
				3091	return -EPERM;
				3092
				3093	error = user_path_dir(new_root, &new);
				3094	if (error)
				3095	goto out0;
				3096
				3097	error = user_path_dir(put_old, &old);
				3098	if (error)
				3099	goto out1;
				3100
				3101	error = security_sb_pivotroot(&old, &new);
				3102	if (error)
				3103	goto out2;
				3104
				3105	get_fs_root(current->fs, &root);
				3106	old_mp = lock_mount(&old);
				3107	error = PTR_ERR(old_mp);
				3108	if (IS_ERR(old_mp))
				3109	goto out3;
				3110
				3111	error = -EINVAL;
				3112	new_mnt = real_mount(new.mnt);
				3113	root_mnt = real_mount(root.mnt);
				3114	old_mnt = real_mount(old.mnt);
				3115	if (IS_MNT_SHARED(old_mnt) \|\|
				3116	IS_MNT_SHARED(new_mnt->mnt_parent) \|\|
				3117	IS_MNT_SHARED(root_mnt->mnt_parent))
				3118	goto out4;
				3119	if (!check_mnt(root_mnt) \|\| !check_mnt(new_mnt))
				3120	goto out4;
				3121	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
				3122	goto out4;
				3123	error = -ENOENT;
				3124	if (d_unlinked(new.dentry))
				3125	goto out4;
				3126	error = -EBUSY;
				3127	if (new_mnt == root_mnt \|\| old_mnt == root_mnt)
				3128	goto out4; /* loop, on the same file system */
				3129	error = -EINVAL;
				3130	if (root.mnt->mnt_root != root.dentry)
				3131	goto out4; /* not a mountpoint */
				3132	if (!mnt_has_parent(root_mnt))
				3133	goto out4; /* not attached */
				3134	root_mp = root_mnt->mnt_mp;
				3135	if (new.mnt->mnt_root != new.dentry)
				3136	goto out4; /* not a mountpoint */
				3137	if (!mnt_has_parent(new_mnt))
				3138	goto out4; /* not attached */
				3139	/* make sure we can reach put_old from new_root */
				3140	if (!is_path_reachable(old_mnt, old.dentry, &new))
				3141	goto out4;
				3142	/* make certain new is below the root */
				3143	if (!is_path_reachable(new_mnt, new.dentry, &root))
				3144	goto out4;
				3145	root_mp->m_count++; /* pin it so it won't go away */
				3146	lock_mount_hash();
				3147	detach_mnt(new_mnt, &parent_path);
				3148	detach_mnt(root_mnt, &root_parent);
				3149	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
				3150	new_mnt->mnt.mnt_flags \|= MNT_LOCKED;
				3151	root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
				3152	}
				3153	/* mount old root on put_old */
				3154	attach_mnt(root_mnt, old_mnt, old_mp);
				3155	/* mount new_root on / */
				3156	attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
				3157	touch_mnt_namespace(current->nsproxy->mnt_ns);
				3158	/* A moved mount should not expire automatically */
				3159	list_del_init(&new_mnt->mnt_expire);
				3160	put_mountpoint(root_mp);
				3161	unlock_mount_hash();
				3162	chroot_fs_refs(&root, &new);
				3163	error = 0;
				3164	out4:
				3165	unlock_mount(old_mp);
				3166	if (!error) {
				3167	path_put(&root_parent);
				3168	path_put(&parent_path);
				3169	}
				3170	out3:
				3171	path_put(&root);
				3172	out2:
				3173	path_put(&old);
				3174	out1:
				3175	path_put(&new);
				3176	out0:
				3177	return error;
				3178	}
				3179
				3180	static void __init init_mount_tree(void)
				3181	{
				3182	struct vfsmount *mnt;
				3183	struct mnt_namespace *ns;
				3184	struct path root;
				3185	struct file_system_type *type;
				3186
				3187	type = get_fs_type("rootfs");
				3188	if (!type)
				3189	panic("Can't find rootfs type");
				3190	mnt = vfs_kern_mount(type, 0, "rootfs", NULL);
				3191	put_filesystem(type);
				3192	if (IS_ERR(mnt))
				3193	panic("Can't create rootfs");
				3194
				3195	ns = create_mnt_ns(mnt);
				3196	if (IS_ERR(ns))
				3197	panic("Can't allocate initial namespace");
				3198
				3199	init_task.nsproxy->mnt_ns = ns;
				3200	get_mnt_ns(ns);
				3201
				3202	root.mnt = mnt;
				3203	root.dentry = mnt->mnt_root;
				3204	mnt->mnt_flags \|= MNT_LOCKED;
				3205
				3206	set_fs_pwd(current->fs, &root);
				3207	set_fs_root(current->fs, &root);
				3208	}
				3209
				3210	void __init mnt_init(void)
				3211	{
				3212	int err;
				3213
				3214	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
				3215	0, SLAB_HWCACHE_ALIGN \| SLAB_PANIC, NULL);
				3216
				3217	mount_hashtable = alloc_large_system_hash("Mount-cache",
				3218	sizeof(struct hlist_head),
				3219	mhash_entries, 19,
				3220	HASH_ZERO,
				3221	&m_hash_shift, &m_hash_mask, 0, 0);
				3222	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
				3223	sizeof(struct hlist_head),
				3224	mphash_entries, 19,
				3225	HASH_ZERO,
				3226	&mp_hash_shift, &mp_hash_mask, 0, 0);
				3227
				3228	if (!mount_hashtable \|\| !mountpoint_hashtable)
				3229	panic("Failed to allocate mount hash table\n");
				3230
				3231	kernfs_init();
				3232
				3233	err = sysfs_init();
				3234	if (err)
				3235	printk(KERN_WARNING "%s: sysfs_init error: %d\n",
				3236	__func__, err);
				3237	fs_kobj = kobject_create_and_add("fs", NULL);
				3238	if (!fs_kobj)
				3239	printk(KERN_WARNING "%s: kobj create error\n", __func__);
				3240	init_rootfs();
				3241	init_mount_tree();
				3242	}
				3243
				3244	void put_mnt_ns(struct mnt_namespace *ns)
				3245	{
				3246	if (!atomic_dec_and_test(&ns->count))
				3247	return;
				3248	drop_collected_mounts(&ns->root->mnt);
				3249	free_mnt_ns(ns);
				3250	}
				3251
				3252	struct vfsmount kern_mount_data(struct file_system_type type, void *data)
				3253	{
				3254	struct vfsmount *mnt;
				3255	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, data);
				3256	if (!IS_ERR(mnt)) {
				3257	/*
				3258	* it is a longterm mount, don't release mnt until
				3259	* we unmount before file sys is unregistered
				3260	*/
				3261	real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
				3262	}
				3263	return mnt;
				3264	}
				3265	EXPORT_SYMBOL_GPL(kern_mount_data);
				3266
				3267	void kern_unmount(struct vfsmount *mnt)
				3268	{
				3269	/* release long term mount so mount point can be released */
				3270	if (!IS_ERR_OR_NULL(mnt)) {
				3271	real_mount(mnt)->mnt_ns = NULL;
				3272	synchronize_rcu(); /* yecchhh... */
				3273	mntput(mnt);
				3274	}
				3275	}
				3276	EXPORT_SYMBOL(kern_unmount);
				3277
				3278	bool our_mnt(struct vfsmount *mnt)
				3279	{
				3280	return check_mnt(real_mount(mnt));
				3281	}
				3282
				3283	bool current_chrooted(void)
				3284	{
				3285	/* Does the current process have a non-standard root */
				3286	struct path ns_root;
				3287	struct path fs_root;
				3288	bool chrooted;
				3289
				3290	/* Find the namespace root */
				3291	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
				3292	ns_root.dentry = ns_root.mnt->mnt_root;
				3293	path_get(&ns_root);
				3294	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
				3295	;
				3296
				3297	get_fs_root(current->fs, &fs_root);
				3298
				3299	chrooted = !path_equal(&fs_root, &ns_root);
				3300
				3301	path_put(&fs_root);
				3302	path_put(&ns_root);
				3303
				3304	return chrooted;
				3305	}
				3306
				3307	static bool mnt_already_visible(struct mnt_namespace ns, struct vfsmount new,
				3308	int *new_mnt_flags)
				3309	{
				3310	int new_flags = *new_mnt_flags;
				3311	struct mount *mnt;
				3312	bool visible = false;
				3313
				3314	down_read(&namespace_sem);
				3315	list_for_each_entry(mnt, &ns->list, mnt_list) {
				3316	struct mount *child;
				3317	int mnt_flags;
				3318
				3319	if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
				3320	continue;
				3321
				3322	/* This mount is not fully visible if it's root directory
				3323	* is not the root directory of the filesystem.
				3324	*/
				3325	if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
				3326	continue;
				3327
				3328	/* A local view of the mount flags */
				3329	mnt_flags = mnt->mnt.mnt_flags;
				3330
				3331	/* Don't miss readonly hidden in the superblock flags */
				3332	if (sb_rdonly(mnt->mnt.mnt_sb))
				3333	mnt_flags \|= MNT_LOCK_READONLY;
				3334
				3335	/* Verify the mount flags are equal to or more permissive
				3336	* than the proposed new mount.
				3337	*/
				3338	if ((mnt_flags & MNT_LOCK_READONLY) &&
				3339	!(new_flags & MNT_READONLY))
				3340	continue;
				3341	if ((mnt_flags & MNT_LOCK_ATIME) &&
				3342	((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
				3343	continue;
				3344
				3345	/* This mount is not fully visible if there are any
				3346	* locked child mounts that cover anything except for
				3347	* empty directories.
				3348	*/
				3349	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
				3350	struct inode *inode = child->mnt_mountpoint->d_inode;
				3351	/* Only worry about locked mounts */
				3352	if (!(child->mnt.mnt_flags & MNT_LOCKED))
				3353	continue;
				3354	/* Is the directory permanetly empty? */
				3355	if (!is_empty_dir_inode(inode))
				3356	goto next;
				3357	}
				3358	/* Preserve the locked attributes */
				3359	*new_mnt_flags \|= mnt_flags & (MNT_LOCK_READONLY \| \
				3360	MNT_LOCK_ATIME);
				3361	visible = true;
				3362	goto found;
				3363	next: ;
				3364	}
				3365	found:
				3366	up_read(&namespace_sem);
				3367	return visible;
				3368	}
				3369
				3370	static bool mount_too_revealing(struct vfsmount mnt, int new_mnt_flags)
				3371	{
				3372	const unsigned long required_iflags = SB_I_NOEXEC \| SB_I_NODEV;
				3373	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
				3374	unsigned long s_iflags;
				3375
				3376	if (ns->user_ns == &init_user_ns)
				3377	return false;
				3378
				3379	/* Can this filesystem be too revealing? */
				3380	s_iflags = mnt->mnt_sb->s_iflags;
				3381	if (!(s_iflags & SB_I_USERNS_VISIBLE))
				3382	return false;
				3383
				3384	if ((s_iflags & required_iflags) != required_iflags) {
				3385	WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
				3386	required_iflags);
				3387	return true;
				3388	}
				3389
				3390	return !mnt_already_visible(ns, mnt, new_mnt_flags);
				3391	}
				3392
				3393	bool mnt_may_suid(struct vfsmount *mnt)
				3394	{
				3395	/*
				3396	* Foreign mounts (accessed via fchdir or through /proc
				3397	* symlinks) are always treated as if they are nosuid. This
				3398	* prevents namespaces from trusting potentially unsafe
				3399	* suid/sgid bits, file caps, or security labels that originate
				3400	* in other namespaces.
				3401	*/
				3402	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
				3403	current_in_userns(mnt->mnt_sb->s_user_ns);
				3404	}
				3405
				3406	static struct ns_common mntns_get(struct task_struct task)
				3407	{
				3408	struct ns_common *ns = NULL;
				3409	struct nsproxy *nsproxy;
				3410
				3411	task_lock(task);
				3412	nsproxy = task->nsproxy;
				3413	if (nsproxy) {
				3414	ns = &nsproxy->mnt_ns->ns;
				3415	get_mnt_ns(to_mnt_ns(ns));
				3416	}
				3417	task_unlock(task);
				3418
				3419	return ns;
				3420	}
				3421
				3422	static void mntns_put(struct ns_common *ns)
				3423	{
				3424	put_mnt_ns(to_mnt_ns(ns));
				3425	}
				3426
				3427	static int mntns_install(struct nsproxy nsproxy, struct ns_common ns)
				3428	{
				3429	struct fs_struct *fs = current->fs;
				3430	struct mnt_namespace mnt_ns = to_mnt_ns(ns), old_mnt_ns;
				3431	struct path root;
				3432	int err;
				3433
				3434	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) \|\|
				3435	!ns_capable(current_user_ns(), CAP_SYS_CHROOT) \|\|
				3436	!ns_capable(current_user_ns(), CAP_SYS_ADMIN))
				3437	return -EPERM;
				3438
				3439	if (fs->users != 1)
				3440	return -EINVAL;
				3441
				3442	get_mnt_ns(mnt_ns);
				3443	old_mnt_ns = nsproxy->mnt_ns;
				3444	nsproxy->mnt_ns = mnt_ns;
				3445
				3446	/* Find the root */
				3447	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
				3448	"/", LOOKUP_DOWN, &root);
				3449	if (err) {
				3450	/* revert to old namespace */
				3451	nsproxy->mnt_ns = old_mnt_ns;
				3452	put_mnt_ns(mnt_ns);
				3453	return err;
				3454	}
				3455
				3456	put_mnt_ns(old_mnt_ns);
				3457
				3458	/* Update the pwd and root */
				3459	set_fs_pwd(fs, &root);
				3460	set_fs_root(fs, &root);
				3461
				3462	path_put(&root);
				3463	return 0;
				3464	}
				3465
				3466	static struct user_namespace mntns_owner(struct ns_common ns)
				3467	{
				3468	return to_mnt_ns(ns)->user_ns;
				3469	}
				3470
				3471	const struct proc_ns_operations mntns_operations = {
				3472	.name = "mnt",
				3473	.type = CLONE_NEWNS,
				3474	.get = mntns_get,
				3475	.put = mntns_put,
				3476	.install = mntns_install,
				3477	.owner = mntns_owner,
				3478	};