Skip to content
Snippets Groups Projects
namespace.c 84.6 KiB
Newer Older
  • Learn to ignore specific revisions
  • Kenneth Johansson's avatar
    Kenneth Johansson committed
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
    /*
     *  linux/fs/namespace.c
     *
     * (C) Copyright Al Viro 2000, 2001
     *	Released under GPL v2.
     *
     * Based on code from fs/super.c, copyright Linus Torvalds and others.
     * Heavily rewritten.
     */
    
    #include <linux/syscalls.h>
    #include <linux/export.h>
    #include <linux/capability.h>
    #include <linux/mnt_namespace.h>
    #include <linux/user_namespace.h>
    #include <linux/namei.h>
    #include <linux/security.h>
    #include <linux/idr.h>
    #include <linux/init.h>		/* init_rootfs */
    #include <linux/fs_struct.h>	/* get_fs_root et.al. */
    #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
    #include <linux/uaccess.h>
    #include <linux/proc_ns.h>
    #include <linux/magic.h>
    #include <linux/bootmem.h>
    #include <linux/task_work.h>
    #include "pnode.h"
    #include "internal.h"
    
    /* Maximum number of mounts in a mount namespace */
    unsigned int sysctl_mount_max __read_mostly = 100000;
    
    static unsigned int m_hash_mask __read_mostly;
    static unsigned int m_hash_shift __read_mostly;
    static unsigned int mp_hash_mask __read_mostly;
    static unsigned int mp_hash_shift __read_mostly;
    
    static __initdata unsigned long mhash_entries;
    static int __init set_mhash_entries(char *str)
    {
    	if (!str)
    		return 0;
    	mhash_entries = simple_strtoul(str, &str, 0);
    	return 1;
    }
    __setup("mhash_entries=", set_mhash_entries);
    
    static __initdata unsigned long mphash_entries;
    static int __init set_mphash_entries(char *str)
    {
    	if (!str)
    		return 0;
    	mphash_entries = simple_strtoul(str, &str, 0);
    	return 1;
    }
    __setup("mphash_entries=", set_mphash_entries);
    
    static u64 event;
    static DEFINE_IDA(mnt_id_ida);
    static DEFINE_IDA(mnt_group_ida);
    static DEFINE_SPINLOCK(mnt_id_lock);
    static int mnt_id_start = 0;
    static int mnt_group_start = 1;
    
    static struct hlist_head *mount_hashtable __read_mostly;
    static struct hlist_head *mountpoint_hashtable __read_mostly;
    static struct kmem_cache *mnt_cache __read_mostly;
    static DECLARE_RWSEM(namespace_sem);
    
    /* /sys/fs */
    struct kobject *fs_kobj;
    EXPORT_SYMBOL_GPL(fs_kobj);
    
    /*
     * vfsmount lock may be taken for read to prevent changes to the
     * vfsmount hash, ie. during mountpoint lookups or walking back
     * up the tree.
     *
     * It should be taken for write in all cases where the vfsmount
     * tree or hash is modified or when a vfsmount structure is modified.
     */
    __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
    
    static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
    {
    	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
    	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
    	tmp = tmp + (tmp >> m_hash_shift);
    	return &mount_hashtable[tmp & m_hash_mask];
    }
    
    static inline struct hlist_head *mp_hash(struct dentry *dentry)
    {
    	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
    	tmp = tmp + (tmp >> mp_hash_shift);
    	return &mountpoint_hashtable[tmp & mp_hash_mask];
    }
    
    /*
     * allocation is serialized by namespace_sem, but we need the spinlock to
     * serialize with freeing.
     */
    static int mnt_alloc_id(struct mount *mnt)
    {
    	int res;
    
    retry:
    	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
    	spin_lock(&mnt_id_lock);
    	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
    	if (!res)
    		mnt_id_start = mnt->mnt_id + 1;
    	spin_unlock(&mnt_id_lock);
    	if (res == -EAGAIN)
    		goto retry;
    
    	return res;
    }
    
    static void mnt_free_id(struct mount *mnt)
    {
    	int id = mnt->mnt_id;
    	spin_lock(&mnt_id_lock);
    	ida_remove(&mnt_id_ida, id);
    	if (mnt_id_start > id)
    		mnt_id_start = id;
    	spin_unlock(&mnt_id_lock);
    }
    
    /*
     * Allocate a new peer group ID
     *
     * mnt_group_ida is protected by namespace_sem
     */
    static int mnt_alloc_group_id(struct mount *mnt)
    {
    	int res;
    
    	if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
    		return -ENOMEM;
    
    	res = ida_get_new_above(&mnt_group_ida,
    				mnt_group_start,
    				&mnt->mnt_group_id);
    	if (!res)
    		mnt_group_start = mnt->mnt_group_id + 1;
    
    	return res;
    }
    
    /*
     * Release a peer group ID
     */
    void mnt_release_group_id(struct mount *mnt)
    {
    	int id = mnt->mnt_group_id;
    	ida_remove(&mnt_group_ida, id);
    	if (mnt_group_start > id)
    		mnt_group_start = id;
    	mnt->mnt_group_id = 0;
    }
    
    /*
     * vfsmount lock must be held for read
     */
    static inline void mnt_add_count(struct mount *mnt, int n)
    {
    #ifdef CONFIG_SMP
    	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
    #else
    	preempt_disable();
    	mnt->mnt_count += n;
    	preempt_enable();
    #endif
    }
    
    /*
     * vfsmount lock must be held for write
     */
    unsigned int mnt_get_count(struct mount *mnt)
    {
    #ifdef CONFIG_SMP
    	unsigned int count = 0;
    	int cpu;
    
    	for_each_possible_cpu(cpu) {
    		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
    	}
    
    	return count;
    #else
    	return mnt->mnt_count;
    #endif
    }
    
    static void drop_mountpoint(struct fs_pin *p)
    {
    	struct mount *m = container_of(p, struct mount, mnt_umount);
    	dput(m->mnt_ex_mountpoint);
    	pin_remove(p);
    	mntput(&m->mnt);
    }
    
    static struct mount *alloc_vfsmnt(const char *name)
    {
    	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
    	if (mnt) {
    		int err;
    
    		err = mnt_alloc_id(mnt);
    		if (err)
    			goto out_free_cache;
    
    		if (name) {
    			mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
    			if (!mnt->mnt_devname)
    				goto out_free_id;
    		}
    
    #ifdef CONFIG_SMP
    		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
    		if (!mnt->mnt_pcp)
    			goto out_free_devname;
    
    		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
    #else
    		mnt->mnt_count = 1;
    		mnt->mnt_writers = 0;
    #endif
    
    		INIT_HLIST_NODE(&mnt->mnt_hash);
    		INIT_LIST_HEAD(&mnt->mnt_child);
    		INIT_LIST_HEAD(&mnt->mnt_mounts);
    		INIT_LIST_HEAD(&mnt->mnt_list);
    		INIT_LIST_HEAD(&mnt->mnt_expire);
    		INIT_LIST_HEAD(&mnt->mnt_share);
    		INIT_LIST_HEAD(&mnt->mnt_slave_list);
    		INIT_LIST_HEAD(&mnt->mnt_slave);
    		INIT_HLIST_NODE(&mnt->mnt_mp_list);
    		INIT_LIST_HEAD(&mnt->mnt_umounting);
    #ifdef CONFIG_FSNOTIFY
    		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
    #endif
    		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
    	}
    	return mnt;
    
    #ifdef CONFIG_SMP
    out_free_devname:
    	kfree_const(mnt->mnt_devname);
    #endif
    out_free_id:
    	mnt_free_id(mnt);
    out_free_cache:
    	kmem_cache_free(mnt_cache, mnt);
    	return NULL;
    }
    
    /*
     * Most r/o checks on a fs are for operations that take
     * discrete amounts of time, like a write() or unlink().
     * We must keep track of when those operations start
     * (for permission checks) and when they end, so that
     * we can determine when writes are able to occur to
     * a filesystem.
     */
    /*
     * __mnt_is_readonly: check whether a mount is read-only
     * @mnt: the mount to check for its write status
     *
     * This shouldn't be used directly ouside of the VFS.
     * It does not guarantee that the filesystem will stay
     * r/w, just that it is right *now*.  This can not and
     * should not be used in place of IS_RDONLY(inode).
     * mnt_want/drop_write() will _keep_ the filesystem
     * r/w.
     */
    int __mnt_is_readonly(struct vfsmount *mnt)
    {
    	if (mnt->mnt_flags & MNT_READONLY)
    		return 1;
    	if (mnt->mnt_sb->s_flags & MS_RDONLY)
    		return 1;
    	return 0;
    }
    EXPORT_SYMBOL_GPL(__mnt_is_readonly);
    
    static inline void mnt_inc_writers(struct mount *mnt)
    {
    #ifdef CONFIG_SMP
    	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
    #else
    	mnt->mnt_writers++;
    #endif
    }
    
    static inline void mnt_dec_writers(struct mount *mnt)
    {
    #ifdef CONFIG_SMP
    	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
    #else
    	mnt->mnt_writers--;
    #endif
    }
    
    static unsigned int mnt_get_writers(struct mount *mnt)
    {
    #ifdef CONFIG_SMP
    	unsigned int count = 0;
    	int cpu;
    
    	for_each_possible_cpu(cpu) {
    		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
    	}
    
    	return count;
    #else
    	return mnt->mnt_writers;
    #endif
    }
    
    static int mnt_is_readonly(struct vfsmount *mnt)
    {
    	if (mnt->mnt_sb->s_readonly_remount)
    		return 1;
    	/* Order wrt setting s_flags/s_readonly_remount in do_remount() */
    	smp_rmb();
    	return __mnt_is_readonly(mnt);
    }
    
    /*
     * Most r/o & frozen checks on a fs are for operations that take discrete
     * amounts of time, like a write() or unlink().  We must keep track of when
     * those operations start (for permission checks) and when they end, so that we
     * can determine when writes are able to occur to a filesystem.
     */
    /**
     * __mnt_want_write - get write access to a mount without freeze protection
     * @m: the mount on which to take a write
     *
     * This tells the low-level filesystem that a write is about to be performed to
     * it, and makes sure that writes are allowed (mnt it read-write) before
     * returning success. This operation does not protect against filesystem being
     * frozen. When the write operation is finished, __mnt_drop_write() must be
     * called. This is effectively a refcount.
     */
    int __mnt_want_write(struct vfsmount *m)
    {
    	struct mount *mnt = real_mount(m);
    	int ret = 0;
    
    	preempt_disable();
    	mnt_inc_writers(mnt);
    	/*
    	 * The store to mnt_inc_writers must be visible before we pass
    	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
    	 * incremented count after it has set MNT_WRITE_HOLD.
    	 */
    	smp_mb();
    	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
    		cpu_relax();
    	/*
    	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
    	 * be set to match its requirements. So we must not load that until
    	 * MNT_WRITE_HOLD is cleared.
    	 */
    	smp_rmb();
    	if (mnt_is_readonly(m)) {
    		mnt_dec_writers(mnt);
    		ret = -EROFS;
    	}
    	preempt_enable();
    
    	return ret;
    }
    
    /**
     * mnt_want_write - get write access to a mount
     * @m: the mount on which to take a write
     *
     * This tells the low-level filesystem that a write is about to be performed to
     * it, and makes sure that writes are allowed (mount is read-write, filesystem
     * is not frozen) before returning success.  When the write operation is
     * finished, mnt_drop_write() must be called.  This is effectively a refcount.
     */
    int mnt_want_write(struct vfsmount *m)
    {
    	int ret;
    
    	sb_start_write(m->mnt_sb);
    	ret = __mnt_want_write(m);
    	if (ret)
    		sb_end_write(m->mnt_sb);
    	return ret;
    }
    EXPORT_SYMBOL_GPL(mnt_want_write);
    
    /**
     * mnt_clone_write - get write access to a mount
     * @mnt: the mount on which to take a write
     *
     * This is effectively like mnt_want_write, except
     * it must only be used to take an extra write reference
     * on a mountpoint that we already know has a write reference
     * on it. This allows some optimisation.
     *
     * After finished, mnt_drop_write must be called as usual to
     * drop the reference.
     */
    int mnt_clone_write(struct vfsmount *mnt)
    {
    	/* superblock may be r/o */
    	if (__mnt_is_readonly(mnt))
    		return -EROFS;
    	preempt_disable();
    	mnt_inc_writers(real_mount(mnt));
    	preempt_enable();
    	return 0;
    }
    EXPORT_SYMBOL_GPL(mnt_clone_write);
    
    /**
     * __mnt_want_write_file - get write access to a file's mount
     * @file: the file who's mount on which to take a write
     *
     * This is like __mnt_want_write, but it takes a file and can
     * do some optimisations if the file is open for write already
     */
    int __mnt_want_write_file(struct file *file)
    {
    	if (!(file->f_mode & FMODE_WRITER))
    		return __mnt_want_write(file->f_path.mnt);
    	else
    		return mnt_clone_write(file->f_path.mnt);
    }
    
    /**
     * mnt_want_write_file - get write access to a file's mount
     * @file: the file who's mount on which to take a write
     *
     * This is like mnt_want_write, but it takes a file and can
     * do some optimisations if the file is open for write already
     */
    int mnt_want_write_file(struct file *file)
    {
    	int ret;
    
    	sb_start_write(file->f_path.mnt->mnt_sb);
    	ret = __mnt_want_write_file(file);
    	if (ret)
    		sb_end_write(file->f_path.mnt->mnt_sb);
    	return ret;
    }
    EXPORT_SYMBOL_GPL(mnt_want_write_file);
    
    /**
     * __mnt_drop_write - give up write access to a mount
     * @mnt: the mount on which to give up write access
     *
     * Tells the low-level filesystem that we are done
     * performing writes to it.  Must be matched with
     * __mnt_want_write() call above.
     */
    void __mnt_drop_write(struct vfsmount *mnt)
    {
    	preempt_disable();
    	mnt_dec_writers(real_mount(mnt));
    	preempt_enable();
    }
    
    /**
     * mnt_drop_write - give up write access to a mount
     * @mnt: the mount on which to give up write access
     *
     * Tells the low-level filesystem that we are done performing writes to it and
     * also allows filesystem to be frozen again.  Must be matched with
     * mnt_want_write() call above.
     */
    void mnt_drop_write(struct vfsmount *mnt)
    {
    	__mnt_drop_write(mnt);
    	sb_end_write(mnt->mnt_sb);
    }
    EXPORT_SYMBOL_GPL(mnt_drop_write);
    
    void __mnt_drop_write_file(struct file *file)
    {
    	__mnt_drop_write(file->f_path.mnt);
    }
    
    void mnt_drop_write_file(struct file *file)
    {
    	mnt_drop_write(file->f_path.mnt);
    }
    EXPORT_SYMBOL(mnt_drop_write_file);
    
    static int mnt_make_readonly(struct mount *mnt)
    {
    	int ret = 0;
    
    	lock_mount_hash();
    	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
    	/*
    	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
    	 * should be visible before we do.
    	 */
    	smp_mb();
    
    	/*
    	 * With writers on hold, if this value is zero, then there are
    	 * definitely no active writers (although held writers may subsequently
    	 * increment the count, they'll have to wait, and decrement it after
    	 * seeing MNT_READONLY).
    	 *
    	 * It is OK to have counter incremented on one CPU and decremented on
    	 * another: the sum will add up correctly. The danger would be when we
    	 * sum up each counter, if we read a counter before it is incremented,
    	 * but then read another CPU's count which it has been subsequently
    	 * decremented from -- we would see more decrements than we should.
    	 * MNT_WRITE_HOLD protects against this scenario, because
    	 * mnt_want_write first increments count, then smp_mb, then spins on
    	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
    	 * we're counting up here.
    	 */
    	if (mnt_get_writers(mnt) > 0)
    		ret = -EBUSY;
    	else
    		mnt->mnt.mnt_flags |= MNT_READONLY;
    	/*
    	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
    	 * that become unheld will see MNT_READONLY.
    	 */
    	smp_wmb();
    	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
    	unlock_mount_hash();
    	return ret;
    }
    
    static void __mnt_unmake_readonly(struct mount *mnt)
    {
    	lock_mount_hash();
    	mnt->mnt.mnt_flags &= ~MNT_READONLY;
    	unlock_mount_hash();
    }
    
    int sb_prepare_remount_readonly(struct super_block *sb)
    {
    	struct mount *mnt;
    	int err = 0;
    
    	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
    	if (atomic_long_read(&sb->s_remove_count))
    		return -EBUSY;
    
    	lock_mount_hash();
    	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
    		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
    			mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
    			smp_mb();
    			if (mnt_get_writers(mnt) > 0) {
    				err = -EBUSY;
    				break;
    			}
    		}
    	}
    	if (!err && atomic_long_read(&sb->s_remove_count))
    		err = -EBUSY;
    
    	if (!err) {
    		sb->s_readonly_remount = 1;
    		smp_wmb();
    	}
    	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
    		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
    			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
    	}
    	unlock_mount_hash();
    
    	return err;
    }
    
    static void free_vfsmnt(struct mount *mnt)
    {
    	kfree_const(mnt->mnt_devname);
    #ifdef CONFIG_SMP
    	free_percpu(mnt->mnt_pcp);
    #endif
    	kmem_cache_free(mnt_cache, mnt);
    }
    
    static void delayed_free_vfsmnt(struct rcu_head *head)
    {
    	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
    }
    
    /* call under rcu_read_lock */
    int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
    {
    	struct mount *mnt;
    	if (read_seqretry(&mount_lock, seq))
    		return 1;
    	if (bastard == NULL)
    		return 0;
    	mnt = real_mount(bastard);
    	mnt_add_count(mnt, 1);
    	if (likely(!read_seqretry(&mount_lock, seq)))
    		return 0;
    	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
    		mnt_add_count(mnt, -1);
    		return 1;
    	}
    	return -1;
    }
    
    /* call under rcu_read_lock */
    bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
    {
    	int res = __legitimize_mnt(bastard, seq);
    	if (likely(!res))
    		return true;
    	if (unlikely(res < 0)) {
    		rcu_read_unlock();
    		mntput(bastard);
    		rcu_read_lock();
    	}
    	return false;
    }
    
    /*
     * find the first mount at @dentry on vfsmount @mnt.
     * call under rcu_read_lock()
     */
    struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
    {
    	struct hlist_head *head = m_hash(mnt, dentry);
    	struct mount *p;
    
    	hlist_for_each_entry_rcu(p, head, mnt_hash)
    		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
    			return p;
    	return NULL;
    }
    
    /*
     * lookup_mnt - Return the first child mount mounted at path
     *
     * "First" means first mounted chronologically.  If you create the
     * following mounts:
     *
     * mount /dev/sda1 /mnt
     * mount /dev/sda2 /mnt
     * mount /dev/sda3 /mnt
     *
     * Then lookup_mnt() on the base /mnt dentry in the root mount will
     * return successively the root dentry and vfsmount of /dev/sda1, then
     * /dev/sda2, then /dev/sda3, then NULL.
     *
     * lookup_mnt takes a reference to the found vfsmount.
     */
    struct vfsmount *lookup_mnt(struct path *path)
    {
    	struct mount *child_mnt;
    	struct vfsmount *m;
    	unsigned seq;
    
    	rcu_read_lock();
    	do {
    		seq = read_seqbegin(&mount_lock);
    		child_mnt = __lookup_mnt(path->mnt, path->dentry);
    		m = child_mnt ? &child_mnt->mnt : NULL;
    	} while (!legitimize_mnt(m, seq));
    	rcu_read_unlock();
    	return m;
    }
    
    /*
     * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
     *                         current mount namespace.
     *
     * The common case is dentries are not mountpoints at all and that
     * test is handled inline.  For the slow case when we are actually
     * dealing with a mountpoint of some kind, walk through all of the
     * mounts in the current mount namespace and test to see if the dentry
     * is a mountpoint.
     *
     * The mount_hashtable is not usable in the context because we
     * need to identify all mounts that may be in the current mount
     * namespace not just a mount that happens to have some specified
     * parent mount.
     */
    bool __is_local_mountpoint(struct dentry *dentry)
    {
    	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
    	struct mount *mnt;
    	bool is_covered = false;
    
    	if (!d_mountpoint(dentry))
    		goto out;
    
    	down_read(&namespace_sem);
    	list_for_each_entry(mnt, &ns->list, mnt_list) {
    		is_covered = (mnt->mnt_mountpoint == dentry);
    		if (is_covered)
    			break;
    	}
    	up_read(&namespace_sem);
    out:
    	return is_covered;
    }
    
    static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
    {
    	struct hlist_head *chain = mp_hash(dentry);
    	struct mountpoint *mp;
    
    	hlist_for_each_entry(mp, chain, m_hash) {
    		if (mp->m_dentry == dentry) {
    			/* might be worth a WARN_ON() */
    			if (d_unlinked(dentry))
    				return ERR_PTR(-ENOENT);
    			mp->m_count++;
    			return mp;
    		}
    	}
    	return NULL;
    }
    
    static struct mountpoint *get_mountpoint(struct dentry *dentry)
    {
    	struct mountpoint *mp, *new = NULL;
    	int ret;
    
    	if (d_mountpoint(dentry)) {
    mountpoint:
    		read_seqlock_excl(&mount_lock);
    		mp = lookup_mountpoint(dentry);
    		read_sequnlock_excl(&mount_lock);
    		if (mp)
    			goto done;
    	}
    
    	if (!new)
    		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
    	if (!new)
    		return ERR_PTR(-ENOMEM);
    
    
    	/* Exactly one processes may set d_mounted */
    	ret = d_set_mounted(dentry);
    
    	/* Someone else set d_mounted? */
    	if (ret == -EBUSY)
    		goto mountpoint;
    
    	/* The dentry is not available as a mountpoint? */
    	mp = ERR_PTR(ret);
    	if (ret)
    		goto done;
    
    	/* Add the new mountpoint to the hash table */
    	read_seqlock_excl(&mount_lock);
    	new->m_dentry = dentry;
    	new->m_count = 1;
    	hlist_add_head(&new->m_hash, mp_hash(dentry));
    	INIT_HLIST_HEAD(&new->m_list);
    	read_sequnlock_excl(&mount_lock);
    
    	mp = new;
    	new = NULL;
    done:
    	kfree(new);
    	return mp;
    }
    
    static void put_mountpoint(struct mountpoint *mp)
    {
    	if (!--mp->m_count) {
    		struct dentry *dentry = mp->m_dentry;
    		BUG_ON(!hlist_empty(&mp->m_list));
    		spin_lock(&dentry->d_lock);
    		dentry->d_flags &= ~DCACHE_MOUNTED;
    		spin_unlock(&dentry->d_lock);
    		hlist_del(&mp->m_hash);
    		kfree(mp);
    	}
    }
    
    static inline int check_mnt(struct mount *mnt)
    {
    	return mnt->mnt_ns == current->nsproxy->mnt_ns;
    }
    
    /*
     * vfsmount lock must be held for write
     */
    static void touch_mnt_namespace(struct mnt_namespace *ns)
    {
    	if (ns) {
    		ns->event = ++event;
    		wake_up_interruptible(&ns->poll);
    	}
    }
    
    /*
     * vfsmount lock must be held for write
     */
    static void __touch_mnt_namespace(struct mnt_namespace *ns)
    {
    	if (ns && ns->event != event) {
    		ns->event = event;
    		wake_up_interruptible(&ns->poll);
    	}
    }
    
    /*
     * vfsmount lock must be held for write
     */
    static void unhash_mnt(struct mount *mnt)
    {
    	mnt->mnt_parent = mnt;
    	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
    	list_del_init(&mnt->mnt_child);
    	hlist_del_init_rcu(&mnt->mnt_hash);
    	hlist_del_init(&mnt->mnt_mp_list);
    	put_mountpoint(mnt->mnt_mp);
    	mnt->mnt_mp = NULL;
    }
    
    /*
     * vfsmount lock must be held for write
     */
    static void detach_mnt(struct mount *mnt, struct path *old_path)
    {
    	old_path->dentry = mnt->mnt_mountpoint;
    	old_path->mnt = &mnt->mnt_parent->mnt;
    	unhash_mnt(mnt);
    }
    
    /*
     * vfsmount lock must be held for write
     */
    static void umount_mnt(struct mount *mnt)
    {
    	/* old mountpoint will be dropped when we can do that */
    	mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
    	unhash_mnt(mnt);
    }
    
    /*
     * vfsmount lock must be held for write
     */
    void mnt_set_mountpoint(struct mount *mnt,
    			struct mountpoint *mp,
    			struct mount *child_mnt)
    {
    	mp->m_count++;
    	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
    	child_mnt->mnt_mountpoint = dget(mp->m_dentry);
    	child_mnt->mnt_parent = mnt;
    	child_mnt->mnt_mp = mp;
    	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
    }
    
    static void __attach_mnt(struct mount *mnt, struct mount *parent)
    {
    	hlist_add_head_rcu(&mnt->mnt_hash,
    			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
    	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
    }
    
    /*
     * vfsmount lock must be held for write
     */
    static void attach_mnt(struct mount *mnt,
    			struct mount *parent,
    			struct mountpoint *mp)
    {
    	mnt_set_mountpoint(parent, mp, mnt);
    	__attach_mnt(mnt, parent);
    }
    
    void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
    {
    	struct mountpoint *old_mp = mnt->mnt_mp;
    	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
    	struct mount *old_parent = mnt->mnt_parent;
    
    	list_del_init(&mnt->mnt_child);
    	hlist_del_init(&mnt->mnt_mp_list);
    	hlist_del_init_rcu(&mnt->mnt_hash);
    
    	attach_mnt(mnt, parent, mp);
    
    	put_mountpoint(old_mp);
    
    	/*
    	 * Safely avoid even the suggestion this code might sleep or
    	 * lock the mount hash by taking advantage of the knowledge that
    	 * mnt_change_mountpoint will not release the final reference
    	 * to a mountpoint.
    	 *
    	 * During mounting, the mount passed in as the parent mount will
    	 * continue to use the old mountpoint and during unmounting, the
    	 * old mountpoint will continue to exist until namespace_unlock,
    	 * which happens well after mnt_change_mountpoint.
    	 */
    	spin_lock(&old_mountpoint->d_lock);
    	old_mountpoint->d_lockref.count--;
    	spin_unlock(&old_mountpoint->d_lock);
    
    	mnt_add_count(old_parent, -1);
    }
    
    /*
     * vfsmount lock must be held for write
     */
    static void commit_tree(struct mount *mnt)
    {
    	struct mount *parent = mnt->mnt_parent;
    	struct mount *m;
    	LIST_HEAD(head);
    	struct mnt_namespace *n = parent->mnt_ns;
    
    	BUG_ON(parent == mnt);
    
    	list_add_tail(&head, &mnt->mnt_list);
    	list_for_each_entry(m, &head, mnt_list)
    		m->mnt_ns = n;
    
    	list_splice(&head, n->list.prev);
    
    	n->mounts += n->pending_mounts;
    	n->pending_mounts = 0;
    
    	__attach_mnt(mnt, parent);
    	touch_mnt_namespace(n);
    }
    
    static struct mount *next_mnt(struct mount *p, struct mount *root)
    {
    	struct list_head *next = p->mnt_mounts.next;
    	if (next == &p->mnt_mounts) {
    		while (1) {
    			if (p == root)
    				return NULL;
    			next = p->mnt_child.next;
    			if (next != &p->mnt_parent->mnt_mounts)
    				break;
    			p = p->mnt_parent;
    		}
    	}
    	return list_entry(next, struct mount, mnt_child);
    }
    
    static struct mount *skip_mnt_tree(struct mount *p)
    {
    	struct list_head *prev = p->mnt_mounts.prev;
    	while (prev != &p->mnt_mounts) {
    		p = list_entry(prev, struct mount, mnt_child);
    		prev = p->mnt_mounts.prev;
    	}
    	return p;
    }
    
    struct vfsmount *
    vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
    {
    	struct mount *mnt;
    	struct dentry *root;
    
    	if (!type)
    		return ERR_PTR(-ENODEV);
    
    	mnt = alloc_vfsmnt(name);
    	if (!mnt)
    		return ERR_PTR(-ENOMEM);
    
    	if (flags & MS_KERNMOUNT)
    		mnt->mnt.mnt_flags = MNT_INTERNAL;
    
    	root = mount_fs(type, flags, name, data);
    	if (IS_ERR(root)) {
    		mnt_free_id(mnt);
    		free_vfsmnt(mnt);
    		return ERR_CAST(root);
    	}
    
    	mnt->mnt.mnt_root = root;
    	mnt->mnt.mnt_sb = root->d_sb;
    	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
    	mnt->mnt_parent = mnt;
    	lock_mount_hash();
    	list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
    	unlock_mount_hash();
    	return &mnt->mnt;
    }
    EXPORT_SYMBOL_GPL(vfs_kern_mount);
    
    struct vfsmount *
    vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,