Skip to content
Snippets Groups Projects
hugetlb.c 123 KiB
Newer Older
  • Learn to ignore specific revisions
  • Kenneth Johansson's avatar
    Kenneth Johansson committed
    1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
    /*
     * Generic hugetlb support.
     * (C) Nadia Yvette Chambers, April 2004
     */
    #include <linux/list.h>
    #include <linux/init.h>
    #include <linux/mm.h>
    #include <linux/seq_file.h>
    #include <linux/sysctl.h>
    #include <linux/highmem.h>
    #include <linux/mmu_notifier.h>
    #include <linux/nodemask.h>
    #include <linux/pagemap.h>
    #include <linux/mempolicy.h>
    #include <linux/compiler.h>
    #include <linux/cpuset.h>
    #include <linux/mutex.h>
    #include <linux/bootmem.h>
    #include <linux/sysfs.h>
    #include <linux/slab.h>
    #include <linux/rmap.h>
    #include <linux/swap.h>
    #include <linux/swapops.h>
    #include <linux/page-isolation.h>
    #include <linux/jhash.h>
    
    #include <asm/page.h>
    #include <asm/pgtable.h>
    #include <asm/tlb.h>
    
    #include <linux/io.h>
    #include <linux/hugetlb.h>
    #include <linux/hugetlb_cgroup.h>
    #include <linux/node.h>
    #include "internal.h"
    
    int hugepages_treat_as_movable;
    
    int hugetlb_max_hstate __read_mostly;
    unsigned int default_hstate_idx;
    struct hstate hstates[HUGE_MAX_HSTATE];
    /*
     * Minimum page order among possible hugepage sizes, set to a proper value
     * at boot time.
     */
    static unsigned int minimum_order __read_mostly = UINT_MAX;
    
    __initdata LIST_HEAD(huge_boot_pages);
    
    /* for command line parsing */
    static struct hstate * __initdata parsed_hstate;
    static unsigned long __initdata default_hstate_max_huge_pages;
    static unsigned long __initdata default_hstate_size;
    static bool __initdata parsed_valid_hugepagesz = true;
    
    /*
     * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
     * free_huge_pages, and surplus_huge_pages.
     */
    DEFINE_SPINLOCK(hugetlb_lock);
    
    /*
     * Serializes faults on the same logical page.  This is used to
     * prevent spurious OOMs when the hugepage pool is fully utilized.
     */
    static int num_fault_mutexes;
    struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
    
    /* Forward declaration */
    static int hugetlb_acct_memory(struct hstate *h, long delta);
    
    static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
    {
    	bool free = (spool->count == 0) && (spool->used_hpages == 0);
    
    	spin_unlock(&spool->lock);
    
    	/* If no pages are used, and no other handles to the subpool
    	 * remain, give up any reservations mased on minimum size and
    	 * free the subpool */
    	if (free) {
    		if (spool->min_hpages != -1)
    			hugetlb_acct_memory(spool->hstate,
    						-spool->min_hpages);
    		kfree(spool);
    	}
    }
    
    struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
    						long min_hpages)
    {
    	struct hugepage_subpool *spool;
    
    	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
    	if (!spool)
    		return NULL;
    
    	spin_lock_init(&spool->lock);
    	spool->count = 1;
    	spool->max_hpages = max_hpages;
    	spool->hstate = h;
    	spool->min_hpages = min_hpages;
    
    	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
    		kfree(spool);
    		return NULL;
    	}
    	spool->rsv_hpages = min_hpages;
    
    	return spool;
    }
    
    void hugepage_put_subpool(struct hugepage_subpool *spool)
    {
    	spin_lock(&spool->lock);
    	BUG_ON(!spool->count);
    	spool->count--;
    	unlock_or_release_subpool(spool);
    }
    
    /*
     * Subpool accounting for allocating and reserving pages.
     * Return -ENOMEM if there are not enough resources to satisfy the
     * the request.  Otherwise, return the number of pages by which the
     * global pools must be adjusted (upward).  The returned value may
     * only be different than the passed value (delta) in the case where
     * a subpool minimum size must be manitained.
     */
    static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
    				      long delta)
    {
    	long ret = delta;
    
    	if (!spool)
    		return ret;
    
    	spin_lock(&spool->lock);
    
    	if (spool->max_hpages != -1) {		/* maximum size accounting */
    		if ((spool->used_hpages + delta) <= spool->max_hpages)
    			spool->used_hpages += delta;
    		else {
    			ret = -ENOMEM;
    			goto unlock_ret;
    		}
    	}
    
    	/* minimum size accounting */
    	if (spool->min_hpages != -1 && spool->rsv_hpages) {
    		if (delta > spool->rsv_hpages) {
    			/*
    			 * Asking for more reserves than those already taken on
    			 * behalf of subpool.  Return difference.
    			 */
    			ret = delta - spool->rsv_hpages;
    			spool->rsv_hpages = 0;
    		} else {
    			ret = 0;	/* reserves already accounted for */
    			spool->rsv_hpages -= delta;
    		}
    	}
    
    unlock_ret:
    	spin_unlock(&spool->lock);
    	return ret;
    }
    
    /*
     * Subpool accounting for freeing and unreserving pages.
     * Return the number of global page reservations that must be dropped.
     * The return value may only be different than the passed value (delta)
     * in the case where a subpool minimum size must be maintained.
     */
    static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
    				       long delta)
    {
    	long ret = delta;
    
    	if (!spool)
    		return delta;
    
    	spin_lock(&spool->lock);
    
    	if (spool->max_hpages != -1)		/* maximum size accounting */
    		spool->used_hpages -= delta;
    
    	 /* minimum size accounting */
    	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
    		if (spool->rsv_hpages + delta <= spool->min_hpages)
    			ret = 0;
    		else
    			ret = spool->rsv_hpages + delta - spool->min_hpages;
    
    		spool->rsv_hpages += delta;
    		if (spool->rsv_hpages > spool->min_hpages)
    			spool->rsv_hpages = spool->min_hpages;
    	}
    
    	/*
    	 * If hugetlbfs_put_super couldn't free spool due to an outstanding
    	 * quota reference, free it now.
    	 */
    	unlock_or_release_subpool(spool);
    
    	return ret;
    }
    
    static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
    {
    	return HUGETLBFS_SB(inode->i_sb)->spool;
    }
    
    static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
    {
    	return subpool_inode(file_inode(vma->vm_file));
    }
    
    /*
     * Region tracking -- allows tracking of reservations and instantiated pages
     *                    across the pages in a mapping.
     *
     * The region data structures are embedded into a resv_map and protected
     * by a resv_map's lock.  The set of regions within the resv_map represent
     * reservations for huge pages, or huge pages that have already been
     * instantiated within the map.  The from and to elements are huge page
     * indicies into the associated mapping.  from indicates the starting index
     * of the region.  to represents the first index past the end of  the region.
     *
     * For example, a file region structure with from == 0 and to == 4 represents
     * four huge pages in a mapping.  It is important to note that the to element
     * represents the first element past the end of the region. This is used in
     * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
     *
     * Interval notation of the form [from, to) will be used to indicate that
     * the endpoint from is inclusive and to is exclusive.
     */
    struct file_region {
    	struct list_head link;
    	long from;
    	long to;
    };
    
    /*
     * Add the huge page range represented by [f, t) to the reserve
     * map.  In the normal case, existing regions will be expanded
     * to accommodate the specified range.  Sufficient regions should
     * exist for expansion due to the previous call to region_chg
     * with the same range.  However, it is possible that region_del
     * could have been called after region_chg and modifed the map
     * in such a way that no region exists to be expanded.  In this
     * case, pull a region descriptor from the cache associated with
     * the map and use that for the new range.
     *
     * Return the number of new huge pages added to the map.  This
     * number is greater than or equal to zero.
     */
    static long region_add(struct resv_map *resv, long f, long t)
    {
    	struct list_head *head = &resv->regions;
    	struct file_region *rg, *nrg, *trg;
    	long add = 0;
    
    	spin_lock(&resv->lock);
    	/* Locate the region we are either in or before. */
    	list_for_each_entry(rg, head, link)
    		if (f <= rg->to)
    			break;
    
    	/*
    	 * If no region exists which can be expanded to include the
    	 * specified range, the list must have been modified by an
    	 * interleving call to region_del().  Pull a region descriptor
    	 * from the cache and use it for this range.
    	 */
    	if (&rg->link == head || t < rg->from) {
    		VM_BUG_ON(resv->region_cache_count <= 0);
    
    		resv->region_cache_count--;
    		nrg = list_first_entry(&resv->region_cache, struct file_region,
    					link);
    		list_del(&nrg->link);
    
    		nrg->from = f;
    		nrg->to = t;
    		list_add(&nrg->link, rg->link.prev);
    
    		add += t - f;
    		goto out_locked;
    	}
    
    	/* Round our left edge to the current segment if it encloses us. */
    	if (f > rg->from)
    		f = rg->from;
    
    	/* Check for and consume any regions we now overlap with. */
    	nrg = rg;
    	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
    		if (&rg->link == head)
    			break;
    		if (rg->from > t)
    			break;
    
    		/* If this area reaches higher then extend our area to
    		 * include it completely.  If this is not the first area
    		 * which we intend to reuse, free it. */
    		if (rg->to > t)
    			t = rg->to;
    		if (rg != nrg) {
    			/* Decrement return value by the deleted range.
    			 * Another range will span this area so that by
    			 * end of routine add will be >= zero
    			 */
    			add -= (rg->to - rg->from);
    			list_del(&rg->link);
    			kfree(rg);
    		}
    	}
    
    	add += (nrg->from - f);		/* Added to beginning of region */
    	nrg->from = f;
    	add += t - nrg->to;		/* Added to end of region */
    	nrg->to = t;
    
    out_locked:
    	resv->adds_in_progress--;
    	spin_unlock(&resv->lock);
    	VM_BUG_ON(add < 0);
    	return add;
    }
    
    /*
     * Examine the existing reserve map and determine how many
     * huge pages in the specified range [f, t) are NOT currently
     * represented.  This routine is called before a subsequent
     * call to region_add that will actually modify the reserve
     * map to add the specified range [f, t).  region_chg does
     * not change the number of huge pages represented by the
     * map.  However, if the existing regions in the map can not
     * be expanded to represent the new range, a new file_region
     * structure is added to the map as a placeholder.  This is
     * so that the subsequent region_add call will have all the
     * regions it needs and will not fail.
     *
     * Upon entry, region_chg will also examine the cache of region descriptors
     * associated with the map.  If there are not enough descriptors cached, one
     * will be allocated for the in progress add operation.
     *
     * Returns the number of huge pages that need to be added to the existing
     * reservation map for the range [f, t).  This number is greater or equal to
     * zero.  -ENOMEM is returned if a new file_region structure or cache entry
     * is needed and can not be allocated.
     */
    static long region_chg(struct resv_map *resv, long f, long t)
    {
    	struct list_head *head = &resv->regions;
    	struct file_region *rg, *nrg = NULL;
    	long chg = 0;
    
    retry:
    	spin_lock(&resv->lock);
    retry_locked:
    	resv->adds_in_progress++;
    
    	/*
    	 * Check for sufficient descriptors in the cache to accommodate
    	 * the number of in progress add operations.
    	 */
    	if (resv->adds_in_progress > resv->region_cache_count) {
    		struct file_region *trg;
    
    		VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
    		/* Must drop lock to allocate a new descriptor. */
    		resv->adds_in_progress--;
    		spin_unlock(&resv->lock);
    
    		trg = kmalloc(sizeof(*trg), GFP_KERNEL);
    		if (!trg) {
    			kfree(nrg);
    			return -ENOMEM;
    		}
    
    		spin_lock(&resv->lock);
    		list_add(&trg->link, &resv->region_cache);
    		resv->region_cache_count++;
    		goto retry_locked;
    	}
    
    	/* Locate the region we are before or in. */
    	list_for_each_entry(rg, head, link)
    		if (f <= rg->to)
    			break;
    
    	/* If we are below the current region then a new region is required.
    	 * Subtle, allocate a new region at the position but make it zero
    	 * size such that we can guarantee to record the reservation. */
    	if (&rg->link == head || t < rg->from) {
    		if (!nrg) {
    			resv->adds_in_progress--;
    			spin_unlock(&resv->lock);
    			nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
    			if (!nrg)
    				return -ENOMEM;
    
    			nrg->from = f;
    			nrg->to   = f;
    			INIT_LIST_HEAD(&nrg->link);
    			goto retry;
    		}
    
    		list_add(&nrg->link, rg->link.prev);
    		chg = t - f;
    		goto out_nrg;
    	}
    
    	/* Round our left edge to the current segment if it encloses us. */
    	if (f > rg->from)
    		f = rg->from;
    	chg = t - f;
    
    	/* Check for and consume any regions we now overlap with. */
    	list_for_each_entry(rg, rg->link.prev, link) {
    		if (&rg->link == head)
    			break;
    		if (rg->from > t)
    			goto out;
    
    		/* We overlap with this area, if it extends further than
    		 * us then we must extend ourselves.  Account for its
    		 * existing reservation. */
    		if (rg->to > t) {
    			chg += rg->to - t;
    			t = rg->to;
    		}
    		chg -= rg->to - rg->from;
    	}
    
    out:
    	spin_unlock(&resv->lock);
    	/*  We already know we raced and no longer need the new region */
    	kfree(nrg);
    	return chg;
    out_nrg:
    	spin_unlock(&resv->lock);
    	return chg;
    }
    
    /*
     * Abort the in progress add operation.  The adds_in_progress field
     * of the resv_map keeps track of the operations in progress between
     * calls to region_chg and region_add.  Operations are sometimes
     * aborted after the call to region_chg.  In such cases, region_abort
     * is called to decrement the adds_in_progress counter.
     *
     * NOTE: The range arguments [f, t) are not needed or used in this
     * routine.  They are kept to make reading the calling code easier as
     * arguments will match the associated region_chg call.
     */
    static void region_abort(struct resv_map *resv, long f, long t)
    {
    	spin_lock(&resv->lock);
    	VM_BUG_ON(!resv->region_cache_count);
    	resv->adds_in_progress--;
    	spin_unlock(&resv->lock);
    }
    
    /*
     * Delete the specified range [f, t) from the reserve map.  If the
     * t parameter is LONG_MAX, this indicates that ALL regions after f
     * should be deleted.  Locate the regions which intersect [f, t)
     * and either trim, delete or split the existing regions.
     *
     * Returns the number of huge pages deleted from the reserve map.
     * In the normal case, the return value is zero or more.  In the
     * case where a region must be split, a new region descriptor must
     * be allocated.  If the allocation fails, -ENOMEM will be returned.
     * NOTE: If the parameter t == LONG_MAX, then we will never split
     * a region and possibly return -ENOMEM.  Callers specifying
     * t == LONG_MAX do not need to check for -ENOMEM error.
     */
    static long region_del(struct resv_map *resv, long f, long t)
    {
    	struct list_head *head = &resv->regions;
    	struct file_region *rg, *trg;
    	struct file_region *nrg = NULL;
    	long del = 0;
    
    retry:
    	spin_lock(&resv->lock);
    	list_for_each_entry_safe(rg, trg, head, link) {
    		/*
    		 * Skip regions before the range to be deleted.  file_region
    		 * ranges are normally of the form [from, to).  However, there
    		 * may be a "placeholder" entry in the map which is of the form
    		 * (from, to) with from == to.  Check for placeholder entries
    		 * at the beginning of the range to be deleted.
    		 */
    		if (rg->to <= f && (rg->to != rg->from || rg->to != f))
    			continue;
    
    		if (rg->from >= t)
    			break;
    
    		if (f > rg->from && t < rg->to) { /* Must split region */
    			/*
    			 * Check for an entry in the cache before dropping
    			 * lock and attempting allocation.
    			 */
    			if (!nrg &&
    			    resv->region_cache_count > resv->adds_in_progress) {
    				nrg = list_first_entry(&resv->region_cache,
    							struct file_region,
    							link);
    				list_del(&nrg->link);
    				resv->region_cache_count--;
    			}
    
    			if (!nrg) {
    				spin_unlock(&resv->lock);
    				nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
    				if (!nrg)
    					return -ENOMEM;
    				goto retry;
    			}
    
    			del += t - f;
    
    			/* New entry for end of split region */
    			nrg->from = t;
    			nrg->to = rg->to;
    			INIT_LIST_HEAD(&nrg->link);
    
    			/* Original entry is trimmed */
    			rg->to = f;
    
    			list_add(&nrg->link, &rg->link);
    			nrg = NULL;
    			break;
    		}
    
    		if (f <= rg->from && t >= rg->to) { /* Remove entire region */
    			del += rg->to - rg->from;
    			list_del(&rg->link);
    			kfree(rg);
    			continue;
    		}
    
    		if (f <= rg->from) {	/* Trim beginning of region */
    			del += t - rg->from;
    			rg->from = t;
    		} else {		/* Trim end of region */
    			del += rg->to - f;
    			rg->to = f;
    		}
    	}
    
    	spin_unlock(&resv->lock);
    	kfree(nrg);
    	return del;
    }
    
    /*
     * A rare out of memory error was encountered which prevented removal of
     * the reserve map region for a page.  The huge page itself was free'ed
     * and removed from the page cache.  This routine will adjust the subpool
     * usage count, and the global reserve count if needed.  By incrementing
     * these counts, the reserve map entry which could not be deleted will
     * appear as a "reserved" entry instead of simply dangling with incorrect
     * counts.
     */
    void hugetlb_fix_reserve_counts(struct inode *inode)
    {
    	struct hugepage_subpool *spool = subpool_inode(inode);
    	long rsv_adjust;
    
    	rsv_adjust = hugepage_subpool_get_pages(spool, 1);
    	if (rsv_adjust) {
    		struct hstate *h = hstate_inode(inode);
    
    		hugetlb_acct_memory(h, 1);
    	}
    }
    
    /*
     * Count and return the number of huge pages in the reserve map
     * that intersect with the range [f, t).
     */
    static long region_count(struct resv_map *resv, long f, long t)
    {
    	struct list_head *head = &resv->regions;
    	struct file_region *rg;
    	long chg = 0;
    
    	spin_lock(&resv->lock);
    	/* Locate each segment we overlap with, and count that overlap. */
    	list_for_each_entry(rg, head, link) {
    		long seg_from;
    		long seg_to;
    
    		if (rg->to <= f)
    			continue;
    		if (rg->from >= t)
    			break;
    
    		seg_from = max(rg->from, f);
    		seg_to = min(rg->to, t);
    
    		chg += seg_to - seg_from;
    	}
    	spin_unlock(&resv->lock);
    
    	return chg;
    }
    
    /*
     * Convert the address within this vma to the page offset within
     * the mapping, in pagecache page units; huge pages here.
     */
    static pgoff_t vma_hugecache_offset(struct hstate *h,
    			struct vm_area_struct *vma, unsigned long address)
    {
    	return ((address - vma->vm_start) >> huge_page_shift(h)) +
    			(vma->vm_pgoff >> huge_page_order(h));
    }
    
    pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
    				     unsigned long address)
    {
    	return vma_hugecache_offset(hstate_vma(vma), vma, address);
    }
    EXPORT_SYMBOL_GPL(linear_hugepage_index);
    
    /*
     * Return the size of the pages allocated when backing a VMA. In the majority
     * cases this will be same size as used by the page table entries.
     */
    unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
    {
    	struct hstate *hstate;
    
    	if (!is_vm_hugetlb_page(vma))
    		return PAGE_SIZE;
    
    	hstate = hstate_vma(vma);
    
    	return 1UL << huge_page_shift(hstate);
    }
    EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
    
    /*
     * Return the page size being used by the MMU to back a VMA. In the majority
     * of cases, the page size used by the kernel matches the MMU size. On
     * architectures where it differs, an architecture-specific version of this
     * function is required.
     */
    #ifndef vma_mmu_pagesize
    unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
    {
    	return vma_kernel_pagesize(vma);
    }
    #endif
    
    /*
     * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
     * bits of the reservation map pointer, which are always clear due to
     * alignment.
     */
    #define HPAGE_RESV_OWNER    (1UL << 0)
    #define HPAGE_RESV_UNMAPPED (1UL << 1)
    #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
    
    /*
     * These helpers are used to track how many pages are reserved for
     * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
     * is guaranteed to have their future faults succeed.
     *
     * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
     * the reserve counters are updated with the hugetlb_lock held. It is safe
     * to reset the VMA at fork() time as it is not in use yet and there is no
     * chance of the global counters getting corrupted as a result of the values.
     *
     * The private mapping reservation is represented in a subtly different
     * manner to a shared mapping.  A shared mapping has a region map associated
     * with the underlying file, this region map represents the backing file
     * pages which have ever had a reservation assigned which this persists even
     * after the page is instantiated.  A private mapping has a region map
     * associated with the original mmap which is attached to all VMAs which
     * reference it, this region map represents those offsets which have consumed
     * reservation ie. where pages have been instantiated.
     */
    static unsigned long get_vma_private_data(struct vm_area_struct *vma)
    {
    	return (unsigned long)vma->vm_private_data;
    }
    
    static void set_vma_private_data(struct vm_area_struct *vma,
    							unsigned long value)
    {
    	vma->vm_private_data = (void *)value;
    }
    
    struct resv_map *resv_map_alloc(void)
    {
    	struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
    	struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
    
    	if (!resv_map || !rg) {
    		kfree(resv_map);
    		kfree(rg);
    		return NULL;
    	}
    
    	kref_init(&resv_map->refs);
    	spin_lock_init(&resv_map->lock);
    	INIT_LIST_HEAD(&resv_map->regions);
    
    	resv_map->adds_in_progress = 0;
    
    	INIT_LIST_HEAD(&resv_map->region_cache);
    	list_add(&rg->link, &resv_map->region_cache);
    	resv_map->region_cache_count = 1;
    
    	return resv_map;
    }
    
    void resv_map_release(struct kref *ref)
    {
    	struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
    	struct list_head *head = &resv_map->region_cache;
    	struct file_region *rg, *trg;
    
    	/* Clear out any active regions before we release the map. */
    	region_del(resv_map, 0, LONG_MAX);
    
    	/* ... and any entries left in the cache */
    	list_for_each_entry_safe(rg, trg, head, link) {
    		list_del(&rg->link);
    		kfree(rg);
    	}
    
    	VM_BUG_ON(resv_map->adds_in_progress);
    
    	kfree(resv_map);
    }
    
    static inline struct resv_map *inode_resv_map(struct inode *inode)
    {
    	return inode->i_mapping->private_data;
    }
    
    static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
    {
    	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
    	if (vma->vm_flags & VM_MAYSHARE) {
    		struct address_space *mapping = vma->vm_file->f_mapping;
    		struct inode *inode = mapping->host;
    
    		return inode_resv_map(inode);
    
    	} else {
    		return (struct resv_map *)(get_vma_private_data(vma) &
    							~HPAGE_RESV_MASK);
    	}
    }
    
    static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
    {
    	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
    	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
    
    	set_vma_private_data(vma, (get_vma_private_data(vma) &
    				HPAGE_RESV_MASK) | (unsigned long)map);
    }
    
    static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
    {
    	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
    	VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
    
    	set_vma_private_data(vma, get_vma_private_data(vma) | flags);
    }
    
    static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
    {
    	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
    
    	return (get_vma_private_data(vma) & flag) != 0;
    }
    
    /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
    void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
    {
    	VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
    	if (!(vma->vm_flags & VM_MAYSHARE))
    		vma->vm_private_data = (void *)0;
    }
    
    /* Returns true if the VMA has associated reserve pages */
    static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
    {
    	if (vma->vm_flags & VM_NORESERVE) {
    		/*
    		 * This address is already reserved by other process(chg == 0),
    		 * so, we should decrement reserved count. Without decrementing,
    		 * reserve count remains after releasing inode, because this
    		 * allocated page will go into page cache and is regarded as
    		 * coming from reserved pool in releasing step.  Currently, we
    		 * don't have any other solution to deal with this situation
    		 * properly, so add work-around here.
    		 */
    		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
    			return true;
    		else
    			return false;
    	}
    
    	/* Shared mappings always use reserves */
    	if (vma->vm_flags & VM_MAYSHARE) {
    		/*
    		 * We know VM_NORESERVE is not set.  Therefore, there SHOULD
    		 * be a region map for all pages.  The only situation where
    		 * there is no region map is if a hole was punched via
    		 * fallocate.  In this case, there really are no reverves to
    		 * use.  This situation is indicated if chg != 0.
    		 */
    		if (chg)
    			return false;
    		else
    			return true;
    	}
    
    	/*
    	 * Only the process that called mmap() has reserves for
    	 * private mappings.
    	 */
    	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
    		/*
    		 * Like the shared case above, a hole punch or truncate
    		 * could have been performed on the private mapping.
    		 * Examine the value of chg to determine if reserves
    		 * actually exist or were previously consumed.
    		 * Very Subtle - The value of chg comes from a previous
    		 * call to vma_needs_reserves().  The reserve map for
    		 * private mappings has different (opposite) semantics
    		 * than that of shared mappings.  vma_needs_reserves()
    		 * has already taken this difference in semantics into
    		 * account.  Therefore, the meaning of chg is the same
    		 * as in the shared case above.  Code could easily be
    		 * combined, but keeping it separate draws attention to
    		 * subtle differences.
    		 */
    		if (chg)
    			return false;
    		else
    			return true;
    	}
    
    	return false;
    }
    
    static void enqueue_huge_page(struct hstate *h, struct page *page)
    {
    	int nid = page_to_nid(page);
    	list_move(&page->lru, &h->hugepage_freelists[nid]);
    	h->free_huge_pages++;
    	h->free_huge_pages_node[nid]++;
    }
    
    static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
    {
    	struct page *page;
    
    	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
    		if (!is_migrate_isolate_page(page))
    			break;
    	/*
    	 * if 'non-isolated free hugepage' not found on the list,
    	 * the allocation fails.
    	 */
    	if (&h->hugepage_freelists[nid] == &page->lru)
    		return NULL;
    	list_move(&page->lru, &h->hugepage_activelist);
    	set_page_refcounted(page);
    	h->free_huge_pages--;
    	h->free_huge_pages_node[nid]--;
    	return page;
    }
    
    /* Movability of hugepages depends on migration support. */
    static inline gfp_t htlb_alloc_mask(struct hstate *h)
    {
    	if (hugepages_treat_as_movable || hugepage_migration_supported(h))
    		return GFP_HIGHUSER_MOVABLE;
    	else
    		return GFP_HIGHUSER;
    }
    
    static struct page *dequeue_huge_page_vma(struct hstate *h,
    				struct vm_area_struct *vma,
    				unsigned long address, int avoid_reserve,
    				long chg)
    {
    	struct page *page = NULL;
    	struct mempolicy *mpol;
    	nodemask_t *nodemask;
    	struct zonelist *zonelist;
    	struct zone *zone;
    	struct zoneref *z;
    	unsigned int cpuset_mems_cookie;
    
    	/*
    	 * A child process with MAP_PRIVATE mappings created by their parent
    	 * have no page reserves. This check ensures that reservations are
    	 * not "stolen". The child may still get SIGKILLed
    	 */
    	if (!vma_has_reserves(vma, chg) &&
    			h->free_huge_pages - h->resv_huge_pages == 0)
    		goto err;
    
    	/* If reserves cannot be used, ensure enough pages are in the pool */
    	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
    		goto err;
    
    retry_cpuset:
    	cpuset_mems_cookie = read_mems_allowed_begin();
    	zonelist = huge_zonelist(vma, address,
    					htlb_alloc_mask(h), &mpol, &nodemask);
    
    	for_each_zone_zonelist_nodemask(zone, z, zonelist,
    						MAX_NR_ZONES - 1, nodemask) {
    		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
    			page = dequeue_huge_page_node(h, zone_to_nid(zone));
    			if (page) {
    				if (avoid_reserve)
    					break;
    				if (!vma_has_reserves(vma, chg))
    					break;
    
    				SetPagePrivate(page);
    				h->resv_huge_pages--;
    				break;
    			}
    		}
    	}
    
    	mpol_cond_put(mpol);
    	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
    		goto retry_cpuset;
    	return page;
    
    err:
    	return NULL;
    }
    
    /*
     * common helper functions for hstate_next_node_to_{alloc|free}.
     * We may have allocated or freed a huge page based on a different
     * nodes_allowed previously, so h->next_node_to_{alloc|free} might
     * be outside of *nodes_allowed.  Ensure that we use an allowed
     * node for alloc or free.
     */
    static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
    {
    	nid = next_node_in(nid, *nodes_allowed);
    	VM_BUG_ON(nid >= MAX_NUMNODES);
    
    	return nid;
    }
    
    static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
    {
    	if (!node_isset(nid, *nodes_allowed))
    		nid = next_node_allowed(nid, nodes_allowed);
    	return nid;
    }
    
    /*
     * returns the previously saved node ["this node"] from which to
     * allocate a persistent huge page for the pool and advance the
     * next node from which to allocate, handling wrap at end of node
     * mask.
     */
    static int hstate_next_node_to_alloc(struct hstate *h,
    					nodemask_t *nodes_allowed)
    {
    	int nid;
    
    	VM_BUG_ON(!nodes_allowed);
    
    	nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
    	h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
    
    	return nid;
    }
    
    /*
     * helper for free_pool_huge_page() - return the previously saved
     * node ["this node"] from which to free a huge page.  Advance the
     * next node id whether or not we find a free huge page to free so
     * that the next attempt to free addresses the next node.
     */