1. 08 Oct, 2016 40 commits
    • Robert Ho's avatar
      mm, proc: fix region lost in /proc/self/smaps · 855af072
      Robert Ho authored
      Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
      more detailed info please refer to:
      
         https://bugzilla.redhat.com/show_bug.cgi?id=1365721
      
      Actually, this bug is not only for NVDIMM/DAX but also for any other
      file systems.  This simple test case abstracted from nvml can easily
      reproduce this bug in common environment:
      
      -------------------------- testcase.c -----------------------------
      
      int
      is_pmem_proc(const void *addr, size_t len)
      {
              const char *caddr = addr;
      
              FILE *fp;
              if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
                      printf("!/proc/self/smaps");
                      return 0;
              }
      
              int retval = 0;         /* assume false until proven otherwise */
              char line[PROCMAXLEN];  /* for fgets() */
              char *lo = NULL;        /* beginning of current range in smaps file */
              char *hi = NULL;        /* end of current range in smaps file */
              int needmm = 0;         /* looking for mm flag for current range */
              while (fgets(line, PROCMAXLEN, fp) != NULL) {
                      static const char vmflags[] = "VmFlags:";
                      static const char mm[] = " wr";
      
                      /* check for range line */
                      if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
                              if (needmm) {
                                      /* last range matched, but no mm flag found */
                                      printf("never found mm flag.\n");
                                      break;
                              } else if (caddr < lo) {
                                      /* never found the range for caddr */
                                      printf("#######no match for addr %p.\n", caddr);
                                      break;
                              } else if (caddr < hi) {
                                      /* start address is in this range */
                                      size_t rangelen = (size_t)(hi - caddr);
      
                                      /* remember that matching has started */
                                      needmm = 1;
      
                                      /* calculate remaining range to search for */
                                      if (len > rangelen) {
                                              len -= rangelen;
                                              caddr += rangelen;
                                              printf("matched %zu bytes in range "
                                                      "%p-%p, %zu left over.\n",
                                                              rangelen, lo, hi, len);
                                      } else {
                                              len = 0;
                                              printf("matched all bytes in range "
                                                              "%p-%p.\n", lo, hi);
                                      }
                              }
                      } else if (needmm && strncmp(line, vmflags,
                                              sizeof(vmflags) - 1) == 0) {
                              if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
                                      printf("mm flag found.\n");
                                      if (len == 0) {
                                              /* entire range matched */
                                              retval = 1;
                                              break;
                                      }
                                      needmm = 0;     /* saw what was needed */
                              } else {
                                      /* mm flag not set for some or all of range */
                                      printf("range has no mm flag.\n");
                                      break;
                              }
                      }
              }
      
              fclose(fp);
      
              printf("returning %d.\n", retval);
              return retval;
      }
      
      void *Addr;
      size_t Size;
      
      /*
       * worker -- the work each thread performs
       */
      static void *
      worker(void *arg)
      {
              int *ret = (int *)arg;
              *ret =  is_pmem_proc(Addr, Size);
              return NULL;
      }
      
      int main(int argc, char *argv[])
      {
              if (argc <  2 || argc > 3) {
                      printf("usage: %s file [env].\n", argv[0]);
                      return -1;
              }
      
              int fd = open(argv[1], O_RDWR);
      
              struct stat stbuf;
              fstat(fd, &stbuf);
      
              Size = stbuf.st_size;
              Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
      
              close(fd);
      
              pthread_t threads[NTHREAD];
              int ret[NTHREAD];
      
              /* kick off NTHREAD threads */
              for (int i = 0; i < NTHREAD; i++)
                      pthread_create(&threads[i], NULL, worker, &ret[i]);
      
              /* wait for all the threads to complete */
              for (int i = 0; i < NTHREAD; i++)
                      pthread_join(threads[i], NULL);
      
              /* verify that all the threads return the same value */
              for (int i = 1; i < NTHREAD; i++) {
                      if (ret[0] != ret[i]) {
                              printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
                                      ret[0], ret[i]);
                      }
              }
      
              printf("%d", ret[0]);
              return 0;
      }
      
      It failed as some threads can not find the memory region in
      "/proc/self/smaps" which is allocated in the main process
      
      It is caused by proc fs which uses 'file->version' to indicate the VMA that
      is the last one has already been handled by read() system call. When the
      next read() issues, it uses the 'version' to find the VMA, then the next
      VMA is what we want to handle, the related code is as follows:
      
              if (last_addr) {
                      vma = find_vma(mm, last_addr);
                      if (vma && (vma = m_next_vma(priv, vma)))
                              return vma;
              }
      
      However, VMA will be lost if the last VMA is gone, e.g:
      
      The process VMA list is A->B->C->D
      
      CPU 0                                  CPU 1
      read() system call
         handle VMA B
         version = B
      return to userspace
      
                                         unmap VMA B
      
      issue read() again to continue to get
      the region info
         find_vma(version) will get VMA C
         m_next_vma(C) will get VMA D
         handle D
         !!! VMA C is lost !!!
      
      In order to fix this bug, we make 'file->version' indicate the end address
      of the current VMA.  m_start will then look up a vma which with vma_start
      < last_vm_end and moves on to the next vma if we found the same or an
      overlapping vma.  This will guarantee that we will not miss an exclusive
      vma but we can still miss one if the previous vma was shrunk.  This is
      acceptable because guaranteeing "never miss a vma" is simply not feasible.
      User has to cope with some inconsistencies if the file is not read in one
      go.
      
      [mhocko@suse.com: changelog fixes]
      Link: http://lkml.kernel.org/r/1475296958-27652-1-git-send-email-robert.hu@intel.comAcked-by: default avatarDave Hansen <dave.hansen@intel.com>
      Signed-off-by: default avatarXiao Guangrong <guangrong.xiao@linux.intel.com>
      Signed-off-by: default avatarRobert Hu <robert.hu@intel.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarOleg Nesterov <oleg@redhat.com>
      Cc: Paolo Bonzini <pbonzini@redhat.com>
      Cc: Dan Williams <dan.j.williams@intel.com>
      Cc: Gleb Natapov <gleb@kernel.org>
      Cc: Marcelo Tosatti <mtosatti@redhat.com>
      Cc: Stefan Hajnoczi <stefanha@redhat.com>
      Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      855af072
    • John Stultz's avatar
      proc: fix timerslack_ns CAP_SYS_NICE check when adjusting self · 4b2bd5fe
      John Stultz authored
      In changing from checking ptrace_may_access(p, PTRACE_MODE_ATTACH_FSCREDS)
      to capable(CAP_SYS_NICE), I missed that ptrace_my_access succeeds when p
      == current, but the CAP_SYS_NICE doesn't.
      
      Thus while the previous commit was intended to loosen the needed
      privileges to modify a processes timerslack, it needlessly restricted a
      task modifying its own timerslack via the proc/<tid>/timerslack_ns
      (which is permitted also via the PR_SET_TIMERSLACK method).
      
      This patch corrects this by checking if p == current before checking the
      CAP_SYS_NICE value.
      
      This patch applies on top of my two previous patches currently in -mm
      
      Link: http://lkml.kernel.org/r/1471906870-28624-1-git-send-email-john.stultz@linaro.orgSigned-off-by: default avatarJohn Stultz <john.stultz@linaro.org>
      Acked-by: default avatarKees Cook <keescook@chromium.org>
      Cc: "Serge E. Hallyn" <serge@hallyn.com>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: Arjan van de Ven <arjan@linux.intel.com>
      Cc: Oren Laadan <orenl@cellrox.com>
      Cc: Ruchi Kandoi <kandoiruchi@google.com>
      Cc: Rom Lemarchand <romlem@android.com>
      Cc: Todd Kjos <tkjos@google.com>
      Cc: Colin Cross <ccross@android.com>
      Cc: Nick Kralevich <nnk@google.com>
      Cc: Dmitry Shmidt <dimitrysh@google.com>
      Cc: Elliott Hughes <enh@google.com>
      Cc: Android Kernel Team <kernel-team@android.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      4b2bd5fe
    • John Stultz's avatar
      proc: add LSM hook checks to /proc/<tid>/timerslack_ns · 904763e1
      John Stultz authored
      As requested, this patch checks the existing LSM hooks
      task_getscheduler/task_setscheduler when reading or modifying the task's
      timerslack value.
      
      Previous versions added new get/settimerslack LSM hooks, but since they
      checked the same PROCESS__SET/GETSCHED values as existing hooks, it was
      suggested we just use the existing ones.
      
      Link: http://lkml.kernel.org/r/1469132667-17377-2-git-send-email-john.stultz@linaro.orgSigned-off-by: default avatarJohn Stultz <john.stultz@linaro.org>
      Cc: Kees Cook <keescook@chromium.org>
      Cc: "Serge E. Hallyn" <serge@hallyn.com>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: Arjan van de Ven <arjan@linux.intel.com>
      Cc: Oren Laadan <orenl@cellrox.com>
      Cc: Ruchi Kandoi <kandoiruchi@google.com>
      Cc: Rom Lemarchand <romlem@android.com>
      Cc: Todd Kjos <tkjos@google.com>
      Cc: Colin Cross <ccross@android.com>
      Cc: Nick Kralevich <nnk@google.com>
      Cc: Dmitry Shmidt <dimitrysh@google.com>
      Cc: Elliott Hughes <enh@google.com>
      Cc: James Morris <jmorris@namei.org>
      Cc: Android Kernel Team <kernel-team@android.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      904763e1
    • John Stultz's avatar
      proc: relax /proc/<tid>/timerslack_ns capability requirements · 7abbaf94
      John Stultz authored
      When an interface to allow a task to change another tasks timerslack was
      first proposed, it was suggested that something greater then
      CAP_SYS_NICE would be needed, as a task could be delayed further then
      what normally could be done with nice adjustments.
      
      So CAP_SYS_PTRACE was adopted instead for what became the
      /proc/<tid>/timerslack_ns interface.  However, for Android (where this
      feature originates), giving the system_server CAP_SYS_PTRACE would allow
      it to observe and modify all tasks memory.  This is considered too high
      a privilege level for only needing to change the timerslack.
      
      After some discussion, it was realized that a CAP_SYS_NICE process can
      set a task as SCHED_FIFO, so they could fork some spinning processes and
      set them all SCHED_FIFO 99, in effect delaying all other tasks for an
      infinite amount of time.
      
      So as a CAP_SYS_NICE task can already cause trouble for other tasks,
      using it as a required capability for accessing and modifying
      /proc/<tid>/timerslack_ns seems sufficient.
      
      Thus, this patch loosens the capability requirements to CAP_SYS_NICE and
      removes CAP_SYS_PTRACE, simplifying some of the code flow as well.
      
      This is technically an ABI change, but as the feature just landed in
      4.6, I suspect no one is yet using it.
      
      Link: http://lkml.kernel.org/r/1469132667-17377-1-git-send-email-john.stultz@linaro.orgSigned-off-by: default avatarJohn Stultz <john.stultz@linaro.org>
      Reviewed-by: default avatarNick Kralevich <nnk@google.com>
      Acked-by: default avatarSerge Hallyn <serge@hallyn.com>
      Acked-by: default avatarKees Cook <keescook@chromium.org>
      Cc: Kees Cook <keescook@chromium.org>
      Cc: "Serge E. Hallyn" <serge@hallyn.com>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: Arjan van de Ven <arjan@linux.intel.com>
      Cc: Oren Laadan <orenl@cellrox.com>
      Cc: Ruchi Kandoi <kandoiruchi@google.com>
      Cc: Rom Lemarchand <romlem@android.com>
      Cc: Todd Kjos <tkjos@google.com>
      Cc: Colin Cross <ccross@android.com>
      Cc: Nick Kralevich <nnk@google.com>
      Cc: Dmitry Shmidt <dimitrysh@google.com>
      Cc: Elliott Hughes <enh@google.com>
      Cc: Android Kernel Team <kernel-team@android.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      7abbaf94
    • Joe Perches's avatar
      meminfo: break apart a very long seq_printf with #ifdefs · e16e2d8e
      Joe Perches authored
      Use a specific routine to emit most lines so that the code is easier to
      read and maintain.
      
      akpm:
         text    data     bss     dec     hex filename
         2976       8       0    2984     ba8 fs/proc/meminfo.o before
         2669       8       0    2677     a75 fs/proc/meminfo.o after
      
      Link: http://lkml.kernel.org/r/8fce7fdef2ba081a4ef531594e97da8a9feebb58.1470810406.git.joe@perches.comSigned-off-by: default avatarJoe Perches <joe@perches.com>
      Cc: Andi Kleen <andi@firstfloor.org>
      Cc: Alexey Dobriyan <adobriyan@gmail.com>
      Cc: Al Viro <viro@zeniv.linux.org.uk>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      e16e2d8e
    • Joe Perches's avatar
      seq/proc: modify seq_put_decimal_[u]ll to take a const char *, not char · 75ba1d07
      Joe Perches authored
      Allow some seq_puts removals by taking a string instead of a single
      char.
      
      [akpm@linux-foundation.org: update vmstat_show(), per Joe]
      Link: http://lkml.kernel.org/r/667e1cf3d436de91a5698170a1e98d882905e956.1470704995.git.joe@perches.comSigned-off-by: default avatarJoe Perches <joe@perches.com>
      Cc: Joe Perches <joe@perches.com>
      Cc: Andi Kleen <andi@firstfloor.org>
      Cc: Al Viro <viro@zeniv.linux.org.uk>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      75ba1d07
    • Alexey Dobriyan's avatar
      proc: faster /proc/*/status · f7a5f132
      Alexey Dobriyan authored
      top(1) opens the following files for every PID:
      
      	/proc/*/stat
      	/proc/*/statm
      	/proc/*/status
      
      This patch switches /proc/*/status away from seq_printf().
      The result is 13.5% speedup.
      
      Benchmark is open("/proc/self/status")+read+close 1.000.000 million times.
      
      				BEFORE
      $ perf stat -r 10 taskset -c 3 ./proc-self-status
      
       Performance counter stats for 'taskset -c 3 ./proc-self-status' (10 runs):
      
            10748.474301      task-clock (msec)         #    0.954 CPUs utilized            ( +-  0.91% )
                      12      context-switches          #    0.001 K/sec                    ( +-  1.09% )
                       1      cpu-migrations            #    0.000 K/sec
                     104      page-faults               #    0.010 K/sec                    ( +-  0.45% )
          37,424,127,876      cycles                    #    3.482 GHz                      ( +-  0.04% )
           8,453,010,029      stalled-cycles-frontend   #   22.59% frontend cycles idle     ( +-  0.12% )
           3,747,609,427      stalled-cycles-backend    #  10.01% backend cycles idle       ( +-  0.68% )
          65,632,764,147      instructions              #    1.75  insn per cycle
                                                        #    0.13  stalled cycles per insn  ( +-  0.00% )
          13,981,324,775      branches                  # 1300.773 M/sec                    ( +-  0.00% )
             138,967,110      branch-misses             #    0.99% of all branches          ( +-  0.18% )
      
            11.263885428 seconds time elapsed                                          ( +-  0.04% )
            ^^^^^^^^^^^^
      
      				AFTER
      $ perf stat -r 10 taskset -c 3 ./proc-self-status
      
       Performance counter stats for 'taskset -c 3 ./proc-self-status' (10 runs):
      
             9010.521776      task-clock (msec)         #    0.925 CPUs utilized            ( +-  1.54% )
                      11      context-switches          #    0.001 K/sec                    ( +-  1.54% )
                       1      cpu-migrations            #    0.000 K/sec                    ( +- 11.11% )
                     103      page-faults               #    0.011 K/sec                    ( +-  0.60% )
          32,352,310,603      cycles                    #    3.591 GHz                      ( +-  0.07% )
           7,849,199,578      stalled-cycles-frontend   #   24.26% frontend cycles idle     ( +-  0.27% )
           3,269,738,842      stalled-cycles-backend    #  10.11% backend cycles idle       ( +-  0.73% )
          56,012,163,567      instructions              #    1.73  insn per cycle
                                                        #    0.14  stalled cycles per insn  ( +-  0.00% )
          11,735,778,795      branches                  # 1302.453 M/sec                    ( +-  0.00% )
              98,084,459      branch-misses             #    0.84% of all branches          ( +-  0.28% )
      
             9.741247736 seconds time elapsed                                          ( +-  0.07% )
             ^^^^^^^^^^^
      
      Link: http://lkml.kernel.org/r/20160806125608.GB1187@p183.telecom.bySigned-off-by: default avatarAlexey Dobriyan <adobriyan@gmail.com>
      Cc: Joe Perches <joe@perches.com>
      Cc: Andi Kleen <andi@firstfloor.org>
      Cc: Al Viro <viro@zeniv.linux.org.uk>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      f7a5f132
    • Alexey Dobriyan's avatar
      proc: much faster /proc/vmstat · 68ba0326
      Alexey Dobriyan authored
      Every current KDE system has process named ksysguardd polling files
      below once in several seconds:
      
      	$ strace -e trace=open -p $(pidof ksysguardd)
      	Process 1812 attached
      	open("/etc/mtab", O_RDONLY|O_CLOEXEC)   = 8
      	open("/etc/mtab", O_RDONLY|O_CLOEXEC)   = 8
      	open("/proc/net/dev", O_RDONLY)         = 8
      	open("/proc/net/wireless", O_RDONLY)    = -1 ENOENT (No such file or directory)
      	open("/proc/stat", O_RDONLY)            = 8
      	open("/proc/vmstat", O_RDONLY)          = 8
      
      Hell knows what it is doing but speed up reading /proc/vmstat by 33%!
      
      Benchmark is open+read+close 1.000.000 times.
      
      			BEFORE
      $ perf stat -r 10 taskset -c 3 ./proc-vmstat
      
       Performance counter stats for 'taskset -c 3 ./proc-vmstat' (10 runs):
      
            13146.768464      task-clock (msec)         #    0.960 CPUs utilized            ( +-  0.60% )
                      15      context-switches          #    0.001 K/sec                    ( +-  1.41% )
                       1      cpu-migrations            #    0.000 K/sec                    ( +- 11.11% )
                     104      page-faults               #    0.008 K/sec                    ( +-  0.57% )
          45,489,799,349      cycles                    #    3.460 GHz                      ( +-  0.03% )
           9,970,175,743      stalled-cycles-frontend   #   21.92% frontend cycles idle     ( +-  0.10% )
           2,800,298,015      stalled-cycles-backend    #   6.16% backend cycles idle       ( +-  0.32% )
          79,241,190,850      instructions              #    1.74  insn per cycle
                                                        #    0.13  stalled cycles per insn  ( +-  0.00% )
          17,616,096,146      branches                  # 1339.956 M/sec                    ( +-  0.00% )
             176,106,232      branch-misses             #    1.00% of all branches          ( +-  0.18% )
      
            13.691078109 seconds time elapsed                                          ( +-  0.03% )
            ^^^^^^^^^^^^
      
      			AFTER
      $ perf stat -r 10 taskset -c 3 ./proc-vmstat
      
       Performance counter stats for 'taskset -c 3 ./proc-vmstat' (10 runs):
      
             8688.353749      task-clock (msec)         #    0.950 CPUs utilized            ( +-  1.25% )
                      10      context-switches          #    0.001 K/sec                    ( +-  2.13% )
                       1      cpu-migrations            #    0.000 K/sec
                     104      page-faults               #    0.012 K/sec                    ( +-  0.56% )
          30,384,010,730      cycles                    #    3.497 GHz                      ( +-  0.07% )
          12,296,259,407      stalled-cycles-frontend   #   40.47% frontend cycles idle     ( +-  0.13% )
           3,370,668,651      stalled-cycles-backend    #  11.09% backend cycles idle       ( +-  0.69% )
          28,969,052,879      instructions              #    0.95  insn per cycle
                                                        #    0.42  stalled cycles per insn  ( +-  0.01% )
           6,308,245,891      branches                  #  726.058 M/sec                    ( +-  0.00% )
             214,685,502      branch-misses             #    3.40% of all branches          ( +-  0.26% )
      
             9.146081052 seconds time elapsed                                          ( +-  0.07% )
             ^^^^^^^^^^^
      
      vsnprintf() is slow because:
      
      1. format_decode() is busy looking for format specifier: 2 branches
         per character (not in this case, but in others)
      
      2. approximately million branches while parsing format mini language
         and everywhere
      
      3.  just look at what string() does /proc/vmstat is good case because
         most of its content are strings
      
      Link: http://lkml.kernel.org/r/20160806125455.GA1187@p183.telecom.bySigned-off-by: default avatarAlexey Dobriyan <adobriyan@gmail.com>
      Cc: Joe Perches <joe@perches.com>
      Cc: Andi Kleen <andi@firstfloor.org>
      Cc: Al Viro <viro@zeniv.linux.org.uk>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      68ba0326
    • Vineet Gupta's avatar
      atomic64: no need for CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE · 51a02124
      Vineet Gupta authored
      This came to light when implementing native 64-bit atomics for ARCv2.
      
      The atomic64 self-test code uses CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
      to check whether atomic64_dec_if_positive() is available.  It seems it
      was needed when not every arch defined it.  However as of current code
      the Kconfig option seems needless
      
       - for CONFIG_GENERIC_ATOMIC64 it is auto-enabled in lib/Kconfig and a
         generic definition of API is present lib/atomic64.c
       - arches with native 64-bit atomics select it in arch/*/Kconfig and
         define the API in their headers
      
      So I see no point in keeping the Kconfig option
      
      Compile tested for:
       - blackfin (CONFIG_GENERIC_ATOMIC64)
       - x86 (!CONFIG_GENERIC_ATOMIC64)
       - ia64
      
      Link: http://lkml.kernel.org/r/1473703083-8625-3-git-send-email-vgupta@synopsys.comSigned-off-by: default avatarVineet Gupta <vgupta@synopsys.com>
      Cc: Richard Henderson <rth@twiddle.net>
      Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
      Cc: Matt Turner <mattst88@gmail.com>
      Cc: Russell King <linux@armlinux.org.uk>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: Will Deacon <will.deacon@arm.com>
      Cc: Ralf Baechle <ralf@linux-mips.org>
      Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
      Cc: Helge Deller <deller@gmx.de>
      Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
      Cc: Paul Mackerras <paulus@samba.org>
      Cc: Michael Ellerman <mpe@ellerman.id.au>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
      Cc: "David S. Miller" <davem@davemloft.net>
      Cc: Chris Metcalf <cmetcalf@mellanox.com>
      Cc: Thomas Gleixner <tglx@linutronix.de>
      Cc: Ingo Molnar <mingo@redhat.com>
      Cc: "H. Peter Anvin" <hpa@zytor.com>
      Cc: Vineet Gupta <vgupta@synopsys.com>
      Cc: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
      Cc: Linus Walleij <linus.walleij@linaro.org>
      Cc: Alexander Potapenko <glider@google.com>
      Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
      Cc: Herbert Xu <herbert@gondor.apana.org.au>
      Cc: Ming Lin <ming.l@ssi.samsung.com>
      Cc: Arnd Bergmann <arnd@arndb.de>
      Cc: Geert Uytterhoeven <geert@linux-m68k.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Borislav Petkov <bp@suse.de>
      Cc: Andi Kleen <ak@linux.intel.com>
      Cc: Boqun Feng <boqun.feng@gmail.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      51a02124
    • Vineet Gupta's avatar
      ia64: implement atomic64_dec_if_positive · 445ed0a0
      Vineet Gupta authored
      This is based on s390 version and needed to get rid of
      CONFIG_ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
      
      Link: http://lkml.kernel.org/r/1473703083-8625-2-git-send-email-vgupta@synopsys.comSigned-off-by: default avatarVineet Gupta <vgupta@synopsys.com>
      Reported-by: default avatarkbuild test robot <fengguang.wu@intel.com>
      Cc: Tony Luck <tony.luck@intel.com>
      Cc: Fenghua Yu <fenghua.yu@intel.com>
      Cc: Ingo Molnar <mingo@kernel.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      445ed0a0
    • zijun_hu's avatar
      linux/mm.h: canonicalize macro PAGE_ALIGNED() definition · 1061b0d2
      zijun_hu authored
      The macro PAGE_ALIGNED() is prone to cause error because it doesn't
      follow convention to parenthesize parameter @addr within macro body, for
      example unsigned long *ptr = kmalloc(...); PAGE_ALIGNED(ptr + 16); for
      the left parameter of macro IS_ALIGNED(), (unsigned long)(ptr + 16) is
      desired but the actual one is (unsigned long)ptr + 16.
      
      It is fixed by simply canonicalizing macro PAGE_ALIGNED() definition.
      
      Link: http://lkml.kernel.org/r/57EA6AE7.7090807@zoho.comSigned-off-by: default avatarzijun_hu <zijun_hu@htc.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      1061b0d2
    • zhong jiang's avatar
      mm: remove unnecessary condition in remove_inode_hugepages · 72e2936c
      zhong jiang authored
      When the huge page is added to the page cahce (huge_add_to_page_cache),
      the page private flag will be cleared.  since this code
      (remove_inode_hugepages) will only be called for pages in the page
      cahce, PagePrivate(page) will always be false.
      
      The patch remove the code without any functional change.
      
      Link: http://lkml.kernel.org/r/1475113323-29368-1-git-send-email-zhongjiang@huawei.comSigned-off-by: default avatarzhong jiang <zhongjiang@huawei.com>
      Reviewed-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Reviewed-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
      Tested-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      72e2936c
    • Michal Hocko's avatar
      mm: warn about allocations which stall for too long · 63f53dea
      Michal Hocko authored
      Currently we do warn only about allocation failures but small
      allocations are basically nofail and they might loop in the page
      allocator for a long time.  Especially when the reclaim cannot make any
      progress - e.g.  GFP_NOFS cannot invoke the oom killer and rely on a
      different context to make a forward progress in case there is a lot
      memory used by filesystems.
      
      Give us at least a clue when something like this happens and warn about
      allocations which take more than 10s.  Print the basic allocation
      context information along with the cumulative time spent in the
      allocation as well as the allocation stack.  Repeat the warning after
      every 10 seconds so that we know that the problem is permanent rather
      than ephemeral.
      
      Link: http://lkml.kernel.org/r/20160929084407.7004-3-mhocko@kernel.orgSigned-off-by: default avatarMichal Hocko <mhocko@suse.com>
      Cc: Vlastimil Babka <vbabka@suse.cz>
      Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
      Cc: Johannes Weiner <hannes@cmpxchg.org>
      Cc: Mel Gorman <mgorman@suse.de>
      Cc: Dave Hansen <dave.hansen@intel.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      63f53dea
    • Michal Hocko's avatar
      mm: consolidate warn_alloc_failed users · 7877cdcc
      Michal Hocko authored
      warn_alloc_failed is currently used from the page and vmalloc
      allocators.  This is a good reuse of the code except that vmalloc would
      appreciate a slightly different warning message.  This is already
      handled by the fmt parameter except that
      
        "%s: page allocation failure: order:%u, mode:%#x(%pGg)"
      
      is printed anyway.  This might be quite misleading because it might be a
      vmalloc failure which leads to the warning while the page allocator is
      not the culprit here.  Fix this by always using the fmt string and only
      print the context that makes sense for the particular context (e.g.
      order makes only very little sense for the vmalloc context).
      
      Rename the function to not miss any user and also because a later patch
      will reuse it also for !failure cases.
      
      Link: http://lkml.kernel.org/r/20160929084407.7004-2-mhocko@kernel.orgSigned-off-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
      Cc: Johannes Weiner <hannes@cmpxchg.org>
      Cc: Mel Gorman <mgorman@suse.de>
      Cc: Dave Hansen <dave.hansen@intel.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      7877cdcc
    • Wei Fang's avatar
      vfs,mm: fix a dead loop in truncate_inode_pages_range() · c2a9737f
      Wei Fang authored
      We triggered a deadloop in truncate_inode_pages_range() on 32 bits
      architecture with the test case bellow:
      
      	...
      	fd = open();
      	write(fd, buf, 4096);
      	preadv64(fd, &iovec, 1, 0xffffffff000);
      	ftruncate(fd, 0);
      	...
      
      Then ftruncate() will not return forever.
      
      The filesystem used in this case is ubifs, but it can be triggered on
      many other filesystems.
      
      When preadv64() is called with offset=0xffffffff000, a page with
      index=0xffffffff will be added to the radix tree of ->mapping.  Then
      this page can be found in ->mapping with pagevec_lookup().  After that,
      truncate_inode_pages_range(), which is called in ftruncate(), will fall
      into an infinite loop:
      
       - find a page with index=0xffffffff, since index>=end, this page won't
         be truncated
      
       - index++, and index become 0
      
       - the page with index=0xffffffff will be found again
      
      The data type of index is unsigned long, so index won't overflow to 0 on
      64 bits architecture in this case, and the dead loop won't happen.
      
      Since truncate_inode_pages_range() is executed with holding lock of
      inode->i_rwsem, any operation related with this lock will be blocked,
      and a hung task will happen, e.g.:
      
        INFO: task truncate_test:3364 blocked for more than 120 seconds.
        ...
           call_rwsem_down_write_failed+0x17/0x30
           generic_file_write_iter+0x32/0x1c0
           ubifs_write_iter+0xcc/0x170
           __vfs_write+0xc4/0x120
           vfs_write+0xb2/0x1b0
           SyS_write+0x46/0xa0
      
      The page with index=0xffffffff added to ->mapping is useless.  Fix this
      by checking the read position before allocating pages.
      
      Link: http://lkml.kernel.org/r/1475151010-40166-1-git-send-email-fangwei1@huawei.comSigned-off-by: default avatarWei Fang <fangwei1@huawei.com>
      Cc: Christoph Hellwig <hch@infradead.org>
      Cc: Dave Chinner <david@fromorbit.com>
      Cc: Al Viro <viro@zeniv.linux.org.uk>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      c2a9737f
    • Yisheng Xie's avatar
      arm64 Kconfig: select gigantic page · 14f09910
      Yisheng Xie authored
      Arm64 supports gigantic pages after commit 084bd298 ("ARM64: mm:
      HugeTLB support.") however, it can only be allocated at boottime and
      can't be freed.
      
      This patch selects ARCH_HAS_GIGANTIC_PAGE to make gigantic pages can be
      allocated and freed at runtime for arch arm64.
      
      Link: http://lkml.kernel.org/r/1475227569-63446-3-git-send-email-xieyisheng1@huawei.comSigned-off-by: default avatarYisheng Xie <xieyisheng1@huawei.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarCatalin Marinas <catalin.marinas@arm.com>
      Acked-by: default avatarHillf Danton <hillf.zj@alibaba-inc.com>
      Cc: Hanjun Guo <guohanjun@huawei.com>
      Cc: Will Deacon <will.deacon@arm.com>
      Cc: Dave Hansen <dave.hansen@intel.com>
      Cc: Sudeep Holla <sudeep.holla@arm.com>
      Cc: Mark Rutland <mark.rutland@arm.com>
      Cc: Rob Herring <robh+dt@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      14f09910
    • Yisheng Xie's avatar
      mm/hugetlb: introduce ARCH_HAS_GIGANTIC_PAGE · 461a7184
      Yisheng Xie authored
      Avoid making ifdef get pretty unwieldy if many ARCHs support gigantic
      page.  No functional change with this patch.
      
      Link: http://lkml.kernel.org/r/1475227569-63446-2-git-send-email-xieyisheng1@huawei.comSigned-off-by: default avatarYisheng Xie <xieyisheng1@huawei.com>
      Suggested-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Acked-by: default avatarHillf Danton <hillf.zj@alibaba-inc.com>
      Cc: Hanjun Guo <guohanjun@huawei.com>
      Cc: Will Deacon <will.deacon@arm.com>
      Cc: Dave Hansen <dave.hansen@intel.com>
      Cc: Sudeep Holla <sudeep.holla@arm.com>
      Cc: Catalin Marinas <catalin.marinas@arm.com>
      Cc: Mark Rutland <mark.rutland@arm.com>
      Cc: Rob Herring <robh+dt@kernel.org>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      461a7184
    • Michal Hocko's avatar
      oom: print nodemask in the oom report · 82e7d3ab
      Michal Hocko authored
      We have received a hard to explain oom report from a customer.  The oom
      triggered regardless there is a lot of free memory:
      
        PoolThread invoked oom-killer: gfp_mask=0x280da, order=0, oom_adj=0, oom_score_adj=0
        PoolThread cpuset=/ mems_allowed=0-7
        Pid: 30055, comm: PoolThread Tainted: G           E X 3.0.101-80-default #1
        Call Trace:
          dump_trace+0x75/0x300
          dump_stack+0x69/0x6f
          dump_header+0x8e/0x110
          oom_kill_process+0xa6/0x350
          out_of_memory+0x2b7/0x310
          __alloc_pages_slowpath+0x7dd/0x820
          __alloc_pages_nodemask+0x1e9/0x200
          alloc_pages_vma+0xe1/0x290
          do_anonymous_page+0x13e/0x300
          do_page_fault+0x1fd/0x4c0
          page_fault+0x25/0x30
        [...]
        active_anon:1135959151 inactive_anon:1051962 isolated_anon:0
         active_file:13093 inactive_file:222506 isolated_file:0
         unevictable:262144 dirty:2 writeback:0 unstable:0
         free:432672819 slab_reclaimable:7917 slab_unreclaimable:95308
         mapped:261139 shmem:166297 pagetables:2228282 bounce:0
        [...]
        Node 0 DMA free:15896kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15672kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
        lowmem_reserve[]: 0 2892 775542 775542
        Node 0 DMA32 free:2783784kB min:28kB low:32kB high:40kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:2961572kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes
        lowmem_reserve[]: 0 0 772650 772650
        Node 0 Normal free:8120kB min:8160kB low:10200kB high:12240kB active_anon:779334960kB inactive_anon:2198744kB active_file:0kB inactive_file:180kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:791193600kB mlocked:131072kB dirty:0kB writeback:0kB mapped:372940kB shmem:361480kB slab_reclaimable:4536kB slab_unreclaimable:68472kB kernel_stack:10104kB pagetables:1414820kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:2280 all_unreclaimable? yes
        lowmem_reserve[]: 0 0 0 0
        Node 1 Normal free:476718144kB min:8192kB low:10240kB high:12288kB active_anon:307623696kB inactive_anon:283620kB active_file:10392kB inactive_file:69908kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:4kB writeback:0kB mapped:257208kB shmem:189896kB slab_reclaimable:3868kB slab_unreclaimable:44756kB kernel_stack:1848kB pagetables:1369432kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
        lowmem_reserve[]: 0 0 0 0
        Node 2 Normal free:386002452kB min:8192kB low:10240kB high:12288kB active_anon:398563752kB inactive_anon:68184kB active_file:10292kB inactive_file:29936kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:32084kB shmem:776kB slab_reclaimable:6888kB slab_unreclaimable:60056kB kernel_stack:8208kB pagetables:1282880kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
        lowmem_reserve[]: 0 0 0 0
        Node 3 Normal free:196406760kB min:8192kB low:10240kB high:12288kB active_anon:587445640kB inactive_anon:164396kB active_file:5716kB inactive_file:709844kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:291776kB shmem:111416kB slab_reclaimable:5152kB slab_unreclaimable:44516kB kernel_stack:2168kB pagetables:1455956kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
        lowmem_reserve[]: 0 0 0 0
        Node 4 Normal free:425338880kB min:8192kB low:10240kB high:12288kB active_anon:359695204kB inactive_anon:43216kB active_file:5748kB inactive_file:14772kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:24708kB shmem:1120kB slab_reclaimable:1884kB slab_unreclaimable:41060kB kernel_stack:1856kB pagetables:1100208kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
        lowmem_reserve[]: 0 0 0 0
        Node 5 Normal free:11140kB min:8192kB low:10240kB high:12288kB active_anon:784240872kB inactive_anon:1217164kB active_file:28kB inactive_file:48kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:11408kB shmem:0kB slab_reclaimable:2008kB slab_unreclaimable:49220kB kernel_stack:1360kB pagetables:531600kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:1202 all_unreclaimable? yes
        lowmem_reserve[]: 0 0 0 0
        Node 6 Normal free:243395332kB min:8192kB low:10240kB high:12288kB active_anon:542015544kB inactive_anon:40208kB active_file:968kB inactive_file:8484kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:0kB writeback:0kB mapped:19992kB shmem:496kB slab_reclaimable:1672kB slab_unreclaimable:37052kB kernel_stack:2088kB pagetables:750264kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
        lowmem_reserve[]: 0 0 0 0
        Node 7 Normal free:10768kB min:8192kB low:10240kB high:12288kB active_anon:784916936kB inactive_anon:192316kB active_file:19228kB inactive_file:56852kB unevictable:131072kB isolated(anon):0kB isolated(file):0kB present:794296320kB mlocked:131072kB dirty:4kB writeback:0kB mapped:34440kB shmem:4kB slab_reclaimable:5660kB slab_unreclaimable:36100kB kernel_stack:1328kB pagetables:1007968kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
        lowmem_reserve[]: 0 0 0 0
      
      So all nodes but Node 0 have a lot of free memory which should suggest
      that there is an available memory especially when mems_allowed=0-7.  One
      could speculate that a massive process has managed to terminate and free
      up a lot of memory while racing with the above allocation request.
      Although this is highly unlikely it cannot be ruled out.
      
      A further debugging, however shown that the faulting process had
      mempolicy (not cpuset) to bind to Node 0.  We cannot see that
      information from the report though.  mems_allowed turned out to be more
      confusing than really helpful.
      
      Fix this by always priting the nodemask.  It is either mempolicy mask
      (and non-null) or the one defined by the cpusets.  The new output for
      the above oom report would be
      
        PoolThread invoked oom-killer: gfp_mask=0x280da(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), nodemask=0, order=0, oom_adj=0, oom_score_adj=0
      
      This patch doesn't touch show_mem and the node filtering based on the
      cpuset node mask because mempolicy is always a subset of cpusets and
      seeing the full cpuset oom context might be helpful for tunning more
      specific mempolicies inside cpusets (e.g.  when they turn out to be too
      restrictive).  To prevent from ugly ifdefs the mask is printed even for
      !NUMA configurations but this should be OK (a single node will be
      printed).
      
      Link: http://lkml.kernel.org/r/20160930214146.28600-1-mhocko@kernel.orgSigned-off-by: default avatarMichal Hocko <mhocko@suse.com>
      Reported-by: default avatarSellami Abdelkader <abdelkader.sellami@sap.com>
      Acked-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Sellami Abdelkader <abdelkader.sellami@sap.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      82e7d3ab
    • Kirill A. Shutemov's avatar
    • Andrea Arcangeli's avatar
      mm: vma_merge: correct false positive from __vma_unlink->validate_mm_rb · 8f26e0b1
      Andrea Arcangeli authored
      The old code was always doing:
      
         vma->vm_end = next->vm_end
         vma_rb_erase(next) // in __vma_unlink
         vma->vm_next = next->vm_next // in __vma_unlink
         next = vma->vm_next
         vma_gap_update(next)
      
      The new code still does the above for remove_next == 1 and 2, but for
      remove_next == 3 it has been changed and it does:
      
         next->vm_start = vma->vm_start
         vma_rb_erase(vma) // in __vma_unlink
         vma_gap_update(next)
      
      In the latter case, while unlinking "vma", validate_mm_rb() is told to
      ignore "vma" that is being removed, but next->vm_start was reduced
      instead. So for the new case, to avoid the false positive from
      validate_mm_rb, it should be "next" that is ignored when "vma" is
      being unlinked.
      
      "vma" and "next" in the above comment, considered pre-swap().
      
      Link: http://lkml.kernel.org/r/1474492522-2261-4-git-send-email-aarcange@redhat.comSigned-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
      Tested-by: default avatarShaun Tancheff <shaun.tancheff@seagate.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Jan Vorlicek <janvorli@microsoft.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      8f26e0b1
    • Andrea Arcangeli's avatar
      mm: vma_adjust: minor comment correction · 86d12e47
      Andrea Arcangeli authored
      The cases are three not two.
      
      Link: http://lkml.kernel.org/r/1474492522-2261-3-git-send-email-aarcange@redhat.comSigned-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Jan Vorlicek <janvorli@microsoft.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      86d12e47
    • Andrea Arcangeli's avatar
      mm: vma_adjust: remove superfluous check for next not NULL · 97a42cd4
      Andrea Arcangeli authored
      If next would be NULL we couldn't reach such code path.
      
      Link: http://lkml.kernel.org/r/1474309513-20313-2-git-send-email-aarcange@redhat.comSigned-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Jan Vorlicek <janvorli@microsoft.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      97a42cd4
    • Andrea Arcangeli's avatar
      mm: vma_merge: fix vm_page_prot SMP race condition against rmap_walk · e86f15ee
      Andrea Arcangeli authored
      The rmap_walk can access vm_page_prot (and potentially vm_flags in the
      pte/pmd manipulations).  So it's not safe to wait the caller to update
      the vm_page_prot/vm_flags after vma_merge returned potentially removing
      the "next" vma and extending the "current" vma over the
      next->vm_start,vm_end range, but still with the "current" vma
      vm_page_prot, after releasing the rmap locks.
      
      The vm_page_prot/vm_flags must be transferred from the "next" vma to the
      current vma while vma_merge still holds the rmap locks.
      
      The side effect of this race condition is pte corruption during migrate
      as remove_migration_ptes when run on a address of the "next" vma that
      got removed, used the vm_page_prot of the current vma.
      
        migrate   	      	        mprotect
        ------------			-------------
        migrating in "next" vma
      				vma_merge() # removes "next" vma and
      			        	    # extends "current" vma
      					    # current vma is not with
      					    # vm_page_prot updated
        remove_migration_ptes
        read vm_page_prot of current "vma"
        establish pte with wrong permissions
      				vm_set_page_prot(vma) # too late!
      				change_protection in the old vma range
      				only, next range is not updated
      
      This caused segmentation faults and potentially memory corruption in
      heavy mprotect loads with some light page migration caused by compaction
      in the background.
      
      Hugh Dickins pointed out the comment about the Odd case 8 in vma_merge
      which confirms the case 8 is only buggy one where the race can trigger,
      in all other vma_merge cases the above cannot happen.
      
      This fix removes the oddness factor from case 8 and it converts it from:
      
            AAAA
        PPPPNNNNXXXX -> PPPPNNNNNNNN
      
      to:
      
            AAAA
        PPPPNNNNXXXX -> PPPPXXXXXXXX
      
      XXXX has the right vma properties for the whole merged vma returned by
      vma_adjust, so it solves the problem fully.  It has the added benefits
      that the callers could stop updating vma properties when vma_merge
      succeeds however the callers are not updated by this patch (there are
      bits like VM_SOFTDIRTY that still need special care for the whole range,
      as the vma merging ignores them, but as long as they're not processed by
      rmap walks and instead they're accessed with the mmap_sem at least for
      reading, they are fine not to be updated within vma_adjust before
      releasing the rmap_locks).
      
      Link: http://lkml.kernel.org/r/1474309513-20313-1-git-send-email-aarcange@redhat.comSigned-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
      Reported-by: default avatarAditya Mandaleeka <adityam@microsoft.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Jan Vorlicek <janvorli@microsoft.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      e86f15ee
    • Andrea Arcangeli's avatar
      mm: vma_adjust: remove superfluous confusing update in remove_next == 1 case · fb8c41e9
      Andrea Arcangeli authored
      mm->highest_vm_end doesn't need any update.
      
      After finally removing the oddness from vma_merge case 8 that was
      causing:
      
      1) constant risk of trouble whenever anybody would check vma fields
         from rmap_walks, like it happened when page migration was
         introduced and it read the vma->vm_page_prot from a rmap_walk
      
      2) the callers of vma_merge to re-initialize any value different from
         the current vma, instead of vma_merge() more reliably returning a
         vma that already matches all fields passed as parameter
      
      .. it is also worth to take the opportunity of cleaning up superfluous
      code in vma_adjust(), that if not removed adds up to the hard
      readability of the function.
      
      Link: http://lkml.kernel.org/r/1474492522-2261-5-git-send-email-aarcange@redhat.comSigned-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Jan Vorlicek <janvorli@microsoft.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      fb8c41e9
    • Andrea Arcangeli's avatar
      mm: vm_page_prot: update with WRITE_ONCE/READ_ONCE · 6d2329f8
      Andrea Arcangeli authored
      vma->vm_page_prot is read lockless from the rmap_walk, it may be updated
      concurrently and this prevents the risk of reading intermediate values.
      
      Link: http://lkml.kernel.org/r/1474660305-19222-1-git-send-email-aarcange@redhat.comSigned-off-by: default avatarAndrea Arcangeli <aarcange@redhat.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Jan Vorlicek <janvorli@microsoft.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      6d2329f8
    • zhong jiang's avatar
      mm,ksm: add __GFP_HIGH to the allocation in alloc_stable_node() · 6213055f
      zhong jiang authored
      According to Hugh's suggestion, alloc_stable_node() with GFP_KERNEL can
      in rare cases cause a hung task warning.
      
      At present, if alloc_stable_node() allocation fails, two break_cows may
      want to allocate a couple of pages, and the issue will come up when free
      memory is under pressure.
      
      We fix it by adding __GFP_HIGH to GFP, to grant access to memory
      reserves, increasing the likelihood of allocation success.
      
      [akpm@linux-foundation.org: tweak comment]
      Link: http://lkml.kernel.org/r/1474354484-58233-1-git-send-email-zhongjiang@huawei.comSigned-off-by: default avatarzhong jiang <zhongjiang@huawei.com>
      Suggested-by: default avatarHugh Dickins <hughd@google.com>
      Acked-by: default avatarHugh Dickins <hughd@google.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      6213055f
    • Yisheng Xie's avatar
    • Gerald Schaefer's avatar
      mm/hugetlb: improve locking in dissolve_free_huge_pages() · eb03aa00
      Gerald Schaefer authored
      For every pfn aligned to minimum_order, dissolve_free_huge_pages() will
      call dissolve_free_huge_page() which takes the hugetlb spinlock, even if
      the page is not huge at all or a hugepage that is in-use.
      
      Improve this by doing the PageHuge() and page_count() checks already in
      dissolve_free_huge_pages() before calling dissolve_free_huge_page().  In
      dissolve_free_huge_page(), when holding the spinlock, those checks need
      to be revalidated.
      
      Link: http://lkml.kernel.org/r/20160926172811.94033-4-gerald.schaefer@de.ibm.comSigned-off-by: default avatarGerald Schaefer <gerald.schaefer@de.ibm.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Vlastimil Babka <vbabka@suse.cz>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
      Cc: Rui Teng <rui.teng@linux.vnet.ibm.com>
      Cc: Dave Hansen <dave.hansen@linux.intel.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      eb03aa00
    • Gerald Schaefer's avatar
      mm/hugetlb: check for reserved hugepages during memory offline · 082d5b6b
      Gerald Schaefer authored
      In dissolve_free_huge_pages(), free hugepages will be dissolved without
      making sure that there are enough of them left to satisfy hugepage
      reservations.
      
      Fix this by adding a return value to dissolve_free_huge_pages() and
      checking h->free_huge_pages vs.  h->resv_huge_pages.  Note that this may
      lead to the situation where dissolve_free_huge_page() returns an error
      and all free hugepages that were dissolved before that error are lost,
      while the memory block still cannot be set offline.
      
      Fixes: c8721bbb ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
      Link: http://lkml.kernel.org/r/20160926172811.94033-3-gerald.schaefer@de.ibm.comSigned-off-by: default avatarGerald Schaefer <gerald.schaefer@de.ibm.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Vlastimil Babka <vbabka@suse.cz>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
      Cc: Rui Teng <rui.teng@linux.vnet.ibm.com>
      Cc: Dave Hansen <dave.hansen@linux.intel.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      082d5b6b
    • Gerald Schaefer's avatar
      mm/hugetlb: fix memory offline with hugepage size > memory block size · 2247bb33
      Gerald Schaefer authored
      Patch series "mm/hugetlb: memory offline issues with hugepages", v4.
      
      This addresses several issues with hugepages and memory offline.  While
      the first patch fixes a panic, and is therefore rather important, the
      last patch is just a performance optimization.
      
      The second patch fixes a theoretical issue with reserved hugepages,
      while still leaving some ugly usability issue, see description.
      
      This patch (of 3):
      
      dissolve_free_huge_pages() will either run into the VM_BUG_ON() or a
      list corruption and addressing exception when trying to set a memory
      block offline that is part (but not the first part) of a "gigantic"
      hugetlb page with a size > memory block size.
      
      When no other smaller hugetlb page sizes are present, the VM_BUG_ON()
      will trigger directly.  In the other case we will run into an addressing
      exception later, because dissolve_free_huge_page() will not work on the
      head page of the compound hugetlb page which will result in a NULL
      hstate from page_hstate().
      
      To fix this, first remove the VM_BUG_ON() because it is wrong, and then
      use the compound head page in dissolve_free_huge_page().  This means
      that an unused pre-allocated gigantic page that has any part of itself
      inside the memory block that is going offline will be dissolved
      completely.  Losing an unused gigantic hugepage is preferable to failing
      the memory offline, for example in the situation where a (possibly
      faulty) memory DIMM needs to go offline.
      
      Fixes: c8721bbb ("mm: memory-hotplug: enable memory hotplug to handle hugepage")
      Link: http://lkml.kernel.org/r/20160926172811.94033-2-gerald.schaefer@de.ibm.comSigned-off-by: default avatarGerald Schaefer <gerald.schaefer@de.ibm.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Acked-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
      Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
      Cc: Vlastimil Babka <vbabka@suse.cz>
      Cc: Mike Kravetz <mike.kravetz@oracle.com>
      Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
      Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
      Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
      Cc: Rui Teng <rui.teng@linux.vnet.ibm.com>
      Cc: Dave Hansen <dave.hansen@linux.intel.com>
      Cc: <stable@vger.kernel.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      2247bb33
    • Wanlong Gao's avatar
      mm: nobootmem: move the comment of free_all_bootmem · 914a0516
      Wanlong Gao authored
      Commit b4def350 ("mm, nobootmem: clean-up of free_low_memory_core_early()")
      removed the unnecessary nodeid argument, after that, this comment
      becomes more confused.  We should move it to the right place.
      
      Fixes: b4def350 ("mm, nobootmem: clean-up of free_low_memory_core_early()")
      Link: http://lkml.kernel.org/r/1473996082-14603-1-git-send-email-wanlong.gao@gmail.comSigned-off-by: default avatarWanlong Gao <wanlong.gao@gmail.com>
      Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      914a0516
    • Rasmus Villemoes's avatar
      mm/shmem.c: constify anon_ops · 19938e35
      Rasmus Villemoes authored
      Every other dentry_operations instance is const, and this one might as
      well be.
      
      Link: http://lkml.kernel.org/r/1473890528-7009-1-git-send-email-linux@rasmusvillemoes.dkSigned-off-by: default avatarRasmus Villemoes <linux@rasmusvillemoes.dk>
      Acked-by: default avatarHugh Dickins <hughd@google.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      19938e35
    • Johannes Weiner's avatar
      mm: memcontrol: consolidate cgroup socket tracking · 2d758073
      Johannes Weiner authored
      The cgroup core and the memory controller need to track socket ownership
      for different purposes, but the tracking sites being entirely different
      is kind of ugly.
      
      Be a better citizen and rename the memory controller callbacks to match
      the cgroup core callbacks, then move them to the same place.
      
      [akpm@linux-foundation.org: coding-style fixes]
      Link: http://lkml.kernel.org/r/20160914194846.11153-3-hannes@cmpxchg.orgSigned-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
      Acked-by: default avatarTejun Heo <tj@kernel.org>
      Cc: "David S. Miller" <davem@davemloft.net>
      Cc: Michal Hocko <mhocko@suse.cz>
      Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      2d758073
    • Baoyou Xie's avatar
      mm: move phys_mem_access_prot_allowed() declaration to pgtable.h · 08ea8c07
      Baoyou Xie authored
      We get 1 warning when building kernel with W=1:
      
        drivers/char/mem.c:220:12: warning: no previous prototype for 'phys_mem_access_prot_allowed' [-Wmissing-prototypes]
         int __weak phys_mem_access_prot_allowed(struct file *file,
      
      In fact, its declaration is spreading to several header files in
      different architecture, but need to be declare in common header file.
      
      So this patch moves phys_mem_access_prot_allowed() to pgtable.h.
      
      Link: http://lkml.kernel.org/r/1473751597-12139-1-git-send-email-baoyou.xie@linaro.orgSigned-off-by: default avatarBaoyou Xie <baoyou.xie@linaro.org>
      Acked-by: default avatarThomas Gleixner <tglx@linutronix.de>
      Acked-by: default avatarRalf Baechle <ralf@linux-mips.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      08ea8c07
    • Andrew Morton's avatar
      mm/page_io.c: replace some BUG_ON()s with VM_BUG_ON_PAGE() · cc30c5d6
      Andrew Morton authored
      So they are CONFIG_DEBUG_VM-only and more informative.
      
      Cc: Al Viro <viro@zeniv.linux.org.uk>
      Cc: David S. Miller <davem@davemloft.net>
      Cc: Hugh Dickins <hughd@google.com>
      Cc: Jens Axboe <axboe@fb.com>
      Cc: Joe Perches <joe@perches.com>
      Cc: Mel Gorman <mgorman@suse.de>
      Cc: Michal Hocko <mhocko@suse.com>
      Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Santosh Shilimkar <santosh.shilimkar@oracle.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      cc30c5d6
    • Tetsuo Handa's avatar
      mm: don't emit warning from pagefault_out_of_memory() · a104808e
      Tetsuo Handa authored
      Commit c32b3cbe ("oom, PM: make OOM detection in the freezer path
      raceless") inserted a WARN_ON() into pagefault_out_of_memory() in order
      to warn when we raced with disabling the OOM killer.
      
      Now, patch "oom, suspend: fix oom_killer_disable vs.  pm suspend
      properly" introduced a timeout for oom_killer_disable().  Even if we
      raced with disabling the OOM killer and the system is OOM livelocked,
      the OOM killer will be enabled eventually (in 20 seconds by default) and
      the OOM livelock will be solved.  Therefore, we no longer need to warn
      when we raced with disabling the OOM killer.
      
      Link: http://lkml.kernel.org/r/1473442120-7246-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jpSigned-off-by: default avatarTetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
      Acked-by: default avatarMichal Hocko <mhocko@suse.cz>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Johannes Weiner <hannes@cmpxchg.org>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      a104808e
    • Vlastimil Babka's avatar
      mm, compaction: restrict fragindex to costly orders · 20311420
      Vlastimil Babka authored
      Fragmentation index and the vm.extfrag_threshold sysctl is meant as a
      heuristic to prevent excessive compaction for costly orders (i.e.  THP).
      It's unlikely to make any difference for non-costly orders, especially
      with the default threshold.  But we cannot afford any uncertainty for
      the non-costly orders where the only alternative to successful
      reclaim/compaction is OOM.  After the recent patches we are guaranteed
      maximum effort without heuristics from compaction before deciding OOM,
      and fragindex is the last remaining heuristic.  Therefore skip fragindex
      altogether for non-costly orders.
      Suggested-by: default avatarMichal Hocko <mhocko@suse.com>
      Link: http://lkml.kernel.org/r/20160926162025.21555-5-vbabka@suse.czSigned-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Rik van Riel <riel@redhat.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      20311420
    • Vlastimil Babka's avatar
      mm, compaction: ignore fragindex from compaction_zonelist_suitable() · cc5c9f09
      Vlastimil Babka authored
      The compaction_zonelist_suitable() function tries to determine if
      compaction will be able to proceed after sufficient reclaim, i.e.
      whether there are enough reclaimable pages to provide enough order-0
      freepages for compaction.
      
      This addition of reclaimable pages to the free pages works well for the
      order-0 watermark check, but in the fragmentation index check we only
      consider truly free pages.  Thus we can get fragindex value close to 0
      which indicates failure do to lack of memory, and wrongly decide that
      compaction won't be suitable even after reclaim.
      
      Instead of trying to somehow adjust fragindex for reclaimable pages,
      let's just skip it from compaction_zonelist_suitable().
      
      Link: http://lkml.kernel.org/r/20160926162025.21555-4-vbabka@suse.czSigned-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Rik van Riel <riel@redhat.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      cc5c9f09
    • Vlastimil Babka's avatar
      mm, page_alloc: pull no_progress_loops update to should_reclaim_retry() · 423b452e
      Vlastimil Babka authored
      The should_reclaim_retry() makes decisions based on no_progress_loops,
      so it makes sense to also update the counter there.  It will be also
      consistent with should_compact_retry() and compaction_retries.  No
      functional change.
      
      [hillf.zj@alibaba-inc.com: fix missing pointer dereferences]
      Link: http://lkml.kernel.org/r/20160926162025.21555-3-vbabka@suse.czSigned-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Acked-by: default avatarHillf Danton <hillf.zj@alibaba-inc.com>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Rik van Riel <riel@redhat.com>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      423b452e
    • Vlastimil Babka's avatar
      mm, compaction: make full priority ignore pageblock suitability · 9f7e3387
      Vlastimil Babka authored
      Several people have reported premature OOMs for order-2 allocations
      (stack) due to OOM rework in 4.7.  In the scenario (parallel kernel
      build and dd writing to two drives) many pageblocks get marked as
      Unmovable and compaction free scanner struggles to isolate free pages.
      Joonsoo Kim pointed out that the free scanner skips pageblocks that are
      not movable to prevent filling them and forcing non-movable allocations
      to fallback to other pageblocks.  Such heuristic makes sense to help
      prevent long-term fragmentation, but premature OOMs are relatively more
      urgent problem.  As a compromise, this patch disables the heuristic only
      for the ultimate compaction priority.
      
      Link: http://lkml.kernel.org/r/20160906135258.18335-5-vbabka@suse.czReported-by: default avatarRalf-Peter Rohbeck <Ralf-Peter.Rohbeck@quantum.com>
      Reported-by: default avatarArkadiusz Miskiewicz <a.miskiewicz@gmail.com>
      Reported-by: default avatarOlaf Hering <olaf@aepfle.de>
      Suggested-by: default avatarJoonsoo Kim <iamjoonsoo.kim@lge.com>
      Signed-off-by: default avatarVlastimil Babka <vbabka@suse.cz>
      Acked-by: default avatarMichal Hocko <mhocko@suse.com>
      Cc: Michal Hocko <mhocko@kernel.org>
      Cc: Mel Gorman <mgorman@techsingularity.net>
      Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
      Cc: David Rientjes <rientjes@google.com>
      Cc: Rik van Riel <riel@redhat.com>
      Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
      Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
      Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
      9f7e3387