Merge pull request #2465 from gabrieldemarmiesse/simplifying_memoryview_numpy

Changed the numpy tutorial to make is faster to understand.

Merge pull request #2465 from gabrieldemarmiesse/simplifying_memoryview_numpy
Changed the numpy tutorial to make is faster to understand.
1d65bbd4 · scoder · GitHub · 8cc8080b · a0db0099 · 1d65bbd4
Commit 1d65bbd4 authored Aug 19, 2018 by scoder Committed by GitHub Aug 19, 2018
15 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,8 @@ Tools/*.elc
 *.swp
 *~
+.ipynb_checkpoints
 tags
 TAGS
 MANIFEST

--- a/docs/examples/userguide/numpy_tutorial/compute_fused_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_fused_types.pyx
+# cython: infer_types=True
+import numpy as np
+cimport cython
+ctypedef fused my_type:
+    int
+    double
+    long long
+cdef my_type clip(my_type a, my_type min_value, my_type max_value):
+    return min(max(a, min_value), max_value)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def compute(my_type[:, ::1] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):
+    x_max = array_1.shape[0]
+    y_max = array_1.shape[1]
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+    if my_type is int:
+        dtype = np.intc
+    elif my_type is double:
+        dtype = np.double
+    elif my_type is cython.longlong:
+        dtype = np.longlong
+    result = np.zeros((x_max, y_max), dtype=dtype)
+    cdef my_type[:, ::1] result_view = result
+    cdef my_type tmp
+    cdef Py_ssize_t x, y
+    for x in range(x_max):
+        for y in range(y_max):
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_infer_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_infer_types.pyx
+# cython: infer_types=True
+import numpy as np
+cimport cython
+DTYPE = np.intc
+cdef int clip(int a, int min_value, int max_value):
+    return min(max(a, min_value), max_value)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def compute(int[:, ::1] array_1, int[:, ::1] array_2, int a, int b, int c):
+    x_max = array_1.shape[0]
+    y_max = array_1.shape[1]
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+    result = np.zeros((x_max, y_max), dtype=DTYPE)
+    cdef int[:, ::1] result_view = result
+    cdef int tmp
+    cdef Py_ssize_t x, y
+    for x in range(x_max):
+        for y in range(y_max):
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_memview.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_memview.pyx
+import numpy as np
+DTYPE = np.intc
+cdef int clip(int a, int min_value, int max_value):
+    return min(max(a, min_value), max_value)
+def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):
+    cdef Py_ssize_t x_max = array_1.shape[0]
+    cdef Py_ssize_t y_max = array_1.shape[1]
+    # array_1.shape is now a C array, no it's not possible
+    # to compare it simply by using == without a for-loop.
+    # To be able to compare it to array_2.shape easily,
+    # we convert them both to Python tuples.
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+    result = np.zeros((x_max, y_max), dtype=DTYPE)
+    cdef int[:, :] result_view = result
+    cdef int tmp
+    cdef Py_ssize_t x, y
+    for x in range(x_max):
+        for y in range(y_max):
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_prange.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_prange.pyx
+# distutils: extra_compile_args=-fopenmp
+# distutils: extra_link_args=-fopenmp
+import numpy as np
+cimport cython
+from cython.parallel import prange
+ctypedef fused my_type:
+    int
+    double
+    long long
+# We declare our plain c function nogil
+cdef my_type clip(my_type a, my_type min_value, my_type max_value) nogil:
+    return min(max(a, min_value), max_value)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def compute(my_type[:, ::1] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):
+    cdef Py_ssize_t x_max = array_1.shape[0]
+    cdef Py_ssize_t y_max = array_1.shape[1]
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+    if my_type is int:
+        dtype = np.intc
+    elif my_type is double:
+        dtype = np.double
+    elif my_type is cython.longlong:
+        dtype = np.longlong
+    result = np.zeros((x_max, y_max), dtype=dtype)
+    cdef my_type[:, ::1] result_view = result
+    cdef my_type tmp
+    cdef Py_ssize_t x, y
+    # We use prange here.
+    for x in prange(x_max, nogil=True):
+        for y in range(y_max):
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_py.py
+++ b/docs/examples/userguide/numpy_tutorial/compute_py.py
+import numpy as np
+def clip(a, min_value, max_value):
+    return min(max(a, min_value), max_value)
+def compute(array_1, array_2, a, b, c):
+    """
+    This function must implement the formula
+    np.clip(array_1, 2, 10) * a + array_2 * b + c
+    array_1 and array_2 are 2D.
+    """
+    x_max = array_1.shape[0]
+    y_max = array_1.shape[1]
+    assert array_1.shape == array_2.shape
+    result = np.zeros((x_max, y_max), dtype=array_1.dtype)
+    for x in range(x_max):
+        for y in range(y_max):
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result[x, y] = tmp + c
+    return result
--- a/docs/examples/userguide/numpy_tutorial/convolve_typed.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_typed.pyx
@@ -5,49 +5,46 @@ import numpy as np
 # type info object.
 DTYPE = np.intc
-def naive_convolve(f, g):
+# cdef means here that this function is a plain C function (so faster).
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
+# To get all the benefits, we type the arguments and the return value.
-        raise ValueError("Only odd dimensions on filter supported")
+cdef int clip(int a, int min_value, int max_value):
-    assert f.dtype == DTYPE and g.dtype == DTYPE
+    return min(max(a, min_value), max_value)
+def compute(array_1, array_2, int a, int b, int c):
    # The "cdef" keyword is also used within functions to type variables. It
    # can only be used at the top indentation level (there are non-trivial
    # problems with allowing them in other places, though we'd love to see
    # good and thought out proposals for it).
+    cdef Py_ssize_t x_max = array_1.shape[0]
+    cdef Py_ssize_t y_max = array_1.shape[1]
+    assert array_1.shape == array_2.shape
+    assert array_1.dtype == DTYPE
+    assert array_2.dtype == DTYPE
-    # Py_ssize_t is the proper C type for Python array indices.
+    result = np.zeros((x_max, y_max), dtype=DTYPE)
-    cdef Py_ssize_t x, y, s, t, v, w, s_from, s_to, t_from, t_to
-    cdef Py_ssize_t vmax = f.shape[0]
-    cdef Py_ssize_t wmax = f.shape[1]
-    cdef Py_ssize_t smax = g.shape[0]
-    cdef Py_ssize_t tmax = g.shape[1]
-    cdef Py_ssize_t smid = smax // 2
-    cdef Py_ssize_t tmid = tmax // 2
-    cdef Py_ssize_t xmax = vmax + 2*smid
-    cdef Py_ssize_t ymax = wmax + 2*tmid
-    h = np.zeros([xmax, ymax], dtype=DTYPE)
    # It is very important to type ALL your variables. You do not get any
    # warnings if not, only much slower code (they are implicitly typed as
    # Python objects).
-    # For the value variable, we want to use the same data type as is
+    # For the "tmp" variable, we want to use the same data type as is
    # stored in the array, so we use int because it correspond to np.intc.
-    # NB! An important side-effect of this is that if "value" overflows its
+    # NB! An important side-effect of this is that if "tmp" overflows its
    # datatype size, it will simply wrap around like in C, rather than raise
    # an error like in Python.
-    cdef int value
-    for x in range(xmax):
+    cdef int tmp
-        for y in range(ymax):
-            # Cython has built-in C functions for min and max.
+    # Py_ssize_t is the proper C type for Python array indices.
-            # This makes the following lines very fast.
+    cdef Py_ssize_t x, y
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
+    for x in range(x_max):
-            t_from = max(tmid - y, -tmid)
+        for y in range(y_max):
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
+            tmp = clip(array_1[x, y], 2, 10)
-            for s in range(s_from, s_to):
+            tmp = tmp * a + array_2[x, y] * b
-                for t in range(t_from, t_to):
+            result[x, y] = tmp + c
-                    v = x - smid + s
-                    w = y - tmid + t
+    return result
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h
\ No newline at end of file
--- a/docs/examples/userguide/numpy_tutorial/convolve_fused_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_fused_types.pyx
-# cython: infer_types=True
-import numpy as np
-cimport cython
-ctypedef fused my_type:
-    int
-    double
-    long
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef naive_convolve(my_type [:,:] f, my_type [:,:] g):
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-    vmax = f.shape[0]
-    wmax = f.shape[1]
-    smax = g.shape[0]
-    tmax = g.shape[1]
-    smid = smax // 2
-    tmid = tmax // 2
-    xmax = vmax + 2*smid
-    ymax = wmax + 2*tmid
-    if my_type is int:
-        dtype = np.intc
-    elif my_type is double:
-        dtype = np.double
-    else:
-        dtype = np.long
-    h_np =  np.zeros([xmax, ymax], dtype=dtype)
-    cdef my_type [:,:] h = h_np
-    cdef my_type value
-    for x in range(xmax):
-        for y in range(ymax):
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h_np
\ No newline at end of file
--- a/docs/examples/userguide/numpy_tutorial/convolve_infer_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_infer_types.pyx
-# cython: infer_types=True
-import numpy as np
-cimport cython
-DTYPE = np.intc
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def naive_convolve(int [:,::1] f, int [:,::1] g):
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-    vmax = f.shape[0]
-    wmax = f.shape[1]
-    smax = g.shape[0]
-    tmax = g.shape[1]
-    smid = smax // 2
-    tmid = tmax // 2
-    xmax = vmax + 2*smid
-    ymax = wmax + 2*tmid
-    h_np =  np.zeros([xmax, ymax], dtype=DTYPE)
-    cdef int [:,::1] h = h_np
-    cdef int value
-    for x in range(xmax):
-        for y in range(ymax):
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h_np
\ No newline at end of file
--- a/docs/examples/userguide/numpy_tutorial/convolve_memview.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_memview.pyx
-import numpy as np
-DTYPE = np.intc
-# It is possible to declare types in the function declaration.
-def naive_convolve(int [:,:] f, int [:,:] g):
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-    # We don't need to check for the type of NumPy array here because
-    # a check is already performed when calling the function.
-    cdef Py_ssize_t x, y, s, t, v, w, s_from, s_to, t_from, t_to
-    cdef Py_ssize_t vmax = f.shape[0]
-    cdef Py_ssize_t wmax = f.shape[1]
-    cdef Py_ssize_t smax = g.shape[0]
-    cdef Py_ssize_t tmax = g.shape[1]
-    cdef Py_ssize_t smid = smax // 2
-    cdef Py_ssize_t tmid = tmax // 2
-    cdef Py_ssize_t xmax = vmax + 2*smid
-    cdef Py_ssize_t ymax = wmax + 2*tmid
-    h_np =  np.zeros([xmax, ymax], dtype=DTYPE)
-    cdef int [:,:] h = h_np
-    cdef int value
-    for x in range(xmax):
-        for y in range(ymax):
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h_np
\ No newline at end of file
--- a/docs/examples/userguide/numpy_tutorial/convolve_py.py
+++ b/docs/examples/userguide/numpy_tutorial/convolve_py.py
-from __future__ import division
-import numpy as np
-def naive_convolve(f, g):
-    # f is an image and is indexed by (v, w)
-    # g is a filter kernel and is indexed by (s, t),
-    #   it needs odd dimensions
-    # h is the output image and is indexed by (x, y),
-    #   it is not cropped
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-    # smid and tmid are number of pixels between the center pixel
-    # and the edge, ie for a 5x5 filter they will be 2.
-    #
-    # The output size is calculated by adding smid, tmid to each
-    # side of the dimensions of the input image.
-    vmax = f.shape[0]
-    wmax = f.shape[1]
-    smax = g.shape[0]
-    tmax = g.shape[1]
-    smid = smax // 2
-    tmid = tmax // 2
-    xmax = vmax + 2*smid
-    ymax = wmax + 2*tmid
-    # Allocate result image.
-    h = np.zeros([xmax, ymax], dtype=f.dtype)
-    # Do convolution
-    for x in range(xmax):
-        for y in range(ymax):
-            # Calculate pixel value for h at (x,y). Sum one component
-            # for each pixel (s, t) of the filter g.
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h
--- a/docs/examples/userguide/numpy_tutorial/numpy_and_cython.ipynb
+++ b/docs/examples/userguide/numpy_tutorial/numpy_and_cython.ipynb
--- a/docs/src/userguide/compute_typed_html.jpg
+++ b/docs/src/userguide/compute_typed_html.jpg
--- a/docs/src/userguide/convolve_types_html.png
+++ b/docs/src/userguide/convolve_types_html.png
--- a/docs/src/userguide/numpy_tutorial.rst
+++ b/docs/src/userguide/numpy_tutorial.rst
@@ -138,14 +138,28 @@ Python by using a normal ``import yourmod`` statement.
 The first Cython program
 ==========================
-The code below does 2D discrete convolution of an image with a filter (and I'm
+You can easily execute the code of this tutorial by
-sure you can do better!, let it serve for demonstration purposes). It is both
+downloading `the Jupyter notebook <https://github.com/cython/cython/blob/master/docs/examples/userguide/numpy_tutorial/numpy_cython.ipynb>`_.
-valid Python and valid Cython code. I'll refer to it as both
-:file:`convolve_py.py` for the Python version and :file:`convolve_cy.pyx` for the
-Cython version -- Cython uses ".pyx" as its file suffix.
-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_py.py
+The code below does the equivalent of this function in numpy::
-    :linenos:
+    def compute_np(array_1, array_2, a, b, c):
+        return np.clip(array_1, 2, 10) * a + array_2 * b + c
+We'll say that ``array_1`` and ``array_2`` are 2D NumPy arrays of integer type and
+``a``, ``b`` and ``c`` are three Python integers.
+This function uses NumPy and is already really fast, so it might be a bit overkill
+to do it again with Cython. This is for demonstration purposes. Nonetheless, we
+will show that we achieve a better speed and memory efficiency than NumPy at the cost of more verbosity.
+This code computes the function with the loops over the two dimensions being unrolled.
+It is both valid Python and valid Cython code. I'll refer to it as both
+:file:`compute_py.py` for the Python version and :file:`compute_cy.pyx` for the
+Cython version -- Cython uses ``.pyx`` as its file suffix (but it can also compile
+``.py`` files).
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_py.py
 This should be compiled to produce :file:`convolve_cy.so` (for Linux systems,
 on Windows systems, this will be a ``.pyd`` file). We
@@ -155,27 +169,23 @@ run a Python session to test both the Python version (imported from
 .. sourcecode:: ipython
    In [1]: import numpy as np
-    In [2]: import convolve_py
+    In [2]: array_1 = np.random.uniform(0, 1000, size=(3000, 2000)).astype(np.intc)
-    In [3]: convolve_py.naive_convolve(np.array([[1, 1, 1]], dtype=np.int),
+    In [3]: array_2 = np.random.uniform(0, 1000, size=(3000, 2000)).astype(np.intc)
-    ...     np.array([[1],[2],[1]], dtype=np.int))
+    In [4]: a = 4
-    Out [3]:
+    In [5]: b = 3
-    array([[1, 1, 1],
+    In [6]: c = 9
-        [2, 2, 2],
+    In [7]: def compute_np(array_1, array_2, a, b, c):
-        [1, 1, 1]])
+       ...:     return np.clip(array_1, 2, 10) * a + array_2 * b + c
-    In [4]: import convolve_cy
+    In [8]: %timeit compute_np(array_1, array_2, a, b, c)
-    In [4]: convolve_cy.naive_convolve(np.array([[1, 1, 1]], dtype=np.int),
+    103 ms ± 4.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
-    ...     np.array([[1],[2],[1]], dtype=np.int))
-    Out [4]:
+    In [9]: import compute_py
-    array([[1, 1, 1],
+    In [10]: compute_py.compute(array_1, array_2, a, b, c)
-        [2, 2, 2],
+    1min 10s ± 844 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-        [1, 1, 1]])
-    In [11]: N = 600
+    In [11]: import compute_cy
-    In [12]: f = np.arange(N*N, dtype=np.int).reshape((N,N))
+    In [12]: compute_cy.compute(array_1, array_2, a, b, c)
-    In [13]: g = np.arange(81, dtype=np.int).reshape((9, 9))
+    56.5 s ± 587 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-    In [19]: %timeit convolve_py.naive_convolve(f, g)
-    16 s ± 70.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-    In [20]: %timeit convolve_cy.naive_convolve(f, g)
-    13.5 s ± 99.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 There's not such a huge difference yet; because the C code still does exactly
 what the Python interpreter does (meaning, for instance, that a new object is
@@ -184,53 +194,56 @@ allocated for each number used).
 You can look at the Python interaction and the generated C
 code by using ``-a`` when calling Cython from the command
 line, ``%%cython -a`` when using a Jupyter Notebook, or by using
-``cythonize('convolve_cy.pyx', annotate=True)`` when using a ``setup.py``.
+``cythonize('compute_cy.pyx', annotate=True)`` when using a ``setup.py``.
 Look at the generated html file and see what
 is needed for even the simplest statements. You get the point quickly. We need
 to give Cython more information; we need to add types.
 Adding types
 =============
 To add types we use custom Cython syntax, so we are now breaking Python source
-compatibility. Here's :file:`convolve_typed.pyx`. *Read the comments!*
+compatibility. Here's :file:`compute_typed.pyx`. *Read the comments!*
-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_typed.pyx
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_typed.pyx
-    :linenos:
-.. figure:: convolve_types_html.png
+.. figure:: compute_typed_html.jpg
-At this point, have a look at the generated C code for :file:`convolve_cy.pyx` and
+At this point, have a look at the generated C code for :file:`compute_cy.pyx` and
-:file:`convolve_typed.pyx`. Click on the lines to expand them and see corresponding C.
+:file:`compute_typed.pyx`. Click on the lines to expand them and see corresponding C.
-Especially have a look at the ``for-loops``: In :file:`convolve_cy.c`, these are ~20 lines
+Especially have a look at the ``for-loops``: In :file:`compute_cy.c`, these are ~20 lines
-of C code to set up while in :file:`convolve_typed.c` a normal C for loop is used.
+of C code to set up while in :file:`compute_typed.c` a normal C for loop is used.
 After building this and continuing my (very informal) benchmarks, I get:
 .. sourcecode:: ipython
-    In [22]: %timeit convolve_typed.naive_convolve(f, g)
+    In [13]: %timeit compute_typed.compute(array_1, array_2, a, b, c)
-    55.8 s ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+    26.5 s ± 422 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-So in the end, adding types make the Cython code slower?
+So adding types does make the code faster, but nowhere
+near the speed of NumPy?
-What happened is that most of the time spend in this code is spent on line
+What happened is that most of the time spend in this code is spent in the following lines,
-54. ::
+and those lines are slower to execute than in pure Python::
-    value += g[smid - s, tmid - t] * f[v, w]
+    tmp = clip(array_1[x, y], 2, 10)
+    tmp = tmp * a + array_2[x, y] * b
+    result[x, y] = tmp + c
-So what made this line so much slower than in the pure Python version?
+So what made those line so much slower than in the pure Python version?
-``g`` and ``f`` are still NumPy arrays, so Python objects, and expect
+``array_1`` and ``array_2`` are still NumPy arrays, so Python objects, and expect
 Python integers as indexes. Here we pass C int values. So every time
 Cython reaches this line, it has to convert all the C integers to Python
 int objects. Since this line is called very often, it outweighs the speed
 benefits of the pure C loops that were created from the ``range()`` earlier.
-Furthermore, ``g[smid - s, tmid - t] * f[v, w]`` returns a Python integer
+Furthermore, ``tmp * a + array_2[x, y] * b`` returns a Python integer
-and ``value`` is a C integer, so Cython has to do type conversions again.
+and ``tmp`` is a C integer, so Cython has to do type conversions again.
-In the end those types conversions add up. And made our convolution really
+In the end those types conversions add up. And made our computation really
 slow. But this problem can be solved easily by using memoryviews.
 Efficient indexing with memoryviews
@@ -263,25 +276,25 @@ Here is how to declare a memoryview of integers::
 No data is copied from the NumPy array to the memoryview in our example.
 As the name implies, it is only a "view" of the memory. So we can use the
-view ``h`` for efficient indexing and at the end return the real NumPy
+view ``result_view`` for efficient indexing and at the end return the real NumPy
-array ``h_np`` that holds the data that we operated on.
+array ``result`` that holds the data that we operated on.
 Here is how to use them in our code:
-:file:`convolve_memview.pyx`
+:file:`compute_memview.pyx`
-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_memview.pyx
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_memview.pyx
-    :linenos:
 Let's see how much faster accessing is now.
 .. sourcecode:: ipython
-    In [22]: %timeit convolve_memview.naive_convolve(f, g)
+    In [22]: %timeit compute_memview.compute(array_1, array_2, a, b, c)
-    57.1 ms ± 268 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    22.9 ms ± 197 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 Note the importance of this change.
-We're now 280 times faster than an interpreted version of Python.
+We're now 3081 times faster than an interpreted version of Python and 4.5 times
+faster than NumPy.
 Memoryviews can be used with slices too, or even
 with Python arrays. Check out the :ref:`memoryview page <memoryviews>` to
@@ -297,14 +310,14 @@ The array lookups are still slowed down by two factors:
   explicitly coded so that it doesn't use negative indices, and it
   (hopefully) always access within bounds.
-    With decorators, we can deactivate those checks::
+With decorators, we can deactivate those checks::
-        ...
+    ...
-        cimport cython
+    cimport cython
-        @cython.boundscheck(False)  # Deactivate bounds checking
+    @cython.boundscheck(False)  # Deactivate bounds checking
-        @cython.wraparound(False)   # Deactivate negative indexing.
+    @cython.wraparound(False)   # Deactivate negative indexing.
-        def naive_convolve(int [:, :] f, int [:, :] g):
+    def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):
-        ...
+    ...
 Now bounds checking is not performed (and, as a side-effect, if you ''do''
 happen to access out of bounds you will in the best case crash your program
@@ -315,15 +328,19 @@ information.
 .. sourcecode:: ipython
-    In [23]: %timeit convolve_index.naive_convolve(f, g)
+    In [23]: %timeit compute_index.compute(array_1, array_2, a, b, c)
-    19.8 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+    16.8 ms ± 25.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-We're now 800 times faster than the interpreted Python version.
+We're faster than the NumPy version (6.2x). NumPy is really well written,
+but does not performs operation lazily, resulting in a lot
+of intermediate copy operations in memory. Our version is
+very memory efficient and cache friendly because we
+can execute the operations in a single run over the data.
 .. Warning::
    Speed comes with some cost. Especially it can be dangerous to set typed
-    objects (like ``f``, ``g`` and ``h`` in our sample code) to ``None``.
+    objects (like ``array_1``, ``array_2`` and ``result_view`` in our sample code) to ``None``.
    Setting such objects to ``None`` is entirely legal, but all you can do with them
    is check whether they are None. All other use (attribute lookup or indexing)
    can potentially segfault or corrupt data (rather than raising exceptions as
@@ -350,8 +367,9 @@ you have to declare the memoryview like this::
    cdef int [::1, :, :] a
-If all this makes no sense to you, you can skip this part, the performance gains are
+If all this makes no sense to you, you can skip this part, declaring
-not that important. If you still want to understand what contiguous arrays are
+arrays as contiguous constrains the usage of your functions as it rejects array slices as input.
+If you still want to understand what contiguous arrays are
 all about, you can see `this answer on StackOverflow
 <https://stackoverflow.com/questions/26998223/what-is-the-difference-between-contiguous-and-non-contiguous-arrays>`_.
@@ -360,10 +378,11 @@ get by declaring the memoryviews as contiguous:
 .. sourcecode:: ipython
-    In [23]: %timeit convolve_contiguous.naive_convolve(f, g)
+    In [23]: %timeit compute_contiguous.compute(array_1, array_2, a, b, c)
-    21.3 ms ± 489 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    11.1 ms ± 30.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-We're still around 800 times faster than the interpreted Python version.
+We're now around nine times faster than the NumPy version, and 6300 times
+faster than the pure Python version!
 Making the function cleaner
 ===========================
@@ -377,22 +396,21 @@ Note that since type declarations must happen at the top indentation level,
 Cython won't infer the type of variables declared for the first time
 in other indentation levels. It would change too much the meaning of
 our code. This is why, we must still declare manually the type of the
-``value`` variable.
+``tmp``, ``x`` and ``y`` variable.
-And actually, manually giving the type of the ``value`` variable will
+And actually, manually giving the type of the ``tmp`` variable will
 be useful when using fused types.
-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_infer_types.pyx
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_infer_types.pyx
-    :linenos:
 We now do a speed test:
 .. sourcecode:: ipython
-    In [24]: %timeit convolve_infer_types.naive_convolve(f, g)
+    In [24]: %timeit compute_infer_types.compute(array_1, array_2, a, b, c)
-    21.3 ms ± 344 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    11.5 ms ± 261 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-We're still around 800 times faster than the interpreted Python version.
+Lo and behold, the speed has not changed.
 More generic code
 ==================
@@ -417,31 +435,58 @@ know what NumPy data type we should use for our output array.
 In this case, our function now works for ints, doubles and floats.
-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_fused_types.pyx
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_fused_types.pyx
-    :linenos:
 We can check that the output type is the right one::
-    >>>naive_convolve_fused_types(f, g).dtype
+    >>>compute(array_1, array_2, a, b, c).dtype
    dtype('int32')
-    >>>naive_convolve_fused_types(f.astype(np.double), g.astype(np.double)).dtype
+    >>>compute(array_1.astype(np.double), array_2.astype(np.double), a, b, c).dtype
    dtype('float64')
 We now do a speed test:
 .. sourcecode:: ipython
-    In [25]: %timeit convolve_fused_types.naive_convolve(f, g)
+    In [25]: %timeit compute_fused_types.compute(array_1, array_2, a, b, c)
-    20 ms ± 392 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    11.5 ms ± 258 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+More versions of the function are created at compile time. So it makes
+sense that the speed doesn't change for executing this function with
+integers as before.
+Using multiple threads
+======================
+Cython have support for OpenMP. It have also some nice wrappers around it,
+like the function :func:`prange`. You can see more information about Cython and
+parralelism in :ref:`parallel`. Since we do elementwise operations, we can easily
+distribute the work among multiple threads. It's important not to forget to pass the
+correct arguments to the compiler to enable OpenMP. When using the Jupyter notebook,
+you should use the cell magic like this::
+    %%cython --force
+    # distutils: extra_compile_args=-fopenmp
+    # distutils: extra_link_args=-fopenmp
+The GIL must be released (see :ref:`Releasing the GIL <nogil>`), so this is why we
+declare our :func:`clip` function ``nogil``.
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_prange.pyx
+We can have substantial speed gains for minimal effort:
+.. sourcecode:: ipython
+    In [25]: %timeit compute_prange.compute(array_1, array_2, a, b, c)
+    9.33 ms ± 412 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
-We're still around 800 times faster than the interpreted Python version.
+We're now 7558 times faster than the pure Python version and 11.1 times faster
+than NumPy!
 Where to go from here?
 ======================
-* Since there is no Python interaction in the loops, it is possible with Cython
-  to release the GIL and use multiple cores easily. To learn how to do that,
-  you can see :ref:`using parallelism in Cython <parallel>`.
 * If you want to learn how to make use of `BLAS <http://www.netlib.org/blas/>`_
  or `LAPACK <http://www.netlib.org/lapack/>`_ with Cython, you can watch
  `the presentation of Ian Henriksen at SciPy 2015