Changed the numpy tutorial to make is faster to understand. Added prange example.

20547723 · gabrieldemarmiesse · 084a25f5 · 20547723 · 20547723 · 20547723
Commit 20547723 authored Jul 02, 2018 by gabrieldemarmiesse
14 changed files
--- a/docs/examples/userguide/numpy_tutorial/compute_fused_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_fused_types.pyx
+# cython: infer_types=True
+import numpy as np
+cimport cython
+
+ctypedef fused my_type:
+    int
+    double
+    long
+
+
+cdef my_type clip(my_type a, my_type min_value, my_type max_value):
+    return min(max(a, min_value), max_value)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def compute(my_type[:, ::1] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):
+     
+    x_max = array_1.shape[0]
+    y_max = array_1.shape[1]
+    
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+    
+    if my_type == int:
+        dtype = np.intc
+    elif my_type == double:
+        dtype = np.double
+    else:
+        dtype = np.long
+        
+    result = np.zeros((x_max, y_max), dtype=dtype)
+    cdef my_type[:, ::1] result_view = result
+
+    cdef my_type tmp
+    cdef Py_ssize_t x, y
+
+    for x in range(x_max):
+        for y in range(y_max):
+
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_infer_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_infer_types.pyx
+# cython: infer_types=True
+import numpy as np
+cimport cython
+
+DTYPE = np.intc
+
+
+cdef int clip(int a, int min_value, int max_value):
+    return min(max(a, min_value), max_value)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def compute(int[:, ::1] array_1, int[:, ::1] array_2, int a, int b, int c):
+     
+    x_max = array_1.shape[0]
+    y_max = array_1.shape[1]
+    
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+
+    result = np.zeros((x_max, y_max), dtype=DTYPE)
+    cdef int[:, ::1] result_view = result
+
+    cdef int tmp
+    cdef Py_ssize_t x, y
+
+    for x in range(x_max):
+        for y in range(y_max):
+
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_memview.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_memview.pyx
+import numpy as np
+
+DTYPE = np.intc
+
+
+cdef int clip(int a, int min_value, int max_value):
+    return min(max(a, min_value), max_value)
+
+
+def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):
+     
+    cdef Py_ssize_t x_max = array_1.shape[0]
+    cdef Py_ssize_t y_max = array_1.shape[1]
+
+    # array_1.shape is now a C array, no it's not possible
+    # to compare it simply by using == without a for-loop.
+    # To be able to compare it to array_2.shape easily,
+    # we convert them both to Python tuples.
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+
+    result = np.zeros((x_max, y_max), dtype=DTYPE)
+    cdef int[:, :] result_view = result
+
+    cdef int tmp
+    cdef Py_ssize_t x, y
+
+    for x in range(x_max):
+        for y in range(y_max):
+
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_prange.pyx
+++ b/docs/examples/userguide/numpy_tutorial/compute_prange.pyx
+import numpy as np
+cimport cython
+from cython.parallel import prange
+
+ctypedef fused my_type:
+    int
+    double
+    long
+
+
+# We declare our plain c function nogil
+cdef my_type clip(my_type a, my_type min_value, my_type max_value) nogil:
+    return min(max(a, min_value), max_value)
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def compute(my_type[:, ::] array_1, my_type[:, ::1] array_2, my_type a, my_type b, my_type c):
+
+    cdef Py_ssize_t x_max = array_1.shape[0]
+    cdef Py_ssize_t y_max = array_1.shape[1]
+
+    assert tuple(array_1.shape) == tuple(array_2.shape)
+
+    if my_type == int:
+        dtype = np.intc
+    elif my_type == double:
+        dtype = np.double
+    else:
+        dtype = np.long
+
+    result = np.zeros((x_max, y_max), dtype=dtype)
+    cdef my_type[:, ::1] result_view = result
+
+    cdef my_type tmp
+    cdef Py_ssize_t x, y
+
+    # We use prange here.
+    for x in prange(x_max, nogil=True):
+        for y in range(y_max):
+
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result_view[x, y] = tmp + c
+
+    return result
--- a/docs/examples/userguide/numpy_tutorial/compute_py.py
+++ b/docs/examples/userguide/numpy_tutorial/compute_py.py
+import numpy as np
+
+
+def clip(a, min_value, max_value):
+    return min(max(a, min_value), max_value)
+
+
+def compute(array_1, array_2, a, b, c):
+    """
+    This function must implement the formula
+    np.clip(array_1, 2, 10) * a + array_2 * b + c
+
+    array_1 and array_2 are 2D.
+    """
+    x_max = array_1.shape[0]
+    y_max = array_1.shape[1]
+
+    assert array_1.shape == array_2.shape
+
+    result = np.zeros((x_max, y_max), dtype=array_1.dtype)
+
+    for x in range(x_max):
+        for y in range(y_max):
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result[x, y] = tmp + c
+
+    return result
--- a/docs/examples/userguide/numpy_tutorial/convolve_typed.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_typed.pyx
@@ -5,49 +5,46 @@ import numpy as np
 # type info object.
 DTYPE = np.intc

-def naive_convolve(f, g):
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-    assert f.dtype == DTYPE and g.dtype == DTYPE
+# cdef means here that this function is a plain C function (so faster).
+# To get all the benefits, we type the arguments and the return value.
+cdef int clip(int a, int min_value, int max_value):
+    return min(max(a, min_value), max_value)
+
+
+def compute(array_1, array_2, int a, int b, int c):
+    
    # The "cdef" keyword is also used within functions to type variables. It
    # can only be used at the top indentation level (there are non-trivial
    # problems with allowing them in other places, though we'd love to see
    # good and thought out proposals for it).
+    cdef Py_ssize_t x_max = array_1.shape[0]
+    cdef Py_ssize_t y_max = array_1.shape[1]
+    
+    assert array_1.shape == array_2.shape
+    assert array_1.dtype == DTYPE
+    assert array_2.dtype == DTYPE

-    # Py_ssize_t is the proper C type for Python array indices.
-    cdef Py_ssize_t x, y, s, t, v, w, s_from, s_to, t_from, t_to
-
-    cdef Py_ssize_t vmax = f.shape[0]
-    cdef Py_ssize_t wmax = f.shape[1]
-    cdef Py_ssize_t smax = g.shape[0]
-    cdef Py_ssize_t tmax = g.shape[1]
-    cdef Py_ssize_t smid = smax // 2
-    cdef Py_ssize_t tmid = tmax // 2
-    cdef Py_ssize_t xmax = vmax + 2*smid
-    cdef Py_ssize_t ymax = wmax + 2*tmid
-    h = np.zeros([xmax, ymax], dtype=DTYPE)
+    result = np.zeros((x_max, y_max), dtype=DTYPE)
+    
    # It is very important to type ALL your variables. You do not get any
    # warnings if not, only much slower code (they are implicitly typed as
    # Python objects).
-    # For the value variable, we want to use the same data type as is
+    # For the "tmp" variable, we want to use the same data type as is
    # stored in the array, so we use int because it correspond to np.intc.
-    # NB! An important side-effect of this is that if "value" overflows its
+    # NB! An important side-effect of this is that if "tmp" overflows its
    # datatype size, it will simply wrap around like in C, rather than raise
    # an error like in Python.
-    cdef int value
-    for x in range(xmax):
-        for y in range(ymax):
-            # Cython has built-in C functions for min and max.
-            # This makes the following lines very fast.
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h
\ No newline at end of file
+
+    cdef int tmp
+
+    # Py_ssize_t is the proper C type for Python array indices.
+    cdef Py_ssize_t x, y
+
+    for x in range(x_max):
+        for y in range(y_max):
+
+            tmp = clip(array_1[x, y], 2, 10)
+            tmp = tmp * a + array_2[x, y] * b
+            result[x, y] = tmp + c
+
+    return result
--- a/docs/examples/userguide/numpy_tutorial/convolve_fused_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_fused_types.pyx
-# cython: infer_types=True
-import numpy as np
-cimport cython
-
-ctypedef fused my_type:
-    int
-    double
-    long
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-cpdef naive_convolve(my_type [:,:] f, my_type [:,:] g):
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-
-    vmax = f.shape[0]
-    wmax = f.shape[1]
-    smax = g.shape[0]
-    tmax = g.shape[1]
-    smid = smax // 2
-    tmid = tmax // 2
-    xmax = vmax + 2*smid
-    ymax = wmax + 2*tmid
-
-    if my_type is int:
-        dtype = np.intc
-    elif my_type is double:
-        dtype = np.double
-    else:
-        dtype = np.long
-
-    h_np =  np.zeros([xmax, ymax], dtype=dtype)
-    cdef my_type [:,:] h = h_np
-
-    cdef my_type value
-    for x in range(xmax):
-        for y in range(ymax):
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h_np
\ No newline at end of file
--- a/docs/examples/userguide/numpy_tutorial/convolve_infer_types.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_infer_types.pyx
-# cython: infer_types=True
-import numpy as np
-cimport cython
-
-DTYPE = np.intc
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def naive_convolve(int [:,::1] f, int [:,::1] g):
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-
-    vmax = f.shape[0]
-    wmax = f.shape[1]
-    smax = g.shape[0]
-    tmax = g.shape[1]
-    smid = smax // 2
-    tmid = tmax // 2
-    xmax = vmax + 2*smid
-    ymax = wmax + 2*tmid
-
-    h_np =  np.zeros([xmax, ymax], dtype=DTYPE)
-    cdef int [:,::1] h = h_np
-
-    cdef int value
-    for x in range(xmax):
-        for y in range(ymax):
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h_np
\ No newline at end of file
--- a/docs/examples/userguide/numpy_tutorial/convolve_memview.pyx
+++ b/docs/examples/userguide/numpy_tutorial/convolve_memview.pyx
-import numpy as np
-
-DTYPE = np.intc
-
-# It is possible to declare types in the function declaration.
-def naive_convolve(int [:,:] f, int [:,:] g):
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-
-    # We don't need to check for the type of NumPy array here because
-    # a check is already performed when calling the function.
-    cdef Py_ssize_t x, y, s, t, v, w, s_from, s_to, t_from, t_to
-    cdef Py_ssize_t vmax = f.shape[0]
-    cdef Py_ssize_t wmax = f.shape[1]
-    cdef Py_ssize_t smax = g.shape[0]
-    cdef Py_ssize_t tmax = g.shape[1]
-    cdef Py_ssize_t smid = smax // 2
-    cdef Py_ssize_t tmid = tmax // 2
-    cdef Py_ssize_t xmax = vmax + 2*smid
-    cdef Py_ssize_t ymax = wmax + 2*tmid
-
-    h_np =  np.zeros([xmax, ymax], dtype=DTYPE)
-    cdef int [:,:] h = h_np
-
-    cdef int value
-    for x in range(xmax):
-        for y in range(ymax):
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h_np
\ No newline at end of file
--- a/docs/examples/userguide/numpy_tutorial/convolve_py.py
+++ b/docs/examples/userguide/numpy_tutorial/convolve_py.py
-from __future__ import division
-import numpy as np
-def naive_convolve(f, g):
-    # f is an image and is indexed by (v, w)
-    # g is a filter kernel and is indexed by (s, t),
-    #   it needs odd dimensions
-    # h is the output image and is indexed by (x, y),
-    #   it is not cropped
-    if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
-        raise ValueError("Only odd dimensions on filter supported")
-    # smid and tmid are number of pixels between the center pixel
-    # and the edge, ie for a 5x5 filter they will be 2.
-    #
-    # The output size is calculated by adding smid, tmid to each
-    # side of the dimensions of the input image.
-    vmax = f.shape[0]
-    wmax = f.shape[1]
-    smax = g.shape[0]
-    tmax = g.shape[1]
-    smid = smax // 2
-    tmid = tmax // 2
-    xmax = vmax + 2*smid
-    ymax = wmax + 2*tmid
-    # Allocate result image.
-    h = np.zeros([xmax, ymax], dtype=f.dtype)
-    # Do convolution
-    for x in range(xmax):
-        for y in range(ymax):
-            # Calculate pixel value for h at (x,y). Sum one component
-            # for each pixel (s, t) of the filter g.
-            s_from = max(smid - x, -smid)
-            s_to = min((xmax - x) - smid, smid + 1)
-            t_from = max(tmid - y, -tmid)
-            t_to = min((ymax - y) - tmid, tmid + 1)
-            value = 0
-            for s in range(s_from, s_to):
-                for t in range(t_from, t_to):
-                    v = x - smid + s
-                    w = y - tmid + t
-                    value += g[smid - s, tmid - t] * f[v, w]
-            h[x, y] = value
-    return h
--- a/docs/examples/userguide/numpy_tutorial/numpy_and_cython.ipynb
+++ b/docs/examples/userguide/numpy_tutorial/numpy_and_cython.ipynb
--- a/docs/src/userguide/compute_typed_html.jpg
+++ b/docs/src/userguide/compute_typed_html.jpg
--- a/docs/src/userguide/convolve_types_html.png
+++ b/docs/src/userguide/convolve_types_html.png
--- a/docs/src/userguide/numpy_tutorial.rst
+++ b/docs/src/userguide/numpy_tutorial.rst
@@ -138,43 +138,52 @@ Python by using a normal ``import yourmod`` statement.
 The first Cython program
 ==========================

-The code below does 2D discrete convolution of an image with a filter (and I'm
-sure you can do better!, let it serve for demonstration purposes). It is both
-valid Python and valid Cython code. I'll refer to it as both
-:file:`convolve_py.py` for the Python version and :file:`convolve_cy.pyx` for the
-Cython version -- Cython uses ".pyx" as its file suffix.
+You can easily execute the code of this tutorial by
+downloading `the Jupyter notebook <https://github.com/cython/cython/blob/master/docs/examples/userguide/numpy_tutorial/numpy_cython.ipynb>`_.

-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_py.py
-    :linenos:
+The code below does the equivalent of this function in numpy::

-This should be compiled to produce :file:`convolve_cy.so` (for Linux systems). We
+    def compute_np(array_1, array_2, a, b, c):
+        return np.clip(array_1, 2, 10) * a + array_2 * b + c
+
+We'll say that ``array_1`` and ``array_2`` are 2D NumPy arrays of integer type and
+``a``, ``b`` and ``c`` are three Python integers.
+
+This function uses NumPy and is already really fast, so it might be a bit overkill
+to do it again with Cython. This is for demonstration purposes. Nonetheless, we
+will show that we achieve a better speed and memory efficiency than NumPy at the cost of more verbosity.
+
+This code present the function with the loops over the two dimensions being unrolled.
+It is both valid Python and valid Cython code. I'll refer to it as both
+:file:`compute_py.py` for the Python version and :file:`compute_cy.pyx` for the
+Cython version -- Cython uses ``.pyx`` as its file suffix.
+
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_py.py
+
+This should be compiled to produce :file:`compute_cy.so` (for Linux systems). We
 run a Python session to test both the Python version (imported from
 ``.py``-file) and the compiled Cython module.

 .. sourcecode:: ipython

    In [1]: import numpy as np
-    In [2]: import convolve_py
-    In [3]: convolve_py.naive_convolve(np.array([[1, 1, 1]], dtype=np.int),
-    ...     np.array([[1],[2],[1]], dtype=np.int))
-    Out [3]:
-    array([[1, 1, 1],
-        [2, 2, 2],
-        [1, 1, 1]])
-    In [4]: import convolve_cy
-    In [4]: convolve_cy.naive_convolve(np.array([[1, 1, 1]], dtype=np.int),
-    ...     np.array([[1],[2],[1]], dtype=np.int))
-    Out [4]:
-    array([[1, 1, 1],
-        [2, 2, 2],
-        [1, 1, 1]])
-    In [11]: N = 600
-    In [12]: f = np.arange(N*N, dtype=np.int).reshape((N,N))
-    In [13]: g = np.arange(81, dtype=np.int).reshape((9, 9))
-    In [19]: %timeit convolve_py.naive_convolve(f, g)
-    16 s ± 70.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
-    In [20]: %timeit convolve_cy.naive_convolve(f, g)
-    13.5 s ± 99.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+    In [2]: array_1 = np.random.uniform(0, 1000, size=(1000, 2000)).astype(np.intc)
+    In [3]: array_2 = np.random.uniform(0, 1000, size=(1000, 2000)).astype(np.intc)
+    In [4]: a = 4
+    In [5]: b = 3
+    In [6]: c = 9
+    In [7]: def compute_np(array_1, array_2, a, b, c):
+       ...:     return np.clip(array_1, 2, 10) * a + array_2 * b + c
+    In [8]: %timeit compute_np(array_1, array_2, a, b, c)
+    8.69 ms ± 297 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+
+    In [9]: import compute_py
+    In [10]: compute_py.compute(array_1, array_2, a, b, c)
+    25.6 s ± 225 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+
+    In [11]: import compute_cy
+    In [12]: compute_cy.compute(array_1, array_2, a, b, c)
+    21.9 s ± 398 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

 There's not such a huge difference yet; because the C code still does exactly
 what the Python interpreter does (meaning, for instance, that a new object is
@@ -183,7 +192,7 @@ allocated for each number used).
 You can look at the Python interaction and the generated C
 code by using ``-a`` when calling Cython from the command
 line, ``%%cython -a`` when using a Jupyter Notebook, or by using
-``cythonize('convolve_cy.pyx', annotate=True)`` when using a ``setup.py``.
+``cythonize('compute_cy.pyx', annotate=True)`` when using a ``setup.py``.
 Look at the generated html file and see what
 is needed for even the simplest statements. You get the point quickly. We need
 to give Cython more information; we need to add types.
@@ -192,44 +201,46 @@ Adding types
 =============

 To add types we use custom Cython syntax, so we are now breaking Python source
-compatibility. Here's :file:`convolve_typed.pyx`. *Read the comments!*
+compatibility. Here's :file:`compute_typed.pyx`. *Read the comments!*

-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_typed.pyx
-    :linenos:
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_typed.pyx

-.. figure:: convolve_types_html.png
+.. figure:: compute_typed_html.jpg

-At this point, have a look at the generated C code for :file:`convolve_cy.pyx` and
-:file:`convolve_typed.pyx`. Click on the lines to expand them and see corresponding C.
+At this point, have a look at the generated C code for :file:`compute_cy.pyx` and
+:file:`compute_typed.pyx`. Click on the lines to expand them and see corresponding C.

-Especially have a look at the ``for-loops``: In :file:`convolve_cy.c`, these are ~20 lines
-of C code to set up while in :file:`convolve_typed.c` a normal C for loop is used.
+Especially have a look at the ``for-loops``: In :file:`compute_cy.c`, these are ~20 lines
+of C code to set up while in :file:`compute_typed.c` a normal C for loop is used.

 After building this and continuing my (very informal) benchmarks, I get:

 .. sourcecode:: ipython

-    In [22]: %timeit convolve_typed.naive_convolve(f, g)
-    55.8 s ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
+    In [13]: %timeit compute_typed.compute(array_1, array_2, a, b, c)
+    10.5 s ± 301 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

-So in the end, adding types make the Cython code slower?
+So adding types does make the code faster, but nowhere
+near the speed of NumPy?

-What happened is that most of the time spend in this code is spent on line
-54. ::
+What happened is that most of the time spend in this code is spent those lines,
+and those lines are slower to execute than in pure Python::

-    value += g[smid - s, tmid - t] * f[v, w]
+    tmp = clip(array_1[x, y], 2, 10)
+    tmp = tmp * a + array_2[x, y] * b
+    result[x, y] = tmp + c

-So what made this line so much slower than in the pure Python version?
+So what made those line so much slower than in the pure Python version?

-``g`` and ``f`` are still NumPy arrays, so Python objects, and expect
+``array_1`` and ``array_2`` are still NumPy arrays, so Python objects, and expect
 Python integers as indexes. Here we pass C int values. So every time
 Cython reaches this line, it has to convert all the C integers to Python
 int objects. Since this line is called very often, it outweighs the speed
 benefits of the pure C loops that were created from the ``range()`` earlier.

-Furthermore, ``g[smid - s, tmid - t] * f[v, w]`` returns a Python integer
-and ``value`` is a C integer, so Cython has to do type conversions again.
-In the end those types conversions add up. And made our convolution really
+Furthermore, ``tmp * a + array_2[x, y] * b`` returns a Python integer
+and ``tmp`` is a C integer, so Cython has to do type conversions again.
+In the end those types conversions add up. And made our computation really
 slow. But this problem can be solved easily by using memoryviews.

 Efficient indexing with memoryviews
@@ -262,25 +273,25 @@ Here is how to declare a memoryview of integers::

 No data is copied from the NumPy array to the memoryview in our example.
 As the name implies, it is only a "view" of the memory. So we can use the
-view ``h`` for efficient indexing and at the end return the real NumPy
-array ``h_np`` that holds the data that we operated on.
+view ``result_view`` for efficient indexing and at the end return the real NumPy
+array ``result`` that holds the data that we operated on.

 Here is how to use them in our code:

-:file:`convolve_memview.pyx`
+:file:`compute_memview.pyx`

-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_memview.pyx
-    :linenos:
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_memview.pyx

 Let's see how much faster accessing is now.

 .. sourcecode:: ipython

-    In [22]: %timeit convolve_memview.naive_convolve(f, g)
-    57.1 ms ± 268 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    In [22]: %timeit compute_memview.compute(array_1, array_2, a, b, c)
+    9.56 ms ± 139 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

 Note the importance of this change.
-We're now 280 times faster than an interpreted version of Python.
+We're now 2700 times faster than an interpreted version of Python and close
+to NumPy speed.

 Memoryviews can be used with slices too, or even
 with Python arrays. Check out the :ref:`memoryview page <memoryviews>` to
@@ -296,14 +307,14 @@ The array lookups are still slowed down by two factors:
   explicitly coded so that it doesn't use negative indices, and it
   (hopefully) always access within bounds.

-    With decorators, we can deactivate those checks::
+With decorators, we can deactivate those checks::

-        ...
-        cimport cython
-        @cython.boundscheck(False)  # Deactivate bounds checking
-        @cython.wraparound(False)   # Deactivate negative indexing.
-        def naive_convolve(int [:, :] f, int [:, :] g):
-        ...
+    ...
+    cimport cython
+    @cython.boundscheck(False)  # Deactivate bounds checking
+    @cython.wraparound(False)   # Deactivate negative indexing.
+    def compute(int[:, :] array_1, int[:, :] array_2, int a, int b, int c):
+    ...

 Now bounds checking is not performed (and, as a side-effect, if you ''do''
 happen to access out of bounds you will in the best case crash your program
@@ -314,15 +325,18 @@ information.

 .. sourcecode:: ipython

-    In [23]: %timeit convolve_index.naive_convolve(f, g)
-    19.8 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+    In [23]: %timeit compute_index.compute(array_1, array_2, a, b, c)
+    6.1 ms ± 103 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

-We're now 800 times faster than the interpreted Python version.
+We're now faster than the NumPy version. NumPy is really well written,
+but does not performs operation lazily, meaning a lot
+of back and forth in memory. Our version is very memory efficient and
+cache friendly because we know the operations in advance.

 .. Warning::

    Speed comes with some cost. Especially it can be dangerous to set typed
-    objects (like ``f``, ``g`` and ``h`` in our sample code) to ``None``.
+    objects (like ``array_1``, ``array_2`` and ``result_view`` in our sample code) to ``None``.
    Setting such objects to ``None`` is entirely legal, but all you can do with them
    is check whether they are None. All other use (attribute lookup or indexing)
    can potentially segfault or corrupt data (rather than raising exceptions as
@@ -349,8 +363,9 @@ you have to declare the memoryview like this::

    cdef int [::1, :, :] a

-If all this makes no sense to you, you can skip this part, the performance gains are
-not that important. If you still want to understand what contiguous arrays are
+If all this makes no sense to you, you can skip this part, declaring
+arrays as contiguous constrain the usage of your function.
+If you still want to understand what contiguous arrays are
 all about, you can see `this answer on StackOverflow
 <https://stackoverflow.com/questions/26998223/what-is-the-difference-between-contiguous-and-non-contiguous-arrays>`_.

@@ -359,10 +374,10 @@ get by declaring the memoryviews as contiguous:

 .. sourcecode:: ipython

-    In [23]: %timeit convolve_contiguous.naive_convolve(f, g)
-    21.3 ms ± 489 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    In [23]: %timeit compute_contiguous.compute(array_1, array_2, a, b, c)
+    4.13 ms ± 87.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

-We're still around 800 times faster than the interpreted Python version.
+We're now around two times faster than the NumPy version.

 Making the function cleaner
 ===========================
@@ -376,22 +391,21 @@ Note that since type declarations must happen at the top indentation level,
 Cython won't infer the type of variables declared for the first time
 in other indentation levels. It would change too much the meaning of
 our code. This is why, we must still declare manually the type of the
-``value`` variable.
+``tmp``, ``x`` and ``y`` variable.

-And actually, manually giving the type of the ``value`` variable will
+And actually, manually giving the type of the ``tmp`` variable will
 be useful when using fused types.

-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_infer_types.pyx
-    :linenos:
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_infer_types.pyx

 We now do a speed test:

 .. sourcecode:: ipython

-    In [24]: %timeit convolve_infer_types.naive_convolve(f, g)
-    21.3 ms ± 344 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    In [24]: %timeit compute_infer_types.compute(array_1, array_2, a, b, c)
+    4.1 ms ± 54.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

-We're still around 800 times faster than the interpreted Python version.
+Lo and behold, the speed has not changed.

 More generic code
 ==================
@@ -416,31 +430,52 @@ know what NumPy data type we should use for our output array.

 In this case, our function now works for ints, doubles and floats.

-.. literalinclude:: ../../examples/userguide/numpy_tutorial/convolve_fused_types.pyx
-    :linenos:
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_fused_types.pyx

 We can check that the output type is the right one::

-    >>>naive_convolve_fused_types(f, g).dtype
+    >>>compute(array_1, array_2, a, b, c).dtype
    dtype('int32')
-    >>>naive_convolve_fused_types(f.astype(np.double), g.astype(np.double)).dtype
+    >>>compute(array_1.astype(np.double), array_2.astype(np.double), a, b, c).dtype
    dtype('float64')

 We now do a speed test:

 .. sourcecode:: ipython

-    In [25]: %timeit convolve_fused_types.naive_convolve(f, g)
-    20 ms ± 392 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
+    In [25]: %timeit compute_fused_types.compute(array_1, array_2, a, b, c)
+    6 ms ± 70.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
+
+We're a bit slower than before, because of the right call to the clip function
+must be found at runtime and adds a bit of overhead.
+
+Using multiple threads
+======================
+
+Cython have support for OpenMP. It have also some nice wrappers around it,
+like the function :func:`prange`. You can see more information about Cython and
+parralelism in :ref:`parallel`. Since we do elementwise operations, we can easily
+distribute the work among multiple threads. It's important not to forget to pass the
+correct arguments to the compiler to enable OpenMP. When using the Jupyter notebook,
+you should use the cell magic like this::
+
+    %%cython --compile-args=-fopenmp --link-args=-fopenmp --force
+
+The GIL must be released (see :ref:`Releasing the GIL <nogil>`), so this is why we
+declare our :func:`clip` function ``nogil``.
+
+.. literalinclude:: ../../examples/userguide/numpy_tutorial/compute_prange.pyx
+
+We can have substantial speed gains for minimal effort:
+
+.. sourcecode:: ipython

-We're still around 800 times faster than the interpreted Python version.
+    In [25]: %timeit compute_prange.compute(array_1, array_2, a, b, c)
+    3.41 ms ± 93.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

 Where to go from here?
 ======================

-* Since there is no Python interaction in the loops, it is possible with Cython
-  to release the GIL and use multiple cores easily. To learn how to do that,
-  you can see :ref:`using parallelism in Cython <parallel>`.
 * If you want to learn how to make use of `BLAS <http://www.netlib.org/blas/>`_
  or `LAPACK <http://www.netlib.org/lapack/>`_ with Cython, you can watch
  `the presentation of Ian Henriksen at SciPy 2015