diff --git a/ci/lint.sh b/ci/lint.sh
index 144febcfcece5..3adfa8d1e3d33 100755
--- a/ci/lint.sh
+++ b/ci/lint.sh
@@ -23,7 +23,7 @@ if [ "$LINT" ]; then
     for path in 'window.pyx'
     do
         echo "linting -> pandas/$path"
-        flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126,E128
+        flake8 pandas/$path --filename '*.pyx' --select=E501,E302,E203,E226,E111,E114,E221,E303,E128,E231,E126
         if [ $? -ne "0" ]; then
             RET=1
         fi
@@ -31,6 +31,18 @@ if [ "$LINT" ]; then
     done
     echo "Linting *.pyx DONE"
 
+    echo "Linting *.pxi.in"
+    for path in 'src'
+    do
+        echo "linting -> pandas/$path"
+        flake8 pandas/$path --filename '*.pxi.in' --select=E501,E302,E203,E111,E114,E221,E303,E231,E126
+        if [ $? -ne "0" ]; then
+            RET=1
+        fi
+
+    done
+    echo "Linting *.pxi.in DONE"
+
     echo "Check for invalid testing"
     grep -r -E --include '*.py' --exclude nosetester.py --exclude testing.py '(numpy|np)\.testing' pandas
     if [ $? = "0" ]; then
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
index 8e659a8566adb..cccc5377d0dec 100644
--- a/pandas/algos.pyx
+++ b/pandas/algos.pyx
@@ -13,6 +13,7 @@ cdef float64_t FP_ERR = 1e-13
 cimport util
 
 from libc.stdlib cimport malloc, free
+from libc.string cimport memmove
 
 from numpy cimport NPY_INT8 as NPY_int8
 from numpy cimport NPY_INT16 as NPY_int16
@@ -41,10 +42,14 @@ cdef extern from "src/headers/math.h":
     double fabs(double) nogil
 
 # this is our util.pxd
-from util cimport numeric
+from util cimport numeric, get_nat
 
+cimport lib
+from lib cimport is_null_datetimelike
 from pandas import lib
 
+cdef int64_t iNaT = get_nat()
+
 cdef:
     int TIEBREAK_AVERAGE = 0
     int TIEBREAK_MIN = 1
@@ -1334,5 +1339,11 @@ cdef inline float64_t _median_linear(float64_t* a, int n):
 
     return result
 
+
 include "join.pyx"
-include "generated.pyx"
+
+# generated from template
+include "algos_common_helper.pxi"
+include "algos_groupby_helper.pxi"
+include "algos_join_helper.pxi"
+include "algos_take_helper.pxi"
diff --git a/pandas/src/algos_common_helper.pxi b/pandas/src/algos_common_helper.pxi
new file mode 100644
index 0000000000000..59b3ddff46dec
--- /dev/null
+++ b/pandas/src/algos_common_helper.pxi
@@ -0,0 +1,2925 @@
+"""
+Template for each `dtype` helper function using 1-d template
+
+# 1-d template
+- map_indices
+- pad
+- pad_1d
+- pad_2d
+- backfill
+- backfill_1d
+- backfill_2d
+- is_monotonic
+- groupby
+- arrmap
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+#----------------------------------------------------------------------
+# 1-d template
+#----------------------------------------------------------------------
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_float64(ndarray[float64_t] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new,
+                 limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef float64_t cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_float64(ndarray[float64_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef float64_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float64_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old      New    Fill vector    Mask
+         .        0               1
+         .        0               1
+         .        0               1
+A        A        0               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+B        B        1               1
+         .        2               1
+         .        2               1
+         .        2               1
+C        C        2               1
+         .                        0
+         .                        0
+D
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef float64_t cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_inplace_float64(ndarray[float64_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef float64_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1, -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float64_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1, -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_float64(ndarray[float64_t] arr, bint timelike):
+    """
+    Returns
+    -------
+    is_monotonic_inc, is_monotonic_dec
+    """
+    cdef:
+        Py_ssize_t i, n
+        float64_t prev, cur
+        bint is_monotonic_inc = 1
+        bint is_monotonic_dec = 1
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
+            # single value is NaN
+            return False, False
+        else:
+            return True, True
+    elif n < 2:
+        return True, True
+
+    if timelike and arr[0] == iNaT:
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_float64(ndarray[float64_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    if not length == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if is_null_datetimelike(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_float64(ndarray[float64_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_float32(ndarray[float32_t] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new,
+                 limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef float32_t cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_float32(ndarray[float32_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef float32_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float32_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old      New    Fill vector    Mask
+         .        0               1
+         .        0               1
+         .        0               1
+A        A        0               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+B        B        1               1
+         .        2               1
+         .        2               1
+         .        2               1
+C        C        2               1
+         .                        0
+         .                        0
+D
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef float32_t cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_inplace_float32(ndarray[float32_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef float32_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1, -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef float32_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1, -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_float32(ndarray[float32_t] arr, bint timelike):
+    """
+    Returns
+    -------
+    is_monotonic_inc, is_monotonic_dec
+    """
+    cdef:
+        Py_ssize_t i, n
+        float32_t prev, cur
+        bint is_monotonic_inc = 1
+        bint is_monotonic_dec = 1
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
+            # single value is NaN
+            return False, False
+        else:
+            return True, True
+    elif n < 2:
+        return True, True
+
+    if timelike and arr[0] == iNaT:
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_float32(ndarray[float32_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    if not length == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if is_null_datetimelike(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_float32(ndarray[float32_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_object(ndarray[object] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_object(ndarray[object] old, ndarray[object] new,
+                 limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef object cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_object(ndarray[object] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef object val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_2d_inplace_object(ndarray[object, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef object val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old      New    Fill vector    Mask
+         .        0               1
+         .        0               1
+         .        0               1
+A        A        0               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+B        B        1               1
+         .        2               1
+         .        2               1
+         .        2               1
+C        C        2               1
+         .                        0
+         .                        0
+D
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_object(ndarray[object] old, ndarray[object] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef object cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_inplace_object(ndarray[object] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef object val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1, -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef object val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1, -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_object(ndarray[object] arr, bint timelike):
+    """
+    Returns
+    -------
+    is_monotonic_inc, is_monotonic_dec
+    """
+    cdef:
+        Py_ssize_t i, n
+        object prev, cur
+        bint is_monotonic_inc = 1
+        bint is_monotonic_dec = 1
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
+            # single value is NaN
+            return False, False
+        else:
+            return True, True
+    elif n < 2:
+        return True, True
+
+    if timelike and arr[0] == iNaT:
+        return False, False
+
+    
+    prev = arr[0]
+    for i in range(1, n):
+        cur = arr[i]
+        if timelike and cur == iNaT:
+            is_monotonic_inc = 0
+            is_monotonic_dec = 0
+            break
+        if cur < prev:
+            is_monotonic_inc = 0
+        elif cur > prev:
+            is_monotonic_dec = 0
+        elif cur == prev:
+            pass # is_unique = 0
+        else:
+            # cur or prev is NaN
+            is_monotonic_inc = 0
+            is_monotonic_dec = 0
+            break
+        if not is_monotonic_inc and not is_monotonic_dec:
+            is_monotonic_inc = 0
+            is_monotonic_dec = 0
+            break
+        prev = cur
+    return is_monotonic_inc, is_monotonic_dec
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_object(ndarray[object] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    if not length == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if is_null_datetimelike(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_object(ndarray[object] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_int32(ndarray[int32_t] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+                 limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int32_t cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_int32(ndarray[int32_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef int32_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int32_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old      New    Fill vector    Mask
+         .        0               1
+         .        0               1
+         .        0               1
+A        A        0               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+B        B        1               1
+         .        2               1
+         .        2               1
+         .        2               1
+C        C        2               1
+         .                        0
+         .                        0
+D
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int32_t cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_inplace_int32(ndarray[int32_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef int32_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1, -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int32_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1, -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_int32(ndarray[int32_t] arr, bint timelike):
+    """
+    Returns
+    -------
+    is_monotonic_inc, is_monotonic_dec
+    """
+    cdef:
+        Py_ssize_t i, n
+        int32_t prev, cur
+        bint is_monotonic_inc = 1
+        bint is_monotonic_dec = 1
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
+            # single value is NaN
+            return False, False
+        else:
+            return True, True
+    elif n < 2:
+        return True, True
+
+    if timelike and arr[0] == iNaT:
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int32(ndarray[int32_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    if not length == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if is_null_datetimelike(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int32(ndarray[int32_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_int64(ndarray[int64_t] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+                 limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int64_t cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_int64(ndarray[int64_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef int64_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int64_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old      New    Fill vector    Mask
+         .        0               1
+         .        0               1
+         .        0               1
+A        A        0               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+B        B        1               1
+         .        2               1
+         .        2               1
+         .        2               1
+C        C        2               1
+         .                        0
+         .                        0
+D
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef int64_t cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_inplace_int64(ndarray[int64_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef int64_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1, -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef int64_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1, -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_int64(ndarray[int64_t] arr, bint timelike):
+    """
+    Returns
+    -------
+    is_monotonic_inc, is_monotonic_dec
+    """
+    cdef:
+        Py_ssize_t i, n
+        int64_t prev, cur
+        bint is_monotonic_inc = 1
+        bint is_monotonic_dec = 1
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
+            # single value is NaN
+            return False, False
+        else:
+            return True, True
+    elif n < 2:
+        return True, True
+
+    if timelike and arr[0] == iNaT:
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_int64(ndarray[int64_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    if not length == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if is_null_datetimelike(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_int64(ndarray[int64_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_bool(ndarray[uint8_t] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+                 limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef uint8_t cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_bool(ndarray[uint8_t] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef uint8_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef uint8_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old      New    Fill vector    Mask
+         .        0               1
+         .        0               1
+         .        0               1
+A        A        0               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+B        B        1               1
+         .        2               1
+         .        2               1
+         .        2               1
+C        C        2               1
+         .                        0
+         .                        0
+D
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef uint8_t cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_inplace_bool(ndarray[uint8_t] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef uint8_t val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1, -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef uint8_t val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1, -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike):
+    """
+    Returns
+    -------
+    is_monotonic_inc, is_monotonic_dec
+    """
+    cdef:
+        Py_ssize_t i, n
+        uint8_t prev, cur
+        bint is_monotonic_inc = 1
+        bint is_monotonic_dec = 1
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
+            # single value is NaN
+            return False, False
+        else:
+            return True, True
+    elif n < 2:
+        return True, True
+
+    if timelike and arr[0] == iNaT:
+        return False, False
+
+    with nogil:
+        prev = arr[0]
+        for i in range(1, n):
+            cur = arr[i]
+            if timelike and cur == iNaT:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if cur < prev:
+                is_monotonic_inc = 0
+            elif cur > prev:
+                is_monotonic_dec = 0
+            elif cur == prev:
+                pass # is_unique = 0
+            else:
+                # cur or prev is NaN
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            if not is_monotonic_inc and not is_monotonic_dec:
+                is_monotonic_inc = 0
+                is_monotonic_dec = 0
+                break
+            prev = cur
+    return is_monotonic_inc, is_monotonic_dec
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_bool(ndarray[uint8_t] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    if not length == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if is_null_datetimelike(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_bool(ndarray[uint8_t] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+#----------------------------------------------------------------------
+# put template
+#----------------------------------------------------------------------
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                     Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def put2d_float64_float64(ndarray[float64_t, ndim=2, cast=True] values,
+                                 ndarray[int64_t] indexer, Py_ssize_t loc,
+                                 ndarray[float64_t] out):
+    cdef:
+        Py_ssize_t i, j, k
+
+    k = len(values)
+    for j from 0 <= j < k:
+        i = indexer[j]
+        out[i] = values[j, loc]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                     Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def put2d_float32_float32(ndarray[float32_t, ndim=2, cast=True] values,
+                                 ndarray[int64_t] indexer, Py_ssize_t loc,
+                                 ndarray[float32_t] out):
+    cdef:
+        Py_ssize_t i, j, k
+
+    k = len(values)
+    for j from 0 <= j < k:
+        i = indexer[j]
+        out[i] = values[j, loc]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                     Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def put2d_int8_float32(ndarray[int8_t, ndim=2, cast=True] values,
+                                 ndarray[int64_t] indexer, Py_ssize_t loc,
+                                 ndarray[float32_t] out):
+    cdef:
+        Py_ssize_t i, j, k
+
+    k = len(values)
+    for j from 0 <= j < k:
+        i = indexer[j]
+        out[i] = values[j, loc]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
+                     ndarray[float32_t, ndim=2] out,
+                     Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def put2d_int16_float32(ndarray[int16_t, ndim=2, cast=True] values,
+                                 ndarray[int64_t] indexer, Py_ssize_t loc,
+                                 ndarray[float32_t] out):
+    cdef:
+        Py_ssize_t i, j, k
+
+    k = len(values)
+    for j from 0 <= j < k:
+        i = indexer[j]
+        out[i] = values[j, loc]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                     Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def put2d_int32_float64(ndarray[int32_t, ndim=2, cast=True] values,
+                                 ndarray[int64_t] indexer, Py_ssize_t loc,
+                                 ndarray[float64_t] out):
+    cdef:
+        Py_ssize_t i, j, k
+
+    k = len(values)
+    for j from 0 <= j < k:
+        i = indexer[j]
+        out[i] = values[j, loc]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
+                     ndarray[float64_t, ndim=2] out,
+                     Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def put2d_int64_float64(ndarray[int64_t, ndim=2, cast=True] values,
+                                 ndarray[int64_t] indexer, Py_ssize_t loc,
+                                 ndarray[float64_t] out):
+    cdef:
+        Py_ssize_t i, j, k
+
+    k = len(values)
+    for j from 0 <= j < k:
+        i = indexer[j]
+        out[i] = values[j, loc]
+
+#----------------------------------------------------------------------
+# ensure_dtype
+#----------------------------------------------------------------------
+
+cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
+
+cpdef ensure_platform_int(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == PLATFORM_INT:
+            return arr
+        else:
+            return arr.astype(np.int_)
+    else:
+        return np.array(arr, dtype=np.int_)
+
+cpdef ensure_object(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_OBJECT:
+            return arr
+        else:
+            return arr.astype(np.object_)
+    elif hasattr(arr, 'asobject'):
+        return arr.asobject
+    else:
+        return np.array(arr, dtype=np.object_)
+
+cpdef ensure_float64(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_FLOAT64:
+            return arr
+        else:
+            return arr.astype(np.float64)
+    else:
+        return np.array(arr, dtype=np.float64)
+
+cpdef ensure_float32(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_FLOAT32:
+            return arr
+        else:
+            return arr.astype(np.float32)
+    else:
+        return np.array(arr, dtype=np.float32)
+
+cpdef ensure_int8(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_INT8:
+            return arr
+        else:
+            return arr.astype(np.int8)
+    else:
+        return np.array(arr, dtype=np.int8)
+
+cpdef ensure_int16(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_INT16:
+            return arr
+        else:
+            return arr.astype(np.int16)
+    else:
+        return np.array(arr, dtype=np.int16)
+
+cpdef ensure_int32(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_INT32:
+            return arr
+        else:
+            return arr.astype(np.int32)
+    else:
+        return np.array(arr, dtype=np.int32)
+
+cpdef ensure_int64(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_INT64:
+            return arr
+        else:
+            return arr.astype(np.int64)
+    else:
+        return np.array(arr, dtype=np.int64)
diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in
new file mode 100644
index 0000000000000..2327f10389cb5
--- /dev/null
+++ b/pandas/src/algos_common_helper.pxi.in
@@ -0,0 +1,603 @@
+"""
+Template for each `dtype` helper function using 1-d template
+
+# 1-d template
+- map_indices
+- pad
+- pad_1d
+- pad_2d
+- backfill
+- backfill_1d
+- backfill_2d
+- is_monotonic
+- groupby
+- arrmap
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+#----------------------------------------------------------------------
+# 1-d template
+#----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type, dtype, can_hold_na, nogil
+dtypes = [('float64', 'float64_t', 'np.float64', True, True),
+          ('float32', 'float32_t', 'np.float32', True, True),
+          ('object', 'object', 'object', True, False),
+          ('int32', 'int32_t', 'np.int32', False, True),
+          ('int64', 'int64_t', 'np.int64', False, True),
+          ('bool', 'uint8_t', 'np.bool', False, True)]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dtype, can_hold_na, nogil in dtypes:
+
+        nogil_str = 'with nogil:' if nogil else ''
+        tab = '    ' if nogil else ''
+        yield name, c_type, dtype, can_hold_na, nogil_str, tab
+}}
+
+{{for name, c_type, dtype, can_hold_na, nogil_str, tab
+      in get_dispatch(dtypes)}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cpdef map_indices_{{name}}(ndarray[{{c_type}}] index):
+    """
+    Produce a dict mapping the values of the input array to their respective
+    locations.
+
+    Example:
+        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
+
+    Better to do this with Cython because of the enormous speed boost.
+    """
+    cdef Py_ssize_t i, length
+    cdef dict result = {}
+
+    length = len(index)
+
+    for i in range(length):
+        result[index[i]] = i
+
+    return result
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
+                 limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef {{c_type}} cur, next
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
+        return indexer
+
+    i = j = 0
+
+    cur = old[0]
+
+    while j <= nright - 1 and new[j] < cur:
+        j += 1
+
+    while True:
+        if j == nright:
+            break
+
+        if i == nleft - 1:
+            while j < nright:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] > cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j += 1
+            break
+
+        next = old[i + 1]
+
+        while j < nright and cur <= new[j] < next:
+            if new[j] == cur:
+                indexer[j] = i
+            elif fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j += 1
+
+        fill_count = 0
+        i += 1
+        cur = next
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_inplace_{{name}}(ndarray[{{c_type}}] values,
+                         ndarray[uint8_t, cast=True] mask,
+                         limit=None):
+    cdef Py_ssize_t i, N
+    cdef {{c_type}} val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[0]
+    for i in range(N):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def pad_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
+                            ndarray[uint8_t, ndim=2] mask,
+                            limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef {{c_type}} val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, 0]
+        for i in range(N):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+"""
+Backfilling logic for generating fill vector
+
+Diagram of what's going on
+
+Old      New    Fill vector    Mask
+         .        0               1
+         .        0               1
+         .        0               1
+A        A        0               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+         .        1               1
+B        B        1               1
+         .        2               1
+         .        2               1
+         .        2               1
+C        C        2               1
+         .                        0
+         .                        0
+D
+"""
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_{{name}}(ndarray[{{c_type}}] old, ndarray[{{c_type}}] new,
+                      limit=None):
+    cdef Py_ssize_t i, j, nleft, nright
+    cdef ndarray[int64_t, ndim=1] indexer
+    cdef {{c_type}} cur, prev
+    cdef int lim, fill_count = 0
+
+    nleft = len(old)
+    nright = len(new)
+    indexer = np.empty(nright, dtype=np.int64)
+    indexer.fill(-1)
+
+    if limit is None:
+        lim = nright
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
+        return indexer
+
+    i = nleft - 1
+    j = nright - 1
+
+    cur = old[nleft - 1]
+
+    while j >= 0 and new[j] > cur:
+        j -= 1
+
+    while True:
+        if j < 0:
+            break
+
+        if i == 0:
+            while j >= 0:
+                if new[j] == cur:
+                    indexer[j] = i
+                elif new[j] < cur and fill_count < lim:
+                    indexer[j] = i
+                    fill_count += 1
+                j -= 1
+            break
+
+        prev = old[i - 1]
+
+        while j >= 0 and prev < new[j] <= cur:
+            if new[j] == cur:
+                indexer[j] = i
+            elif new[j] < cur and fill_count < lim:
+                indexer[j] = i
+                fill_count += 1
+            j -= 1
+
+        fill_count = 0
+        i -= 1
+        cur = prev
+
+    return indexer
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_inplace_{{name}}(ndarray[{{c_type}}] values,
+                              ndarray[uint8_t, cast=True] mask,
+                              limit=None):
+    cdef Py_ssize_t i, N
+    cdef {{c_type}} val
+    cdef int lim, fill_count = 0
+
+    N = len(values)
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    val = values[N - 1]
+    for i in range(N - 1, -1, -1):
+        if mask[i]:
+            if fill_count >= lim:
+                continue
+            fill_count += 1
+            values[i] = val
+        else:
+            fill_count = 0
+            val = values[i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def backfill_2d_inplace_{{name}}(ndarray[{{c_type}}, ndim=2] values,
+                                 ndarray[uint8_t, ndim=2] mask,
+                                 limit=None):
+    cdef Py_ssize_t i, j, N, K
+    cdef {{c_type}} val
+    cdef int lim, fill_count = 0
+
+    K, N = (<object> values).shape
+
+    # GH 2778
+    if N == 0:
+        return
+
+    if limit is None:
+        lim = N
+    else:
+        if limit < 0:
+            raise ValueError('Limit must be non-negative')
+        lim = limit
+
+    for j in range(K):
+        fill_count = 0
+        val = values[j, N - 1]
+        for i in range(N - 1, -1, -1):
+            if mask[j, i]:
+                if fill_count >= lim:
+                    continue
+                fill_count += 1
+                values[j, i] = val
+            else:
+                fill_count = 0
+                val = values[j, i]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def is_monotonic_{{name}}(ndarray[{{c_type}}] arr, bint timelike):
+    """
+    Returns
+    -------
+    is_monotonic_inc, is_monotonic_dec
+    """
+    cdef:
+        Py_ssize_t i, n
+        {{c_type}} prev, cur
+        bint is_monotonic_inc = 1
+        bint is_monotonic_dec = 1
+
+    n = len(arr)
+
+    if n == 1:
+        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
+            # single value is NaN
+            return False, False
+        else:
+            return True, True
+    elif n < 2:
+        return True, True
+
+    if timelike and arr[0] == iNaT:
+        return False, False
+
+    {{nogil_str}}
+    {{tab}}prev = arr[0]
+    {{tab}}for i in range(1, n):
+    {{tab}}    cur = arr[i]
+    {{tab}}    if timelike and cur == iNaT:
+    {{tab}}        is_monotonic_inc = 0
+    {{tab}}        is_monotonic_dec = 0
+    {{tab}}        break
+    {{tab}}    if cur < prev:
+    {{tab}}        is_monotonic_inc = 0
+    {{tab}}    elif cur > prev:
+    {{tab}}        is_monotonic_dec = 0
+    {{tab}}    elif cur == prev:
+    {{tab}}        pass # is_unique = 0
+    {{tab}}    else:
+    {{tab}}        # cur or prev is NaN
+    {{tab}}        is_monotonic_inc = 0
+    {{tab}}        is_monotonic_dec = 0
+    {{tab}}        break
+    {{tab}}    if not is_monotonic_inc and not is_monotonic_dec:
+    {{tab}}        is_monotonic_inc = 0
+    {{tab}}        is_monotonic_dec = 0
+    {{tab}}        break
+    {{tab}}    prev = cur
+    return is_monotonic_inc, is_monotonic_dec
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def groupby_{{name}}(ndarray[{{c_type}}] index, ndarray labels):
+    cdef dict result = {}
+    cdef Py_ssize_t i, length
+    cdef list members
+    cdef object idx, key
+
+    length = len(index)
+
+    if not length == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    for i in range(length):
+        key = util.get_value_1d(labels, i)
+
+        if is_null_datetimelike(key):
+            continue
+
+        idx = index[i]
+        if key in result:
+            members = result[key]
+            members.append(idx)
+        else:
+            result[key] = [idx]
+
+    return result
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def arrmap_{{name}}(ndarray[{{c_type}}] index, object func):
+    cdef Py_ssize_t length = index.shape[0]
+    cdef Py_ssize_t i = 0
+
+    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
+
+    from pandas.lib import maybe_convert_objects
+
+    for i in range(length):
+        result[i] = func(index[i])
+
+    return maybe_convert_objects(result)
+
+{{endfor}}
+
+#----------------------------------------------------------------------
+# put template
+#----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type, dest_type, dest_dtype
+dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'),
+          ('float32', 'float32_t', 'float32_t', 'np.float32'),
+          ('int8',  'int8_t',  'float32_t', 'np.float32'),
+          ('int16', 'int16_t', 'float32_t', 'np.float32'),
+          ('int32', 'int32_t', 'float64_t', 'np.float64'),
+          ('int64', 'int64_t', 'float64_t', 'np.float64')]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dest_type, dest_dtype, in dtypes:
+
+        dest_type2 = dest_type
+        dest_type = dest_type.replace('_t', '')
+
+        yield name, c_type, dest_type, dest_type2, dest_dtype
+
+}}
+
+{{for name, c_type, dest_type, dest_type2, dest_dtype
+      in get_dispatch(dtypes)}}
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def diff_2d_{{name}}(ndarray[{{c_type}}, ndim=2] arr,
+                     ndarray[{{dest_type2}}, ndim=2] out,
+                     Py_ssize_t periods, int axis):
+    cdef:
+        Py_ssize_t i, j, sx, sy
+
+    sx, sy = (<object> arr).shape
+    if arr.flags.f_contiguous:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for j in range(sy):
+                for i in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for j in range(start, stop):
+                for i in range(sx):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+    else:
+        if axis == 0:
+            if periods >= 0:
+                start, stop = periods, sx
+            else:
+                start, stop = 0, sx + periods
+            for i in range(start, stop):
+                for j in range(sy):
+                    out[i, j] = arr[i, j] - arr[i - periods, j]
+        else:
+            if periods >= 0:
+                start, stop = periods, sy
+            else:
+                start, stop = 0, sy + periods
+            for i in range(sx):
+                for j in range(start, stop):
+                    out[i, j] = arr[i, j] - arr[i, j - periods]
+
+
+def put2d_{{name}}_{{dest_type}}(ndarray[{{c_type}}, ndim=2, cast=True] values,
+                                 ndarray[int64_t] indexer, Py_ssize_t loc,
+                                 ndarray[{{dest_type2}}] out):
+    cdef:
+        Py_ssize_t i, j, k
+
+    k = len(values)
+    for j from 0 <= j < k:
+        i = indexer[j]
+        out[i] = values[j, loc]
+
+{{endfor}}
+
+#----------------------------------------------------------------------
+# ensure_dtype
+#----------------------------------------------------------------------
+
+cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
+
+cpdef ensure_platform_int(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == PLATFORM_INT:
+            return arr
+        else:
+            return arr.astype(np.int_)
+    else:
+        return np.array(arr, dtype=np.int_)
+
+cpdef ensure_object(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_OBJECT:
+            return arr
+        else:
+            return arr.astype(np.object_)
+    elif hasattr(arr, 'asobject'):
+        return arr.asobject
+    else:
+        return np.array(arr, dtype=np.object_)
+
+{{py:
+
+# name, c_type, dtype
+dtypes = [('float64', 'FLOAT64', 'float64'),
+          ('float32', 'FLOAT32', 'float32'),
+          ('int8', 'INT8', 'int8'),
+          ('int16', 'INT16', 'int16'),
+          ('int32', 'INT32', 'int32'),
+          ('int64', 'INT64', 'int64'),
+          # ('platform_int', 'INT', 'int_'),
+          # ('object', 'OBJECT', 'object_'),
+]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dtype in dtypes:
+        yield name, c_type, dtype
+}}
+
+{{for name, c_type, dtype in get_dispatch(dtypes)}}
+
+cpdef ensure_{{name}}(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_{{c_type}}:
+            return arr
+        else:
+            return arr.astype(np.{{dtype}})
+    else:
+        return np.array(arr, dtype=np.{{dtype}})
+
+{{endfor}}
\ No newline at end of file
diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi
new file mode 100644
index 0000000000000..fb86c4efb7314
--- /dev/null
+++ b/pandas/src/algos_groupby_helper.pxi
@@ -0,0 +1,1369 @@
+"""
+Template for each `dtype` helper function using groupby
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+cdef extern from "numpy/npy_math.h":
+    double NAN "NPY_NAN"
+_int64_max = np.iinfo(np.int64).max
+
+#----------------------------------------------------------------------
+# group_add, group_prod, group_var, group_mean, group_ohlc
+#----------------------------------------------------------------------
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_add_float64(ndarray[float64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float64_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+
+        if K > 1:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+
+        else:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_prod_float64(ndarray[float64_t, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[float64_t, ndim=2] values,
+                        ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, count
+        ndarray[float64_t, ndim=2] prodx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        prodx[lab, j] *= val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    prodx[lab, 0] *= val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def group_var_float64(ndarray[float64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float64_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, ct, oldmean
+        ndarray[float64_t, ndim=2] nobs, mean
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    mean = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    out[:, :] = 0.0
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    oldmean = mean[lab, j]
+                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
+                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+
+        for i in range(ncounts):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] /= (ct - 1)
+# add passing bin edges, instead of labels
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_mean_float64(ndarray[float64_t, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[float64_t, ndim=2] values,
+                        ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, count
+        ndarray[float64_t, ndim=2] sumx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                count = nobs[i, j]
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float64_t, ndim=2] values,
+                  ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float64_t val, count
+        Py_ssize_t ngroups = len(counts)
+
+    if len(labels) == 0:
+        return
+
+    N, K = (<object> values).shape
+
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
+
+    if K > 1:
+        raise NotImplementedError("Argument 'values' must have only "
+                                  "one dimension")
+    out.fill(np.nan)
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab == -1:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            if val != val:
+                continue
+
+            if out[lab, 0] != out[lab, 0]:
+                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
+            else:
+                out[lab, 1] = max(out[lab, 1], val)
+                out[lab, 2] = min(out[lab, 2], val)
+                out[lab, 3] = val
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_add_float32(ndarray[float32_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float32_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+
+        if K > 1:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+
+        else:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_prod_float32(ndarray[float32_t, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[float32_t, ndim=2] values,
+                        ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, count
+        ndarray[float32_t, ndim=2] prodx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        prodx[lab, j] *= val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    prodx[lab, 0] *= val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def group_var_float32(ndarray[float32_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float32_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, ct, oldmean
+        ndarray[float32_t, ndim=2] nobs, mean
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    mean = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    out[:, :] = 0.0
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    oldmean = mean[lab, j]
+                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
+                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+
+        for i in range(ncounts):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] /= (ct - 1)
+# add passing bin edges, instead of labels
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_mean_float32(ndarray[float32_t, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[float32_t, ndim=2] values,
+                        ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, count
+        ndarray[float32_t, ndim=2] sumx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                count = nobs[i, j]
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[float32_t, ndim=2] values,
+                  ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        float32_t val, count
+        Py_ssize_t ngroups = len(counts)
+
+    if len(labels) == 0:
+        return
+
+    N, K = (<object> values).shape
+
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
+
+    if K > 1:
+        raise NotImplementedError("Argument 'values' must have only "
+                                  "one dimension")
+    out.fill(np.nan)
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab == -1:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            if val != val:
+                continue
+
+            if out[lab, 0] != out[lab, 0]:
+                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
+            else:
+                out[lab, 1] = max(out[lab, 1], val)
+                out[lab, 2] = min(out[lab, 2], val)
+                out[lab, 3] = val
+
+#----------------------------------------------------------------------
+# group_nth, group_last
+#----------------------------------------------------------------------
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_last_float64(ndarray[float64_t, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[float64_t, ndim=2] values,
+                        ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, count
+        ndarray[float64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_nth_float64(ndarray[float64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float64_t, ndim=2] values,
+                       ndarray[int64_t] labels, int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, count
+        ndarray[float64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_last_float32(ndarray[float32_t, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[float32_t, ndim=2] values,
+                        ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, count
+        ndarray[float32_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_nth_float32(ndarray[float32_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float32_t, ndim=2] values,
+                       ndarray[int64_t] labels, int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, count
+        ndarray[float32_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = resx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_last_int64(ndarray[int64_t, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[int64_t, ndim=2] values,
+                        ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        int64_t val, count
+        ndarray[int64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != iNaT:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = resx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_nth_int64(ndarray[int64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[int64_t, ndim=2] values,
+                       ndarray[int64_t] labels, int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        int64_t val, count
+        ndarray[int64_t, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != iNaT:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = resx[i, j]
+
+#----------------------------------------------------------------------
+# group_min, group_max
+#----------------------------------------------------------------------
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_float64(ndarray[float64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float64_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, count
+        ndarray[float64_t, ndim=2] maxx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != NAN:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = maxx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_float64(ndarray[float64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float64_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float64_t val, count
+        ndarray[float64_t, ndim=2] minx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != NAN:
+
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = minx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_float32(ndarray[float32_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float32_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, count
+        ndarray[float32_t, ndim=2] maxx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    maxx.fill(-np.inf)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != NAN:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = maxx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_float32(ndarray[float32_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[float32_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        float32_t val, count
+        ndarray[float32_t, ndim=2] minx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(np.inf)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != NAN:
+
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != NAN:
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = minx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_int64(ndarray[int64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[int64_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        int64_t val, count
+        ndarray[int64_t, ndim=2] maxx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    maxx.fill(-_int64_max)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != iNaT:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != iNaT:
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = maxx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_int64(ndarray[int64_t, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[int64_t, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        int64_t val, count
+        ndarray[int64_t, ndim=2] minx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill(_int64_max)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != iNaT:
+
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != iNaT:
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = iNaT
+                else:
+                    out[i, j] = minx[i, j]
+
+#----------------------------------------------------------------------
+# other grouping functions not needing a template
+#----------------------------------------------------------------------
+
+
+def group_median_float64(ndarray[float64_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[float64_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, size
+        ndarray[int64_t] _counts
+        ndarray data
+        float64_t* ptr
+    ngroups = len(counts)
+    N, K = (<object> values).shape
+
+    indexer, _counts = groupsort_indexer(labels, ngroups)
+    counts[:] = _counts[1:]
+
+    data = np.empty((K, N), dtype=np.float64)
+    ptr = <float64_t*> data.data
+
+    take_2d_axis1_float64_float64(values.T, indexer, out=data)
+
+    for i in range(K):
+        # exclude NA group
+        ptr += _counts[0]
+        for j in range(ngroups):
+            size = _counts[j + 1]
+            out[j, i] = _median_linear(ptr, size)
+            ptr += size
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cumprod_float64(float64_t[:, :] out,
+                          float64_t[:, :] values,
+                          int64_t[:] labels,
+                          float64_t[:, :] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        float64_t val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.ones_like(accum)
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    accum[lab, j] *= val
+                    out[i, j] = accum[lab, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cumsum(numeric[:, :] out,
+                 numeric[:, :] values,
+                 int64_t[:] labels,
+                 numeric[:, :] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        numeric val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.zeros_like(accum)
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    accum[lab, j] += val
+                    out[i, j] = accum[lab, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
+                        int ngroups, int periods):
+    cdef:
+        Py_ssize_t N, i, j, ii
+        int offset, sign
+        int64_t lab, idxer, idxer_slot
+        int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
+        int64_t[:, :] label_indexer
+
+    N, = (<object> labels).shape
+
+    if periods < 0:
+        periods = -periods
+        offset = N - 1
+        sign = -1
+    elif periods > 0:
+        offset = 0
+        sign = 1
+
+    if periods == 0:
+        with nogil:
+            for i in range(N):
+                out[i] = i
+    else:
+        # array of each previous indexer seen
+        label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
+        with nogil:
+            for i in range(N):
+                ## reverse iterator if shifting backwards
+                ii = offset + sign * i
+                lab = labels[ii]
+                label_seen[lab] += 1
+
+                idxer_slot = label_seen[lab] % periods
+                idxer = label_indexer[lab, idxer_slot]
+
+                if label_seen[lab] > periods:
+                    out[ii] = idxer
+                else:
+                    out[ii] = -1
+
+                label_indexer[lab, idxer_slot] = ii
diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in
new file mode 100644
index 0000000000000..6b9d8f07587bc
--- /dev/null
+++ b/pandas/src/algos_groupby_helper.pxi.in
@@ -0,0 +1,713 @@
+"""
+Template for each `dtype` helper function using groupby
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+cdef extern from "numpy/npy_math.h":
+    double NAN "NPY_NAN"
+_int64_max = np.iinfo(np.int64).max
+
+#----------------------------------------------------------------------
+# group_add, group_prod, group_var, group_mean, group_ohlc
+#----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type, dest_type, dest_dtype
+dtypes = [('float64', 'float64_t', 'float64_t', 'np.float64'),
+          ('float32', 'float32_t', 'float32_t', 'np.float32')]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dest_type, dest_dtype in dtypes:
+
+        dest_type2 = dest_type
+        dest_type = dest_type.replace('_t', '')
+
+        yield name, c_type, dest_type, dest_type2, dest_dtype
+}}
+
+{{for name, c_type, dest_type, dest_type2, dest_dtype in get_dispatch(dtypes)}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_add_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[{{c_type}}, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, count
+        ndarray[{{dest_type2}}, ndim=2] sumx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+
+        if K > 1:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+
+        else:
+
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_prod_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[{{c_type}}, ndim=2] values,
+                        ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, count
+        ndarray[{{dest_type2}}, ndim=2] prodx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    prodx = np.ones_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        prodx[lab, j] *= val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    prodx[lab, 0] *= val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = prodx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def group_var_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[{{dest_type2}}, ndim=2] values,
+                       ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, ct, oldmean
+        ndarray[{{dest_type2}}, ndim=2] nobs, mean
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    mean = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    out[:, :] = 0.0
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val:
+                    nobs[lab, j] += 1
+                    oldmean = mean[lab, j]
+                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
+                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
+
+        for i in range(ncounts):
+            for j in range(K):
+                ct = nobs[i, j]
+                if ct < 2:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] /= (ct - 1)
+# add passing bin edges, instead of labels
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_mean_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[{{dest_type2}}, ndim=2] values,
+                        ndarray[int64_t] labels):
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, count
+        ndarray[{{dest_type2}}, ndim=2] sumx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+    sumx = np.zeros_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+                    # not nan
+                    if val == val:
+                        nobs[lab, j] += 1
+                        sumx[lab, j] += val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+                # not nan
+                if val == val:
+                    nobs[lab, 0] += 1
+                    sumx[lab, 0] += val
+
+        for i in range(ncounts):
+            for j in range(K):
+                count = nobs[i, j]
+                if nobs[i, j] == 0:
+                    out[i, j] = NAN
+                else:
+                    out[i, j] = sumx[i, j] / count
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                  ndarray[int64_t] counts,
+                  ndarray[{{dest_type2}}, ndim=2] values,
+                  ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        {{dest_type2}} val, count
+        Py_ssize_t ngroups = len(counts)
+
+    if len(labels) == 0:
+        return
+
+    N, K = (<object> values).shape
+
+    if out.shape[1] != 4:
+        raise ValueError('Output array must have 4 columns')
+
+    if K > 1:
+        raise NotImplementedError("Argument 'values' must have only "
+                                  "one dimension")
+    out.fill(np.nan)
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab == -1:
+                continue
+
+            counts[lab] += 1
+            val = values[i, 0]
+            if val != val:
+                continue
+
+            if out[lab, 0] != out[lab, 0]:
+                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
+            else:
+                out[lab, 1] = max(out[lab, 1], val)
+                out[lab, 2] = min(out[lab, 2], val)
+                out[lab, 3] = val
+
+{{endfor}}
+
+#----------------------------------------------------------------------
+# group_nth, group_last
+#----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type, dest_type2, nan_val
+dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'),
+          ('float32', 'float32_t', 'float32_t', 'NAN'),
+          ('int64', 'int64_t', 'int64_t', 'iNaT')]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dest_type2, nan_val in dtypes:
+
+        yield name, c_type, dest_type2, nan_val
+}}
+
+
+{{for name, c_type, dest_type2, nan_val in get_dispatch(dtypes)}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                        ndarray[int64_t] counts,
+                        ndarray[{{c_type}}, ndim=2] values,
+                        ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, count
+        ndarray[{{dest_type2}}, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != {{nan_val}}:
+                    nobs[lab, j] += 1
+                    resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = {{nan_val}}
+                else:
+                    out[i, j] = resx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[{{c_type}}, ndim=2] values,
+                       ndarray[int64_t] labels, int64_t rank):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, count
+        ndarray[{{dest_type2}}, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty_like(out)
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+            if lab < 0:
+                continue
+
+            counts[lab] += 1
+            for j in range(K):
+                val = values[i, j]
+
+                # not nan
+                if val == val and val != {{nan_val}}:
+                    nobs[lab, j] += 1
+                    if nobs[lab, j] == rank:
+                        resx[lab, j] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = {{nan_val}}
+                else:
+                    out[i, j] = resx[i, j]
+
+{{endfor}}
+
+#----------------------------------------------------------------------
+# group_min, group_max
+#----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type, dest_type2, nan_val
+dtypes = [('float64', 'float64_t', 'NAN', 'np.inf'),
+          ('float32', 'float32_t', 'NAN', 'np.inf'),
+          ('int64', 'int64_t', 'iNaT', '_int64_max')]
+
+def get_dispatch(dtypes):
+
+    for name, dest_type2, nan_val, inf_val in dtypes:
+        yield name, dest_type2, nan_val, inf_val
+}}
+
+
+{{for name, dest_type2, nan_val, inf_val in get_dispatch(dtypes)}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_max_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[{{dest_type2}}, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, count
+        ndarray[{{dest_type2}}, ndim=2] maxx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    maxx = np.empty_like(out)
+    maxx.fill(-{{inf_val}})
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != {{nan_val}}:
+                        nobs[lab, j] += 1
+                        if val > maxx[lab, j]:
+                            maxx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != {{nan_val}}:
+                    nobs[lab, 0] += 1
+                    if val > maxx[lab, 0]:
+                        maxx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = {{nan_val}}
+                else:
+                    out[i, j] = maxx[i, j]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
+                       ndarray[int64_t] counts,
+                       ndarray[{{dest_type2}}, ndim=2] values,
+                       ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
+        {{dest_type2}} val, count
+        ndarray[{{dest_type2}}, ndim=2] minx, nobs
+
+    if not len(values) == len(labels):
+        raise AssertionError("len(index) != len(labels)")
+
+    nobs = np.zeros_like(out)
+
+    minx = np.empty_like(out)
+    minx.fill({{inf_val}})
+
+    N, K = (<object> values).shape
+
+    with nogil:
+        if K > 1:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                for j in range(K):
+                    val = values[i, j]
+
+                    # not nan
+                    if val == val and val != {{nan_val}}:
+
+                        nobs[lab, j] += 1
+                        if val < minx[lab, j]:
+                            minx[lab, j] = val
+        else:
+            for i in range(N):
+                lab = labels[i]
+                if lab < 0:
+                    continue
+
+                counts[lab] += 1
+                val = values[i, 0]
+
+                # not nan
+                if val == val and val != {{nan_val}}:
+                    nobs[lab, 0] += 1
+                    if val < minx[lab, 0]:
+                        minx[lab, 0] = val
+
+        for i in range(ncounts):
+            for j in range(K):
+                if nobs[i, j] == 0:
+                    out[i, j] = {{nan_val}}
+                else:
+                    out[i, j] = minx[i, j]
+
+{{endfor}}
+
+#----------------------------------------------------------------------
+# other grouping functions not needing a template
+#----------------------------------------------------------------------
+
+
+def group_median_float64(ndarray[float64_t, ndim=2] out,
+                         ndarray[int64_t] counts,
+                         ndarray[float64_t, ndim=2] values,
+                         ndarray[int64_t] labels):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, ngroups, size
+        ndarray[int64_t] _counts
+        ndarray data
+        float64_t* ptr
+    ngroups = len(counts)
+    N, K = (<object> values).shape
+
+    indexer, _counts = groupsort_indexer(labels, ngroups)
+    counts[:] = _counts[1:]
+
+    data = np.empty((K, N), dtype=np.float64)
+    ptr = <float64_t*> data.data
+
+    take_2d_axis1_float64_float64(values.T, indexer, out=data)
+
+    for i in range(K):
+        # exclude NA group
+        ptr += _counts[0]
+        for j in range(ngroups):
+            size = _counts[j + 1]
+            out[j, i] = _median_linear(ptr, size)
+            ptr += size
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cumprod_float64(float64_t[:, :] out,
+                          float64_t[:, :] values,
+                          int64_t[:] labels,
+                          float64_t[:, :] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        float64_t val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.ones_like(accum)
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    accum[lab, j] *= val
+                    out[i, j] = accum[lab, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_cumsum(numeric[:, :] out,
+                 numeric[:, :] values,
+                 int64_t[:] labels,
+                 numeric[:, :] accum):
+    """
+    Only transforms on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, size
+        numeric val
+        int64_t lab
+
+    N, K = (<object> values).shape
+    accum = np.zeros_like(accum)
+
+    with nogil:
+        for i in range(N):
+            lab = labels[i]
+
+            if lab < 0:
+                continue
+            for j in range(K):
+                val = values[i, j]
+                if val == val:
+                    accum[lab, j] += val
+                    out[i, j] = accum[lab, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
+                        int ngroups, int periods):
+    cdef:
+        Py_ssize_t N, i, j, ii
+        int offset, sign
+        int64_t lab, idxer, idxer_slot
+        int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
+        int64_t[:, :] label_indexer
+
+    N, = (<object> labels).shape
+
+    if periods < 0:
+        periods = -periods
+        offset = N - 1
+        sign = -1
+    elif periods > 0:
+        offset = 0
+        sign = 1
+
+    if periods == 0:
+        with nogil:
+            for i in range(N):
+                out[i] = i
+    else:
+        # array of each previous indexer seen
+        label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
+        with nogil:
+            for i in range(N):
+                ## reverse iterator if shifting backwards
+                ii = offset + sign * i
+                lab = labels[ii]
+                label_seen[lab] += 1
+
+                idxer_slot = label_seen[lab] % periods
+                idxer = label_indexer[lab, idxer_slot]
+
+                if label_seen[lab] > periods:
+                    out[ii] = idxer
+                else:
+                    out[ii] = -1
+
+                label_indexer[lab, idxer_slot] = ii
diff --git a/pandas/src/algos_join_helper.pxi b/pandas/src/algos_join_helper.pxi
new file mode 100644
index 0000000000000..44b8159351492
--- /dev/null
+++ b/pandas/src/algos_join_helper.pxi
@@ -0,0 +1,1899 @@
+"""
+Template for each `dtype` helper function for join
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+#----------------------------------------------------------------------
+# left_join_indexer, inner_join_indexer, outer_join_indexer
+#----------------------------------------------------------------------
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_float64(ndarray[float64_t] left,
+                                      ndarray[float64_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        float64_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+def left_join_indexer_float64(ndarray[float64_t] left,
+                               ndarray[float64_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        float64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float64_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float64)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def inner_join_indexer_float64(ndarray[float64_t] left,
+                                ndarray[float64_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        float64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float64_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float64)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_float64(ndarray[float64_t] left,
+                                ndarray[float64_t] right):
+    cdef:
+        Py_ssize_t i, j, nright, nleft, count
+        float64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float64_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                count += 1
+                j += 1
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float64)
+
+    # do it again, but populate the indexers / result
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nleft):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    count += 1
+                    i += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = lval
+                count += 1
+                i += 1
+            else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                j += 1
+
+    return result, lindexer, rindexer
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_float32(ndarray[float32_t] left,
+                                      ndarray[float32_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        float32_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+def left_join_indexer_float32(ndarray[float32_t] left,
+                               ndarray[float32_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        float32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float32_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float32)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def inner_join_indexer_float32(ndarray[float32_t] left,
+                                ndarray[float32_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        float32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float32_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float32)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_float32(ndarray[float32_t] left,
+                                ndarray[float32_t] right):
+    cdef:
+        Py_ssize_t i, j, nright, nleft, count
+        float32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[float32_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                count += 1
+                j += 1
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.float32)
+
+    # do it again, but populate the indexers / result
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nleft):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    count += 1
+                    i += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = lval
+                count += 1
+                i += 1
+            else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                j += 1
+
+    return result, lindexer, rindexer
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_object(ndarray[object] left,
+                                      ndarray[object] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        object lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+def left_join_indexer_object(ndarray[object] left,
+                               ndarray[object] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        object lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[object] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=object)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def inner_join_indexer_object(ndarray[object] left,
+                                ndarray[object] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        object lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[object] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=object)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_object(ndarray[object] left,
+                                ndarray[object] right):
+    cdef:
+        Py_ssize_t i, j, nright, nleft, count
+        object lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[object] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                count += 1
+                j += 1
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=object)
+
+    # do it again, but populate the indexers / result
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nleft):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    count += 1
+                    i += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = lval
+                count += 1
+                i += 1
+            else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                j += 1
+
+    return result, lindexer, rindexer
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int32(ndarray[int32_t] left,
+                                      ndarray[int32_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int32_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+def left_join_indexer_int32(ndarray[int32_t] left,
+                               ndarray[int32_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        int32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int32_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int32)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def inner_join_indexer_int32(ndarray[int32_t] left,
+                                ndarray[int32_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        int32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int32_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int32)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_int32(ndarray[int32_t] left,
+                                ndarray[int32_t] right):
+    cdef:
+        Py_ssize_t i, j, nright, nleft, count
+        int32_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int32_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                count += 1
+                j += 1
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int32)
+
+    # do it again, but populate the indexers / result
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nleft):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    count += 1
+                    i += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = lval
+                count += 1
+                i += 1
+            else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                j += 1
+
+    return result, lindexer, rindexer
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_int64(ndarray[int64_t] left,
+                                      ndarray[int64_t] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        int64_t lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+def left_join_indexer_int64(ndarray[int64_t] left,
+                               ndarray[int64_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        int64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int64_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int64)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def inner_join_indexer_int64(ndarray[int64_t] left,
+                                ndarray[int64_t] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        int64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int64_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int64)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_int64(ndarray[int64_t] left,
+                                ndarray[int64_t] right):
+    cdef:
+        Py_ssize_t i, j, nright, nleft, count
+        int64_t lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[int64_t] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                count += 1
+                j += 1
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype=np.int64)
+
+    # do it again, but populate the indexers / result
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nleft):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    count += 1
+                    i += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = lval
+                count += 1
+                i += 1
+            else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                j += 1
+
+    return result, lindexer, rindexer
diff --git a/pandas/src/algos_join_helper.pxi.in b/pandas/src/algos_join_helper.pxi.in
new file mode 100644
index 0000000000000..5b55ec2b1bf6d
--- /dev/null
+++ b/pandas/src/algos_join_helper.pxi.in
@@ -0,0 +1,407 @@
+"""
+Template for each `dtype` helper function for join
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+#----------------------------------------------------------------------
+# left_join_indexer, inner_join_indexer, outer_join_indexer
+#----------------------------------------------------------------------
+
+{{py:
+
+# name, c_type, dtype
+dtypes = [('float64', 'float64_t', 'np.float64'),
+          ('float32', 'float32_t', 'np.float32'),
+          ('object', 'object', 'object'),
+          ('int32', 'int32_t', 'np.int32'),
+          ('int64', 'int64_t', 'np.int64')]
+
+def get_dispatch(dtypes):
+
+    for name, c_type, dtype in dtypes:
+        yield name, c_type, dtype
+
+}}
+
+{{for name, c_type, dtype in get_dispatch(dtypes)}}
+
+# Joins on ordered, unique indices
+
+# right might contain non-unique values
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def left_join_indexer_unique_{{name}}(ndarray[{{c_type}}] left,
+                                      ndarray[{{c_type}}] right):
+    cdef:
+        Py_ssize_t i, j, nleft, nright
+        ndarray[int64_t] indexer
+        {{c_type}} lval, rval
+
+    i = 0
+    j = 0
+    nleft = len(left)
+    nright = len(right)
+
+    indexer = np.empty(nleft, dtype=np.int64)
+    while True:
+        if i == nleft:
+            break
+
+        if j == nright:
+            indexer[i] = -1
+            i += 1
+            continue
+
+        rval = right[j]
+
+        while i < nleft - 1 and left[i] == rval:
+            indexer[i] = j
+            i += 1
+
+        if left[i] == right[j]:
+            indexer[i] = j
+            i += 1
+            while i < nleft - 1 and left[i] == rval:
+                indexer[i] = j
+                i += 1
+            j += 1
+        elif left[i] > rval:
+            indexer[i] = -1
+            j += 1
+        else:
+            indexer[i] = -1
+            i += 1
+    return indexer
+
+
+# @cython.wraparound(False)
+# @cython.boundscheck(False)
+def left_join_indexer_{{name}}(ndarray[{{c_type}}] left,
+                               ndarray[{{c_type}}] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        {{c_type}} lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[{{c_type}}] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype={{dtype}})
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0:
+        while i < nleft:
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    i += 1
+                    count += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = left[i]
+                count += 1
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def inner_join_indexer_{{name}}(ndarray[{{c_type}}] left,
+                                ndarray[{{c_type}}] right):
+    """
+    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
+    """
+    cdef:
+        Py_ssize_t i, j, k, nright, nleft, count
+        {{c_type}} lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[{{c_type}}] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    # do it again now that result size is known
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype={{dtype}})
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft > 0 and nright > 0:
+        while True:
+            if i == nleft:
+                break
+            if j == nright:
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                i += 1
+            else:
+                j += 1
+
+    return result, lindexer, rindexer
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def outer_join_indexer_{{name}}(ndarray[{{c_type}}] left,
+                                ndarray[{{c_type}}] right):
+    cdef:
+        Py_ssize_t i, j, nright, nleft, count
+        {{c_type}} lval, rval
+        ndarray[int64_t] lindexer, rindexer
+        ndarray[{{c_type}}] result
+
+    nleft = len(left)
+    nright = len(right)
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        count = nright
+    elif nright == 0:
+        count = nleft
+    else:
+        while True:
+            if i == nleft:
+                count += nright - j
+                break
+            if j == nright:
+                count += nleft - i
+                break
+
+            lval = left[i]
+            rval = right[j]
+            if lval == rval:
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                count += 1
+                i += 1
+            else:
+                count += 1
+                j += 1
+
+    lindexer = np.empty(count, dtype=np.int64)
+    rindexer = np.empty(count, dtype=np.int64)
+    result = np.empty(count, dtype={{dtype}})
+
+    # do it again, but populate the indexers / result
+
+    i = 0
+    j = 0
+    count = 0
+    if nleft == 0:
+        for j in range(nright):
+            lindexer[j] = -1
+            rindexer[j] = j
+            result[j] = right[j]
+    elif nright == 0:
+        for i in range(nleft):
+            lindexer[i] = i
+            rindexer[i] = -1
+            result[i] = left[i]
+    else:
+        while True:
+            if i == nleft:
+                while j < nright:
+                    lindexer[count] = -1
+                    rindexer[count] = j
+                    result[count] = right[j]
+                    count += 1
+                    j += 1
+                break
+            if j == nright:
+                while i < nleft:
+                    lindexer[count] = i
+                    rindexer[count] = -1
+                    result[count] = left[i]
+                    count += 1
+                    i += 1
+                break
+
+            lval = left[i]
+            rval = right[j]
+
+            if lval == rval:
+                lindexer[count] = i
+                rindexer[count] = j
+                result[count] = lval
+                count += 1
+                if i < nleft - 1:
+                    if j < nright - 1 and right[j + 1] == rval:
+                        j += 1
+                    else:
+                        i += 1
+                        if left[i] != rval:
+                            j += 1
+                elif j < nright - 1:
+                    j += 1
+                    if lval != right[j]:
+                        i += 1
+                else:
+                    # end of the road
+                    break
+            elif lval < rval:
+                lindexer[count] = i
+                rindexer[count] = -1
+                result[count] = lval
+                count += 1
+                i += 1
+            else:
+                lindexer[count] = -1
+                rindexer[count] = j
+                result[count] = rval
+                count += 1
+                j += 1
+
+    return result, lindexer, rindexer
+
+{{endfor}}
\ No newline at end of file
diff --git a/pandas/src/algos_take_helper.pxi b/pandas/src/algos_take_helper.pxi
new file mode 100644
index 0000000000000..d8fb05804d4e5
--- /dev/null
+++ b/pandas/src/algos_take_helper.pxi
@@ -0,0 +1,4949 @@
+"""
+Template for each `dtype` helper function for take
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+#----------------------------------------------------------------------
+# take_1d, take_2d
+#----------------------------------------------------------------------
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_bool_bool_memview(uint8_t[:] values,
+                                              int64_t[:] indexer,
+                                              uint8_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        uint8_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              uint8_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_bool_bool_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        uint8_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    uint8_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        uint8_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            uint8_t *v
+            uint8_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(uint8_t) and
+            sizeof(uint8_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(uint8_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    uint8_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_bool_bool_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        uint8_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            uint8_t *v
+            uint8_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(uint8_t) and
+            sizeof(uint8_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(uint8_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    uint8_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        uint8_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    uint8_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_bool_bool_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        uint8_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[uint8_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        uint8_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_bool_object_memview(uint8_t[:] values,
+                                              int64_t[:] indexer,
+                                              object[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        object fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fv
+        else:
+            out[i] = True if values[idx] > 0 else False
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_bool_object(ndarray[uint8_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              object[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_bool_object_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        object fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fv
+        else:
+            out[i] = True if values[idx] > 0 else False
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    object[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            object *v
+            object *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(object) and
+            sizeof(object) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(object) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = True if values[idx, j] > 0 else False
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    object[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_bool_object_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            object *v
+            object *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(object) and
+            sizeof(object) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(object) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = True if values[idx, j] > 0 else False
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    object[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = True if values[i, idx] > 0 else False
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    object[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_bool_object_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = True if values[i, idx] > 0 else False
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[object, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        object fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = True if values[idx, idx1[j]] > 0 else False
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int8_int8_memview(int8_t[:] values,
+                                              int64_t[:] indexer,
+                                              int8_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int8_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fv
+        else:
+            out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int8_int8(ndarray[int8_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int8_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int8_int8_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int8_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fv
+        else:
+            out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int8_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int8_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int8_t *v
+            int8_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int8_t) and
+            sizeof(int8_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int8_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int8_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int8_int8_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int8_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int8_t *v
+            int8_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int8_t) and
+            sizeof(int8_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int8_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int8_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int8_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int8_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int8_int8_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int8_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int8_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int8_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int8_int32_memview(int8_t[:] values,
+                                              int64_t[:] indexer,
+                                              int32_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int8_int32(ndarray[int8_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int32_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int8_int32_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int32_t *v
+            int32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int32_t) and
+            sizeof(int32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int32_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int8_int32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int32_t *v
+            int32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int32_t) and
+            sizeof(int32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int32_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int8_int32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int32_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int32_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int8_int64_memview(int8_t[:] values,
+                                              int64_t[:] indexer,
+                                              int64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int8_int64(ndarray[int8_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int8_int64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int8_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int8_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int8_float64_memview(int8_t[:] values,
+                                              int64_t[:] indexer,
+                                              float64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int8_float64(ndarray[int8_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int8_float64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int8_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int8_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[float64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int16_int16_memview(int16_t[:] values,
+                                              int64_t[:] indexer,
+                                              int16_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int16_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int16_int16(ndarray[int16_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int16_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int16_int16_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int16_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int16_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int16_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int16_t *v
+            int16_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int16_t) and
+            sizeof(int16_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int16_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int16_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int16_int16_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int16_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int16_t *v
+            int16_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int16_t) and
+            sizeof(int16_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int16_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int16_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int16_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int16_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int16_int16_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int16_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int16_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int16_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int16_int32_memview(int16_t[:] values,
+                                              int64_t[:] indexer,
+                                              int32_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int16_int32(ndarray[int16_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int32_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int16_int32_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int32_t *v
+            int32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int32_t) and
+            sizeof(int32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int32_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int16_int32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int32_t *v
+            int32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int32_t) and
+            sizeof(int32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int32_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int16_int32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int32_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int32_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int16_int64_memview(int16_t[:] values,
+                                              int64_t[:] indexer,
+                                              int64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int16_int64(ndarray[int16_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int16_int64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int16_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int16_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int16_float64_memview(int16_t[:] values,
+                                              int64_t[:] indexer,
+                                              float64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int16_float64(ndarray[int16_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int16_float64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int16_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int16_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[float64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int32_int32_memview(int32_t[:] values,
+                                              int64_t[:] indexer,
+                                              int32_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int32_int32(ndarray[int32_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int32_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int32_int32_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int32_t *v
+            int32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int32_t) and
+            sizeof(int32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int32_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int32_int32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int32_t *v
+            int32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int32_t) and
+            sizeof(int32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int32_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int32_int32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int32_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int32_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int32_int64_memview(int32_t[:] values,
+                                              int64_t[:] indexer,
+                                              int64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int32_int64(ndarray[int32_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int32_int64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int32_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int32_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int32_float64_memview(int32_t[:] values,
+                                              int64_t[:] indexer,
+                                              float64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int32_float64(ndarray[int32_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int32_float64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int32_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int32_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[float64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int64_int64_memview(int64_t[:] values,
+                                              int64_t[:] indexer,
+                                              int64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int64_int64(ndarray[int64_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              int64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int64_int64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        int64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int64_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            int64_t *v
+            int64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(int64_t) and
+            sizeof(int64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    int64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    int64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int64_int64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        int64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[int64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        int64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_int64_float64_memview(int64_t[:] values,
+                                              int64_t[:] indexer,
+                                              float64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_int64_float64(ndarray[int64_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_int64_float64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_int64_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_int64_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[float64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_float32_float32_memview(float32_t[:] values,
+                                              int64_t[:] indexer,
+                                              float32_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_float32_float32(ndarray[float32_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              float32_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_float32_float32_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float32_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            float32_t *v
+            float32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float32_t) and
+            sizeof(float32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float32_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_float32_float32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float32_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            float32_t *v
+            float32_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float32_t) and
+            sizeof(float32_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float32_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float32_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float32_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_float32_float32_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float32_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[float32_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        float32_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_float32_float64_memview(float32_t[:] values,
+                                              int64_t[:] indexer,
+                                              float64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_float32_float64(ndarray[float32_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_float32_float64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_float32_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_float32_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[float64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_float64_float64_memview(float64_t[:] values,
+                                              int64_t[:] indexer,
+                                              float64_t[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_float64_float64(ndarray[float64_t, ndim=1] values,
+                              int64_t[:] indexer,
+                              float64_t[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_float64_float64_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        float64_t fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    with nogil:
+        for i from 0 <= i < n:
+            idx = indexer[i]
+            if idx == -1:
+                out[i] = fv
+            else:
+                out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_float64_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF True:
+        cdef:
+            float64_t *v
+            float64_t *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(float64_t) and
+            sizeof(float64_t) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    float64_t[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    float64_t[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_float64_float64_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        float64_t fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values,
+                                    indexer,
+                                    ndarray[float64_t, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        float64_t fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_object_object_memview(object[:] values,
+                                              int64_t[:] indexer,
+                                              object[:] out,
+                                              fill_value=np.nan):
+
+
+
+    cdef:
+        Py_ssize_t i, n, idx
+        object fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fv
+        else:
+            out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_object_object(ndarray[object, ndim=1] values,
+                              int64_t[:] indexer,
+                              object[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_object_object_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+
+    cdef:
+        Py_ssize_t i, n, idx
+        object fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            out[i] = fv
+        else:
+            out[i] = values[idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_object_object_memview(object[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    object[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            object *v
+            object *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(object) and
+            sizeof(object) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(object) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_object_object(ndarray[object, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    object[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_object_object_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF False:
+        cdef:
+            object *v
+            object *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(object) and
+            sizeof(object) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(object) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = values[idx, j]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_object_object_memview(object[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    object[:, :] out,
+                                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_object_object(ndarray[object, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    object[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_object_object_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        object fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = values[i, idx]
+
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_object_object(ndarray[object, ndim=2] values,
+                                    indexer,
+                                    ndarray[object, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        object fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = values[idx, idx1[j]]
diff --git a/pandas/src/algos_take_helper.pxi.in b/pandas/src/algos_take_helper.pxi.in
new file mode 100644
index 0000000000000..e9abbcd13f499
--- /dev/null
+++ b/pandas/src/algos_take_helper.pxi.in
@@ -0,0 +1,261 @@
+"""
+Template for each `dtype` helper function for take
+
+WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
+"""
+
+#----------------------------------------------------------------------
+# take_1d, take_2d
+#----------------------------------------------------------------------
+
+{{py:
+
+# name, dest, c_type_in, c_type_out, preval, postval, can_copy, nogil
+dtypes = [
+    ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True),
+    ('bool', 'object', 'uint8_t', 'object',
+     'True if ', ' > 0 else False', False, False),
+    ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False),
+    ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True),
+    ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True),
+    ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True),
+    ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True),
+    ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True),
+    ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True),
+    ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True),
+    ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True),
+    ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True),
+    ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True),
+    ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True),
+    ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True),
+    ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True),
+    ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True),
+    ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True),
+    ('object', 'object', 'object', 'object', '', '', False, False)]
+
+
+def get_dispatch(dtypes):
+
+    inner_take_1d_template = """
+    cdef:
+        Py_ssize_t i, n, idx
+        %(c_type_out)s fv
+
+    n = indexer.shape[0]
+
+    fv = fill_value
+
+    %(nogil_str)s
+    %(tab)sfor i from 0 <= i < n:
+    %(tab)s    idx = indexer[i]
+    %(tab)s    if idx == -1:
+    %(tab)s        out[i] = fv
+    %(tab)s    else:
+    %(tab)s        out[i] = %(preval)svalues[idx]%(postval)s
+"""
+
+    inner_take_2d_axis0_template = """\
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        %(c_type_out)s fv
+
+    n = len(indexer)
+    k = values.shape[1]
+
+    fv = fill_value
+
+    IF %(can_copy)s:
+        cdef:
+            %(c_type_out)s *v
+            %(c_type_out)s *o
+
+        #GH3130
+        if (values.strides[1] == out.strides[1] and
+            values.strides[1] == sizeof(%(c_type_out)s) and
+            sizeof(%(c_type_out)s) * n >= 256):
+
+            for i from 0 <= i < n:
+                idx = indexer[i]
+                if idx == -1:
+                    for j from 0 <= j < k:
+                        out[i, j] = fv
+                else:
+                    v = &values[idx, 0]
+                    o = &out[i, 0]
+                    memmove(o, v, <size_t>(sizeof(%(c_type_out)s) * k))
+            return
+
+    for i from 0 <= i < n:
+        idx = indexer[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                out[i, j] = %(preval)svalues[idx, j]%(postval)s
+"""
+
+    inner_take_2d_axis1_template = """\
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        %(c_type_out)s fv
+
+    n = len(values)
+    k = len(indexer)
+
+    if n == 0 or k == 0:
+        return
+
+    fv = fill_value
+
+    for i from 0 <= i < n:
+        for j from 0 <= j < k:
+            idx = indexer[j]
+            if idx == -1:
+                out[i, j] = fv
+            else:
+                out[i, j] = %(preval)svalues[i, idx]%(postval)s
+"""
+
+    for (name, dest, c_type_in, c_type_out, preval, postval,
+         can_copy, nogil) in dtypes:
+        if nogil:
+            nogil_str = "with nogil:"
+            tab = '    '
+        else:
+            nogil_str = ''
+            tab = ''
+
+        args = dict(name=name, dest=dest, c_type_in=c_type_in,
+                    c_type_out=c_type_out, preval=preval, postval=postval,
+                    can_copy=can_copy, nogil_str=nogil_str, tab=tab)
+
+        inner_take_1d = inner_take_1d_template % args
+        inner_take_2d_axis0 = inner_take_2d_axis0_template % args
+        inner_take_2d_axis1 = inner_take_2d_axis1_template % args
+
+        yield (name, dest, c_type_in, c_type_out, preval, postval, can_copy,
+               inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1)
+
+}}
+
+
+{{for name, dest, c_type_in, c_type_out, preval, postval, can_copy,
+      inner_take_1d, inner_take_2d_axis0, inner_take_2d_axis1
+      in get_dispatch(dtypes)}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_1d_{{name}}_{{dest}}_memview({{c_type_in}}[:] values,
+                                              int64_t[:] indexer,
+                                              {{c_type_out}}[:] out,
+                                              fill_value=np.nan):
+
+
+{{inner_take_1d}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_1d_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=1] values,
+                              int64_t[:] indexer,
+                              {{c_type_out}}[:] out,
+                              fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_1d_{{name}}_{{dest}}_memview(values, indexer, out,
+                                          fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+{{inner_take_1d}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis0_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    {{c_type_out}}[:, :] out,
+                                                    fill_value=np.nan):
+{{inner_take_2d_axis0}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    {{c_type_out}}[:, :] out,
+                                    fill_value=np.nan):
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis0_{{name}}_{{dest}}_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+{{inner_take_2d_axis0}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+cdef inline take_2d_axis1_{{name}}_{{dest}}_memview({{c_type_in}}[:, :] values,
+                                                    int64_t[:] indexer,
+                                                    {{c_type_out}}[:, :] out,
+                                                    fill_value=np.nan):
+{{inner_take_2d_axis1}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_axis1_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+                                    ndarray[int64_t] indexer,
+                                    {{c_type_out}}[:, :] out,
+                                    fill_value=np.nan):
+
+    if values.flags.writeable:
+        # We can call the memoryview version of the code
+        take_2d_axis1_{{name}}_{{dest}}_memview(values, indexer, out,
+                                                fill_value=fill_value)
+        return
+
+    # We cannot use the memoryview version on readonly-buffers due to
+    # a limitation of Cython's typed memoryviews. Instead we can use
+    # the slightly slower Cython ndarray type directly.
+{{inner_take_2d_axis1}}
+
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def take_2d_multi_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values,
+                                    indexer,
+                                    ndarray[{{c_type_out}}, ndim=2] out,
+                                    fill_value=np.nan):
+    cdef:
+        Py_ssize_t i, j, k, n, idx
+        ndarray[int64_t] idx0 = indexer[0]
+        ndarray[int64_t] idx1 = indexer[1]
+        {{c_type_out}} fv
+
+    n = len(idx0)
+    k = len(idx1)
+
+    fv = fill_value
+    for i from 0 <= i < n:
+        idx = idx0[i]
+        if idx == -1:
+            for j from 0 <= j < k:
+                out[i, j] = fv
+        else:
+            for j from 0 <= j < k:
+                if idx1[j] == -1:
+                    out[i, j] = fv
+                else:
+                    out[i, j] = {{preval}}values[idx, idx1[j]]{{postval}}
+
+{{endfor}}
\ No newline at end of file
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
deleted file mode 100644
index 309a81b38f4e1..0000000000000
--- a/pandas/src/generate_code.py
+++ /dev/null
@@ -1,2182 +0,0 @@
-"""
-This file generates `generated.pyx` which is then included in `../algos.pyx`
-during building.  To regenerate `generated.pyx`, just run:
-
-    `python generate_code.py`.
-
-"""
-
-# flake8: noqa
-
-from __future__ import print_function
-import os
-from pandas.compat import StringIO
-import numpy as np
-
-_int64_max = np.iinfo(np.int64).max
-
-warning_to_new_contributors = """
-# DO NOT EDIT THIS FILE: This file was autogenerated from generate_code.py, so
-# please edit that file and then run `python2 generate_code.py` to re-generate
-# this file.
-"""
-
-header = """
-cimport numpy as np
-cimport cython
-
-from libc.string cimport memmove
-
-from numpy cimport *
-
-from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
-                      PyDict_Contains, PyDict_Keys,
-                      Py_INCREF, PyTuple_SET_ITEM,
-                      PyTuple_SetItem,
-                      PyTuple_New)
-from cpython cimport PyFloat_Check
-cimport cpython
-
-cdef extern from "numpy/npy_math.h":
-    double NAN "NPY_NAN"
-
-import numpy as np
-isnan = np.isnan
-
-from datetime import datetime as pydatetime
-
-# this is our datetime.pxd
-from datetime cimport *
-
-from khash cimport *
-
-ctypedef unsigned char UChar
-
-cimport util
-from util cimport is_array, _checknull, _checknan, get_nat
-cimport lib
-from lib cimport is_null_datetimelike
-
-cdef int64_t iNaT = get_nat()
-
-# import datetime C API
-PyDateTime_IMPORT
-
-# initialize numpy
-import_array()
-import_ufunc()
-
-cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
-
-cpdef ensure_platform_int(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == PLATFORM_INT:
-            return arr
-        else:
-            return arr.astype(np.int_)
-    else:
-        return np.array(arr, dtype=np.int_)
-
-cpdef ensure_object(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_OBJECT:
-            return arr
-        else:
-            return arr.astype(np.object_)
-    elif hasattr(arr,'asobject'):
-        return arr.asobject
-    else:
-        return np.array(arr, dtype=np.object_)
-"""
-
-
-inner_take_1d_template = """\
-    cdef:
-        Py_ssize_t i, n, idx
-        %(c_type_out)s fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    %(nogil)s
-    %(tab)sfor i from 0 <= i < n:
-    %(tab)s    idx = indexer[i]
-    %(tab)s    if idx == -1:
-    %(tab)s        out[i] = fv
-    %(tab)s    else:
-    %(tab)s        out[i] = %(preval)svalues[idx]%(postval)s
-"""
-
-take_1d_template = """\
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_%(name)s_%(dest)s_memview(%(c_type_in)s[:] values,
-                      int64_t[:] indexer,
-                      %(c_type_out)s[:] out,
-                      fill_value=np.nan):
-""" + inner_take_1d_template + """
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=1] values,
-                              int64_t[:] indexer,
-                              %(c_type_out)s[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_%(name)s_%(dest)s_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-""" + inner_take_1d_template
-
-inner_take_2d_axis0_template = """\
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        %(c_type_out)s fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF %(can_copy)s:
-        cdef:
-            %(c_type_out)s *v
-            %(c_type_out)s *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(%(c_type_out)s) and
-            sizeof(%(c_type_out)s) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(%(c_type_out)s) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = %(preval)svalues[idx, j]%(postval)s
-"""
-
-take_2d_axis0_template = """\
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    %(c_type_out)s[:, :] out,
-                                                    fill_value=np.nan):
-""" + inner_take_2d_axis0_template + """
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    %(c_type_out)s[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_%(name)s_%(dest)s_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-""" + inner_take_2d_axis0_template
-
-
-inner_take_2d_axis1_template = """\
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        %(c_type_out)s fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = %(preval)svalues[i, idx]%(postval)s
-"""
-
-take_2d_axis1_template = """\
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_%(name)s_%(dest)s_memview(%(c_type_in)s[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    %(c_type_out)s[:, :] out,
-                                                    fill_value=np.nan):
-""" + inner_take_2d_axis1_template + """
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    %(c_type_out)s[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_%(name)s_%(dest)s_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-""" + inner_take_2d_axis1_template
-
-
-take_2d_multi_template = """@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_%(name)s_%(dest)s(ndarray[%(c_type_in)s, ndim=2] values,
-                                    indexer,
-                                    ndarray[%(c_type_out)s, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        %(c_type_out)s fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = %(preval)svalues[idx, idx1[j]]%(postval)s
-"""
-
-
-
-"""
-Backfilling logic for generating fill vector
-
-Diagram of what's going on
-
-Old      New    Fill vector    Mask
-         .        0               1
-         .        0               1
-         .        0               1
-A        A        0               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-         .        1               1
-B        B        1               1
-         .        2               1
-         .        2               1
-         .        2               1
-C        C        2               1
-         .                        0
-         .                        0
-D
-"""
-
-backfill_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef %(c_type)s cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-"""
-
-
-pad_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef %(c_type)s cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-"""
-
-pad_1d_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_%(name)s(ndarray[%(c_type)s] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef %(c_type)s val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-"""
-
-pad_2d_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef %(c_type)s val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-"""
-
-backfill_2d_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef %(c_type)s val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-"""
-
-backfill_1d_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_%(name)s(ndarray[%(c_type)s] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef %(c_type)s val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-"""
-
-
-diff_2d_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_%(name)s(ndarray[%(c_type)s, ndim=2] arr,
-                     ndarray[%(dest_type2)s, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-"""
-
-is_monotonic_template = '''@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_%(name)s(ndarray[%(c_type)s] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec
-    """
-    cdef:
-        Py_ssize_t i, n
-        %(c_type)s prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False
-        else:
-            return True, True
-    elif n < 2:
-        return True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False
-
-    %(nogil)s
-    %(tab)sprev = arr[0]
-    %(tab)sfor i in range(1, n):
-    %(tab)s    cur = arr[i]
-    %(tab)s    if timelike and cur == iNaT:
-    %(tab)s        is_monotonic_inc = 0
-    %(tab)s        is_monotonic_dec = 0
-    %(tab)s        break
-    %(tab)s    if cur < prev:
-    %(tab)s        is_monotonic_inc = 0
-    %(tab)s    elif cur > prev:
-    %(tab)s        is_monotonic_dec = 0
-    %(tab)s    elif cur == prev:
-    %(tab)s        pass # is_unique = 0
-    %(tab)s    else:
-    %(tab)s        # cur or prev is NaN
-    %(tab)s        is_monotonic_inc = 0
-    %(tab)s        is_monotonic_dec = 0
-    %(tab)s        break
-    %(tab)s    if not is_monotonic_inc and not is_monotonic_dec:
-    %(tab)s        is_monotonic_inc = 0
-    %(tab)s        is_monotonic_dec = 0
-    %(tab)s        break
-    %(tab)s    prev = cur
-    return is_monotonic_inc, is_monotonic_dec
-'''
-
-map_indices_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_%(name)s(ndarray[%(c_type)s] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-'''
-
-groupby_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    if not length == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if is_null_datetimelike(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-'''
-
-group_last_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[%(c_type)s, ndim=2] values,
-               ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, count
-        ndarray[%(dest_type2)s, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != %(nan_val)s:
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = %(nan_val)s
-                else:
-                    out[i, j] = resx[i, j]
-'''
-
-group_nth_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[%(c_type)s, ndim=2] values,
-              ndarray[int64_t] labels, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, count
-        ndarray[%(dest_type2)s, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != %(nan_val)s:
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = %(nan_val)s
-                else:
-                    out[i, j] = resx[i, j]
-'''
-
-group_add_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_add_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[%(c_type)s, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, count
-        ndarray[%(dest_type2)s, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-
-    with nogil:
-
-        if K > 1:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-
-        else:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j]
-'''
-
-group_prod_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_prod_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[%(c_type)s, ndim=2] values,
-               ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, count
-        ndarray[%(dest_type2)s, ndim=2] prodx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        prodx[lab, j] *= val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    prodx[lab, 0] *= val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = prodx[i, j]
-'''
-
-group_var_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[%(dest_type2)s, ndim=2] values,
-              ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, ct, oldmean
-        ndarray[%(dest_type2)s, ndim=2] nobs, mean
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    mean = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    out[:, :] = 0.0
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    oldmean = mean[lab, j]
-                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
-                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
-
-        for i in range(ncounts):
-            for j in range(K):
-                ct = nobs[i, j]
-                if ct < 2:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] /= (ct - 1)
-
-'''
-
-# add passing bin edges, instead of labels
-
-
-#----------------------------------------------------------------------
-# group_min, group_max
-
-group_max_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[%(dest_type2)s, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, count
-        ndarray[%(dest_type2)s, ndim=2] maxx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-%(inf_val)s)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != %(nan_val)s:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != %(nan_val)s:
-                    nobs[lab, 0] += 1
-                    if val > maxx[lab, 0]:
-                        maxx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = %(nan_val)s
-                else:
-                    out[i, j] = maxx[i, j]
-'''
-
-group_min_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[%(dest_type2)s, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, count
-        ndarray[%(dest_type2)s, ndim=2] minx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(%(inf_val)s)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != %(nan_val)s:
-
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != %(nan_val)s:
-                    nobs[lab, 0] += 1
-                    if val < minx[lab, 0]:
-                        minx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = %(nan_val)s
-                else:
-                    out[i, j] = minx[i, j]
-'''
-
-
-group_mean_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_mean_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[%(dest_type2)s, ndim=2] values,
-               ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        %(dest_type2)s val, count
-        ndarray[%(dest_type2)s, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                count = nobs[i, j]
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j] / count
-'''
-
-group_ohlc_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_ohlc_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[%(dest_type2)s, ndim=2] values,
-                  ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        %(dest_type2)s val, count
-        Py_ssize_t ngroups = len(counts)
-
-    if len(labels) == 0:
-        return
-
-    N, K = (<object> values).shape
-
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
-
-    if K > 1:
-        raise NotImplementedError("Argument 'values' must have only "
-                                  "one dimension")
-    out.fill(np.nan)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab == -1:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-            if val != val:
-                continue
-
-            if out[lab, 0] != out[lab, 0]:
-                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
-            else:
-                out[lab, 1] = max(out[lab, 1], val)
-                out[lab, 2] = min(out[lab, 2], val)
-                out[lab, 3] = val
-'''
-
-arrmap_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_%(name)s(ndarray[%(c_type)s] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-'''
-
-#----------------------------------------------------------------------
-# Joins on ordered, unique indices
-
-# right might contain non-unique values
-
-left_join_unique_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_%(name)s(ndarray[%(c_type)s] left,
-                                      ndarray[%(c_type)s] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        %(c_type)s lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-'''
-
-# @cython.wraparound(False)
-# @cython.boundscheck(False)
-
-left_join_template = '''def left_join_indexer_%(name)s(ndarray[%(c_type)s] left,
-                               ndarray[%(c_type)s] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        %(c_type)s lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[%(c_type)s] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=%(dtype)s)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-'''
-
-
-inner_join_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_%(name)s(ndarray[%(c_type)s] left,
-                              ndarray[%(c_type)s] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        %(c_type)s lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[%(c_type)s] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=%(dtype)s)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-'''
-
-
-outer_join_template2 = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left,
-                                ndarray[%(c_type)s] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        %(c_type)s lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[%(c_type)s] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=%(dtype)s)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-'''
-
-outer_join_template = '''@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left,
-                                ndarray[%(c_type)s] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        %(c_type)s lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[%(c_type)s] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    while True:
-        if i == nleft:
-            if j == nright:
-                # we are done
-                break
-            else:
-                while j < nright:
-                    j += 1
-                    count += 1
-                break
-        elif j == nright:
-            while i < nleft:
-                i += 1
-                count += 1
-            break
-        else:
-            if left[i] == right[j]:
-                i += 1
-                j += 1
-            elif left[i] < right[j]:
-                i += 1
-            else:
-                j += 1
-
-            count += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=%(dtype)s)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    while True:
-        if i == nleft:
-            if j == nright:
-                # we are done
-                break
-            else:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    j += 1
-                    count += 1
-                break
-        elif j == nright:
-            while i < nleft:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                i += 1
-                count += 1
-            break
-        else:
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                i += 1
-                j += 1
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                j += 1
-
-            count += 1
-
-    return result, lindexer, rindexer
-'''
-
-# ensure_dtype functions
-
-ensure_dtype_template = """
-cpdef ensure_%(name)s(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_%(ctype)s:
-            return arr
-        else:
-            return arr.astype(np.%(dtype)s)
-    else:
-        return np.array(arr, dtype=np.%(dtype)s)
-"""
-
-ensure_functions = [
-    ('float64', 'FLOAT64', 'float64'),
-    ('float32', 'FLOAT32', 'float32'),
-    ('int8', 'INT8', 'int8'),
-    ('int16', 'INT16', 'int16'),
-    ('int32', 'INT32', 'int32'),
-    ('int64', 'INT64', 'int64'),
-    # ('platform_int', 'INT', 'int_'),
-    #('object', 'OBJECT', 'object_'),
-]
-
-def generate_ensure_dtypes():
-    output = StringIO()
-    for name, ctype, dtype in ensure_functions:
-        filled = ensure_dtype_template % locals()
-        output.write(filled)
-    return output.getvalue()
-
-#----------------------------------------------------------------------
-# Fast "put" logic for speeding up interleaving logic
-
-put2d_template = """
-def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values,
-                              ndarray[int64_t] indexer, Py_ssize_t loc,
-                              ndarray[%(dest_type2)s] out):
-    cdef:
-        Py_ssize_t i, j, k
-
-    k = len(values)
-    for j from 0 <= j < k:
-        i = indexer[j]
-        out[i] = values[j, loc]
-"""
-
-#----------------------------------------------------------------------
-# other grouping functions not needing a template
-grouping_no_template = '''def group_median_float64(ndarray[float64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[float64_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, size
-        ndarray[int64_t] _counts
-        ndarray data
-        float64_t* ptr
-    ngroups = len(counts)
-    N, K = (<object> values).shape
-
-    indexer, _counts = groupsort_indexer(labels, ngroups)
-    counts[:] = _counts[1:]
-
-    data = np.empty((K, N), dtype=np.float64)
-    ptr = <float64_t*> data.data
-
-    take_2d_axis1_float64_float64(values.T, indexer, out=data)
-
-    for i in range(K):
-        # exclude NA group
-        ptr += _counts[0]
-        for j in range(ngroups):
-            size = _counts[j + 1]
-            out[j, i] = _median_linear(ptr, size)
-            ptr += size
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumprod_float64(float64_t[:,:] out,
-                          float64_t[:,:] values,
-                          int64_t[:] labels,
-                          float64_t[:,:] accum):
-    """
-    Only transforms on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        float64_t val
-        int64_t lab
-
-    N, K = (<object> values).shape
-    accum = np.ones_like(accum)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i, j]
-                if val == val:
-                    accum[lab, j] *= val
-                    out[i, j] = accum[lab, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumsum(numeric[:,:] out,
-                 numeric[:,:] values,
-                 int64_t[:] labels,
-                 numeric[:,:] accum):
-    """
-    Only transforms on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        numeric val
-        int64_t lab
-
-    N, K = (<object> values).shape
-    accum = np.zeros_like(accum)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i,j]
-                if val == val:
-                    accum[lab,j] += val
-                    out[i,j] = accum[lab,j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
-                        int ngroups, int periods):
-    cdef:
-        Py_ssize_t N, i, j, ii
-        int offset, sign
-        int64_t lab, idxer, idxer_slot
-        int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
-        int64_t[:,:] label_indexer
-
-    N, = (<object> labels).shape
-
-    if periods < 0:
-        periods = -periods
-        offset = N - 1
-        sign = -1
-    elif periods > 0:
-        offset = 0
-        sign = 1
-
-    if periods == 0:
-        with nogil:
-            for i in range(N):
-                out[i] = i
-    else:
-        # array of each previous indexer seen
-        label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
-        with nogil:
-            for i in range(N):
-                ## reverse iterator if shifting backwards
-                ii = offset + sign * i
-                lab = labels[ii]
-                label_seen[lab] += 1
-
-                idxer_slot = label_seen[lab] % periods
-                idxer = label_indexer[lab, idxer_slot]
-
-                if label_seen[lab] > periods:
-                    out[ii] = idxer
-                else:
-                    out[ii] = -1
-
-                label_indexer[lab, idxer_slot] = ii
-'''
-
-
-#-------------------------------------------------------------------------
-# Generators
-
-def generate_put_template(template, use_ints=True, use_floats=True,
-                          use_objects=False, use_datelikes=False):
-    floats_list = [
-        ('float64', 'float64_t', 'float64_t', 'np.float64', True),
-        ('float32', 'float32_t', 'float32_t', 'np.float32', True),
-    ]
-    ints_list = [
-        ('int8',  'int8_t',  'float32_t', 'np.float32', True),
-        ('int16', 'int16_t', 'float32_t', 'np.float32', True),
-        ('int32', 'int32_t', 'float64_t', 'np.float64', True),
-        ('int64', 'int64_t', 'float64_t', 'np.float64', True),
-    ]
-    date_like_list = [
-        ('int64', 'int64_t', 'float64_t', 'np.float64', True),
-    ]
-    object_list = [('object', 'object', 'object', 'np.object_', False)]
-    function_list = []
-    if use_floats:
-        function_list.extend(floats_list)
-    if use_ints:
-        function_list.extend(ints_list)
-    if use_objects:
-        function_list.extend(object_list)
-    if use_datelikes:
-        function_list.extend(date_like_list)
-
-    output = StringIO()
-    for name, c_type, dest_type, dest_dtype, nogil in function_list:
-        func = template % {'name': name,
-                           'c_type': c_type,
-                           'dest_type': dest_type.replace('_t', ''),
-                           'dest_type2': dest_type,
-                           'dest_dtype': dest_dtype,
-                           'nogil' : 'with nogil:' if nogil else '',
-                           'tab' : '    ' if nogil else '' }
-        output.write(func)
-        output.write("\n")
-    return output.getvalue()
-
-def generate_put_min_max_template(template, use_ints=True, use_floats=True,
-                                  use_objects=False, use_datelikes=False):
-    floats_list = [
-        ('float64', 'float64_t', 'NAN', 'np.inf', True),
-        ('float32', 'float32_t', 'NAN', 'np.inf', True),
-    ]
-    ints_list = [
-        ('int64', 'int64_t', 'iNaT', _int64_max, True),
-    ]
-    date_like_list = [
-        ('int64', 'int64_t', 'iNaT', _int64_max, True),
-    ]
-    object_list = [('object', 'object', 'np.nan', 'np.inf', False)]
-    function_list = []
-    if use_floats:
-        function_list.extend(floats_list)
-    if use_ints:
-        function_list.extend(ints_list)
-    if use_objects:
-        function_list.extend(object_list)
-    if use_datelikes:
-        function_list.extend(date_like_list)
-
-    output = StringIO()
-    for name, dest_type, nan_val, inf_val, nogil in function_list:
-        func = template % {'name': name,
-                           'dest_type2': dest_type,
-                           'nan_val': nan_val,
-                           'inf_val': inf_val,
-                           'nogil' : "with nogil:" if nogil else '',
-                           'tab' : '    ' if nogil else '' }
-        output.write(func)
-        output.write("\n")
-    return output.getvalue()
-
-def generate_put_selection_template(template, use_ints=True, use_floats=True,
-                                    use_objects=False, use_datelikes=False):
-    floats_list = [
-        ('float64', 'float64_t', 'float64_t', 'NAN', True),
-        ('float32', 'float32_t', 'float32_t', 'NAN', True),
-    ]
-    ints_list = [
-        ('int64', 'int64_t', 'int64_t', 'iNaT', True),
-    ]
-    date_like_list = [
-        ('int64', 'int64_t', 'int64_t', 'iNaT', True),
-    ]
-    object_list = [('object', 'object', 'object', 'np.nan', False)]
-    function_list = []
-    if use_floats:
-        function_list.extend(floats_list)
-    if use_ints:
-        function_list.extend(ints_list)
-    if use_objects:
-        function_list.extend(object_list)
-    if use_datelikes:
-        function_list.extend(date_like_list)
-
-    output = StringIO()
-    for name, c_type, dest_type, nan_val, nogil in function_list:
-
-        if nogil:
-            nogil = "with nogil:"
-            tab = '    '
-        else:
-            nogil = ''
-            tab = ''
-
-        func = template % {'name': name,
-                           'c_type': c_type,
-                           'dest_type2': dest_type,
-                           'nan_val': nan_val,
-                           'nogil' : nogil,
-                           'tab' : tab }
-        output.write(func)
-        output.write("\n")
-    return output.getvalue()
-
-def generate_take_template(template, exclude=None):
-    # name, dest, ctypein, ctypeout, preval, postval, cancopy, nogil
-    function_list = [
-        ('bool', 'bool', 'uint8_t', 'uint8_t', '', '', True, True),
-        ('bool', 'object', 'uint8_t', 'object',
-         'True if ', ' > 0 else False', False, False),
-        ('int8', 'int8', 'int8_t', 'int8_t', '', '', True, False),
-        ('int8', 'int32', 'int8_t', 'int32_t', '', '', False, True),
-        ('int8', 'int64', 'int8_t', 'int64_t', '', '', False, True),
-        ('int8', 'float64', 'int8_t', 'float64_t', '', '', False, True),
-        ('int16', 'int16', 'int16_t', 'int16_t', '', '', True, True),
-        ('int16', 'int32', 'int16_t', 'int32_t', '', '', False, True),
-        ('int16', 'int64', 'int16_t', 'int64_t', '', '', False, True),
-        ('int16', 'float64', 'int16_t', 'float64_t', '', '', False, True),
-        ('int32', 'int32', 'int32_t', 'int32_t', '', '', True, True),
-        ('int32', 'int64', 'int32_t', 'int64_t', '', '', False, True),
-        ('int32', 'float64', 'int32_t', 'float64_t', '', '', False, True),
-        ('int64', 'int64', 'int64_t', 'int64_t', '', '', True, True),
-        ('int64', 'float64', 'int64_t', 'float64_t', '', '', False, True),
-        ('float32', 'float32', 'float32_t', 'float32_t', '', '', True, True),
-        ('float32', 'float64', 'float32_t', 'float64_t', '', '', False, True),
-        ('float64', 'float64', 'float64_t', 'float64_t', '', '', True, True),
-        ('object', 'object', 'object', 'object', '', '', False, False),
-    ]
-
-    output = StringIO()
-    for (name, dest, c_type_in, c_type_out,
-         preval, postval, can_copy, nogil) in function_list:
-
-        if exclude is not None and name in exclude:
-            continue
-
-        if nogil:
-            nogil = "with nogil:"
-            tab = '    '
-        else:
-            nogil = ''
-            tab = ''
-
-        func = template % {'name': name, 'dest': dest,
-                           'c_type_in': c_type_in, 'c_type_out': c_type_out,
-                           'preval': preval, 'postval': postval,
-                           'can_copy': 'True' if can_copy else 'False',
-                           'nogil' : nogil,
-                           'tab' : tab }
-        output.write(func)
-        output.write("\n")
-    return output.getvalue()
-
-def generate_from_template(template, exclude=None):
-    # name, ctype, capable of holding NA
-    function_list = [
-        ('float64', 'float64_t', 'np.float64', True, True),
-        ('float32', 'float32_t', 'np.float32', True, True),
-        ('object', 'object', 'object', True, False),
-        ('int32', 'int32_t', 'np.int32', False, True),
-        ('int64', 'int64_t', 'np.int64', False, True),
-        ('bool', 'uint8_t', 'np.bool', False, True)
-    ]
-
-    output = StringIO()
-    for name, c_type, dtype, can_hold_na, nogil in function_list:
-        if exclude is not None and name in exclude:
-            continue
-
-        func = template % {'name': name, 'c_type': c_type,
-                           'dtype': dtype,
-                           'raise_on_na': 'False' if can_hold_na else 'True',
-                           'nogil' : 'with nogil:' if nogil else '',
-                           'tab' : '    ' if nogil else '' }
-        output.write(func)
-        output.write("\n")
-    return output.getvalue()
-
-put_2d = [diff_2d_template]
-
-groupbys = [group_add_template,
-            group_prod_template,
-            group_var_template,
-            group_mean_template,
-            group_ohlc_template]
-
-groupby_selection = [group_last_template,
-                     group_nth_template]
-
-groupby_min_max = [group_min_template,
-                   group_max_template]
-
-templates_1d = [map_indices_template,
-                pad_template,
-                backfill_template,
-                pad_1d_template,
-                backfill_1d_template,
-                pad_2d_template,
-                backfill_2d_template,
-                is_monotonic_template,
-                groupby_template,
-                arrmap_template]
-
-nobool_1d_templates = [left_join_unique_template,
-                       left_join_template,
-                       outer_join_template2,
-                       inner_join_template]
-
-take_templates = [take_1d_template,
-                  take_2d_axis0_template,
-                  take_2d_axis1_template,
-                  take_2d_multi_template]
-
-
-def generate_take_cython_file():
-    # Put `generated.pyx` in the same directory as this file
-    directory = os.path.dirname(os.path.realpath(__file__))
-    filename = 'generated.pyx'
-    path = os.path.join(directory, filename)
-
-    with open(path, 'w') as f:
-        print(warning_to_new_contributors, file=f)
-        print(header, file=f)
-
-        print(generate_ensure_dtypes(), file=f)
-
-        for template in templates_1d:
-            print(generate_from_template(template), file=f)
-
-        for template in take_templates:
-            print(generate_take_template(template), file=f)
-
-        for template in put_2d:
-            print(generate_put_template(template), file=f)
-
-        for template in groupbys:
-            print(generate_put_template(template, use_ints=False), file=f)
-
-        for template in groupby_selection:
-            print(generate_put_selection_template(template, use_ints=True),
-                  file=f)
-
-        for template in groupby_min_max:
-            print(generate_put_min_max_template(template, use_ints=True),
-                  file=f)
-
-        print(grouping_no_template, file=f)
-
-        for template in nobool_1d_templates:
-            print(generate_from_template(template, exclude=['bool']), file=f)
-
-
-if __name__ == '__main__':
-    generate_take_cython_file()
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
deleted file mode 100644
index c6dcd609a2c6e..0000000000000
--- a/pandas/src/generated.pyx
+++ /dev/null
@@ -1,10522 +0,0 @@
-
-# DO NOT EDIT THIS FILE: This file was autogenerated from generate_code.py, so
-# please edit that file and then run `python2 generate_code.py` to re-generate
-# this file.
-
-
-cimport numpy as np
-cimport cython
-
-from libc.string cimport memmove
-
-from numpy cimport *
-
-from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem,
-                      PyDict_Contains, PyDict_Keys,
-                      Py_INCREF, PyTuple_SET_ITEM,
-                      PyTuple_SetItem,
-                      PyTuple_New)
-from cpython cimport PyFloat_Check
-cimport cpython
-
-cdef extern from "numpy/npy_math.h":
-    double NAN "NPY_NAN"
-
-import numpy as np
-isnan = np.isnan
-
-from datetime import datetime as pydatetime
-
-# this is our datetime.pxd
-from datetime cimport *
-
-from khash cimport *
-
-ctypedef unsigned char UChar
-
-cimport util
-from util cimport is_array, _checknull, _checknan, get_nat
-cimport lib
-from lib cimport is_null_datetimelike
-
-cdef int64_t iNaT = get_nat()
-
-# import datetime C API
-PyDateTime_IMPORT
-
-# initialize numpy
-import_array()
-import_ufunc()
-
-cdef int PLATFORM_INT = (<ndarray> np.arange(0, dtype=np.int_)).descr.type_num
-
-cpdef ensure_platform_int(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == PLATFORM_INT:
-            return arr
-        else:
-            return arr.astype(np.int_)
-    else:
-        return np.array(arr, dtype=np.int_)
-
-cpdef ensure_object(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_OBJECT:
-            return arr
-        else:
-            return arr.astype(np.object_)
-    elif hasattr(arr,'asobject'):
-        return arr.asobject
-    else:
-        return np.array(arr, dtype=np.object_)
-
-
-cpdef ensure_float64(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_FLOAT64:
-            return arr
-        else:
-            return arr.astype(np.float64)
-    else:
-        return np.array(arr, dtype=np.float64)
-
-cpdef ensure_float32(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_FLOAT32:
-            return arr
-        else:
-            return arr.astype(np.float32)
-    else:
-        return np.array(arr, dtype=np.float32)
-
-cpdef ensure_int8(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT8:
-            return arr
-        else:
-            return arr.astype(np.int8)
-    else:
-        return np.array(arr, dtype=np.int8)
-
-cpdef ensure_int16(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT16:
-            return arr
-        else:
-            return arr.astype(np.int16)
-    else:
-        return np.array(arr, dtype=np.int16)
-
-cpdef ensure_int32(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT32:
-            return arr
-        else:
-            return arr.astype(np.int32)
-    else:
-        return np.array(arr, dtype=np.int32)
-
-cpdef ensure_int64(object arr):
-    if util.is_array(arr):
-        if (<ndarray> arr).descr.type_num == NPY_INT64:
-            return arr
-        else:
-            return arr.astype(np.int64)
-    else:
-        return np.array(arr, dtype=np.int64)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_float64(ndarray[float64_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_float32(ndarray[float32_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_object(ndarray[object] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_int32(ndarray[int32_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_int64(ndarray[int64_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cpdef map_indices_bool(ndarray[uint8_t] index):
-    """
-    Produce a dict mapping the values of the input array to their respective
-    locations.
-
-    Example:
-        array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
-
-    Better to do this with Cython because of the enormous speed boost.
-    """
-    cdef Py_ssize_t i, length
-    cdef dict result = {}
-
-    length = len(index)
-
-    for i in range(length):
-        result[index[i]] = i
-
-    return result
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float64_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_float32(ndarray[float32_t] old, ndarray[float32_t] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float32_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_object(ndarray[object] old, ndarray[object] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef object cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int32_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int64_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
-                   limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef uint8_t cur, next
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
-        return indexer
-
-    i = j = 0
-
-    cur = old[0]
-
-    while j <= nright - 1 and new[j] < cur:
-        j += 1
-
-    while True:
-        if j == nright:
-            break
-
-        if i == nleft - 1:
-            while j < nright:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] > cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j += 1
-            break
-
-        next = old[i + 1]
-
-        while j < nright and cur <= new[j] < next:
-            if new[j] == cur:
-                indexer[j] = i
-            elif fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j += 1
-
-        fill_count = 0
-        i += 1
-        cur = next
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float64_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_float32(ndarray[float32_t] old, ndarray[float32_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef float32_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_object(ndarray[object] old, ndarray[object] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef object cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int32_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef int64_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new,
-                      limit=None):
-    cdef Py_ssize_t i, j, nleft, nright
-    cdef ndarray[int64_t, ndim=1] indexer
-    cdef uint8_t cur, prev
-    cdef int lim, fill_count = 0
-
-    nleft = len(old)
-    nright = len(new)
-    indexer = np.empty(nright, dtype=np.int64)
-    indexer.fill(-1)
-
-    if limit is None:
-        lim = nright
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
-        return indexer
-
-    i = nleft - 1
-    j = nright - 1
-
-    cur = old[nleft - 1]
-
-    while j >= 0 and new[j] > cur:
-        j -= 1
-
-    while True:
-        if j < 0:
-            break
-
-        if i == 0:
-            while j >= 0:
-                if new[j] == cur:
-                    indexer[j] = i
-                elif new[j] < cur and fill_count < lim:
-                    indexer[j] = i
-                    fill_count += 1
-                j -= 1
-            break
-
-        prev = old[i - 1]
-
-        while j >= 0 and prev < new[j] <= cur:
-            if new[j] == cur:
-                indexer[j] = i
-            elif new[j] < cur and fill_count < lim:
-                indexer[j] = i
-                fill_count += 1
-            j -= 1
-
-        fill_count = 0
-        i -= 1
-        cur = prev
-
-    return indexer
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_float64(ndarray[float64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_float32(ndarray[float32_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_object(ndarray[object] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_int32(ndarray[int32_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_int64(ndarray[int64_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_inplace_bool(ndarray[uint8_t] values,
-                         ndarray[uint8_t, cast=True] mask,
-                         limit=None):
-    cdef Py_ssize_t i, N
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[0]
-    for i in range(N):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_float64(ndarray[float64_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_float32(ndarray[float32_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_object(ndarray[object] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_int32(ndarray[int32_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_int64(ndarray[int64_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_inplace_bool(ndarray[uint8_t] values,
-                              ndarray[uint8_t, cast=True] mask,
-                              limit=None):
-    cdef Py_ssize_t i, N
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    N = len(values)
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    val = values[N - 1]
-    for i in range(N - 1, -1 , -1):
-        if mask[i]:
-            if fill_count >= lim:
-                continue
-            fill_count += 1
-            values[i] = val
-        else:
-            fill_count = 0
-            val = values[i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_object(ndarray[object, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
-                            ndarray[uint8_t, ndim=2] mask,
-                            limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, 0]
-        for i in range(N):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_float32(ndarray[float32_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef float32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_object(ndarray[object, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef object val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int32_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef int64_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values,
-                                 ndarray[uint8_t, ndim=2] mask,
-                                 limit=None):
-    cdef Py_ssize_t i, j, N, K
-    cdef uint8_t val
-    cdef int lim, fill_count = 0
-
-    K, N = (<object> values).shape
-
-    # GH 2778
-    if N == 0:
-        return
-
-    if limit is None:
-        lim = N
-    else:
-        if limit < 0:
-            raise ValueError('Limit must be non-negative')
-        lim = limit
-
-    for j in range(K):
-        fill_count = 0
-        val = values[j, N - 1]
-        for i in range(N - 1, -1 , -1):
-            if mask[j, i]:
-                if fill_count >= lim:
-                    continue
-                fill_count += 1
-                values[j, i] = val
-            else:
-                fill_count = 0
-                val = values[j, i]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_float64(ndarray[float64_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec
-    """
-    cdef:
-        Py_ssize_t i, n
-        float64_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False
-        else:
-            return True, True
-    elif n < 2:
-        return True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                pass # is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_float32(ndarray[float32_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec
-    """
-    cdef:
-        Py_ssize_t i, n
-        float32_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False
-        else:
-            return True, True
-    elif n < 2:
-        return True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                pass # is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_object(ndarray[object] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec
-    """
-    cdef:
-        Py_ssize_t i, n
-        object prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False
-        else:
-            return True, True
-    elif n < 2:
-        return True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False
-
-    
-    prev = arr[0]
-    for i in range(1, n):
-        cur = arr[i]
-        if timelike and cur == iNaT:
-            is_monotonic_inc = 0
-            is_monotonic_dec = 0
-            break
-        if cur < prev:
-            is_monotonic_inc = 0
-        elif cur > prev:
-            is_monotonic_dec = 0
-        elif cur == prev:
-            pass # is_unique = 0
-        else:
-            # cur or prev is NaN
-            is_monotonic_inc = 0
-            is_monotonic_dec = 0
-            break
-        if not is_monotonic_inc and not is_monotonic_dec:
-            is_monotonic_inc = 0
-            is_monotonic_dec = 0
-            break
-        prev = cur
-    return is_monotonic_inc, is_monotonic_dec
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_int32(ndarray[int32_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec
-    """
-    cdef:
-        Py_ssize_t i, n
-        int32_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False
-        else:
-            return True, True
-    elif n < 2:
-        return True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                pass # is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_int64(ndarray[int64_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec
-    """
-    cdef:
-        Py_ssize_t i, n
-        int64_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False
-        else:
-            return True, True
-    elif n < 2:
-        return True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                pass # is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def is_monotonic_bool(ndarray[uint8_t] arr, bint timelike):
-    """
-    Returns
-    -------
-    is_monotonic_inc, is_monotonic_dec
-    """
-    cdef:
-        Py_ssize_t i, n
-        uint8_t prev, cur
-        bint is_monotonic_inc = 1
-        bint is_monotonic_dec = 1
-
-    n = len(arr)
-
-    if n == 1:
-        if arr[0] != arr[0] or (timelike and arr[0] == iNaT):
-            # single value is NaN
-            return False, False
-        else:
-            return True, True
-    elif n < 2:
-        return True, True
-
-    if timelike and arr[0] == iNaT:
-        return False, False
-
-    with nogil:
-        prev = arr[0]
-        for i in range(1, n):
-            cur = arr[i]
-            if timelike and cur == iNaT:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if cur < prev:
-                is_monotonic_inc = 0
-            elif cur > prev:
-                is_monotonic_dec = 0
-            elif cur == prev:
-                pass # is_unique = 0
-            else:
-                # cur or prev is NaN
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            if not is_monotonic_inc and not is_monotonic_dec:
-                is_monotonic_inc = 0
-                is_monotonic_dec = 0
-                break
-            prev = cur
-    return is_monotonic_inc, is_monotonic_dec
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_float64(ndarray[float64_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    if not length == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if is_null_datetimelike(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_float32(ndarray[float32_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    if not length == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if is_null_datetimelike(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_object(ndarray[object] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    if not length == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if is_null_datetimelike(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_int32(ndarray[int32_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    if not length == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if is_null_datetimelike(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_int64(ndarray[int64_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    if not length == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if is_null_datetimelike(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def groupby_bool(ndarray[uint8_t] index, ndarray labels):
-    cdef dict result = {}
-    cdef Py_ssize_t i, length
-    cdef list members
-    cdef object idx, key
-
-    length = len(index)
-
-    if not length == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(length):
-        key = util.get_value_1d(labels, i)
-
-        if is_null_datetimelike(key):
-            continue
-
-        idx = index[i]
-        if key in result:
-            members = result[key]
-            members.append(idx)
-        else:
-            result[key] = [idx]
-
-    return result
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_float64(ndarray[float64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_float32(ndarray[float32_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_object(ndarray[object] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int32(ndarray[int32_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_int64(ndarray[int64_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def arrmap_bool(ndarray[uint8_t] index, object func):
-    cdef Py_ssize_t length = index.shape[0]
-    cdef Py_ssize_t i = 0
-
-    cdef ndarray[object] result = np.empty(length, dtype=np.object_)
-
-    from pandas.lib import maybe_convert_objects
-
-    for i in range(length):
-        result[i] = func(index[i])
-
-    return maybe_convert_objects(result)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_bool_bool_memview(uint8_t[:] values,
-                      int64_t[:] indexer,
-                      uint8_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        uint8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_bool_bool(ndarray[uint8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              uint8_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_bool_bool_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        uint8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_bool_object_memview(uint8_t[:] values,
-                      int64_t[:] indexer,
-                      object[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = True if values[idx] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_bool_object(ndarray[uint8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              object[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_bool_object_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = True if values[idx] > 0 else False
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_int8_memview(int8_t[:] values,
-                      int64_t[:] indexer,
-                      int8_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int8(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int8_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_int8_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int8_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_int32_memview(int8_t[:] values,
-                      int64_t[:] indexer,
-                      int32_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int32(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_int32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_int64_memview(int8_t[:] values,
-                      int64_t[:] indexer,
-                      int64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_int64(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int8_float64_memview(int8_t[:] values,
-                      int64_t[:] indexer,
-                      float64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int8_float64(ndarray[int8_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int8_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_int16_memview(int16_t[:] values,
-                      int64_t[:] indexer,
-                      int16_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int16_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int16(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int16_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_int16_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int16_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_int32_memview(int16_t[:] values,
-                      int64_t[:] indexer,
-                      int32_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int32(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_int32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_int64_memview(int16_t[:] values,
-                      int64_t[:] indexer,
-                      int64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_int64(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int16_float64_memview(int16_t[:] values,
-                      int64_t[:] indexer,
-                      float64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int16_float64(ndarray[int16_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int16_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int32_int32_memview(int32_t[:] values,
-                      int64_t[:] indexer,
-                      int32_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_int32(ndarray[int32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int32_int32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int32_int64_memview(int32_t[:] values,
-                      int64_t[:] indexer,
-                      int64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_int64(ndarray[int32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int32_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int32_float64_memview(int32_t[:] values,
-                      int64_t[:] indexer,
-                      float64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int32_float64(ndarray[int32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int32_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int64_int64_memview(int64_t[:] values,
-                      int64_t[:] indexer,
-                      int64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int64_int64(ndarray[int64_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              int64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int64_int64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        int64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_int64_float64_memview(int64_t[:] values,
-                      int64_t[:] indexer,
-                      float64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_int64_float64(ndarray[int64_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_int64_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_float32_float32_memview(float32_t[:] values,
-                      int64_t[:] indexer,
-                      float32_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        float32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float32_float32(ndarray[float32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float32_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_float32_float32_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        float32_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_float32_float64_memview(float32_t[:] values,
-                      int64_t[:] indexer,
-                      float64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float32_float64(ndarray[float32_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_float32_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_float64_float64_memview(float64_t[:] values,
-                      int64_t[:] indexer,
-                      float64_t[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_float64_float64(ndarray[float64_t, ndim=1] values,
-                              int64_t[:] indexer,
-                              float64_t[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_float64_float64_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        float64_t fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    with nogil:
-        for i from 0 <= i < n:
-            idx = indexer[i]
-            if idx == -1:
-                out[i] = fv
-            else:
-                out[i] = values[idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_1d_object_object_memview(object[:] values,
-                      int64_t[:] indexer,
-                      object[:] out,
-                      fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_1d_object_object(ndarray[object, ndim=1] values,
-                              int64_t[:] indexer,
-                              object[:] out,
-                              fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_1d_object_object_memview(values, indexer, out,
-                                          fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, n, idx
-        object fv
-
-    n = indexer.shape[0]
-
-    fv = fill_value
-
-    
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            out[i] = fv
-        else:
-            out[i] = values[idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_bool_bool_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    uint8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            uint8_t *v
-            uint8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(uint8_t) and
-            sizeof(uint8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(uint8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_bool_bool(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    uint8_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_bool_bool_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            uint8_t *v
-            uint8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(uint8_t) and
-            sizeof(uint8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(uint8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_bool_object_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = True if values[idx, j] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_bool_object(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_bool_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = True if values[idx, j] > 0 else False
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_int8_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int8_t *v
-            int8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int8_t) and
-            sizeof(int8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int8(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int8_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_int8_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int8_t *v
-            int8_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int8_t) and
-            sizeof(int8_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int8_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_int32_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int32(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_int64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_int64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int8_float64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int8_float64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int8_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_int16_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int16_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int16_t *v
-            int16_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int16_t) and
-            sizeof(int16_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int16_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int16(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int16_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_int16_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int16_t *v
-            int16_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int16_t) and
-            sizeof(int16_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int16_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_int32_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int32(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_int64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_int64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int16_float64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int16_float64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int16_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int32_int32_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_int32(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int32_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int32_t *v
-            int32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int32_t) and
-            sizeof(int32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int32_int64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_int64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int32_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int32_float64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int32_float64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int64_int64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int64_int64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int64_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            int64_t *v
-            int64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(int64_t) and
-            sizeof(int64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(int64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_int64_float64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_int64_float64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_int64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_float32_float32_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float32_t *v
-            float32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float32_t) and
-            sizeof(float32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float32_float32(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float32_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_float32_float32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float32_t *v
-            float32_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float32_t) and
-            sizeof(float32_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float32_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_float32_float64_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float32_float64(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_float32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_float64_float64_memview(float64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_float64_float64(ndarray[float64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_float64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF True:
-        cdef:
-            float64_t *v
-            float64_t *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(float64_t) and
-            sizeof(float64_t) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(float64_t) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis0_object_object_memview(object[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis0_object_object(ndarray[object, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis0_object_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(indexer)
-    k = values.shape[1]
-
-    fv = fill_value
-
-    IF False:
-        cdef:
-            object *v
-            object *o
-
-        #GH3130
-        if (values.strides[1] == out.strides[1] and
-            values.strides[1] == sizeof(object) and
-            sizeof(object) * n >= 256):
-
-            for i from 0 <= i < n:
-                idx = indexer[i]
-                if idx == -1:
-                    for j from 0 <= j < k:
-                        out[i, j] = fv
-                else:
-                    v = &values[idx, 0]
-                    o = &out[i, 0]
-                    memmove(o, v, <size_t>(sizeof(object) * k))
-            return
-
-    for i from 0 <= i < n:
-        idx = indexer[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                out[i, j] = values[idx, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_bool_bool_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    uint8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_bool_bool(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    uint8_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_bool_bool_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        uint8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_bool_object_memview(uint8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = True if values[i, idx] > 0 else False
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_bool_object(ndarray[uint8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_bool_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = True if values[i, idx] > 0 else False
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_int8_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int8_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int8(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int8_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_int8_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int8_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_int32_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int32(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_int64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_int64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int8_float64_memview(int8_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int8_float64(ndarray[int8_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int8_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_int16_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int16_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int16(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int16_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_int16_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int16_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_int32_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int32(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_int64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_int64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int16_float64_memview(int16_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int16_float64(ndarray[int16_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int16_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int32_int32_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_int32(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int32_int32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int32_int64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_int64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int32_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int32_float64_memview(int32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int32_float64(ndarray[int32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int64_int64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    int64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int64_int64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    int64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int64_int64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        int64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_int64_float64_memview(int64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_int64_float64(ndarray[int64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_int64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_float32_float32_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float32_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float32_float32(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float32_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_float32_float32_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float32_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_float32_float64_memview(float32_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float32_float64(ndarray[float32_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_float32_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_float64_float64_memview(float64_t[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    float64_t[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_float64_float64(ndarray[float64_t, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    float64_t[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_float64_float64_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        float64_t fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-cdef inline take_2d_axis1_object_object_memview(object[:, :] values,
-                                                    int64_t[:] indexer,
-                                                    object[:, :] out,
-                                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_axis1_object_object(ndarray[object, ndim=2] values,
-                                    ndarray[int64_t] indexer,
-                                    object[:, :] out,
-                                    fill_value=np.nan):
-
-    if values.flags.writeable:
-        # We can call the memoryview version of the code
-        take_2d_axis1_object_object_memview(values, indexer, out,
-                                                fill_value=fill_value)
-        return
-
-    # We cannot use the memoryview version on readonly-buffers due to
-    # a limitation of Cython's typed memoryviews. Instead we can use
-    # the slightly slower Cython ndarray type directly.
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        object fv
-
-    n = len(values)
-    k = len(indexer)
-
-    if n == 0 or k == 0:
-        return
-
-    fv = fill_value
-
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            idx = indexer[j]
-            if idx == -1:
-                out[i, j] = fv
-            else:
-                out[i, j] = values[i, idx]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_bool_bool(ndarray[uint8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[uint8_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        uint8_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_bool_object(ndarray[uint8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[object, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        object fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = True if values[idx, idx1[j]] > 0 else False
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int8(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int8_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int8_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int32(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_int64(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int8_float64(ndarray[int8_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int16(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int16_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int16_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int32(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_int64(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int16_float64(ndarray[int16_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_int32(ndarray[int32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_int64(ndarray[int32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int32_float64(ndarray[int32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int64_int64(ndarray[int64_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[int64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        int64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_int64_float64(ndarray[int64_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float32_float32(ndarray[float32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float32_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float32_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float32_float64(ndarray[float32_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_float64_float64(ndarray[float64_t, ndim=2] values,
-                                    indexer,
-                                    ndarray[float64_t, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        float64_t fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def take_2d_multi_object_object(ndarray[object, ndim=2] values,
-                                    indexer,
-                                    ndarray[object, ndim=2] out,
-                                    fill_value=np.nan):
-    cdef:
-        Py_ssize_t i, j, k, n, idx
-        ndarray[int64_t] idx0 = indexer[0]
-        ndarray[int64_t] idx1 = indexer[1]
-        object fv
-
-    n = len(idx0)
-    k = len(idx1)
-
-    fv = fill_value
-    for i from 0 <= i < n:
-        idx = idx0[i]
-        if idx == -1:
-            for j from 0 <= j < k:
-                out[i, j] = fv
-        else:
-            for j from 0 <= j < k:
-                if idx1[j] == -1:
-                    out[i, j] = fv
-                else:
-                    out[i, j] = values[idx, idx1[j]]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_float64(ndarray[float64_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_float32(ndarray[float32_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int8(ndarray[int8_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int16(ndarray[int16_t, ndim=2] arr,
-                     ndarray[float32_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int32(ndarray[int32_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def diff_2d_int64(ndarray[int64_t, ndim=2] arr,
-                     ndarray[float64_t, ndim=2] out,
-                    Py_ssize_t periods, int axis):
-    cdef:
-        Py_ssize_t i, j, sx, sy
-
-    sx, sy = (<object> arr).shape
-    if arr.flags.f_contiguous:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for j in range(sy):
-                for i in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for j in range(start, stop):
-                for i in range(sx):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-    else:
-        if axis == 0:
-            if periods >= 0:
-                start, stop = periods, sx
-            else:
-                start, stop = 0, sx + periods
-            for i in range(start, stop):
-                for j in range(sy):
-                    out[i, j] = arr[i, j] - arr[i - periods, j]
-        else:
-            if periods >= 0:
-                start, stop = periods, sy
-            else:
-                start, stop = 0, sy + periods
-            for i in range(sx):
-                for j in range(start, stop):
-                    out[i, j] = arr[i, j] - arr[i, j - periods]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_add_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-
-    with nogil:
-
-        if K > 1:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-
-        else:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_add_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-
-    with nogil:
-
-        if K > 1:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-
-        else:
-
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_prod_float64(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] prodx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        prodx[lab, j] *= val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    prodx[lab, 0] *= val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = prodx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_prod_float32(ndarray[float32_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float32_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] prodx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    prodx = np.ones_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        prodx[lab, j] *= val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    prodx[lab, 0] *= val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = prodx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-def group_var_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, ct, oldmean
-        ndarray[float64_t, ndim=2] nobs, mean
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    mean = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    out[:, :] = 0.0
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    oldmean = mean[lab, j]
-                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
-                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
-
-        for i in range(ncounts):
-            for j in range(K):
-                ct = nobs[i, j]
-                if ct < 2:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] /= (ct - 1)
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-@cython.cdivision(True)
-def group_var_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, ct, oldmean
-        ndarray[float32_t, ndim=2] nobs, mean
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    mean = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    out[:, :] = 0.0
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val:
-                    nobs[lab, j] += 1
-                    oldmean = mean[lab, j]
-                    mean[lab, j] += (val - oldmean) / nobs[lab, j]
-                    out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
-
-        for i in range(ncounts):
-            for j in range(K):
-                ct = nobs[i, j]
-                if ct < 2:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] /= (ct - 1)
-
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_mean_float64(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                count = nobs[i, j]
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j] / count
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_mean_float32(ndarray[float32_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float32_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] sumx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-    sumx = np.zeros_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-                    # not nan
-                    if val == val:
-                        nobs[lab, j] += 1
-                        sumx[lab, j] += val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-                # not nan
-                if val == val:
-                    nobs[lab, 0] += 1
-                    sumx[lab, 0] += val
-
-        for i in range(ncounts):
-            for j in range(K):
-                count = nobs[i, j]
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = sumx[i, j] / count
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_ohlc_float64(ndarray[float64_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float64_t, ndim=2] values,
-                  ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float64_t val, count
-        Py_ssize_t ngroups = len(counts)
-
-    if len(labels) == 0:
-        return
-
-    N, K = (<object> values).shape
-
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
-
-    if K > 1:
-        raise NotImplementedError("Argument 'values' must have only "
-                                  "one dimension")
-    out.fill(np.nan)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab == -1:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-            if val != val:
-                continue
-
-            if out[lab, 0] != out[lab, 0]:
-                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
-            else:
-                out[lab, 1] = max(out[lab, 1], val)
-                out[lab, 2] = min(out[lab, 2], val)
-                out[lab, 3] = val
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_ohlc_float32(ndarray[float32_t, ndim=2] out,
-                  ndarray[int64_t] counts,
-                  ndarray[float32_t, ndim=2] values,
-                  ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        float32_t val, count
-        Py_ssize_t ngroups = len(counts)
-
-    if len(labels) == 0:
-        return
-
-    N, K = (<object> values).shape
-
-    if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
-
-    if K > 1:
-        raise NotImplementedError("Argument 'values' must have only "
-                                  "one dimension")
-    out.fill(np.nan)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab == -1:
-                continue
-
-            counts[lab] += 1
-            val = values[i, 0]
-            if val != val:
-                continue
-
-            if out[lab, 0] != out[lab, 0]:
-                out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
-            else:
-                out[lab, 1] = max(out[lab, 1], val)
-                out[lab, 2] = min(out[lab, 2], val)
-                out[lab, 3] = val
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last_float64(ndarray[float64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last_float32(ndarray[float32_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[float32_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_last_int64(ndarray[int64_t, ndim=2] out,
-               ndarray[int64_t] counts,
-               ndarray[int64_t, ndim=2] values,
-               ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, j] += 1
-                    resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = resx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = resx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_nth_int64(ndarray[int64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[int64_t, ndim=2] values,
-              ndarray[int64_t] labels, int64_t rank):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty_like(out)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, j] += 1
-                    if nobs[lab, j] == rank:
-                        resx[lab, j] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = resx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] minx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val < minx[lab, 0]:
-                        minx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = minx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] minx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val < minx[lab, 0]:
-                        minx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = minx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_min_int64(ndarray[int64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[int64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] minx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    minx = np.empty_like(out)
-    minx.fill(9223372036854775807)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != iNaT:
-
-                        nobs[lab, j] += 1
-                        if val < minx[lab, j]:
-                            minx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, 0] += 1
-                    if val < minx[lab, 0]:
-                        minx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = minx[i, j]
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_float64(ndarray[float64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float64_t val, count
-        ndarray[float64_t, ndim=2] maxx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val > maxx[lab, 0]:
-                        maxx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = maxx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_float32(ndarray[float32_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[float32_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        float32_t val, count
-        ndarray[float32_t, ndim=2] maxx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-np.inf)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != NAN:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != NAN:
-                    nobs[lab, 0] += 1
-                    if val > maxx[lab, 0]:
-                        maxx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = NAN
-                else:
-                    out[i, j] = maxx[i, j]
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def group_max_int64(ndarray[int64_t, ndim=2] out,
-              ndarray[int64_t] counts,
-              ndarray[int64_t, ndim=2] values,
-              ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        int64_t val, count
-        ndarray[int64_t, ndim=2] maxx, nobs
-
-    if not len(values) == len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    nobs = np.zeros_like(out)
-
-    maxx = np.empty_like(out)
-    maxx.fill(-9223372036854775807)
-
-    N, K = (<object> values).shape
-
-    with nogil:
-        if K > 1:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                for j in range(K):
-                    val = values[i, j]
-
-                    # not nan
-                    if val == val and val != iNaT:
-                        nobs[lab, j] += 1
-                        if val > maxx[lab, j]:
-                            maxx[lab, j] = val
-        else:
-            for i in range(N):
-                lab = labels[i]
-                if lab < 0:
-                    continue
-
-                counts[lab] += 1
-                val = values[i, 0]
-
-                # not nan
-                if val == val and val != iNaT:
-                    nobs[lab, 0] += 1
-                    if val > maxx[lab, 0]:
-                        maxx[lab, 0] = val
-
-        for i in range(ncounts):
-            for j in range(K):
-                if nobs[i, j] == 0:
-                    out[i, j] = iNaT
-                else:
-                    out[i, j] = maxx[i, j]
-
-
-def group_median_float64(ndarray[float64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[float64_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, ngroups, size
-        ndarray[int64_t] _counts
-        ndarray data
-        float64_t* ptr
-    ngroups = len(counts)
-    N, K = (<object> values).shape
-
-    indexer, _counts = groupsort_indexer(labels, ngroups)
-    counts[:] = _counts[1:]
-
-    data = np.empty((K, N), dtype=np.float64)
-    ptr = <float64_t*> data.data
-
-    take_2d_axis1_float64_float64(values.T, indexer, out=data)
-
-    for i in range(K):
-        # exclude NA group
-        ptr += _counts[0]
-        for j in range(ngroups):
-            size = _counts[j + 1]
-            out[j, i] = _median_linear(ptr, size)
-            ptr += size
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumprod_float64(float64_t[:,:] out,
-                          float64_t[:,:] values,
-                          int64_t[:] labels,
-                          float64_t[:,:] accum):
-    """
-    Only transforms on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        float64_t val
-        int64_t lab
-
-    N, K = (<object> values).shape
-    accum = np.ones_like(accum)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i, j]
-                if val == val:
-                    accum[lab, j] *= val
-                    out[i, j] = accum[lab, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_cumsum(numeric[:,:] out,
-                 numeric[:,:] values,
-                 int64_t[:] labels,
-                 numeric[:,:] accum):
-    """
-    Only transforms on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, size
-        numeric val
-        int64_t lab
-
-    N, K = (<object> values).shape
-    accum = np.zeros_like(accum)
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-
-            if lab < 0:
-                continue
-            for j in range(K):
-                val = values[i,j]
-                if val == val:
-                    accum[lab,j] += val
-                    out[i,j] = accum[lab,j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
-                        int ngroups, int periods):
-    cdef:
-        Py_ssize_t N, i, j, ii
-        int offset, sign
-        int64_t lab, idxer, idxer_slot
-        int64_t[:] label_seen = np.zeros(ngroups, dtype=np.int64)
-        int64_t[:,:] label_indexer
-
-    N, = (<object> labels).shape
-
-    if periods < 0:
-        periods = -periods
-        offset = N - 1
-        sign = -1
-    elif periods > 0:
-        offset = 0
-        sign = 1
-
-    if periods == 0:
-        with nogil:
-            for i in range(N):
-                out[i] = i
-    else:
-        # array of each previous indexer seen
-        label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
-        with nogil:
-            for i in range(N):
-                ## reverse iterator if shifting backwards
-                ii = offset + sign * i
-                lab = labels[ii]
-                label_seen[lab] += 1
-
-                idxer_slot = label_seen[lab] % periods
-                idxer = label_indexer[lab, idxer_slot]
-
-                if label_seen[lab] > periods:
-                    out[ii] = idxer
-                else:
-                    out[ii] = -1
-
-                label_indexer[lab, idxer_slot] = ii
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_float64(ndarray[float64_t] left,
-                                      ndarray[float64_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        float64_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_float32(ndarray[float32_t] left,
-                                      ndarray[float32_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        float32_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_object(ndarray[object] left,
-                                      ndarray[object] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        object lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_int32(ndarray[int32_t] left,
-                                      ndarray[int32_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int32_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def left_join_indexer_unique_int64(ndarray[int64_t] left,
-                                      ndarray[int64_t] right):
-    cdef:
-        Py_ssize_t i, j, nleft, nright
-        ndarray[int64_t] indexer
-        int64_t lval, rval
-
-    i = 0
-    j = 0
-    nleft = len(left)
-    nright = len(right)
-
-    indexer = np.empty(nleft, dtype=np.int64)
-    while True:
-        if i == nleft:
-            break
-
-        if j == nright:
-            indexer[i] = -1
-            i += 1
-            continue
-
-        rval = right[j]
-
-        while i < nleft - 1 and left[i] == rval:
-            indexer[i] = j
-            i += 1
-
-        if left[i] == right[j]:
-            indexer[i] = j
-            i += 1
-            while i < nleft - 1 and left[i] == rval:
-                indexer[i] = j
-                i += 1
-            j += 1
-        elif left[i] > rval:
-            indexer[i] = -1
-            j += 1
-        else:
-            indexer[i] = -1
-            i += 1
-    return indexer
-
-
-def left_join_indexer_float64(ndarray[float64_t] left,
-                               ndarray[float64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-def left_join_indexer_float32(ndarray[float32_t] left,
-                               ndarray[float32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-def left_join_indexer_object(ndarray[object] left,
-                               ndarray[object] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        object lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-def left_join_indexer_int32(ndarray[int32_t] left,
-                               ndarray[int32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-def left_join_indexer_int64(ndarray[int64_t] left,
-                               ndarray[int64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0:
-        while i < nleft:
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    i += 1
-                    count += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = left[i]
-                count += 1
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_float64(ndarray[float64_t] left,
-                                ndarray[float64_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_float32(ndarray[float32_t] left,
-                                ndarray[float32_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        float32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_object(ndarray[object] left,
-                                ndarray[object] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        object lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_int32(ndarray[int32_t] left,
-                                ndarray[int32_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        int32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def outer_join_indexer_int64(ndarray[int64_t] left,
-                                ndarray[int64_t] right):
-    cdef:
-        Py_ssize_t i, j, nright, nleft, count
-        int64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        count = nright
-    elif nright == 0:
-        count = nleft
-    else:
-        while True:
-            if i == nleft:
-                count += nright - j
-                break
-            if j == nright:
-                count += nleft - i
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                count += 1
-                i += 1
-            else:
-                count += 1
-                j += 1
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
-
-    # do it again, but populate the indexers / result
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft == 0:
-        for j in range(nright):
-            lindexer[j] = -1
-            rindexer[j] = j
-            result[j] = right[j]
-    elif nright == 0:
-        for i in range(nleft):
-            lindexer[i] = i
-            rindexer[i] = -1
-            result[i] = left[i]
-    else:
-        while True:
-            if i == nleft:
-                while j < nright:
-                    lindexer[count] = -1
-                    rindexer[count] = j
-                    result[count] = right[j]
-                    count += 1
-                    j += 1
-                break
-            if j == nright:
-                while i < nleft:
-                    lindexer[count] = i
-                    rindexer[count] = -1
-                    result[count] = left[i]
-                    count += 1
-                    i += 1
-                break
-
-            lval = left[i]
-            rval = right[j]
-
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = lval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                lindexer[count] = i
-                rindexer[count] = -1
-                result[count] = lval
-                count += 1
-                i += 1
-            else:
-                lindexer[count] = -1
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                j += 1
-
-    return result, lindexer, rindexer
-
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_float64(ndarray[float64_t] left,
-                              ndarray[float64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_float32(ndarray[float32_t] left,
-                              ndarray[float32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        float32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[float32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.float32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_object(ndarray[object] left,
-                              ndarray[object] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        object lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[object] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=object)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_int32(ndarray[int32_t] left,
-                              ndarray[int32_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int32_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int32_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int32)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-@cython.wraparound(False)
-@cython.boundscheck(False)
-def inner_join_indexer_int64(ndarray[int64_t] left,
-                              ndarray[int64_t] right):
-    """
-    Two-pass algorithm for monotonic indexes. Handles many-to-one merges
-    """
-    cdef:
-        Py_ssize_t i, j, k, nright, nleft, count
-        int64_t lval, rval
-        ndarray[int64_t] lindexer, rindexer
-        ndarray[int64_t] result
-
-    nleft = len(left)
-    nright = len(right)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    # do it again now that result size is known
-
-    lindexer = np.empty(count, dtype=np.int64)
-    rindexer = np.empty(count, dtype=np.int64)
-    result = np.empty(count, dtype=np.int64)
-
-    i = 0
-    j = 0
-    count = 0
-    if nleft > 0 and nright > 0:
-        while True:
-            if i == nleft:
-                break
-            if j == nright:
-                break
-
-            lval = left[i]
-            rval = right[j]
-            if lval == rval:
-                lindexer[count] = i
-                rindexer[count] = j
-                result[count] = rval
-                count += 1
-                if i < nleft - 1:
-                    if j < nright - 1 and right[j + 1] == rval:
-                        j += 1
-                    else:
-                        i += 1
-                        if left[i] != rval:
-                            j += 1
-                elif j < nright - 1:
-                    j += 1
-                    if lval != right[j]:
-                        i += 1
-                else:
-                    # end of the road
-                    break
-            elif lval < rval:
-                i += 1
-            else:
-                j += 1
-
-    return result, lindexer, rindexer
-
-
diff --git a/setup.py b/setup.py
index 937b3509cf493..86777f5579a09 100755
--- a/setup.py
+++ b/setup.py
@@ -90,11 +90,47 @@ def is_platform_mac():
 except ImportError:
     cython = False
 
+
+if cython:
+    try:
+        try:
+            from Cython import Tempita as tempita
+        except ImportError:
+            import tempita
+    except ImportError:
+        raise ImportError('Building pandas requires Tempita: '
+                          'pip install Tempita')
+
+
 from os.path import join as pjoin
 
 
+_pxipath = pjoin('pandas', 'src')
+_pxifiles = ['algos_common_helper.pxi.in', 'algos_groupby_helper.pxi.in',
+             'algos_join_helper.pxi.in', 'algos_take_helper.pxi.in']
+
+
 class build_ext(_build_ext):
     def build_extensions(self):
+
+        for _pxifile in _pxifiles:
+            # build pxifiles first, template extention must be .pxi.in
+            assert _pxifile.endswith('.pxi.in')
+            pxifile = pjoin(_pxipath, _pxifile)
+            outfile = pxifile[:-3]
+
+            if (os.path.exists(outfile) and
+               os.stat(pxifile).st_mtime < os.stat(outfile).st_mtime):
+                # if .pxi.in is not updated, no need to output .pxi
+                continue
+
+            with open(pxifile, "r") as f:
+                tmpl = f.read()
+            pyxcontent = tempita.sub(tmpl)
+
+            with open(outfile, "w") as f:
+                f.write(pyxcontent)
+
         numpy_incl = pkg_resources.resource_filename('numpy', 'core/include')
 
         for ext in self.extensions: