Skip to content

REF: rename 'labels' to 'codes' in algorithms.safe_sort and ._factorize #29552

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 12, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 38 additions & 37 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,9 +448,11 @@ def isin(comps, values) -> np.ndarray:
return f(comps, values)


def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=None):
def _factorize_array(
values, na_sentinel: int = -1, size_hint=None, na_value=None
) -> Tuple[np.ndarray, np.ndarray]:
"""
Factorize an array-like to labels and uniques.
Factorize an array-like to codes and uniques.

This doesn't do any coercion of types or unboxing before factorization.

Expand All @@ -468,18 +470,16 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non

Returns
-------
labels : ndarray
codes : ndarray
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we necessarily know the dtype? e.g. could this be ndarray[int64]?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As it's implemented, codes will always be of type ndarray[int64]. I've got a suspicion taht there are cases where would like it to return arrays of fewer bits, but yes, as it is now, it always is an int64 array.

uniques : ndarray
"""
hash_klass, values = _get_data_algo(values)

table = hash_klass(size_hint or len(values))
uniques, labels = table.factorize(
values, na_sentinel=na_sentinel, na_value=na_value
)
uniques, codes = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value)

labels = ensure_platform_int(labels)
return labels, uniques
codes = ensure_platform_int(codes)
return codes, uniques


_shared_docs[
Expand Down Expand Up @@ -1924,51 +1924,52 @@ def diff(arr, n: int, axis: int = 0):
# this module.
def safe_sort(
values,
labels=None,
codes=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

followup to type this would be great

na_sentinel: int = -1,
assume_unique: bool = False,
verify: bool = True,
):
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
"""
Sort ``values`` and reorder corresponding ``labels``.
``values`` should be unique if ``labels`` is not None.
Sort ``values`` and reorder corresponding ``codes``.

``values`` should be unique if ``codes`` is not None.
Safe for use with mixed types (int, str), orders ints before strs.

Parameters
----------
values : list-like
Sequence; must be unique if ``labels`` is not None.
labels : list_like
Sequence; must be unique if ``codes`` is not None.
codes : list_like, optional
Indices to ``values``. All out of bound indices are treated as
"not found" and will be masked with ``na_sentinel``.
na_sentinel : int, default -1
Value in ``labels`` to mark "not found".
Ignored when ``labels`` is None.
Value in ``codes`` to mark "not found".
Ignored when ``codes`` is None.
assume_unique : bool, default False
When True, ``values`` are assumed to be unique, which can speed up
the calculation. Ignored when ``labels`` is None.
the calculation. Ignored when ``codes`` is None.
verify : bool, default True
Check if labels are out of bound for the values and put out of bound
labels equal to na_sentinel. If ``verify=False``, it is assumed there
are no out of bound labels. Ignored when ``labels`` is None.
Check if codes are out of bound for the values and put out of bound
codes equal to na_sentinel. If ``verify=False``, it is assumed there
are no out of bound codes. Ignored when ``codes`` is None.

.. versionadded:: 0.25.0

Returns
-------
ordered : ndarray
Sorted ``values``
new_labels : ndarray
Reordered ``labels``; returned when ``labels`` is not None.
new_codes : ndarray
Reordered ``codes``; returned when ``codes`` is not None.

Raises
------
TypeError
* If ``values`` is not list-like or if ``labels`` is neither None
* If ``values`` is not list-like or if ``codes`` is neither None
nor list-like
* If ``values`` cannot be sorted
ValueError
* If ``labels`` is not None and ``values`` contain duplicates.
* If ``codes`` is not None and ``values`` contain duplicates.
"""
if not is_list_like(values):
raise TypeError(
Expand Down Expand Up @@ -2002,22 +2003,22 @@ def sort_mixed(values):
# try this anyway
ordered = sort_mixed(values)

# labels:
# codes:

if labels is None:
if codes is None:
return ordered

if not is_list_like(labels):
if not is_list_like(codes):
raise TypeError(
"Only list-like objects or None are allowed to be"
"passed to safe_sort as labels"
"passed to safe_sort as codes"
)
labels = ensure_platform_int(np.asarray(labels))
codes = ensure_platform_int(np.asarray(codes))

from pandas import Index

if not assume_unique and not Index(values).is_unique:
raise ValueError("values should be unique if labels is not None")
raise ValueError("values should be unique if codes is not None")

if sorter is None:
# mixed types
Expand All @@ -2029,23 +2030,23 @@ def sort_mixed(values):
if na_sentinel == -1:
# take_1d is faster, but only works for na_sentinels of -1
order2 = sorter.argsort()
new_labels = take_1d(order2, labels, fill_value=-1)
new_codes = take_1d(order2, codes, fill_value=-1)
if verify:
mask = (labels < -len(values)) | (labels >= len(values))
mask = (codes < -len(values)) | (codes >= len(values))
else:
mask = None
else:
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
reverse_indexer.put(sorter, np.arange(len(sorter)))
# Out of bound indices will be masked with `na_sentinel` next, so we
# may deal with them here without performance loss using `mode='wrap'`
new_labels = reverse_indexer.take(labels, mode="wrap")
new_codes = reverse_indexer.take(codes, mode="wrap")

mask = labels == na_sentinel
mask = codes == na_sentinel
if verify:
mask = mask | (labels < -len(values)) | (labels >= len(values))
mask = mask | (codes < -len(values)) | (codes >= len(values))

if mask is not None:
np.putmask(new_labels, mask, na_sentinel)
np.putmask(new_codes, mask, na_sentinel)

return ordered, ensure_platform_int(new_labels)
return ordered, ensure_platform_int(new_codes)
72 changes: 36 additions & 36 deletions pandas/tests/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,27 +314,27 @@ def verify_order(df):


def test_decons():
def testit(label_list, shape):
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
label_list2 = decons_group_index(group_index, shape)
def testit(codes_list, shape):
group_index = get_group_index(codes_list, shape, sort=True, xnull=True)
codes_list2 = decons_group_index(group_index, shape)

for a, b in zip(label_list, label_list2):
for a, b in zip(codes_list, codes_list2):
tm.assert_numpy_array_equal(a, b)

shape = (4, 5, 6)
label_list = [
codes_list = [
np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64),
np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64),
np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64),
]
testit(label_list, shape)
testit(codes_list, shape)

shape = (10000, 10000)
label_list = [
codes_list = [
np.tile(np.arange(10000, dtype=np.int64), 5),
np.tile(np.arange(10000, dtype=np.int64), 5),
]
testit(label_list, shape)
testit(codes_list, shape)


class TestSafeSort:
Expand All @@ -355,42 +355,42 @@ def test_basic_sort(self):
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize("verify", [True, False])
def test_labels(self, verify):
def test_codes(self, verify):
values = [3, 1, 2, 0, 4]
expected = np.array([0, 1, 2, 3, 4])

labels = [0, 1, 1, 2, 3, 0, -1, 4]
result, result_labels = safe_sort(values, labels, verify=verify)
expected_labels = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
codes = [0, 1, 1, 2, 3, 0, -1, 4]
result, result_codes = safe_sort(values, codes, verify=verify)
expected_codes = np.array([3, 1, 1, 2, 0, 3, -1, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)
tm.assert_numpy_array_equal(result_codes, expected_codes)

# na_sentinel
labels = [0, 1, 1, 2, 3, 0, 99, 4]
result, result_labels = safe_sort(values, labels, na_sentinel=99, verify=verify)
expected_labels = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
codes = [0, 1, 1, 2, 3, 0, 99, 4]
result, result_codes = safe_sort(values, codes, na_sentinel=99, verify=verify)
expected_codes = np.array([3, 1, 1, 2, 0, 3, 99, 4], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)
tm.assert_numpy_array_equal(result_codes, expected_codes)

labels = []
result, result_labels = safe_sort(values, labels, verify=verify)
expected_labels = np.array([], dtype=np.intp)
codes = []
result, result_codes = safe_sort(values, codes, verify=verify)
expected_codes = np.array([], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)
tm.assert_numpy_array_equal(result_codes, expected_codes)

@pytest.mark.parametrize("na_sentinel", [-1, 99])
def test_labels_out_of_bound(self, na_sentinel):
def test_codes_out_of_bound(self, na_sentinel):
values = [3, 1, 2, 0, 4]
expected = np.array([0, 1, 2, 3, 4])

# out of bound indices
labels = [0, 101, 102, 2, 3, 0, 99, 4]
result, result_labels = safe_sort(values, labels, na_sentinel=na_sentinel)
expected_labels = np.array(
codes = [0, 101, 102, 2, 3, 0, 99, 4]
result, result_codes = safe_sort(values, codes, na_sentinel=na_sentinel)
expected_codes = np.array(
[3, na_sentinel, na_sentinel, 2, 0, 3, na_sentinel, 4], dtype=np.intp
)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)
tm.assert_numpy_array_equal(result_codes, expected_codes)

def test_mixed_integer(self):
values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object)
Expand All @@ -399,12 +399,12 @@ def test_mixed_integer(self):
tm.assert_numpy_array_equal(result, expected)

values = np.array(["b", 1, 0, "a"], dtype=object)
labels = [0, 1, 2, 3, 0, -1, 1]
result, result_labels = safe_sort(values, labels)
codes = [0, 1, 2, 3, 0, -1, 1]
result, result_codes = safe_sort(values, codes)
expected = np.array([0, 1, "a", "b"], dtype=object)
expected_labels = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
tm.assert_numpy_array_equal(result_labels, expected_labels)
tm.assert_numpy_array_equal(result_codes, expected_codes)

def test_mixed_integer_from_list(self):
values = ["b", 1, 0, "a", 0, "b"]
Expand All @@ -428,10 +428,10 @@ def test_exceptions(self):
safe_sort(values=1)

with pytest.raises(TypeError, match="Only list-like objects or None"):
safe_sort(values=[0, 1, 2], labels=1)
safe_sort(values=[0, 1, 2], codes=1)

with pytest.raises(ValueError, match="values should be unique"):
safe_sort(values=[0, 1, 2, 1], labels=[0, 1])
safe_sort(values=[0, 1, 2, 1], codes=[0, 1])

def test_extension_array(self):
# a = array([1, 3, np.nan, 2], dtype='Int64')
Expand All @@ -443,12 +443,12 @@ def test_extension_array(self):

@pytest.mark.parametrize("verify", [True, False])
@pytest.mark.parametrize("na_sentinel", [-1, 99])
def test_extension_array_labels(self, verify, na_sentinel):
def test_extension_array_codes(self, verify, na_sentinel):
a = array([1, 3, 2], dtype="Int64")
result, labels = safe_sort(
result, codes = safe_sort(
a, [0, 1, na_sentinel, 2], na_sentinel=na_sentinel, verify=verify
)
expected_values = array([1, 2, 3], dtype="Int64")
expected_labels = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
expected_codes = np.array([0, 2, na_sentinel, 1], dtype=np.intp)
tm.assert_extension_array_equal(result, expected_values)
tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)