|
30 | 30 | from pandas.core.dtypes.missing import isnull
|
31 | 31 |
|
32 | 32 | from pandas.core import common as com
|
33 |
| -from pandas.compat import string_types |
34 | 33 | from pandas._libs import algos, lib, hashtable as htable
|
35 | 34 | from pandas._libs.tslib import iNaT
|
36 | 35 |
|
@@ -431,104 +430,6 @@ def isin(comps, values):
|
431 | 430 | return f(comps, values)
|
432 | 431 |
|
433 | 432 |
|
434 |
| -def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False): |
435 |
| - """ |
436 |
| - Sort ``values`` and reorder corresponding ``labels``. |
437 |
| - ``values`` should be unique if ``labels`` is not None. |
438 |
| - Safe for use with mixed types (int, str), orders ints before strs. |
439 |
| -
|
440 |
| - .. versionadded:: 0.19.0 |
441 |
| -
|
442 |
| - Parameters |
443 |
| - ---------- |
444 |
| - values : list-like |
445 |
| - Sequence; must be unique if ``labels`` is not None. |
446 |
| - labels : list_like |
447 |
| - Indices to ``values``. All out of bound indices are treated as |
448 |
| - "not found" and will be masked with ``na_sentinel``. |
449 |
| - na_sentinel : int, default -1 |
450 |
| - Value in ``labels`` to mark "not found". |
451 |
| - Ignored when ``labels`` is None. |
452 |
| - assume_unique : bool, default False |
453 |
| - When True, ``values`` are assumed to be unique, which can speed up |
454 |
| - the calculation. Ignored when ``labels`` is None. |
455 |
| -
|
456 |
| - Returns |
457 |
| - ------- |
458 |
| - ordered : ndarray |
459 |
| - Sorted ``values`` |
460 |
| - new_labels : ndarray |
461 |
| - Reordered ``labels``; returned when ``labels`` is not None. |
462 |
| -
|
463 |
| - Raises |
464 |
| - ------ |
465 |
| - TypeError |
466 |
| - * If ``values`` is not list-like or if ``labels`` is neither None |
467 |
| - nor list-like |
468 |
| - * If ``values`` cannot be sorted |
469 |
| - ValueError |
470 |
| - * If ``labels`` is not None and ``values`` contain duplicates. |
471 |
| - """ |
472 |
| - if not is_list_like(values): |
473 |
| - raise TypeError("Only list-like objects are allowed to be passed to" |
474 |
| - "safe_sort as values") |
475 |
| - values = np.asarray(values) |
476 |
| - |
477 |
| - def sort_mixed(values): |
478 |
| - # order ints before strings, safe in py3 |
479 |
| - str_pos = np.array([isinstance(x, string_types) for x in values], |
480 |
| - dtype=bool) |
481 |
| - nums = np.sort(values[~str_pos]) |
482 |
| - strs = np.sort(values[str_pos]) |
483 |
| - return _ensure_object(np.concatenate([nums, strs])) |
484 |
| - |
485 |
| - sorter = None |
486 |
| - if compat.PY3 and lib.infer_dtype(values) == 'mixed-integer': |
487 |
| - # unorderable in py3 if mixed str/int |
488 |
| - ordered = sort_mixed(values) |
489 |
| - else: |
490 |
| - try: |
491 |
| - sorter = values.argsort() |
492 |
| - ordered = values.take(sorter) |
493 |
| - except TypeError: |
494 |
| - # try this anyway |
495 |
| - ordered = sort_mixed(values) |
496 |
| - |
497 |
| - # labels: |
498 |
| - |
499 |
| - if labels is None: |
500 |
| - return ordered |
501 |
| - |
502 |
| - if not is_list_like(labels): |
503 |
| - raise TypeError("Only list-like objects or None are allowed to be" |
504 |
| - "passed to safe_sort as labels") |
505 |
| - labels = _ensure_platform_int(np.asarray(labels)) |
506 |
| - |
507 |
| - from pandas import Index |
508 |
| - if not assume_unique and not Index(values).is_unique: |
509 |
| - raise ValueError("values should be unique if labels is not None") |
510 |
| - |
511 |
| - if sorter is None: |
512 |
| - # mixed types |
513 |
| - (hash_klass, _), values = _get_data_algo(values, _hashtables) |
514 |
| - t = hash_klass(len(values)) |
515 |
| - t.map_locations(values) |
516 |
| - sorter = _ensure_platform_int(t.lookup(ordered)) |
517 |
| - |
518 |
| - reverse_indexer = np.empty(len(sorter), dtype=np.int_) |
519 |
| - reverse_indexer.put(sorter, np.arange(len(sorter))) |
520 |
| - |
521 |
| - mask = (labels < -len(values)) | (labels >= len(values)) | \ |
522 |
| - (labels == na_sentinel) |
523 |
| - |
524 |
| - # (Out of bound indices will be masked with `na_sentinel` next, so we may |
525 |
| - # deal with them here without performance loss using `mode='wrap'`.) |
526 |
| - new_labels = reverse_indexer.take(labels, mode='wrap') |
527 |
| - np.putmask(new_labels, mask, na_sentinel) |
528 |
| - |
529 |
| - return ordered, _ensure_platform_int(new_labels) |
530 |
| - |
531 |
| - |
532 | 433 | def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
|
533 | 434 | """
|
534 | 435 | Encode input values as an enumerated type or categorical variable
|
@@ -568,6 +469,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
|
568 | 469 | uniques = uniques.to_array()
|
569 | 470 |
|
570 | 471 | if sort and len(uniques) > 0:
|
| 472 | + from pandas.core.sorting import safe_sort |
571 | 473 | uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
|
572 | 474 | assume_unique=True)
|
573 | 475 |
|
|
0 commit comments