Skip to content
Closed
38 changes: 9 additions & 29 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import decimal
import operator
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Literal,
Expand Down Expand Up @@ -35,10 +34,7 @@
TakeIndexer,
npt,
)
from pandas.util._decorators import (
doc,
set_module,
)
from pandas.util._decorators import set_module
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -654,28 +650,6 @@ def factorize_array(


@set_module("pandas")
@doc(
values=dedent(
"""\
values : sequence
A 1-D sequence. Sequences that aren't pandas objects are
coerced to ndarrays before factorization.
"""
),
sort=dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
size_hint=dedent(
"""\
size_hint : int, optional
Hint to the hashtable sizer.
"""
),
)
def factorize(
values,
sort: bool = False,
Expand All @@ -692,12 +666,18 @@ def factorize(

Parameters
----------
{values}{sort}
values : sequence
A 1-D sequence. Sequences that aren't pandas objects are
coerced to ndarrays before factorization.
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
{size_hint}\
size_hint : int, optional
Hint to the hashtable sizer.

Returns
-------
Expand Down
129 changes: 116 additions & 13 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,24 +1277,127 @@ def _memory_usage(self, deep: bool = False) -> int:
v += lib.memory_usage_of_objects(values)
return v

@doc(
algorithms.factorize,
values="",
order="",
size_hint="",
sort=textwrap.dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
)
def factorize(
self,
sort: bool = False,
use_na_sentinel: bool = True,
) -> tuple[npt.NDArray[np.intp], Index]:
"""
Encode the object as an enumerated type or categorical variable.

This method is useful for obtaining a numeric representation of an
array when all that matters is identifying distinct values. `factorize`
is available as both a top-level function :func:`pandas.factorize`,
and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.

Parameters
----------
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.

Returns
-------
codes : ndarray
An integer ndarray that's an indexer into `uniques`.
``uniques.take(codes)`` will have the same values as `values`.
uniques : ndarray, Index, or Categorical
The unique valid values. When `values` is Categorical, `uniques`
is a Categorical. When `values` is some other pandas object, an
`Index` is returned. Otherwise, a 1-D ndarray is returned.

.. note::

Even if there's a missing value in `values`, `uniques` will
*not* contain an entry for it.

See Also
--------
cut : Discretize continuous-valued array.
unique : Find the unique value in an array.

Notes
-----
Reference :ref:`the user guide <reshaping.factorize>` for more examples.

Examples
--------
These examples all show factorize as a top-level method like
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.

>>> codes, uniques = pd.factorize(
... np.array(['b', 'b', 'a', 'c', 'b'], dtype="O")
... )
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)

With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.

>>> codes, uniques = pd.factorize(
... np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"), sort=True
... )
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
array(['a', 'b', 'c'], dtype=object)

When ``use_na_sentinel=True`` (the default), missing values are indicated in
the `codes` with the sentinel value ``-1`` and missing values are not
included in `uniques`.

>>> codes, uniques = pd.factorize(
... np.array(['b', None, 'a', 'c', 'b'], dtype="O")
... )
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)

Thus far, we've only factorized lists (which are internally coerced to
NumPy arrays). When factorizing pandas objects, the type of `uniques`
will differ. For Categoricals, a `Categorical` is returned.

>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
['a', 'c']
Categories (3, object): ['a', 'b', 'c']

Notice that ``'b'`` is in ``uniques.categories``, despite not being
present in ``cat.values``.

For all other pandas objects, an Index of the appropriate type is
returned.

>>> cat = pd.Series(['a', 'a', 'c'])
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
Index(['a', 'c'], dtype='object')

If NaN is in the values, and we want to include NaN in the uniques of the
values, it can be achieved by setting ``use_na_sentinel=False``.

>>> codes, uniques = pd.factorize(
... np.array(['b', None, 'a', 'c', 'b'], dtype="O"),
... use_na_sentinel=False,
... )
>>> codes
array([0, 1, 2, 3, 0])
>>> uniques
array(['b', None, 'a', 'c'], dtype=object)
"""
codes, uniques = algorithms.factorize(
self._values, sort=sort, use_na_sentinel=use_na_sentinel
)
Expand Down
Loading