Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 18 additions & 33 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import decimal
import operator
from textwrap import dedent
from typing import (
TYPE_CHECKING,
Literal,
Expand Down Expand Up @@ -35,10 +34,7 @@
TakeIndexer,
npt,
)
from pandas.util._decorators import (
doc,
set_module,
)
from pandas.util._decorators import set_module
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -654,28 +650,6 @@ def factorize_array(


@set_module("pandas")
@doc(
values=dedent(
"""\
values : sequence
A 1-D sequence. Sequences that aren't pandas objects are
coerced to ndarrays before factorization.
"""
),
sort=dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
size_hint=dedent(
"""\
size_hint : int, optional
Hint to the hashtable sizer.
"""
),
)
def factorize(
values,
sort: bool = False,
Expand All @@ -692,12 +666,18 @@ def factorize(

Parameters
----------
{values}{sort}
values : sequence
A 1-D sequence. Sequences that aren't pandas objects are
coerced to ndarrays before factorization.
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.
{size_hint}\
size_hint : int, optional
Hint to the hashtable sizer.

Returns
-------
Expand Down Expand Up @@ -729,7 +709,9 @@ def factorize(
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.

>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"))
>>> codes, uniques = pd.factorize(
... np.array(['b', 'b', 'a', 'c', 'b'], dtype="O")
... )
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
Expand All @@ -738,8 +720,9 @@ def factorize(
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.

>>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"),
... sort=True)
>>> codes, uniques = pd.factorize(
... np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"), sort=True
... )
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
Expand All @@ -749,7 +732,9 @@ def factorize(
the `codes` with the sentinel value ``-1`` and missing values are not
included in `uniques`.

>>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"))
>>> codes, uniques = pd.factorize(
... np.array(['b', None, 'a', 'c', 'b'], dtype="O")
... )
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
Expand Down
129 changes: 116 additions & 13 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,24 +1277,127 @@ def _memory_usage(self, deep: bool = False) -> int:
v += lib.memory_usage_of_objects(values)
return v

@doc(
algorithms.factorize,
values="",
order="",
size_hint="",
sort=textwrap.dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
)
def factorize(
self,
sort: bool = False,
use_na_sentinel: bool = True,
) -> tuple[npt.NDArray[np.intp], Index]:
"""
Encode the object as an enumerated type or categorical variable.

This method is useful for obtaining a numeric representation of an
array when all that matters is identifying distinct values. `factorize`
is available as both a top-level function :func:`pandas.factorize`,
and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.

Parameters
----------
sort : bool, default False
Sort `uniques` and shuffle `codes` to maintain the
relationship.
use_na_sentinel : bool, default True
If True, the sentinel -1 will be used for NaN values. If False,
NaN values will be encoded as non-negative integers and will not drop the
NaN from the uniques of the values.

Returns
-------
codes : ndarray
An integer ndarray that's an indexer into `uniques`.
``uniques.take(codes)`` will have the same values as `values`.
uniques : ndarray, Index, or Categorical
The unique valid values. When `values` is Categorical, `uniques`
is a Categorical. When `values` is some other pandas object, an
`Index` is returned. Otherwise, a 1-D ndarray is returned.

.. note::

Even if there's a missing value in `values`, `uniques` will
*not* contain an entry for it.

See Also
--------
cut : Discretize continuous-valued array.
unique : Find the unique value in an array.

Notes
-----
Reference :ref:`the user guide <reshaping.factorize>` for more examples.

Examples
--------
These examples all show factorize as a top-level method like
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.

>>> codes, uniques = pd.factorize(
... np.array(['b', 'b', 'a', 'c', 'b'], dtype="O")
... )
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)

With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.

>>> codes, uniques = pd.factorize(
... np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"), sort=True
... )
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
array(['a', 'b', 'c'], dtype=object)

When ``use_na_sentinel=True`` (the default), missing values are indicated in
the `codes` with the sentinel value ``-1`` and missing values are not
included in `uniques`.

>>> codes, uniques = pd.factorize(
... np.array(['b', None, 'a', 'c', 'b'], dtype="O")
... )
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)

Thus far, we've only factorized lists (which are internally coerced to
NumPy arrays). When factorizing pandas objects, the type of `uniques`
will differ. For Categoricals, a `Categorical` is returned.

>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
['a', 'c']
Categories (3, object): ['a', 'b', 'c']

Notice that ``'b'`` is in ``uniques.categories``, despite not being
present in ``cat.values``.

For all other pandas objects, an Index of the appropriate type is
returned.

>>> cat = pd.Series(['a', 'a', 'c'])
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
Index(['a', 'c'], dtype='object')

If NaN is in the values, and we want to include NaN in the uniques of the
values, it can be achieved by setting ``use_na_sentinel=False``.

>>> codes, uniques = pd.factorize(
... np.array(['b', None, 'a', 'c', 'b'], dtype="O"),
... use_na_sentinel=False,
... )
>>> codes
array([0, 1, 2, 3, 0])
>>> uniques
array(['b', None, 'a', 'c'], dtype=object)
"""
codes, uniques = algorithms.factorize(
self._values, sort=sort, use_na_sentinel=use_na_sentinel
)
Expand Down
Loading