@@ -1277,22 +1277,120 @@ def _memory_usage(self, deep: bool = False) -> int:
12771277 v += lib .memory_usage_of_objects (values )
12781278 return v
12791279
1280- @doc (
1281- algorithms .factorize ,
1282- order = "" ,
1283- sort = textwrap .dedent (
1284- """\
1285- sort : bool, default False
1286- Sort `uniques` and shuffle `codes` to maintain the
1287- relationship.
1288- """
1289- ),
1290- )
12911280 def factorize (
12921281 self ,
12931282 sort : bool = False ,
12941283 use_na_sentinel : bool = True ,
12951284 ) -> tuple [npt .NDArray [np .intp ], Index ]:
1285+ """
1286+ Encode the object as an enumerated type or categorical variable.
1287+
1288+ This method is useful for obtaining a numeric representation of an
1289+ array when all that matters is identifying distinct values. `factorize`
1290+ is available as both a top-level function :func:`pandas.factorize`,
1291+ and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
1292+
1293+ Parameters
1294+ ----------
1295+ sort : bool, default False
1296+ Sort `uniques` and shuffle `codes` to maintain the
1297+ relationship.
1298+ use_na_sentinel : bool, default True
1299+ If True, the sentinel -1 will be used for NaN values. If False,
1300+ NaN values will be encoded as non-negative integers and will not drop the
1301+ NaN from the uniques of the values.
1302+
1303+ Returns
1304+ -------
1305+ codes : ndarray
1306+ An integer ndarray that's an indexer into `uniques`.
1307+ ``uniques.take(codes)`` will have the same values as `values`.
1308+ uniques : ndarray, Index, or Categorical
1309+ The unique valid values. When `values` is Categorical, `uniques`
1310+ is a Categorical. When `values` is some other pandas object, an
1311+ `Index` is returned. Otherwise, a 1-D ndarray is returned.
1312+
1313+ .. note::
1314+
1315+ Even if there's a missing value in `values`, `uniques` will
1316+ *not* contain an entry for it.
1317+
1318+ See Also
1319+ --------
1320+ cut : Discretize continuous-valued array.
1321+ unique : Find the unique value in an array.
1322+
1323+ Notes
1324+ -----
1325+ Reference :ref:`the user guide <reshaping.factorize>` for more examples.
1326+
1327+ Examples
1328+ --------
1329+ These examples all show factorize as a top-level method like
1330+ ``pd.factorize(values)``. The results are identical for methods like
1331+ :meth:`Series.factorize`.
1332+
1333+ >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"))
1334+ >>> codes
1335+ array([0, 0, 1, 2, 0])
1336+ >>> uniques
1337+ array(['b', 'a', 'c'], dtype=object)
1338+
1339+ With ``sort=True``, the `uniques` will be sorted, and `codes` will be
1340+ shuffled so that the relationship is the maintained.
1341+
1342+ >>> codes, uniques = pd.factorize(np.array(['b', 'b', 'a', 'c', 'b'], dtype="O"),
1343+ ... sort=True)
1344+ >>> codes
1345+ array([1, 1, 0, 2, 1])
1346+ >>> uniques
1347+ array(['a', 'b', 'c'], dtype=object)
1348+
1349+ When ``use_na_sentinel=True`` (the default), missing values are indicated in
1350+ the `codes` with the sentinel value ``-1`` and missing values are not
1351+ included in `uniques`.
1352+
1353+ >>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"))
1354+ >>> codes
1355+ array([ 0, -1, 1, 2, 0])
1356+ >>> uniques
1357+ array(['b', 'a', 'c'], dtype=object)
1358+
1359+ Thus far, we've only factorized lists (which are internally coerced to
1360+ NumPy arrays). When factorizing pandas objects, the type of `uniques`
1361+ will differ. For Categoricals, a `Categorical` is returned.
1362+
1363+ >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
1364+ >>> codes, uniques = pd.factorize(cat)
1365+ >>> codes
1366+ array([0, 0, 1])
1367+ >>> uniques
1368+ ['a', 'c']
1369+ Categories (3, object): ['a', 'b', 'c']
1370+
1371+ Notice that ``'b'`` is in ``uniques.categories``, despite not being
1372+ present in ``cat.values``.
1373+
1374+ For all other pandas objects, an Index of the appropriate type is
1375+ returned.
1376+
1377+ >>> cat = pd.Series(['a', 'a', 'c'])
1378+ >>> codes, uniques = pd.factorize(cat)
1379+ >>> codes
1380+ array([0, 0, 1])
1381+ >>> uniques
1382+ Index(['a', 'c'], dtype='object')
1383+
1384+ If NaN is in the values, and we want to include NaN in the uniques of the
1385+ values, it can be achieved by setting ``use_na_sentinel=False``.
1386+
1387+ >>> codes, uniques = pd.factorize(np.array(['b', None, 'a', 'c', 'b'], dtype="O"),
1388+ ... use_na_sentinel=False)
1389+ >>> codes
1390+ array([0, 1, 2, 3, 0])
1391+ >>> uniques
1392+ array(['b', None, 'a', 'c'], dtype=object)
1393+ """
12961394 codes , uniques = algorithms .factorize (
12971395 self ._values , sort = sort , use_na_sentinel = use_na_sentinel
12981396 )
0 commit comments