{ "cells": [ { "cell_type": "markdown", "id": "81ad838d", "metadata": {}, "source": [ "# Indexing\n", "\n", "## Index objects\n", "\n", "The index objects of pandas are responsible for the axis labels and other metadata, such as the axis name. Any array or other sequence of labels you use when constructing a series or DataFrame is internally converted into an index:" ] }, { "cell_type": "code", "execution_count": 1, "id": "cb4b1c29", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "\n", "obj = pd.Series(range(7), index=pd.date_range(\"2022-02-02\", periods=7))" ] }, { "cell_type": "code", "execution_count": 2, "id": "e274cd7d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2022-02-02', '2022-02-03', '2022-02-04', '2022-02-05',\n", " '2022-02-06', '2022-02-07', '2022-02-08'],\n", " dtype='datetime64[ns]', freq='D')" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.index" ] }, { "cell_type": "code", "execution_count": 3, "id": "e676416a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatetimeIndex(['2022-02-05', '2022-02-06', '2022-02-07', '2022-02-08'], dtype='datetime64[ns]', freq='D')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.index[3:]" ] }, { "cell_type": "markdown", "id": "dcea073a", "metadata": {}, "source": [ "Index objects are immutable and therefore cannot be changed by the user:" ] }, { "cell_type": "code", "execution_count": 4, "id": "8aae53cd", "metadata": { "scrolled": true }, "outputs": [ { "ename": "TypeError", "evalue": "Index does not support mutable operations", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m]\u001b[49m \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2022-02-03\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", "File \u001b[0;32m~/sandbox/py313/.venv/lib/python3.13/site-packages/pandas/core/indexes/base.py:5371\u001b[0m, in \u001b[0;36mIndex.__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 5369\u001b[0m \u001b[38;5;129m@final\u001b[39m\n\u001b[1;32m 5370\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__setitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key, value) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 5371\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIndex does not support mutable operations\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", "\u001b[0;31mTypeError\u001b[0m: Index does not support mutable operations" ] } ], "source": [ "obj.index[1] = \"2022-02-03\"" ] }, { "cell_type": "markdown", "id": "a4c5edd0", "metadata": {}, "source": [ "Immutability makes the sharing of index objects in data structures more secure:" ] }, { "cell_type": "code", "execution_count": 5, "id": "94fffc30", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index([0, 1, 2], dtype='int64')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np\n", "\n", "\n", "labels = pd.Index(np.arange(3))\n", "\n", "labels" ] }, { "cell_type": "code", "execution_count": 6, "id": "6ac851bc", "metadata": {}, "outputs": [], "source": [ "rng = np.random.default_rng()\n", "obj2 = pd.Series(rng.normal(size=3),index=labels)" ] }, { "cell_type": "code", "execution_count": 7, "id": "754cf115", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.911343\n", "1 -0.175944\n", "2 -1.311947\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2" ] }, { "cell_type": "code", "execution_count": 8, "id": "f4944006", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj2.index is labels" ] }, { "cell_type": "markdown", "id": "63e9fc6c", "metadata": {}, "source": [ "To be similar to an array, an index also behaves like a fixed-size set:" ] }, { "cell_type": "code", "execution_count": 9, "id": "15e953bd", "metadata": {}, "outputs": [], "source": [ "data1 = {\n", " \"Code\": [\"U+0000\", \"U+0001\", \"U+0002\", \"U+0003\", \"U+0004\", \"U+0005\"],\n", " \"Decimal\": [0, 1, 2, 3, 4, 5],\n", " \"Octal\": [\"001\", \"002\", \"003\", \"004\", \"004\", \"005\"],\n", "}\n", "df1 = pd.DataFrame(data1)" ] }, { "cell_type": "code", "execution_count": 10, "id": "9ba9e87b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDecimalOctal
0U+00000001
1U+00011002
2U+00022003
3U+00033004
4U+00044004
5U+00055005
\n", "
" ], "text/plain": [ " Code Decimal Octal\n", "0 U+0000 0 001\n", "1 U+0001 1 002\n", "2 U+0002 2 003\n", "3 U+0003 3 004\n", "4 U+0004 4 004\n", "5 U+0005 5 005" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1" ] }, { "cell_type": "code", "execution_count": 11, "id": "96cb827a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['Code', 'Decimal', 'Octal'], dtype='object')" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.columns" ] }, { "cell_type": "code", "execution_count": 12, "id": "90b857aa", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"Code\" in df1.columns" ] }, { "cell_type": "code", "execution_count": 13, "id": "44d02a76", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"Key\" in df1.columns" ] }, { "cell_type": "markdown", "id": "56087759", "metadata": {}, "source": [ "## Axis indices with double labels\n", "\n", "Unlike Python sets, a pandas index can contain duplicate labels:" ] }, { "cell_type": "code", "execution_count": 14, "id": "45210e3e", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDecimalOctal
0U+00000001
1U+00011002
2U+00022003
3U+00033004
4U+00044004
5U+00055005
0U+00066006
1U+00077007
\n", "
" ], "text/plain": [ " Code Decimal Octal\n", "0 U+0000 0 001\n", "1 U+0001 1 002\n", "2 U+0002 2 003\n", "3 U+0003 3 004\n", "4 U+0004 4 004\n", "5 U+0005 5 005\n", "0 U+0006 6 006\n", "1 U+0007 7 007" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data2 = {\n", " \"Code\": [\"U+0006\", \"U+0007\"],\n", " \"Decimal\": [6, 7],\n", " \"Octal\": [\"006\", \"007\"],\n", "}\n", "df2 = pd.DataFrame(data2)\n", "df12 = pd.concat([df1, df2])\n", "\n", "df12" ] }, { "cell_type": "markdown", "id": "a1c04dc3", "metadata": {}, "source": [ "When [selecting](select-filter.ipynb) duplicate labels, all occurrences of the label in question are selected:" ] }, { "cell_type": "code", "execution_count": 15, "id": "95dc51bb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDecimalOctal
1U+00011002
1U+00077007
\n", "
" ], "text/plain": [ " Code Decimal Octal\n", "1 U+0001 1 002\n", "1 U+0007 7 007" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12.loc[1]" ] }, { "cell_type": "code", "execution_count": 16, "id": "5c4125d4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Code U+0002\n", "Decimal 2\n", "Octal 003\n", "Name: 2, dtype: object" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12.loc[2]" ] }, { "cell_type": "markdown", "id": "4718c4a2", "metadata": {}, "source": [ "Data selection is one of the main points that behaves differently with duplicates. Indexing a label with multiple entries results in a series, while single entries result in a scalar value. This can complicate your code because the output type of indexing can vary depending on whether a label is repeated or not. In addition, many pandas functions, such as `reindex`, require labels to be unique. You can use the `is_unique` property of the index to determine whether its labels are unique or not:" ] }, { "cell_type": "code", "execution_count": 17, "id": "0aa3ac06", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12.index.is_unique" ] }, { "cell_type": "markdown", "id": "fa4ee45a", "metadata": {}, "source": [ "To avoid duplicate labels, you can use `ignore_index=True`, for example:" ] }, { "cell_type": "code", "execution_count": 18, "id": "a22f9f81", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDecimalOctal
0U+00000001
1U+00011002
2U+00022003
3U+00033004
4U+00044004
5U+00055005
6U+00066006
7U+00077007
\n", "
" ], "text/plain": [ " Code Decimal Octal\n", "0 U+0000 0 001\n", "1 U+0001 1 002\n", "2 U+0002 2 003\n", "3 U+0003 3 004\n", "4 U+0004 4 004\n", "5 U+0005 5 005\n", "6 U+0006 6 006\n", "7 U+0007 7 007" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df12 = pd.concat([df1, df2], ignore_index=True)\n", "\n", "df12" ] }, { "cell_type": "markdown", "id": "794d7f69", "metadata": {}, "source": [ "## Some index methods and properties\n", "\n", "Each index has a number of set logic methods and properties that answer other general questions about the data it contains. The following are some useful methods and properties:\n", "\n", "Method | Description\n", ":----- | :----------\n", "[concat](https://pandas.pydata.org/docs/reference/api/pandas.concat.html) | concatenates additional index objects, creating a new index\n", "[Index.difference](https://pandas.pydata.org/docs/reference/api/pandas.Index.difference.html) | calculates the difference of two sets as an index\n", "[Index.intersection](https://pandas.pydata.org/docs/reference/api/pandas.Index.intersection.html) | calculates the intersection\n", "[Index.union](https://pandas.pydata.org/docs/reference/api/pandas.Index.union.html) | calculates the union set\n", "[Index.isin](https://pandas.pydata.org/docs/reference/api/pandas.Index.isin.html) | computes a boolean array indicating whether each value is contained in the passed collection\n", "[Index.delete](https://pandas.pydata.org/docs/reference/api/pandas.Index.delete.html) | computes a new index by deleting the element in index `i`\n", "[Index.drop](https://pandas.pydata.org/docs/reference/api/pandas.Index.drop.html) | computes a new index by deleting the passed values\n", "[Index.insert](https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html) | insert computes new index by inserting the element in index `i`\n", "[Index.is_monotonic_increasing](https://pandas.pydata.org/docs/reference/api/pandas.Index.is_monotonic_increasing.html#pandas.Index.is_monotonic_increasing) | is_monotonic returns `True` if each element is greater than or equal to the previous element\n", "[Index.is_unique](https://pandas.pydata.org/docs/reference/api/pandas.Index.is_unique.html) | is_unique returns `True` if the index does not contain duplicate values\n", "[Index.unique](https://pandas.pydata.org/docs/reference/api/pandas.Index.unique.html) | calculates the array of unique values in the index" ] }, { "cell_type": "markdown", "id": "333b972a", "metadata": {}, "source": [ "## Re-indexing with `Index.reindex`\n", "\n", "An important method for Pandas objects is [Index.reindex](https://pandas.pydata.org/docs/reference/api/pandas.Index.reindex.html), which can be used to create a new object with rearranged values that match the new index. Consider, for example:" ] }, { "cell_type": "code", "execution_count": 19, "id": "cc22d262", "metadata": {}, "outputs": [], "source": [ "obj = pd.Series(range(7), index=pd.date_range(\"2022-02-02\", periods=7))" ] }, { "cell_type": "code", "execution_count": 20, "id": "0c97988b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2022-02-02 0\n", "2022-02-03 1\n", "2022-02-04 2\n", "2022-02-05 3\n", "2022-02-06 4\n", "2022-02-07 5\n", "2022-02-08 6\n", "Freq: D, dtype: int64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj" ] }, { "cell_type": "code", "execution_count": 21, "id": "042544df", "metadata": {}, "outputs": [], "source": [ "new_index = pd.date_range(\"2022-02-03\", periods=7)" ] }, { "cell_type": "code", "execution_count": 22, "id": "238e492d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2022-02-03 1.0\n", "2022-02-04 2.0\n", "2022-02-05 3.0\n", "2022-02-06 4.0\n", "2022-02-07 5.0\n", "2022-02-08 6.0\n", "2022-02-09 NaN\n", "Freq: D, dtype: float64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.reindex(new_index)" ] }, { "cell_type": "markdown", "id": "5d5d00c3", "metadata": {}, "source": [ "`Index.reindex` creates a new index and re-indexes the DataFrame. By default, values in the new index for which there are no corresponding records in the DataFrame become `NaN`." ] }, { "cell_type": "markdown", "id": "d8ebdd36", "metadata": {}, "source": [ "For ordered data such as time series, it may be desirable to interpolate or fill values during reindexing. The `method` option allows this with a method like `ffill` that fills the values forward:" ] }, { "cell_type": "code", "execution_count": 23, "id": "b574c19c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2022-02-03 1\n", "2022-02-04 2\n", "2022-02-05 3\n", "2022-02-06 4\n", "2022-02-07 5\n", "2022-02-08 6\n", "2022-02-09 6\n", "Freq: D, dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "obj.reindex(new_index, method=\"ffill\")" ] }, { "cell_type": "markdown", "id": "5de617e6", "metadata": {}, "source": [ "For a DataFrame, `reindex` can change either the (row) index, the columns or both. If only a sequence is passed, the rows in the result are re-indexed:" ] }, { "cell_type": "code", "execution_count": 24, "id": "5e260215", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CodeDecimalOctal
0U+00000.0001
1U+00011.0002
2U+00022.0003
3U+00033.0004
4U+00044.0004
5U+00055.0005
6NaNNaNNaN
\n", "
" ], "text/plain": [ " Code Decimal Octal\n", "0 U+0000 0.0 001\n", "1 U+0001 1.0 002\n", "2 U+0002 2.0 003\n", "3 U+0003 3.0 004\n", "4 U+0004 4.0 004\n", "5 U+0005 5.0 005\n", "6 NaN NaN NaN" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.reindex(range(7))" ] }, { "cell_type": "markdown", "id": "bbb1f8c2", "metadata": {}, "source": [ "The columns can be re-indexed with the keyword `columns`:" ] }, { "cell_type": "code", "execution_count": 25, "id": "be0234f4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OctalCodeDescription
0001U+0000NaN
1002U+0001NaN
2003U+0002NaN
3004U+0003NaN
4004U+0004NaN
5005U+0005NaN
\n", "
" ], "text/plain": [ " Octal Code Description\n", "0 001 U+0000 NaN\n", "1 002 U+0001 NaN\n", "2 003 U+0002 NaN\n", "3 004 U+0003 NaN\n", "4 004 U+0004 NaN\n", "5 005 U+0005 NaN" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "encoding = [\"Octal\", \"Code\", \"Description\"]\n", "\n", "df1.reindex(columns=encoding)" ] }, { "cell_type": "markdown", "id": "8c4214a7", "metadata": {}, "source": [ "### Arguments of the function `Index.reindex`\n", "\n", "Argument | Description\n", ":------- | :----------\n", "`labels` | New sequence to be used as index. Can be an index instance or another sequence-like Python data structure. An index is used exactly as it is, without being copied.\n", "`axis` | The new axis to index, either `index` (rows) or `columns`. The default is `index`. You can alternatively use `reindex(index=new_labels)` or `reindex(columns=new_labels)`.\n", "`method` | Interpolation method; `ffill` fills forwards, while `bfill` fills backwards.\n", "`fill_value` | Substitute value to be used when missing data is inserted by re-indexing. Uses `fill_value='missing'` (the default behaviour) if the missing labels in the result are to have zero values.\n", "`limit` | When filling forward or backward, the maximum number of elements to fill.\n", "`tolerance` | When filling forward or backward, the maximum size of the gap to be filled for inexact matches.\n", "`level` | Match single index at `MultiIndex` level; otherwise select subset.\n", "`copy` | If `True`, the underlying data is always copied, even if the new index matches the old index; if `False`, the data is not copied if the indices are equivalent." ] }, { "cell_type": "markdown", "id": "267ce240", "metadata": {}, "source": [ "## Rename axis indices\n", "\n", "The axis labels can be converted by a function or mapping to create new, differently labelled objects. You can also change the axes in place without creating a new data structure. Here is a simple example:" ] }, { "cell_type": "code", "execution_count": 26, "id": "14deb23b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1234
Deutsch0123
English4567
Français891011
\n", "
" ], "text/plain": [ " 1 2 3 4\n", "Deutsch 0 1 2 3\n", "English 4 5 6 7\n", "Français 8 9 10 11" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3 = pd.DataFrame(\n", " np.arange(12).reshape((3, 4)),\n", " index=[\"Deutsch\", \"English\", \"Français\"],\n", " columns=[1, 2, 3, 4],\n", ")\n", "\n", "df3" ] }, { "cell_type": "markdown", "id": "22644047", "metadata": {}, "source": [ "### Rename axis indices with `Index.map`\n", "\n", "The axis labels can be converted by a function or [Index.map](https://pandas.pydata.org/docs/reference/api/pandas.Index.map.html) to create new, differently labeled objects. You can also change the axes in place without creating a new data structure. Here is a simple example:" ] }, { "cell_type": "code", "execution_count": 27, "id": "6be509e2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['DE', 'EN', 'FR'], dtype='object')" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "transform = lambda x: x[:2].upper()\n", "\n", "df3.index.map(transform)" ] }, { "cell_type": "markdown", "id": "424d8a86", "metadata": {}, "source": [ "You can assign the index and change the DataFrame on the spot:" ] }, { "cell_type": "code", "execution_count": 28, "id": "863ee099", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1234
DE0123
EN4567
FR891011
\n", "
" ], "text/plain": [ " 1 2 3 4\n", "DE 0 1 2 3\n", "EN 4 5 6 7\n", "FR 8 9 10 11" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3.index = df3.index.map(transform)\n", "\n", "df3" ] }, { "cell_type": "markdown", "id": "600dd8e4", "metadata": {}, "source": [ "### Rename axis indices with `Index.rename`\n", "\n", "If you want to create a converted version of your dataset without changing the original, you can use [Index.rename](https://pandas.pydata.org/docs/reference/api/pandas.Index.rename.html):" ] }, { "cell_type": "code", "execution_count": 29, "id": "8368c7a5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
1234
de0123
en4567
fr891011
\n", "
" ], "text/plain": [ " 1 2 3 4\n", "de 0 1 2 3\n", "en 4 5 6 7\n", "fr 8 9 10 11" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3.rename(index=str.lower)" ] }, { "cell_type": "markdown", "id": "835e1a78", "metadata": {}, "source": [ "In particular, `Index.rename` can be used in conjunction with a `dict`-like object that provides new values for a subset of the axis labels:" ] }, { "cell_type": "code", "execution_count": 30, "id": "837bc54b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
BE0123
DE4567
EN891011
\n", "
" ], "text/plain": [ " 0 1 2 3\n", "BE 0 1 2 3\n", "DE 4 5 6 7\n", "EN 8 9 10 11" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3.rename(\n", " index={\"DE\": \"BE\", \"EN\": \"DE\", \"FR\": \"EN\"},\n", " columns={1: 0, 2: 1, 3: 2, 4: 3},\n", " inplace=True,\n", ")\n", "\n", "df3" ] }, { "cell_type": "markdown", "id": "40b1fba3", "metadata": {}, "source": [ "`Index.rename` saves you from manually copying the DataFrame and assigning its index and column attributes. If you want to change a data set on the spot, also pass `inplace=True`:" ] }, { "cell_type": "code", "execution_count": 31, "id": "e4424690", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0012
BE0123
BE4567
DE891011
\n", "
" ], "text/plain": [ " 0 0 1 2\n", "BE 0 1 2 3\n", "BE 4 5 6 7\n", "DE 8 9 10 11" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df3.rename(\n", " index={\"DE\": \"BE\", \"EN\": \"DE\", \"FR\": \"EN\"},\n", " columns={1: 0, 2: 1, 3: 2, 4: 3},\n", " inplace=True,\n", ")\n", "\n", "df3" ] }, { "cell_type": "markdown", "id": "da72ecee", "metadata": {}, "source": [ "## Hierarchical Indexing\n", "\n", "Hierarchical indexing is an important feature of pandas that allows you to have multiple index levels on one axis. This gives you the opportunity to work with higher dimensional data in a lower dimensional form.\n", "\n", "Let's start with a simple example: Let's create a series of lists as an index:" ] }, { "cell_type": "code", "execution_count": 32, "id": "97c0fb4b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Jupyter Tutorial de 83080\n", " en 20336\n", "PyViz Tutorial de 11376\n", "Python Basics de 1228\n", " en 468\n", "dtype: int64" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits = pd.Series(\n", " [83080, 20336, 11376, 1228, 468],\n", " index=[\n", " [\n", " \"Jupyter Tutorial\",\n", " \"Jupyter Tutorial\",\n", " \"PyViz Tutorial\",\n", " \"Python Basics\",\n", " \"Python Basics\",\n", " ],\n", " [\"de\", \"en\", \"de\", \"de\", \"en\"],\n", " ],\n", ")\n", "\n", "hits" ] }, { "cell_type": "markdown", "id": "22d1d110", "metadata": {}, "source": [ "What you see is a graphical view of a series with a [pandas.MultiIndex](https://pandas.pydata.org/docs/reference/api/pandas.MultiIndex.html). The *gaps* in the index display mean that the label above it is to be used." ] }, { "cell_type": "code", "execution_count": 33, "id": "536bfbd5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('Jupyter Tutorial', 'de'),\n", " ('Jupyter Tutorial', 'en'),\n", " ( 'PyViz Tutorial', 'de'),\n", " ( 'Python Basics', 'de'),\n", " ( 'Python Basics', 'en')],\n", " )" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits.index" ] }, { "cell_type": "markdown", "id": "6b0834b8", "metadata": {}, "source": [ "With a hierarchically indexed object, so-called partial indexing is possible, with which you can select subsets of the data:" ] }, { "cell_type": "code", "execution_count": 34, "id": "eb52654f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "de 83080\n", "en 20336\n", "dtype: int64" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits[\"Jupyter Tutorial\"]" ] }, { "cell_type": "code", "execution_count": 35, "id": "1048a2fe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Jupyter Tutorial de 83080\n", " en 20336\n", "PyViz Tutorial de 11376\n", "Python Basics de 1228\n", " en 468\n", "dtype: int64" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits[\"Jupyter Tutorial\":\"Python Basics\"]" ] }, { "cell_type": "code", "execution_count": 36, "id": "4dbc5800", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Jupyter Tutorial de 83080\n", " en 20336\n", "Python Basics de 1228\n", " en 468\n", "dtype: int64" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits.loc[[\"Jupyter Tutorial\", \"Python Basics\"]]" ] }, { "cell_type": "markdown", "id": "3db2201a", "metadata": {}, "source": [ "The selection is even possible from an *inner* level. In the following I select all values with the value `1` from the second index level:" ] }, { "cell_type": "code", "execution_count": 37, "id": "67929194", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Jupyter Tutorial 83080\n", "PyViz Tutorial 11376\n", "Python Basics 1228\n", "dtype: int64" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits.loc[:, \"de\"]" ] }, { "cell_type": "markdown", "id": "6e6d830c", "metadata": {}, "source": [ "## View vs. copy\n", "\n", "In Pandas, whether you get a view or not depends on the structure and data types of the original DataFrame – and whether changes made to a view are propagated back to the original DataFrame." ] }, { "cell_type": "markdown", "id": "bdc51706", "metadata": {}, "source": [ "
\n", "\n", "**See also:**\n", "\n", "* [Returning a view versus a copy](https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy)\n", "* [Views and Copies in pandas](https://www.practicaldatascience.org/html/views_and_copies_in_pandas.html)\n", "\n", "
" ] }, { "cell_type": "markdown", "id": "ed31816c", "metadata": {}, "source": [ "### `stack` and `unstack`\n", "\n", "Hierarchical indexing plays an important role in data reshaping and group-based operations such as forming a [pivot table](pivoting-crosstab.ipynb). For example, you can reorder this data into a DataFrame using the pandas.[Series.unstack](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.unstack.html) method:" ] }, { "cell_type": "code", "execution_count": 38, "id": "f9cc8c87", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
deen
Jupyter Tutorial83080.020336.0
PyViz Tutorial11376.0NaN
Python Basics1228.0468.0
\n", "
" ], "text/plain": [ " de en\n", "Jupyter Tutorial 83080.0 20336.0\n", "PyViz Tutorial 11376.0 NaN\n", "Python Basics 1228.0 468.0" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits.unstack()" ] }, { "cell_type": "markdown", "id": "00540c2c", "metadata": {}, "source": [ "The reverse operation of unstack is [stack](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.stack.html):" ] }, { "cell_type": "code", "execution_count": 39, "id": "1375ab71", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Jupyter Tutorial de 83080.0\n", " en 20336.0\n", "PyViz Tutorial de 11376.0\n", "Python Basics de 1228.0\n", " en 468.0\n", "dtype: float64" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "hits.unstack().stack()" ] }, { "cell_type": "markdown", "id": "1a349f03", "metadata": {}, "source": [ "In a DataFrame, each axis can have a hierarchical index:" ] }, { "cell_type": "code", "execution_count": 40, "id": "3677acbc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
12/202101/202202/2022
lateststablelateststablelateststable
Jupyter Tutorialde196510301340332950
en472218253497257640093707
PyViz Tutorialde257304873039300
Python Basicsde525042702760
en15708502260
\n", "
" ], "text/plain": [ " 12/2021 01/2022 02/2022 \n", " latest stable latest stable latest stable\n", "Jupyter Tutorial de 19651 0 30134 0 33295 0\n", " en 4722 1825 3497 2576 4009 3707\n", "PyViz Tutorial de 2573 0 4873 0 3930 0\n", "Python Basics de 525 0 427 0 276 0\n", " en 157 0 85 0 226 0" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "version_hits = [\n", " [19651, 0, 30134, 0, 33295, 0],\n", " [4722, 1825, 3497, 2576, 4009, 3707],\n", " [2573, 0, 4873, 0, 3930, 0],\n", " [525, 0, 427, 0, 276, 0],\n", " [157, 0, 85, 0, 226, 0],\n", "]\n", "\n", "df = pd.DataFrame(\n", " version_hits,\n", " index=[\n", " [\n", " \"Jupyter Tutorial\",\n", " \"Jupyter Tutorial\",\n", " \"PyViz Tutorial\",\n", " \"Python Basics\",\n", " \"Python Basics\",\n", " ],\n", " [\"de\", \"en\", \"de\", \"de\", \"en\"],\n", " ],\n", " columns=[\n", " [\"12/2021\", \"12/2021\", \"01/2022\", \"01/2022\", \"02/2022\", \"02/2022\"],\n", " [\"latest\", \"stable\", \"latest\", \"stable\", \"latest\", \"stable\"],\n", " ],\n", ")\n", "\n", "df" ] }, { "cell_type": "markdown", "id": "a7185ef5", "metadata": {}, "source": [ "The hierarchy levels can have names (as strings or any Python objects). If this is the case, they are displayed in the console output:" ] }, { "cell_type": "code", "execution_count": 41, "id": "78ef1662", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Month12/202101/202202/2022
Versionlateststablelateststablelateststable
TitleLanguage
Jupyter Tutorialde196510301340332950
en472218253497257640093707
PyViz Tutorialde257304873039300
Python Basicsde525042702760
en15708502260
\n", "
" ], "text/plain": [ "Month 12/2021 01/2022 02/2022 \n", "Version latest stable latest stable latest stable\n", "Title Language \n", "Jupyter Tutorial de 19651 0 30134 0 33295 0\n", " en 4722 1825 3497 2576 4009 3707\n", "PyViz Tutorial de 2573 0 4873 0 3930 0\n", "Python Basics de 525 0 427 0 276 0\n", " en 157 0 85 0 226 0" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.index.names = [\"Title\", \"Language\"]\n", "df.columns.names = [\"Month\", \"Version\"]\n", "\n", "df" ] }, { "cell_type": "markdown", "id": "a6b6545b", "metadata": {}, "source": [ "
\n", "\n", "**Warning:**\n", "\n", "Make sure that the index names `Month` and `Version` are not part of the row names (of the `df.index` values).\n", "
" ] }, { "cell_type": "markdown", "id": "67f86654", "metadata": {}, "source": [ "With the partial column indexing you can select column groups in a similar way:" ] }, { "cell_type": "code", "execution_count": 42, "id": "e6c619fb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Versionlateststable
TitleLanguage
Jupyter Tutorialde196510
en47221825
PyViz Tutorialde25730
Python Basicsde5250
en1570
\n", "
" ], "text/plain": [ "Version latest stable\n", "Title Language \n", "Jupyter Tutorial de 19651 0\n", " en 4722 1825\n", "PyViz Tutorial de 2573 0\n", "Python Basics de 525 0\n", " en 157 0" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"12/2021\"]" ] }, { "cell_type": "markdown", "id": "84c29efe", "metadata": {}, "source": [ "With [MultiIndex.from_arrays](https://pandas.pydata.org/docs/reference/api/pandas.MultiIndex.from_arrays.html), a `MultiIndex` can be created itself and then reused; the columns in the preceding DataFrame with level names could be created in this way:" ] }, { "cell_type": "code", "execution_count": 43, "id": "6da0da1d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MultiIndex([('Jupyter Tutorial', 'de'),\n", " ('Jupyter Tutorial', 'en'),\n", " ( 'PyViz Tutorial', 'de'),\n", " ( 'Python Basics', 'de'),\n", " ( 'Python Basics', 'en')],\n", " names=['Title', 'Language'])" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.MultiIndex.from_arrays(\n", " [\n", " [\n", " \"Jupyter Tutorial\",\n", " \"Jupyter Tutorial\",\n", " \"PyViz Tutorial\",\n", " \"Python Basics\",\n", " \"Python Basics\",\n", " ],\n", " [\"de\", \"en\", \"de\", \"de\", \"en\"],\n", " ],\n", " names=[\"Title\", \"Language\"],\n", ")" ] }, { "cell_type": "markdown", "id": "679bd347", "metadata": {}, "source": [ "### Rearranging and Sorting Levels\n", "\n", "There may be times when you want to rearrange the order of the levels on an axis or sort the data by the values in a particular level. The function [DataFrame.swaplevel](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.swaplevel.html) takes two level numbers or names and returns a new object in which the levels are swapped (but the data remains unchanged):" ] }, { "cell_type": "code", "execution_count": 44, "id": "b5bdee79", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Month12/202101/202202/2022
Versionlateststablelateststablelateststable
LanguageTitle
deJupyter Tutorial196510301340332950
enJupyter Tutorial472218253497257640093707
dePyViz Tutorial257304873039300
Python Basics525042702760
enPython Basics15708502260
\n", "
" ], "text/plain": [ "Month 12/2021 01/2022 02/2022 \n", "Version latest stable latest stable latest stable\n", "Language Title \n", "de Jupyter Tutorial 19651 0 30134 0 33295 0\n", "en Jupyter Tutorial 4722 1825 3497 2576 4009 3707\n", "de PyViz Tutorial 2573 0 4873 0 3930 0\n", " Python Basics 525 0 427 0 276 0\n", "en Python Basics 157 0 85 0 226 0" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.swaplevel(\"Language\", \"Title\")" ] }, { "cell_type": "markdown", "id": "2912b089", "metadata": {}, "source": [ "[DataFrame.sort_index](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_index.html), on the other hand, sorts the data only by the values in a single level. When swapping levels, it is not uncommon to also use `sort_index` so that the result is lexicographically sorted by the specified level:" ] }, { "cell_type": "code", "execution_count": 45, "id": "b99c1f20", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Month12/202101/202202/2022
Versionlateststablelateststablelateststable
TitleLanguage
Jupyter Tutorialde196510301340332950
en472218253497257640093707
PyViz Tutorialde257304873039300
Python Basicsde525042702760
en15708502260
\n", "
" ], "text/plain": [ "Month 12/2021 01/2022 02/2022 \n", "Version latest stable latest stable latest stable\n", "Title Language \n", "Jupyter Tutorial de 19651 0 30134 0 33295 0\n", " en 4722 1825 3497 2576 4009 3707\n", "PyViz Tutorial de 2573 0 4873 0 3930 0\n", "Python Basics de 525 0 427 0 276 0\n", " en 157 0 85 0 226 0" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_index(level=0)" ] }, { "cell_type": "markdown", "id": "94beaacd", "metadata": {}, "source": [ "However, the *PyViz Tutorial* will now be sorted before the *Python Basics*, as all upper case letters appear before lower case letters in this sorting. To avoid this, you can use the following lambda function:" ] }, { "cell_type": "code", "execution_count": 46, "id": "7ed7ecf2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Month12/202101/202202/2022
Versionlateststablelateststablelateststable
TitleLanguage
Jupyter Tutorialde196510301340332950
en472218253497257640093707
Python Basicsde525042702760
en15708502260
PyViz Tutorialde257304873039300
\n", "
" ], "text/plain": [ "Month 12/2021 01/2022 02/2022 \n", "Version latest stable latest stable latest stable\n", "Title Language \n", "Jupyter Tutorial de 19651 0 30134 0 33295 0\n", " en 4722 1825 3497 2576 4009 3707\n", "Python Basics de 525 0 427 0 276 0\n", " en 157 0 85 0 226 0\n", "PyViz Tutorial de 2573 0 4873 0 3930 0" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sort_index(level=0, key=lambda x: x.str.lower())" ] }, { "cell_type": "code", "execution_count": 47, "id": "1ff17c74", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Month12/202101/202202/2022
Versionlateststablelateststablelateststable
LanguageTitle
deJupyter Tutorial196510301340332950
PyViz Tutorial257304873039300
Python Basics525042702760
enJupyter Tutorial472218253497257640093707
Python Basics15708502260
\n", "
" ], "text/plain": [ "Month 12/2021 01/2022 02/2022 \n", "Version latest stable latest stable latest stable\n", "Language Title \n", "de Jupyter Tutorial 19651 0 30134 0 33295 0\n", " PyViz Tutorial 2573 0 4873 0 3930 0\n", " Python Basics 525 0 427 0 276 0\n", "en Jupyter Tutorial 4722 1825 3497 2576 4009 3707\n", " Python Basics 157 0 85 0 226 0" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.swaplevel(0, 1).sort_index(level=0)" ] }, { "cell_type": "markdown", "id": "7aa8319d", "metadata": {}, "source": [ "
\n", "\n", "**Note:**\n", "\n", "Data selection performance is much better for hierarchically indexed objects if the index is sorted lexicographically, starting with the outermost level, i.e. the result of calling `sort_index(level=0)` or `sort_index()`.\n", "
" ] }, { "cell_type": "markdown", "id": "43f194e8", "metadata": {}, "source": [ "### Summary statistics by level\n", "\n", "Many descriptive and summary statistics for `DataFrame` and `Series` have a level option that allows you to specify the level by which you can aggregate on a particular axis. Consider the `DataFrame` above; we can aggregate either the rows or the columns by level as follows:" ] }, { "cell_type": "code", "execution_count": 48, "id": "18e87e48", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Month12/202101/202202/2022
Versionlateststablelateststablelateststable
Language
de227490354340375010
en487918253582257642353707
\n", "
" ], "text/plain": [ "Month 12/2021 01/2022 02/2022 \n", "Version latest stable latest stable latest stable\n", "Language \n", "de 22749 0 35434 0 37501 0\n", "en 4879 1825 3582 2576 4235 3707" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby(level=\"Language\").sum()" ] }, { "cell_type": "code", "execution_count": 49, "id": "f8217b6f-53a4-4bd3-bb95-40555d525090", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Month01/202202/202212/2021
TitleLanguage
Jupyter Tutorialde301343329519651
en607377166547
PyViz Tutorialde487339302573
Python Basicsde427276525
en85226157
\n", "
" ], "text/plain": [ "Month 01/2022 02/2022 12/2021\n", "Title Language \n", "Jupyter Tutorial de 30134 33295 19651\n", " en 6073 7716 6547\n", "PyViz Tutorial de 4873 3930 2573\n", "Python Basics de 427 276 525\n", " en 85 226 157" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dft = df.T.groupby(level=\"Month\").sum()\n", "\n", "dft.T" ] }, { "cell_type": "markdown", "id": "82b717b4", "metadata": {}, "source": [ "Internally, pandas’ [DataFrame.groupby](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html) machinery is used for this purpose, which is explained in more detail in [Group Operations](group-operations.ipynb)." ] }, { "cell_type": "markdown", "id": "686ebd97", "metadata": {}, "source": [ "## Indexing with the columns of a DataFrame\n", "\n", "It is not uncommon to use one or more columns of a DataFrame as a row index; alternatively, you can move the row index into the columns of the DataFrame. Here is an example DataFrame:" ] }, { "cell_type": "code", "execution_count": 50, "id": "93739c99", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
0Jupyter Tutorialde196510301340332950
1Jupyter Tutorialen472218253497257640093707
2PyViz Tutorialde257304873039300
3Python Basicsde525042702760
4Python Basicsen15708502260
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "0 Jupyter Tutorial de 19651 0 30134 0 33295 0\n", "1 Jupyter Tutorial en 4722 1825 3497 2576 4009 3707\n", "2 PyViz Tutorial de 2573 0 4873 0 3930 0\n", "3 Python Basics de 525 0 427 0 276 0\n", "4 Python Basics en 157 0 85 0 226 0" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = [\n", " [\"Jupyter Tutorial\", \"de\", 19651, 0, 30134, 0, 33295, 0],\n", " [\"Jupyter Tutorial\", \"en\", 4722, 1825, 3497, 2576, 4009, 3707],\n", " [\"PyViz Tutorial\", \"de\", 2573, 0, 4873, 0, 3930, 0],\n", " [\"Python Basics\", \"de\", 525, 0, 427, 0, 276, 0],\n", " [\"Python Basics\", \"en\", 157, 0, 85, 0, 226, 0],\n", "]\n", "\n", "df = pd.DataFrame(data)\n", "\n", "df" ] }, { "cell_type": "markdown", "id": "97bc63d3", "metadata": {}, "source": [ "The function [pandas.DataFrame.set_index](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.set_index.html) creates a new DataFrame that uses one or more of its columns as an index:" ] }, { "cell_type": "code", "execution_count": 51, "id": "9c4781cb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
234567
01
Jupyter Tutorialde196510301340332950
en472218253497257640093707
PyViz Tutorialde257304873039300
Python Basicsde525042702760
en15708502260
\n", "
" ], "text/plain": [ " 2 3 4 5 6 7\n", "0 1 \n", "Jupyter Tutorial de 19651 0 30134 0 33295 0\n", " en 4722 1825 3497 2576 4009 3707\n", "PyViz Tutorial de 2573 0 4873 0 3930 0\n", "Python Basics de 525 0 427 0 276 0\n", " en 157 0 85 0 226 0" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df.set_index([0, 1])\n", "\n", "df2" ] }, { "cell_type": "markdown", "id": "4a8b8e46", "metadata": {}, "source": [ "By default, the columns are removed from the DataFrame, but you can also leave them in by passing `drop=False` to `set_index`:" ] }, { "cell_type": "code", "execution_count": 52, "id": "cfd85a06", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
01
Jupyter TutorialdeJupyter Tutorialde196510301340332950
enJupyter Tutorialen472218253497257640093707
PyViz TutorialdePyViz Tutorialde257304873039300
Python BasicsdePython Basicsde525042702760
enPython Basicsen15708502260
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 1 \n", "Jupyter Tutorial de Jupyter Tutorial de 19651 0 30134 0 33295 \n", " en Jupyter Tutorial en 4722 1825 3497 2576 4009 \n", "PyViz Tutorial de PyViz Tutorial de 2573 0 4873 0 3930 \n", "Python Basics de Python Basics de 525 0 427 0 276 \n", " en Python Basics en 157 0 85 0 226 \n", "\n", " 7 \n", "0 1 \n", "Jupyter Tutorial de 0 \n", " en 3707 \n", "PyViz Tutorial de 0 \n", "Python Basics de 0 \n", " en 0 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.set_index([0, 1], drop=False)" ] }, { "cell_type": "markdown", "id": "b0e86d42", "metadata": {}, "source": [ "[DataFrame.reset_index](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html), on the other hand, does the opposite of set_index; the hierarchical index levels are moved into the columns:" ] }, { "cell_type": "code", "execution_count": 53, "id": "38db67e5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567
0Jupyter Tutorialde196510301340332950
1Jupyter Tutorialen472218253497257640093707
2PyViz Tutorialde257304873039300
3Python Basicsde525042702760
4Python Basicsen15708502260
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7\n", "0 Jupyter Tutorial de 19651 0 30134 0 33295 0\n", "1 Jupyter Tutorial en 4722 1825 3497 2576 4009 3707\n", "2 PyViz Tutorial de 2573 0 4873 0 3930 0\n", "3 Python Basics de 525 0 427 0 276 0\n", "4 Python Basics en 157 0 85 0 226 0" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.reset_index()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.13 Kernel", "language": "python", "name": "python313" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.0" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }