{ "cells": [ { "cell_type": "markdown", "metadata": { "cell_id": "af6850ba-bf23-4bbd-b58c-fea6b29e9500", "deepnote_cell_type": "markdown", "tags": [] }, "source": [ "# Review\n", "\n", "Today's class was review.\n", "\n", "[Recording of lecture from 1/28/2022](https://uci.zoom.us/rec/share/OxfhTaj6tVZ4ysRJC_U2Nj3vGo0Hiqn-vm8WgGK-ANcfV5hahZLYPRCQFj60xI2a.MTVnvxxTkIXzkvLS?startTime=1643385481000)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "cell_id": "bf0ed2ee-ca49-4236-976e-ddd5a4c7978d", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 300, "execution_start": 1643385441255, "source_hash": "7baafd37", "tags": [] }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import altair as alt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dictionaries, and their relationship to pandas Series and pandas DataFrames." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "cell_id": "e17de02e-bfb1-44b7-8007-98ecad8f04c7", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643385790842, "source_hash": "6a294ecd", "tags": [] }, "outputs": [], "source": [ "d = {\"a\":5,\"b\":10,\"chris\":30}" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "d9c35a6f-e750-48e9-9ad6-b181ca907ae7", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 345, "execution_start": 1643385807114, "source_hash": "15f90960", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "10" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "d[\"b\"]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "cell_id": "75287e85-6ff8-4d16-9a24-178df9b8875c", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_start": 1643385840275, "source_hash": "62db6b48", "tags": [] }, "outputs": [], "source": [ "e = {\"a\":[5,6,7],\"b\":10,\"chris\":[30,2.4,-20]}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Making a DataFrame from a dictionary." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "cell_id": "ccb3054b-9dce-4cc3-9093-64fcac225051", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643385925252, "source_hash": "e08c1f7f", "tags": [] }, "outputs": [], "source": [ "df = pd.DataFrame(e)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "cell_id": "cdc110ed-cbdd-4afc-ba91-0bc9b5420537", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 304, "execution_start": 1643385956633, "source_hash": "f804c160", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchris
051030.0
16102.4
2710-20.0
\n", "
" ], "text/plain": [ " a b chris\n", "0 5 10 30.0\n", "1 6 10 2.4\n", "2 7 10 -20.0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "cell_id": "b7347bc9-62cb-49f2-abc7-f176f0f8ff96", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643385932131, "source_hash": "b165b385", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 5\n", "1 6\n", "2 7\n", "Name: a, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.a" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "cell_id": "540229e4-e89b-448f-8305-dd73476a3e2d", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 4, "execution_start": 1643385940313, "source_hash": "38ded7f3", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 5\n", "1 6\n", "2 7\n", "Name: a, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"a\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Making a Series from a dictionary." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "cell_id": "48e7e248-31bd-46f2-92b7-49aad02a2d0e", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 18, "execution_start": 1643385881115, "source_hash": "ef1cac7a", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "a 5\n", "b 10\n", "chris 30\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.Series(d)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "3d472dc7-3ab9-4753-b973-bb384589c577", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 21, "execution_start": 1643385993312, "source_hash": "f804c160", "tags": [] }, "outputs": [ { "data": { "application/vnd.deepnote.dataframe.v3+json": { "column_count": 3, "columns": [ { "dtype": "int64", "name": "a", "stats": { "histogram": [ { "bin_end": 5.2, "bin_start": 5, "count": 1 }, { "bin_end": 5.4, "bin_start": 5.2, "count": 0 }, { "bin_end": 5.6, "bin_start": 5.4, "count": 0 }, { "bin_end": 5.8, "bin_start": 5.6, "count": 0 }, { "bin_end": 6, "bin_start": 5.8, "count": 0 }, { "bin_end": 6.2, "bin_start": 6, "count": 1 }, { "bin_end": 6.4, "bin_start": 6.2, "count": 0 }, { "bin_end": 6.6, "bin_start": 6.4, "count": 0 }, { "bin_end": 6.8, "bin_start": 6.6, "count": 0 }, { "bin_end": 7, "bin_start": 6.8, "count": 1 } ], "max": "7", "min": "5", "nan_count": 0, "unique_count": 3 } }, { "dtype": "int64", "name": "b", "stats": { "histogram": [ { "bin_end": 9.6, "bin_start": 9.5, "count": 0 }, { "bin_end": 9.7, "bin_start": 9.6, "count": 0 }, { "bin_end": 9.8, "bin_start": 9.7, "count": 0 }, { "bin_end": 9.9, "bin_start": 9.8, "count": 0 }, { "bin_end": 10, "bin_start": 9.9, "count": 0 }, { "bin_end": 10.1, "bin_start": 10, "count": 3 }, { "bin_end": 10.2, "bin_start": 10.1, "count": 0 }, { "bin_end": 10.3, "bin_start": 10.2, "count": 0 }, { "bin_end": 10.4, "bin_start": 10.3, "count": 0 }, { "bin_end": 10.5, "bin_start": 10.4, "count": 0 } ], "max": "10", "min": "10", "nan_count": 0, "unique_count": 1 } }, { "dtype": "float64", "name": "chris", "stats": { "histogram": [ { "bin_end": -15, "bin_start": -20, "count": 1 }, { "bin_end": -10, "bin_start": -15, "count": 0 }, { "bin_end": -5, "bin_start": -10, "count": 0 }, { "bin_end": 0, "bin_start": -5, "count": 0 }, { "bin_end": 5, "bin_start": 0, "count": 1 }, { "bin_end": 10, "bin_start": 5, "count": 0 }, { "bin_end": 15, "bin_start": 10, "count": 0 }, { "bin_end": 20, "bin_start": 15, "count": 0 }, { "bin_end": 25, "bin_start": 20, "count": 0 }, { "bin_end": 30, "bin_start": 25, "count": 1 } ], "max": "30.0", "min": "-20.0", "nan_count": 0, "unique_count": 3 } }, { "dtype": "int64", "name": "_deepnote_index_column" } ], "row_count": 3, "rows": [ { "_deepnote_index_column": 0, "a": 5, "b": 10, "chris": 30 }, { "_deepnote_index_column": 1, "a": 6, "b": 10, "chris": 2.4 }, { "_deepnote_index_column": 2, "a": 7, "b": 10, "chris": -20 } ] }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchris
051030.0
16102.4
2710-20.0
\n", "
" ], "text/plain": [ " a b chris\n", "0 5 10 30.0\n", "1 6 10 2.4\n", "2 7 10 -20.0" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "6c425055-3982-4910-99f4-f64724a14206", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 4, "execution_start": 1643385993338, "source_hash": "25d43fa0", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['a', 'b', 'chris'], dtype='object')" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## List comprehension" ] }, { "cell_type": "markdown", "metadata": { "cell_id": "b2c78b4e-4d99-4824-99e5-6a2d2a83acc5", "deepnote_cell_type": "markdown", "tags": [] }, "source": [ "Practice exercise:\n", "* Make a list of `True`s (bool type) using list comprehension, where the length of the list is the number of rows in `df`." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "cell_id": "07ce0717-e02a-4a20-8561-3e4a974f9234", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 6, "execution_start": 1643386278671, "source_hash": "5d8ad24d", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['true', 'true', 'true']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# These are strings, not bools\n", "['true' for x in range(3)]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "cell_id": "0cfaacfb-6711-491b-91d4-5f66b8481c34", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643386256002, "source_hash": "7cf516b", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "['true', 'true', 'true']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[\"true\" for x in range(len(df))]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "cell_id": "ae2ab7a4-28b6-4f71-b4e1-be49e864a051", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 17, "execution_start": 1643386406029, "source_hash": "ac64be90", "tags": [] }, "outputs": [ { "ename": "NameError", "evalue": "name 'true' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_37915/3159580370.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m[\u001b[0m\u001b[0mtrue\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_37915/3159580370.py\u001b[0m in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m[\u001b[0m\u001b[0mtrue\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'true' is not defined" ] } ], "source": [ "[true for x in range(len(df))]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is the correct answer." ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "cell_id": "72829fa8-e4cb-4638-af02-a19a67ca9bd2", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643386417846, "source_hash": "4a58405d", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "[True, True, True]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[True for x in range(len(df))]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Putting a new column in a DataFrame\n", "\n", "One way to create a new column." ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "cell_id": "9d4cf038-cd0e-4c04-a5d6-f9fe714697e9", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 1, "execution_start": 1643386477651, "source_hash": "96acba45", "tags": [] }, "outputs": [], "source": [ "df[\"new column\"] = [True for x in range(len(df))]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "cell_id": "8a603460-c5fd-4fee-a76b-020128bbcf98", "deepnote_cell_type": "code", "deepnote_output_heights": [ 118.953125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 21, "execution_start": 1643386499453, "source_hash": "f804c160", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchrisnew column
051030.0True
16102.4True
2710-20.0True
\n", "
" ], "text/plain": [ " a b chris new column\n", "0 5 10 30.0 True\n", "1 6 10 2.4 True\n", "2 7 10 -20.0 True" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If the new column is filled with a single value, then you can make the new column faster:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "162ef89e-ed72-477c-94cf-5ceb6fc8af3e", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 4, "execution_start": 1643386548048, "source_hash": "8e76f58a", "tags": [] }, "outputs": [], "source": [ "df[\"new column 2\"] = False" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "7858a04b-ade8-436f-909b-70260f367ba4", "deepnote_cell_type": "code", "deepnote_output_heights": [ 118.953125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 31, "execution_start": 1643386552003, "source_hash": "f804c160", "tags": [] }, "outputs": [ { "data": { "application/vnd.deepnote.dataframe.v3+json": { "column_count": 5, "columns": [ { "dtype": "int64", "name": "a", "stats": { "histogram": [ { "bin_end": 5.2, "bin_start": 5, "count": 1 }, { "bin_end": 5.4, "bin_start": 5.2, "count": 0 }, { "bin_end": 5.6, "bin_start": 5.4, "count": 0 }, { "bin_end": 5.8, "bin_start": 5.6, "count": 0 }, { "bin_end": 6, "bin_start": 5.8, "count": 0 }, { "bin_end": 6.2, "bin_start": 6, "count": 1 }, { "bin_end": 6.4, "bin_start": 6.2, "count": 0 }, { "bin_end": 6.6, "bin_start": 6.4, "count": 0 }, { "bin_end": 6.8, "bin_start": 6.6, "count": 0 }, { "bin_end": 7, "bin_start": 6.8, "count": 1 } ], "max": "7", "min": "5", "nan_count": 0, "unique_count": 3 } }, { "dtype": "int64", "name": "b", "stats": { "histogram": [ { "bin_end": 9.6, "bin_start": 9.5, "count": 0 }, { "bin_end": 9.7, "bin_start": 9.6, "count": 0 }, { "bin_end": 9.8, "bin_start": 9.7, "count": 0 }, { "bin_end": 9.9, "bin_start": 9.8, "count": 0 }, { "bin_end": 10, "bin_start": 9.9, "count": 0 }, { "bin_end": 10.1, "bin_start": 10, "count": 3 }, { "bin_end": 10.2, "bin_start": 10.1, "count": 0 }, { "bin_end": 10.3, "bin_start": 10.2, "count": 0 }, { "bin_end": 10.4, "bin_start": 10.3, "count": 0 }, { "bin_end": 10.5, "bin_start": 10.4, "count": 0 } ], "max": "10", "min": "10", "nan_count": 0, "unique_count": 1 } }, { "dtype": "float64", "name": "chris", "stats": { "histogram": [ { "bin_end": -15, "bin_start": -20, "count": 1 }, { "bin_end": -10, "bin_start": -15, "count": 0 }, { "bin_end": -5, "bin_start": -10, "count": 0 }, { "bin_end": 0, "bin_start": -5, "count": 0 }, { "bin_end": 5, "bin_start": 0, "count": 1 }, { "bin_end": 10, "bin_start": 5, "count": 0 }, { "bin_end": 15, "bin_start": 10, "count": 0 }, { "bin_end": 20, "bin_start": 15, "count": 0 }, { "bin_end": 25, "bin_start": 20, "count": 0 }, { "bin_end": 30, "bin_start": 25, "count": 1 } ], "max": "30.0", "min": "-20.0", "nan_count": 0, "unique_count": 3 } }, { "dtype": "bool", "name": "new column", "stats": { "categories": [ { "count": 3, "name": "True" } ], "nan_count": 0, "unique_count": 1 } }, { "dtype": "bool", "name": "new column 2", "stats": { "categories": [ { "count": 3, "name": "False" } ], "nan_count": 0, "unique_count": 1 } }, { "dtype": "int64", "name": "_deepnote_index_column" } ], "row_count": 3, "rows": [ { "_deepnote_index_column": 0, "a": 5, "b": 10, "chris": 30, "new column": true, "new column 2": false }, { "_deepnote_index_column": 1, "a": 6, "b": 10, "chris": 2.4, "new column": true, "new column 2": false }, { "_deepnote_index_column": 2, "a": 7, "b": 10, "chris": -20, "new column": true, "new column 2": false } ] }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchrisnew columnnew column 2
051030.0TrueFalse
16102.4TrueFalse
2710-20.0TrueFalse
\n", "
" ], "text/plain": [ " a b chris new column new column 2\n", "0 5 10 30.0 True False\n", "1 6 10 2.4 True False\n", "2 7 10 -20.0 True False" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Indexing\n", "\n", "Indexing using `iloc`." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "cell_id": "ba6032d3-f47b-411d-a47f-e3501c9cd4a3", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 8, "execution_start": 1643386757914, "source_hash": "549c0088", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchrisnew column
051030.0True
120102.4True
2710-20.0True
\n", "
" ], "text/plain": [ " a b chris new column\n", "0 5 10 30.0 True\n", "1 20 10 2.4 True\n", "2 7 10 -20.0 True" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.iloc[1,0] = 20\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Indexing using `loc`." ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "cell_id": "64d8f5af-84aa-4706-9e76-2858824728f8", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 50, "execution_start": 1643386786790, "source_hash": "82352452", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchrisnew column
051030.0True
120102.4True
2710-20.0True
\n", "
" ], "text/plain": [ " a b chris new column\n", "0 5 10 30.0 True\n", "1 20 10 2.4 True\n", "2 7 10 -20.0 True" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[1,'a'] = 20\n", "df" ] }, { "cell_type": "markdown", "metadata": { "cell_id": "a8a1ef54-7fe9-4dd3-8cb9-bee16851af20", "deepnote_cell_type": "markdown", "tags": [] }, "source": [ "## Second largest value in a column.\n", "\n", "Find the second largest value in the \"a\" column of df." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "f331642a-cea2-457a-96de-76dad08ce281", "deepnote_cell_type": "code", "deepnote_output_heights": [ 78.8125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 10, "execution_start": 1643386855790, "source_hash": "38ded7f3", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 5\n", "1 20\n", "2 7\n", "Name: a, dtype: int64" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"a\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "c5022c3e-d91b-4de6-ac34-06553827fe2b", "deepnote_cell_type": "code", "deepnote_output_heights": [ 78.8125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643386948119, "source_hash": "e21b8e18", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 5\n", "2 7\n", "1 20\n", "Name: a, dtype: int64" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"a\"].sort_values()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "cfcca3cc-ba57-4fc8-a39e-50dedf4e53de", "deepnote_cell_type": "code", "deepnote_output_heights": [ 78.8125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 6, "execution_start": 1643386992997, "source_hash": "765ab8e8", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "1 20\n", "2 7\n", "0 5\n", "Name: a, dtype: int64" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"a\"].sort_values(ascending=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I kept accidentally trying to use `[1]` instead of `iloc[1]`. Here is the correct way to find the element at index 1 in a pandas Series." ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "cell_id": "2887e8ce-6d9c-4744-9d6c-2f3f073ff020", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 4, "execution_start": 1643387027488, "source_hash": "f57a6f13", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"a\"].sort_values(ascending=False).iloc[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Without the `ascending` keyword argument, we have to take the second-to-last entry." ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "cell_id": "180f3cb3-29ba-425e-82d5-aa080e61d658", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 8, "execution_start": 1643387069710, "source_hash": "e9017366", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "7" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"a\"].sort_values().iloc[-2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Practice with axis\n", "\n", "We haven't used `median` before, I don't think, but the following should make sense. The most important part is recognizing what `axis=0` means." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchrisnew column
051030.0True
120102.4True
2710-20.0True
\n", "
" ], "text/plain": [ " a b chris new column\n", "0 5 10 30.0 True\n", "1 20 10 2.4 True\n", "2 7 10 -20.0 True" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "cell_id": "320bf021-7f54-4bd6-b9f3-1d079c085c21", "deepnote_cell_type": "code", "deepnote_output_heights": [ 1 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643387140975, "source_hash": "8a04b452", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "a 7.0\n", "b 10.0\n", "chris 2.4\n", "new column 1.0\n", "dtype: float64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.median(axis=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Flattening a NumPy array" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "cell_id": "f2660896-1b10-4465-a69d-a696a3233338", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643387221160, "source_hash": "801acc99", "tags": [] }, "outputs": [], "source": [ "A = np.array([[2,5,1],[3,1,10]])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "cell_id": "5fa02e49-4464-484f-9746-6f836b345372", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 12, "execution_start": 1643387292722, "source_hash": "aea5aa59", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "array([ 2, 5, 1, 3, 1, 10])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "A.reshape((-1))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Slicing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "b4c14ac2-fc37-4a58-922a-cd79ce4b5870", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 258, "execution_start": 1643387327197, "source_hash": "f804c160", "tags": [] }, "outputs": [ { "data": { "application/vnd.deepnote.dataframe.v3+json": { "column_count": 5, "columns": [ { "dtype": "int64", "name": "a", "stats": { "histogram": [ { "bin_end": 6.5, "bin_start": 5, "count": 1 }, { "bin_end": 8, "bin_start": 6.5, "count": 1 }, { "bin_end": 9.5, "bin_start": 8, "count": 0 }, { "bin_end": 11, "bin_start": 9.5, "count": 0 }, { "bin_end": 12.5, "bin_start": 11, "count": 0 }, { "bin_end": 14, "bin_start": 12.5, "count": 0 }, { "bin_end": 15.5, "bin_start": 14, "count": 0 }, { "bin_end": 17, "bin_start": 15.5, "count": 0 }, { "bin_end": 18.5, "bin_start": 17, "count": 0 }, { "bin_end": 20, "bin_start": 18.5, "count": 1 } ], "max": "20", "min": "5", "nan_count": 0, "unique_count": 3 } }, { "dtype": "int64", "name": "b", "stats": { "histogram": [ { "bin_end": 9.6, "bin_start": 9.5, "count": 0 }, { "bin_end": 9.7, "bin_start": 9.6, "count": 0 }, { "bin_end": 9.8, "bin_start": 9.7, "count": 0 }, { "bin_end": 9.9, "bin_start": 9.8, "count": 0 }, { "bin_end": 10, "bin_start": 9.9, "count": 0 }, { "bin_end": 10.1, "bin_start": 10, "count": 3 }, { "bin_end": 10.2, "bin_start": 10.1, "count": 0 }, { "bin_end": 10.3, "bin_start": 10.2, "count": 0 }, { "bin_end": 10.4, "bin_start": 10.3, "count": 0 }, { "bin_end": 10.5, "bin_start": 10.4, "count": 0 } ], "max": "10", "min": "10", "nan_count": 0, "unique_count": 1 } }, { "dtype": "float64", "name": "chris", "stats": { "histogram": [ { "bin_end": -15, "bin_start": -20, "count": 1 }, { "bin_end": -10, "bin_start": -15, "count": 0 }, { "bin_end": -5, "bin_start": -10, "count": 0 }, { "bin_end": 0, "bin_start": -5, "count": 0 }, { "bin_end": 5, "bin_start": 0, "count": 1 }, { "bin_end": 10, "bin_start": 5, "count": 0 }, { "bin_end": 15, "bin_start": 10, "count": 0 }, { "bin_end": 20, "bin_start": 15, "count": 0 }, { "bin_end": 25, "bin_start": 20, "count": 0 }, { "bin_end": 30, "bin_start": 25, "count": 1 } ], "max": "30.0", "min": "-20.0", "nan_count": 0, "unique_count": 3 } }, { "dtype": "bool", "name": "new column", "stats": { "categories": [ { "count": 3, "name": "True" } ], "nan_count": 0, "unique_count": 1 } }, { "dtype": "bool", "name": "new column 2", "stats": { "categories": [ { "count": 3, "name": "False" } ], "nan_count": 0, "unique_count": 1 } }, { "dtype": "int64", "name": "_deepnote_index_column" } ], "row_count": 3, "rows": [ { "_deepnote_index_column": 0, "a": 5, "b": 10, "chris": 30, "new column": true, "new column 2": false }, { "_deepnote_index_column": 1, "a": 20, "b": 10, "chris": 2.4, "new column": true, "new column 2": false }, { "_deepnote_index_column": 2, "a": 7, "b": 10, "chris": -20, "new column": true, "new column 2": false } ] }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchrisnew columnnew column 2
051030.0TrueFalse
120102.4TrueFalse
2710-20.0TrueFalse
\n", "
" ], "text/plain": [ " a b chris new column new column 2\n", "0 5 10 30.0 True False\n", "1 20 10 2.4 True False\n", "2 7 10 -20.0 True False" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Access every other element in the row with label 1 using slicing" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "cell_id": "aeffc566-d5a5-461d-b4c5-90c360aa683c", "deepnote_cell_type": "code", "deepnote_output_heights": [ 78.8125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643387403078, "source_hash": "fff7ff58", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "a 20\n", "chris 2.4\n", "Name: 1, dtype: object" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[1,::2]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Change every other element in the row with label 1 using slicing" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "cell_id": "28b7a6a2-6530-4abc-9ae1-ecb06349cb89", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 4, "execution_start": 1643387479303, "source_hash": "b0e70448", "tags": [] }, "outputs": [ { "ename": "ValueError", "evalue": "Must have equal len keys and value when setting with an iterable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_37915/1568438968.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloc\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/miniconda3/envs/math11/lib/python3.9/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 721\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 722\u001b[0m \u001b[0miloc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"iloc\"\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 723\u001b[0;31m \u001b[0miloc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setitem_with_indexer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 724\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_validate_key\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/math11/lib/python3.9/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_setitem_with_indexer\u001b[0;34m(self, indexer, value, name)\u001b[0m\n\u001b[1;32m 1728\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtake_split_path\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1729\u001b[0m \u001b[0;31m# We have to operate column-wise\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1730\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setitem_with_indexer_split_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1731\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1732\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_setitem_single_block\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindexer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/math11/lib/python3.9/site-packages/pandas/core/indexing.py\u001b[0m in \u001b[0;36m_setitem_with_indexer_split_path\u001b[0;34m(self, indexer, value, name)\u001b[0m\n\u001b[1;32m 1806\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1807\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1808\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1809\u001b[0m \u001b[0;34m\"Must have equal len keys and value \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1810\u001b[0m \u001b[0;34m\"when setting with an iterable\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Must have equal len keys and value when setting with an iterable" ] } ], "source": [ "df.loc[1,::2] = [i**2 for i in range(3)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Change every other element in the row with label 1 using slicing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "bfea788c-addb-45e5-bfc8-cf2a45a3d99c", "deepnote_cell_type": "code", "tags": [] }, "outputs": [], "source": [ "df.loc[1,::2] = [0,1,4]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "c353bf34-0a49-4655-962b-e650fb70e997", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 37, "execution_start": 1643387479310, "source_hash": "f804c160", "tags": [] }, "outputs": [ { "data": { "application/vnd.deepnote.dataframe.v3+json": { "column_count": 5, "columns": [ { "dtype": "int64", "name": "a", "stats": { "histogram": [ { "bin_end": 0.7, "bin_start": 0, "count": 1 }, { "bin_end": 1.4, "bin_start": 0.7, "count": 0 }, { "bin_end": 2.0999999999999996, "bin_start": 1.4, "count": 0 }, { "bin_end": 2.8, "bin_start": 2.0999999999999996, "count": 0 }, { "bin_end": 3.5, "bin_start": 2.8, "count": 0 }, { "bin_end": 4.199999999999999, "bin_start": 3.5, "count": 0 }, { "bin_end": 4.8999999999999995, "bin_start": 4.199999999999999, "count": 0 }, { "bin_end": 5.6, "bin_start": 4.8999999999999995, "count": 1 }, { "bin_end": 6.3, "bin_start": 5.6, "count": 0 }, { "bin_end": 7, "bin_start": 6.3, "count": 1 } ], "max": "7", "min": "0", "nan_count": 0, "unique_count": 3 } }, { "dtype": "int64", "name": "b", "stats": { "histogram": [ { "bin_end": 9.6, "bin_start": 9.5, "count": 0 }, { "bin_end": 9.7, "bin_start": 9.6, "count": 0 }, { "bin_end": 9.8, "bin_start": 9.7, "count": 0 }, { "bin_end": 9.9, "bin_start": 9.8, "count": 0 }, { "bin_end": 10, "bin_start": 9.9, "count": 0 }, { "bin_end": 10.1, "bin_start": 10, "count": 3 }, { "bin_end": 10.2, "bin_start": 10.1, "count": 0 }, { "bin_end": 10.3, "bin_start": 10.2, "count": 0 }, { "bin_end": 10.4, "bin_start": 10.3, "count": 0 }, { "bin_end": 10.5, "bin_start": 10.4, "count": 0 } ], "max": "10", "min": "10", "nan_count": 0, "unique_count": 1 } }, { "dtype": "float64", "name": "chris", "stats": { "histogram": [ { "bin_end": -15, "bin_start": -20, "count": 1 }, { "bin_end": -10, "bin_start": -15, "count": 0 }, { "bin_end": -5, "bin_start": -10, "count": 0 }, { "bin_end": 0, "bin_start": -5, "count": 0 }, { "bin_end": 5, "bin_start": 0, "count": 1 }, { "bin_end": 10, "bin_start": 5, "count": 0 }, { "bin_end": 15, "bin_start": 10, "count": 0 }, { "bin_end": 20, "bin_start": 15, "count": 0 }, { "bin_end": 25, "bin_start": 20, "count": 0 }, { "bin_end": 30, "bin_start": 25, "count": 1 } ], "max": "30.0", "min": "-20.0", "nan_count": 0, "unique_count": 3 } }, { "dtype": "bool", "name": "new column", "stats": { "categories": [ { "count": 3, "name": "True" } ], "nan_count": 0, "unique_count": 1 } }, { "dtype": "object", "name": "new column 2", "stats": { "categories": [ { "count": 2, "name": "False" }, { "count": 1, "name": "4" } ], "nan_count": 0, "unique_count": 2 } }, { "dtype": "int64", "name": "_deepnote_index_column" } ], "row_count": 3, "rows": [ { "_deepnote_index_column": 0, "a": 5, "b": 10, "chris": 30, "new column": true, "new column 2": "False" }, { "_deepnote_index_column": 1, "a": 0, "b": 10, "chris": 1, "new column": true, "new column 2": "4" }, { "_deepnote_index_column": 2, "a": 7, "b": 10, "chris": -20, "new column": true, "new column 2": "False" } ] }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchrisnew columnnew column 2
051030.0TrueFalse
10101.0True4
2710-20.0TrueFalse
\n", "
" ], "text/plain": [ " a b chris new column new column 2\n", "0 5 10 30.0 True False\n", "1 0 10 1.0 True 4\n", "2 7 10 -20.0 True False" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Square every element in a DataFrame" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "cell_id": "e5476dc2-8793-4f66-af07-ece0ed9708df", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 15, "execution_start": 1643387718414, "source_hash": "8e461c83", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchris
051030.0
120102.4
2710-20.0
\n", "
" ], "text/plain": [ " a b chris\n", "0 5 10 30.0\n", "1 20 10 2.4\n", "2 7 10 -20.0" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2 = df.iloc[:,:3]\n", "df2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "06a97bc0-7695-4538-ac7a-3ebf0a8684c9", "deepnote_cell_type": "code", "deepnote_output_heights": [ 78.8125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643387734047, "source_hash": "62b9b55b", "tags": [] }, "outputs": [], "source": [ "df2.loc[1] = df2.loc[1]**2" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "ac03692d-e58d-4fe4-8340-9cc125eaf5a3", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 16, "execution_start": 1643387736160, "source_hash": "caa55e2e", "tags": [] }, "outputs": [ { "data": { "application/vnd.deepnote.dataframe.v3+json": { "column_count": 3, "columns": [ { "dtype": "float64", "name": "a", "stats": { "histogram": [ { "bin_end": 0.7, "bin_start": 0, "count": 1 }, { "bin_end": 1.4, "bin_start": 0.7, "count": 0 }, { "bin_end": 2.0999999999999996, "bin_start": 1.4, "count": 0 }, { "bin_end": 2.8, "bin_start": 2.0999999999999996, "count": 0 }, { "bin_end": 3.5, "bin_start": 2.8, "count": 0 }, { "bin_end": 4.199999999999999, "bin_start": 3.5, "count": 0 }, { "bin_end": 4.8999999999999995, "bin_start": 4.199999999999999, "count": 0 }, { "bin_end": 5.6, "bin_start": 4.8999999999999995, "count": 1 }, { "bin_end": 6.3, "bin_start": 5.6, "count": 0 }, { "bin_end": 7, "bin_start": 6.3, "count": 1 } ], "max": "7.0", "min": "0.0", "nan_count": 0, "unique_count": 3 } }, { "dtype": "float64", "name": "b", "stats": { "histogram": [ { "bin_end": 19, "bin_start": 10, "count": 2 }, { "bin_end": 28, "bin_start": 19, "count": 0 }, { "bin_end": 37, "bin_start": 28, "count": 0 }, { "bin_end": 46, "bin_start": 37, "count": 0 }, { "bin_end": 55, "bin_start": 46, "count": 0 }, { "bin_end": 64, "bin_start": 55, "count": 0 }, { "bin_end": 73, "bin_start": 64, "count": 0 }, { "bin_end": 82, "bin_start": 73, "count": 0 }, { "bin_end": 91, "bin_start": 82, "count": 0 }, { "bin_end": 100, "bin_start": 91, "count": 1 } ], "max": "100.0", "min": "10.0", "nan_count": 0, "unique_count": 2 } }, { "dtype": "float64", "name": "chris", "stats": { "histogram": [ { "bin_end": -15, "bin_start": -20, "count": 1 }, { "bin_end": -10, "bin_start": -15, "count": 0 }, { "bin_end": -5, "bin_start": -10, "count": 0 }, { "bin_end": 0, "bin_start": -5, "count": 0 }, { "bin_end": 5, "bin_start": 0, "count": 1 }, { "bin_end": 10, "bin_start": 5, "count": 0 }, { "bin_end": 15, "bin_start": 10, "count": 0 }, { "bin_end": 20, "bin_start": 15, "count": 0 }, { "bin_end": 25, "bin_start": 20, "count": 0 }, { "bin_end": 30, "bin_start": 25, "count": 1 } ], "max": "30.0", "min": "-20.0", "nan_count": 0, "unique_count": 3 } }, { "dtype": "int64", "name": "_deepnote_index_column" } ], "row_count": 3, "rows": [ { "_deepnote_index_column": 0, "a": 5, "b": 10, "chris": 30 }, { "_deepnote_index_column": 1, "a": 0, "b": 100, "chris": 1 }, { "_deepnote_index_column": 2, "a": 7, "b": 10, "chris": -20 } ] }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchris
05.010.030.0
10.0100.01.0
27.010.0-20.0
\n", "
" ], "text/plain": [ " a b chris\n", "0 5.0 10.0 30.0\n", "1 0.0 100.0 1.0\n", "2 7.0 10.0 -20.0" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "cell_id": "d1872d4c-3889-466f-804c-3286e6adbb86", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 29, "execution_start": 1643387760375, "source_hash": "b5f73824", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchris
025100900.00
14001005.76
249100400.00
\n", "
" ], "text/plain": [ " a b chris\n", "0 25 100 900.00\n", "1 400 100 5.76\n", "2 49 100 400.00" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2**2" ] }, { "cell_type": "markdown", "metadata": { "cell_id": "fd98728f-abe6-4572-bc6c-d3672b497207", "deepnote_cell_type": "markdown", "tags": [] }, "source": [ "Try doing that same thing (squaring every entry in df2) using `map`, `apply`, or `applymap`." ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "cell_id": "355adbf3-c513-439d-8365-45b34a7809ad", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643387920495, "source_hash": "8061dcae", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchris
025100900.00
14001005.76
249100400.00
\n", "
" ], "text/plain": [ " a b chris\n", "0 25 100 900.00\n", "1 400 100 5.76\n", "2 49 100 400.00" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.applymap(lambda x: x**2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Warning**: notice that `applymap` doesn't change the original DataFrame." ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "cell_id": "7f22f075-8067-4ee7-b007-ba24616f19d5", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643387979938, "source_hash": "caa55e2e", "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abchris
051030.0
120102.4
2710-20.0
\n", "
" ], "text/plain": [ " a b chris\n", "0 5 10 30.0\n", "1 20 10 2.4\n", "2 7 10 -20.0" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2" ] }, { "cell_type": "markdown", "metadata": { "cell_id": "fe74e8ba-8cec-4220-bfd5-921f9a3c8cfb", "deepnote_cell_type": "code", "deepnote_output_heights": [ 117.21875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 17, "execution_start": 1643388184581, "source_hash": "5cc90411", "tags": [] }, "source": [ "## Example using apply" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "cell_id": "6874da94-7aeb-41cf-926c-c2ae40280861", "deepnote_cell_type": "code", "deepnote_output_heights": [ 78.8125 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643388015610, "source_hash": "339b2fc5", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "a 32.0\n", "b 30.0\n", "chris 12.4\n", "dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.apply(lambda c: c.sum(), axis = 0)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 45.0\n", "1 32.4\n", "2 -3.0\n", "dtype: float64" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df2.apply(lambda r: r.sum(), axis = 1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What if we tried to use `applymap` instead?" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "d0801cea-4262-4830-b507-29b01d36c6cd", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 26, "execution_start": 1643388141729, "source_hash": "adda5cc5", "tags": [] }, "outputs": [ { "ename": "AttributeError", "evalue": "'float' object has no attribute 'sum'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapplymap\u001b[0;34m(self, func, na_action)\u001b[0m\n\u001b[1;32m 7842\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_infer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_na\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_na\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7843\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 7844\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minfer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"applymap\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7845\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7846\u001b[0m \u001b[0;31m# ----------------------------------------------------------------------\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, raw, result_type, args, **kwds)\u001b[0m\n\u001b[1;32m 7766\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7767\u001b[0m )\n\u001b[0;32m-> 7768\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mop\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7769\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7770\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_action\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mget_result\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_raw\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_empty_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 275\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 276\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mres_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply_series_generator\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 277\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[0;31m# wrap results\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/apply.py\u001b[0m in \u001b[0;36mapply_series_generator\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0;31m# ignore SettingWithCopy here in case the user mutates\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 290\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 291\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mABCSeries\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;31m# If we have a view on v, we need to make a copy because\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36minfer\u001b[0;34m(x)\u001b[0m\n\u001b[1;32m 7840\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7841\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_infer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_na\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_na\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 7842\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_infer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobject\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_na\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_na\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7843\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7844\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minfer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"applymap\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32mpandas/_libs/lib.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.map_infer\u001b[0;34m()\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m(a)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapplymap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m: 'float' object has no attribute 'sum'" ] } ], "source": [ "df2.applymap(lambda a: a.sum())" ] }, { "cell_type": "markdown", "metadata": { "cell_id": "8a2f01ac-e621-4f78-a062-397e31c77383", "deepnote_cell_type": "markdown", "tags": [] }, "source": [ "Sample exercise: What causes the above error?\n", "\n", "Sample answer: `a` will be a number in the dataframe, and `number.sum()` does not make sense. Should use `apply` and `axis` instead." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A smaller piece of code that raises the same error:" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'float' object has no attribute 'sum'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_37915/223743580.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0ma\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m5.1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAttributeError\u001b[0m: 'float' object has no attribute 'sum'" ] } ], "source": [ "a = 5.1\n", "a.sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Of the three methods, `map`, `applymap`, and `apply`, definitely `apply` is the trickiest to understand. The most natural example with `apply` we have seen was using `pd.to_numeric` on every column. Notice how the input to `pd.to_numeric` should be an entire Series, not an individual entry." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with datetime entries" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "cell_id": "25702ca0-1d3f-44ab-9232-1307f50f61ea", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 23, "execution_start": 1643388402781, "source_hash": "6b2531b2", "tags": [] }, "outputs": [], "source": [ "df = pd.read_csv(\"../data/spotify_dataset.csv\", na_values=\" \")" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "cell_id": "5bbb132a-192d-4928-9db1-392a78349626", "deepnote_cell_type": "code", "deepnote_output_heights": [ 136.421875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 16, "execution_start": 1643388407679, "source_hash": "25d43fa0", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['Index', 'Highest Charting Position', 'Number of Times Charted',\n", " 'Week of Highest Charting', 'Song Name', 'Streams', 'Artist',\n", " 'Artist Followers', 'Song ID', 'Genre', 'Release Date', 'Weeks Charted',\n", " 'Popularity', 'Danceability', 'Energy', 'Loudness', 'Speechiness',\n", " 'Acousticness', 'Liveness', 'Tempo', 'Duration (ms)', 'Valence',\n", " 'Chord'],\n", " dtype='object')" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This way of getting a column from a DataFrame is called *attribute* access:" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "cell_id": "6c00faa6-0850-4acc-b4e5-a508ef83467b", "deepnote_cell_type": "code", "deepnote_output_heights": [ 232.4375 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643388429000, "source_hash": "9634eaf3", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 ['indie rock italiano', 'italian pop']\n", "1 ['australian hip hop']\n", "2 ['pop']\n", "3 ['pop', 'uk pop']\n", "4 ['lgbtq+ hip hop', 'pop rap']\n", " ... \n", "1551 ['dance pop', 'pop', 'uk pop']\n", "1552 ['sertanejo', 'sertanejo universitario']\n", "1553 ['dance pop', 'electropop', 'pop', 'post-teen ...\n", "1554 ['brega funk', 'funk carioca']\n", "1555 ['pop', 'post-teen pop']\n", "Name: Genre, Length: 1556, dtype: object" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.Genre" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It doesn't always work. For example, in the following, we can't use attribute access because *Release Date* has a space in it." ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "cell_id": "12da1ee5-195e-42ad-9bbb-3aecc608e1b5", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643388445735, "source_hash": "c2d0c103", "tags": [] }, "outputs": [ { "ename": "SyntaxError", "evalue": "invalid syntax (3791632194.py, line 1)", "output_type": "error", "traceback": [ "\u001b[0;36m File \u001b[0;32m\"/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_37915/3791632194.py\"\u001b[0;36m, line \u001b[0;32m1\u001b[0m\n\u001b[0;31m df.Release Date\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" ] } ], "source": [ "df.Release Date" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "cell_id": "ed0f2a84-b0c9-47db-bf5d-0a9e9c5c09ca", "deepnote_cell_type": "code", "deepnote_output_heights": [ 232.4375 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643388460752, "source_hash": "9e0b7036", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 2017-12-08\n", "1 2021-07-09\n", "2 2021-05-21\n", "3 2021-06-25\n", "4 2021-07-23\n", " ... \n", "1551 2017-06-02\n", "1552 2019-10-11\n", "1553 2018-01-12\n", "1554 2019-09-25\n", "1555 2019-11-13\n", "Name: Release Date, Length: 1556, dtype: object" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Release Date\"]" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "cell_id": "aa45def3-1eec-4144-8913-ee496e4e4bdb", "deepnote_cell_type": "code", "deepnote_output_heights": [ 232.4375 ], "deepnote_to_be_reexecuted": false, "execution_millis": 9, "execution_start": 1643388521349, "source_hash": "a05f272f", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0 8.0\n", "1 9.0\n", "2 21.0\n", "3 25.0\n", "4 23.0\n", " ... \n", "1551 2.0\n", "1552 11.0\n", "1553 12.0\n", "1554 25.0\n", "1555 13.0\n", "Name: Release Date, Length: 1556, dtype: float64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_datetime(df[\"Release Date\"]).dt.day" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "cell_id": "8dda9b06-7f4d-444d-a9a9-9261982253ba", "deepnote_cell_type": "code", "deepnote_output_heights": [ 232.4375 ], "deepnote_to_be_reexecuted": false, "execution_millis": 0, "execution_start": 1643388602047, "source_hash": "993d219d", "tags": [] }, "outputs": [], "source": [ "year_series = pd.to_datetime(df[\"Release Date\"]).dt.year" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Sample exercise: How many `2019`s are there in `year_series`?" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "cell_id": "35d0392f-2ff8-4fc7-9a39-f808d5927712", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 3, "execution_start": 1643388715520, "source_hash": "6ae1b884", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "181" ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(year_series == 2019).sum()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we're trying a different method, but it doesn't work at first because of null values. I realized after class that we could have used a keyword argument to `map` called `na_action`, but during class we removed the null values by hand." ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "cell_id": "333509a2-a623-4627-a978-f3dced88082d", "deepnote_cell_type": "code", "deepnote_output_heights": [ 233 ], "deepnote_to_be_reexecuted": false, "execution_millis": 30, "execution_start": 1643388788276, "source_hash": "ddf125b8", "tags": [] }, "outputs": [ { "ename": "TypeError", "evalue": "'float' object is not subscriptable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_37915/337003375.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Release Date\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2019\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/miniconda3/envs/math11/lib/python3.9/site-packages/pandas/core/series.py\u001b[0m in \u001b[0;36mmap\u001b[0;34m(self, arg, na_action)\u001b[0m\n\u001b[1;32m 4159\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4160\u001b[0m \"\"\"\n\u001b[0;32m-> 4161\u001b[0;31m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_map_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mna_action\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mna_action\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4162\u001b[0m return self._constructor(new_values, index=self.index).__finalize__(\n\u001b[1;32m 4163\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"map\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/math11/lib/python3.9/site-packages/pandas/core/base.py\u001b[0m in \u001b[0;36m_map_values\u001b[0;34m(self, mapper, na_action)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 869\u001b[0m \u001b[0;31m# mapper is a function\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 870\u001b[0;31m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmap_f\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 871\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 872\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mnew_values\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/math11/lib/python3.9/site-packages/pandas/_libs/lib.pyx\u001b[0m in \u001b[0;36mpandas._libs.lib.map_infer\u001b[0;34m()\u001b[0m\n", "\u001b[0;32m/var/folders/8j/gshrlmtn7dg4qtztj4d4t_w40000gn/T/ipykernel_37915/337003375.py\u001b[0m in \u001b[0;36m\u001b[0;34m(s)\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Release Date\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m2019\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: 'float' object is not subscriptable" ] } ], "source": [ "df[\"Release Date\"].map(lambda s: s[:4] == 2019)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "cell_id": "752d9dc3-4693-430b-a8f1-fcc167dd82e9", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 11, "execution_start": 1643388856109, "source_hash": "47061172", "tags": [] }, "outputs": [ { "ename": "TypeError", "evalue": "'float' object is not subscriptable", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mTypeError\u001b[0m: 'float' object is not subscriptable" ] } ], "source": [ "np.nan[:4]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "During class I made another mistake before getting to the next cell, but I deleted it from this notebook because it's more confusing than helpful.\n", "\n", "Here is an alternate approach." ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "cell_id": "327bc4a8-199a-4fe6-a45a-5bef11ad74de", "deepnote_cell_type": "code", "deepnote_to_be_reexecuted": false, "execution_millis": 2, "execution_start": 1643389005077, "source_hash": "d9d052d5", "tags": [] }, "outputs": [], "source": [ "clean = df[\"Release Date\"][~df[\"Release Date\"].isna()]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "cell_id": "f31583c0-a0a2-4463-8cc6-c5d1dbd196f5", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643389110528, "source_hash": "9eb0b2f4", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "pandas.core.series.Series" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(clean)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2017-12-08\n", "1 2021-07-09\n", "2 2021-05-21\n", "3 2021-06-25\n", "4 2021-07-23\n", " ... \n", "1551 2017-06-02\n", "1552 2019-10-11\n", "1553 2018-01-12\n", "1554 2019-09-25\n", "1555 2019-11-13\n", "Name: Release Date, Length: 1545, dtype: object" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Two different ways to count `2019`s in this pandas Series of strings. Notice that they give the same answer as the `.dt.year` method from above." ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "cell_id": "b3f67f74-26d5-41c0-a9dc-0fc321e62429", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 5, "execution_start": 1643389020671, "source_hash": "87192a9c", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "181" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean.map(lambda s: s[:4] == \"2019\").sum()" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "cell_id": "821ba94b-c0ac-4122-bdd3-5f1231879659", "deepnote_cell_type": "code", "deepnote_output_heights": [ 21.1875 ], "deepnote_to_be_reexecuted": false, "execution_millis": 10, "execution_start": 1643389057991, "source_hash": "107a5169", "tags": [] }, "outputs": [ { "data": { "text/plain": [ "181" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean.map(lambda s: int(s[:4]) == 2019).sum()" ] } ], "metadata": { "deepnote": { "is_reactive": false }, "deepnote_execution_queue": [], "deepnote_notebook_id": "2cfe386f-b03b-4bf9-8aba-f1d8192309bc", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.9" } }, "nbformat": 4, "nbformat_minor": 4 }