diff --git a/02_pandas_tips&tricks/Excercises/07_grouping.ipynb b/02_pandas_tips&tricks/Excercises/07_grouping.ipynb index d49a518..ad32fbc 100644 --- a/02_pandas_tips&tricks/Excercises/07_grouping.ipynb +++ b/02_pandas_tips&tricks/Excercises/07_grouping.ipynb @@ -123,9 +123,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3.9.7 ('base')", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -137,7 +137,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.16" + "version": "3.9.7" + }, + "vscode": { + "interpreter": { + "hash": "9b9ced3d0af0cb92224640680f81aa3cf99730ecb34e6382d788e77830a0b6a6" + } } }, "nbformat": 4, diff --git a/02_pandas_tips&tricks/Excercises/08_grouping.ipynb b/02_pandas_tips&tricks/Excercises/08_grouping.ipynb index 16fbcdd..c2d580c 100644 --- a/02_pandas_tips&tricks/Excercises/08_grouping.ipynb +++ b/02_pandas_tips&tricks/Excercises/08_grouping.ipynb @@ -133,9 +133,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3.9.7 ('base')", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -147,7 +147,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.11" + "version": "3.9.7" + }, + "vscode": { + "interpreter": { + "hash": "9b9ced3d0af0cb92224640680f81aa3cf99730ecb34e6382d788e77830a0b6a6" + } } }, "nbformat": 4, diff --git a/02_pandas_tips&tricks/Solutions/06_filtering_and_sorting.ipynb b/02_pandas_tips&tricks/Solutions/06_filtering_and_sorting.ipynb index abbfcb5..165f214 100644 --- a/02_pandas_tips&tricks/Solutions/06_filtering_and_sorting.ipynb +++ b/02_pandas_tips&tricks/Solutions/06_filtering_and_sorting.ipynb @@ -1918,7 +1918,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.7 ('base')", "language": "python", "name": "python3" }, @@ -1932,7 +1932,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.9.7" }, "toc": { "base_numbering": 1, @@ -1946,6 +1946,11 @@ "toc_position": {}, "toc_section_display": true, "toc_window_display": false + }, + "vscode": { + "interpreter": { + "hash": "9b9ced3d0af0cb92224640680f81aa3cf99730ecb34e6382d788e77830a0b6a6" + } } }, "nbformat": 4, diff --git a/02_pandas_tips&tricks/Solutions/07_grouping.ipynb b/02_pandas_tips&tricks/Solutions/07_grouping.ipynb new file mode 100644 index 0000000..27f511a --- /dev/null +++ b/02_pandas_tips&tricks/Solutions/07_grouping.ipynb @@ -0,0 +1,557 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ex - GroupBy" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3. Assign it to a variable called drinks." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countrybeer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcoholcontinent
0Afghanistan0000.0AS
1Albania89132544.9EU
2Algeria250140.7AF
3Andorra24513831212.4EU
4Angola21757455.9AF
\n", + "
" + ], + "text/plain": [ + " country beer_servings spirit_servings wine_servings \\\n", + "0 Afghanistan 0 0 0 \n", + "1 Albania 89 132 54 \n", + "2 Algeria 25 0 14 \n", + "3 Andorra 245 138 312 \n", + "4 Angola 217 57 45 \n", + "\n", + " total_litres_of_pure_alcohol continent \n", + "0 0.0 AS \n", + "1 4.9 EU \n", + "2 0.7 AF \n", + "3 12.4 EU \n", + "4 5.9 AF " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drinks = pd.read_csv('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/drinks.csv')\n", + "drinks.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4. Which continent drinks more beer on average?" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "continent\n", + "AF 61.471698\n", + "AS 37.045455\n", + "EU 193.777778\n", + "OC 89.687500\n", + "SA 175.083333\n", + "Name: beer_servings, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drinks.groupby('continent').beer_servings.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5. For each continent print the statistics for wine consumption." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "continent \n", + "AF count 53.000000\n", + " mean 16.264151\n", + " std 38.846419\n", + " min 0.000000\n", + " 25% 1.000000\n", + " 50% 2.000000\n", + " 75% 13.000000\n", + " max 233.000000\n", + "AS count 44.000000\n", + " mean 9.068182\n", + " std 21.667034\n", + " min 0.000000\n", + " 25% 0.000000\n", + " 50% 1.000000\n", + " 75% 8.000000\n", + " max 123.000000\n", + "EU count 45.000000\n", + " mean 142.222222\n", + " std 97.421738\n", + " min 0.000000\n", + " 25% 59.000000\n", + " 50% 128.000000\n", + " 75% 195.000000\n", + " max 370.000000\n", + "OC count 16.000000\n", + " mean 35.625000\n", + " std 64.555790\n", + " min 0.000000\n", + " 25% 1.000000\n", + " 50% 8.500000\n", + " 75% 23.250000\n", + " max 212.000000\n", + "SA count 12.000000\n", + " mean 62.416667\n", + " std 88.620189\n", + " min 1.000000\n", + " 25% 3.000000\n", + " 50% 12.000000\n", + " 75% 98.500000\n", + " max 221.000000\n", + "dtype: float64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drinks.groupby('continent').wine_servings.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6. Print the mean alcohol consumption per continent for every column" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcohol
continent
AF61.47169816.33962316.2641513.007547
AS37.04545560.8409099.0681822.170455
EU193.777778132.555556142.2222228.617778
OC89.68750058.43750035.6250003.381250
SA175.083333114.75000062.4166676.308333
\n", + "
" + ], + "text/plain": [ + " beer_servings spirit_servings wine_servings \\\n", + "continent \n", + "AF 61.471698 16.339623 16.264151 \n", + "AS 37.045455 60.840909 9.068182 \n", + "EU 193.777778 132.555556 142.222222 \n", + "OC 89.687500 58.437500 35.625000 \n", + "SA 175.083333 114.750000 62.416667 \n", + "\n", + " total_litres_of_pure_alcohol \n", + "continent \n", + "AF 3.007547 \n", + "AS 2.170455 \n", + "EU 8.617778 \n", + "OC 3.381250 \n", + "SA 6.308333 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drinks.groupby('continent').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7. Print the median alcohol consumption per continent for every column" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
beer_servingsspirit_servingswine_servingstotal_litres_of_pure_alcohol
continent
AF32.03.02.02.30
AS17.516.01.01.20
EU219.0122.0128.010.00
OC52.537.08.51.75
SA162.5108.512.06.85
\n", + "
" + ], + "text/plain": [ + " beer_servings spirit_servings wine_servings \\\n", + "continent \n", + "AF 32.0 3.0 2.0 \n", + "AS 17.5 16.0 1.0 \n", + "EU 219.0 122.0 128.0 \n", + "OC 52.5 37.0 8.5 \n", + "SA 162.5 108.5 12.0 \n", + "\n", + " total_litres_of_pure_alcohol \n", + "continent \n", + "AF 2.30 \n", + "AS 1.20 \n", + "EU 10.00 \n", + "OC 1.75 \n", + "SA 6.85 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drinks.groupby('continent').median()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8. Print the mean, min and max values for spirit consumption.\n", + "#### This time output a DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
meanminmax
continent
AF16.3396230152
AS60.8409090326
EU132.5555560373
OC58.4375000254
SA114.75000025302
\n", + "
" + ], + "text/plain": [ + " mean min max\n", + "continent \n", + "AF 16.339623 0 152\n", + "AS 60.840909 0 326\n", + "EU 132.555556 0 373\n", + "OC 58.437500 0 254\n", + "SA 114.750000 25 302" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "drinks.groupby('continent').spirit_servings.agg(['mean', 'min', 'max'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.7 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "vscode": { + "interpreter": { + "hash": "9b9ced3d0af0cb92224640680f81aa3cf99730ecb34e6382d788e77830a0b6a6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/02_pandas_tips&tricks/Solutions/08_grouping.ipynb b/02_pandas_tips&tricks/Solutions/08_grouping.ipynb new file mode 100644 index 0000000..1283e66 --- /dev/null +++ b/02_pandas_tips&tricks/Solutions/08_grouping.ipynb @@ -0,0 +1,593 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Occupation" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3. Assign it to a variable called users." + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agegenderoccupationzip_code
user_id
124Mtechnician85711
253Fother94043
323Mwriter32067
424Mtechnician43537
533Fother15213
\n", + "
" + ], + "text/plain": [ + " age gender occupation zip_code\n", + "user_id \n", + "1 24 M technician 85711\n", + "2 53 F other 94043\n", + "3 23 M writer 32067\n", + "4 24 M technician 43537\n", + "5 33 F other 15213" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users = pd.read_table('https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user', \n", + " sep='|', index_col='user_id')\n", + "users.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4. Discover what is the mean age per occupation" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "occupation\n", + "administrator 38.746835\n", + "artist 31.392857\n", + "doctor 43.571429\n", + "educator 42.010526\n", + "engineer 36.388060\n", + "entertainment 29.222222\n", + "executive 38.718750\n", + "healthcare 41.562500\n", + "homemaker 32.571429\n", + "lawyer 36.750000\n", + "librarian 40.000000\n", + "marketing 37.615385\n", + "none 26.555556\n", + "other 34.523810\n", + "programmer 33.121212\n", + "retired 63.071429\n", + "salesman 35.666667\n", + "scientist 35.548387\n", + "student 22.081633\n", + "technician 33.148148\n", + "writer 36.311111\n", + "Name: age, dtype: float64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.groupby('occupation').age.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5. Discover the Male ratio per occupation and sort it from the most to the least" + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "doctor 100.000000\n", + "engineer 97.014925\n", + "technician 96.296296\n", + "retired 92.857143\n", + "programmer 90.909091\n", + "executive 90.625000\n", + "scientist 90.322581\n", + "entertainment 88.888889\n", + "lawyer 83.333333\n", + "salesman 75.000000\n", + "educator 72.631579\n", + "student 69.387755\n", + "other 65.714286\n", + "marketing 61.538462\n", + "writer 57.777778\n", + "none 55.555556\n", + "administrator 54.430380\n", + "artist 53.571429\n", + "librarian 43.137255\n", + "healthcare 31.250000\n", + "homemaker 14.285714\n", + "dtype: float64" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create a function\n", + "def gender_to_numeric(x):\n", + " if x == 'M':\n", + " return 1\n", + " if x == 'F':\n", + " return 0\n", + "\n", + "# apply the function to the gender column and create a new column\n", + "users['gender_n'] = users['gender'].apply(gender_to_numeric)\n", + "\n", + "\n", + "a = users.groupby('occupation').gender_n.sum() / users.occupation.value_counts() * 100 \n", + "\n", + "# sort to the most male \n", + "a.sort_values(ascending = False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6. For each occupation, calculate the minimum and maximum ages" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
minmax
occupation
administrator2170
artist1948
doctor2864
educator2363
engineer2270
entertainment1550
executive2269
healthcare2262
homemaker2050
lawyer2153
librarian2369
marketing2455
none1155
other1364
programmer2063
retired5173
salesman1866
scientist2355
student742
technician2155
writer1860
\n", + "
" + ], + "text/plain": [ + " min max\n", + "occupation \n", + "administrator 21 70\n", + "artist 19 48\n", + "doctor 28 64\n", + "educator 23 63\n", + "engineer 22 70\n", + "entertainment 15 50\n", + "executive 22 69\n", + "healthcare 22 62\n", + "homemaker 20 50\n", + "lawyer 21 53\n", + "librarian 23 69\n", + "marketing 24 55\n", + "none 11 55\n", + "other 13 64\n", + "programmer 20 63\n", + "retired 51 73\n", + "salesman 18 66\n", + "scientist 23 55\n", + "student 7 42\n", + "technician 21 55\n", + "writer 18 60" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.groupby('occupation').age.agg(['min', 'max'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7. For each combination of occupation and gender, calculate the mean age" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "occupation gender\n", + "administrator F 40.638889\n", + " M 37.162791\n", + "artist F 30.307692\n", + " M 32.333333\n", + "doctor M 43.571429\n", + "educator F 39.115385\n", + " M 43.101449\n", + "engineer F 29.500000\n", + " M 36.600000\n", + "entertainment F 31.000000\n", + " M 29.000000\n", + "executive F 44.000000\n", + " M 38.172414\n", + "healthcare F 39.818182\n", + " M 45.400000\n", + "homemaker F 34.166667\n", + " M 23.000000\n", + "lawyer F 39.500000\n", + " M 36.200000\n", + "librarian F 40.000000\n", + " M 40.000000\n", + "marketing F 37.200000\n", + " M 37.875000\n", + "none F 36.500000\n", + " M 18.600000\n", + "other F 35.472222\n", + " M 34.028986\n", + "programmer F 32.166667\n", + " M 33.216667\n", + "retired F 70.000000\n", + " M 62.538462\n", + "salesman F 27.000000\n", + " M 38.555556\n", + "scientist F 28.333333\n", + " M 36.321429\n", + "student F 20.750000\n", + " M 22.669118\n", + "technician F 38.000000\n", + " M 32.961538\n", + "writer F 37.631579\n", + " M 35.346154\n", + "Name: age, dtype: float64" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users.groupby(['occupation', 'gender']).age.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8. For each occupation present the percentage of women and men" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "occupation gender\n", + "administrator F 45.569620\n", + " M 54.430380\n", + "artist F 46.428571\n", + " M 53.571429\n", + "doctor M 100.000000\n", + "educator F 27.368421\n", + " M 72.631579\n", + "engineer F 2.985075\n", + " M 97.014925\n", + "entertainment F 11.111111\n", + " M 88.888889\n", + "executive F 9.375000\n", + " M 90.625000\n", + "healthcare F 68.750000\n", + " M 31.250000\n", + "homemaker F 85.714286\n", + " M 14.285714\n", + "lawyer F 16.666667\n", + " M 83.333333\n", + "librarian F 56.862745\n", + " M 43.137255\n", + "marketing F 38.461538\n", + " M 61.538462\n", + "none F 44.444444\n", + " M 55.555556\n", + "other F 34.285714\n", + " M 65.714286\n", + "programmer F 9.090909\n", + " M 90.909091\n", + "retired F 7.142857\n", + " M 92.857143\n", + "salesman F 25.000000\n", + " M 75.000000\n", + "scientist F 9.677419\n", + " M 90.322581\n", + "student F 30.612245\n", + " M 69.387755\n", + "technician F 3.703704\n", + " M 96.296296\n", + "writer F 42.222222\n", + " M 57.777778\n", + "Name: gender, dtype: float64" + ] + }, + "execution_count": 154, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create a data frame and apply count to gender\n", + "gender_ocup = users.groupby(['occupation', 'gender']).agg({'gender': 'count'})\n", + "\n", + "# create a DataFrame and apply count for each occupation\n", + "occup_count = users.groupby(['occupation']).agg('count')\n", + "\n", + "# divide the gender_ocup per the occup_count and multiply per 100\n", + "occup_gender = gender_ocup.div(occup_count, level = \"occupation\") * 100\n", + "\n", + "# present all rows from the 'gender column'\n", + "occup_gender.loc[: , 'gender']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.7 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "vscode": { + "interpreter": { + "hash": "9b9ced3d0af0cb92224640680f81aa3cf99730ecb34e6382d788e77830a0b6a6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/02_pandas_tips&tricks/Solutions/09_grouping.ipynb b/02_pandas_tips&tricks/Solutions/09_grouping.ipynb new file mode 100644 index 0000000..a825886 --- /dev/null +++ b/02_pandas_tips&tricks/Solutions/09_grouping.ipynb @@ -0,0 +1,749 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regiment" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2. Create the DataFrame with the following values:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'], \n", + " 'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'], \n", + " 'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'], \n", + " 'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],\n", + " 'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3. Assign it to a variable called regiment.\n", + "#### Don't forget to name each column" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
regimentcompanynamepreTestScorepostTestScore
0Nighthawks1stMiller425
1Nighthawks1stJacobson2494
2Nighthawks2ndAli3157
3Nighthawks2ndMilner262
4Dragoons1stCooze370
5Dragoons1stJacon425
6Dragoons2ndRyaner2494
7Dragoons2ndSone3157
8Scouts1stSloan262
9Scouts1stPiger370
10Scouts2ndRiani262
11Scouts2ndAli370
\n", + "
" + ], + "text/plain": [ + " regiment company name preTestScore postTestScore\n", + "0 Nighthawks 1st Miller 4 25\n", + "1 Nighthawks 1st Jacobson 24 94\n", + "2 Nighthawks 2nd Ali 31 57\n", + "3 Nighthawks 2nd Milner 2 62\n", + "4 Dragoons 1st Cooze 3 70\n", + "5 Dragoons 1st Jacon 4 25\n", + "6 Dragoons 2nd Ryaner 24 94\n", + "7 Dragoons 2nd Sone 31 57\n", + "8 Scouts 1st Sloan 2 62\n", + "9 Scouts 1st Piger 3 70\n", + "10 Scouts 2nd Riani 2 62\n", + "11 Scouts 2nd Ali 3 70" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment = pd.DataFrame(raw_data, columns = raw_data.keys())\n", + "regiment" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4. What is the mean preTestScore from the regiment Nighthawks? " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
preTestScorepostTestScore
regiment
Dragoons15.5061.5
Nighthawks15.2559.5
Scouts2.5066.0
\n", + "
" + ], + "text/plain": [ + " preTestScore postTestScore\n", + "regiment \n", + "Dragoons 15.50 61.5\n", + "Nighthawks 15.25 59.5\n", + "Scouts 2.50 66.0" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment[regiment['regiment'] == 'Nighthawks'].groupby('regiment').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5. Present general statistics by company" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postTestScorepreTestScore
company
1stcount6.0000006.000000
mean57.6666676.666667
std27.4857548.524475
min25.0000002.000000
25%34.2500003.000000
50%66.0000003.500000
75%70.0000004.000000
max94.00000024.000000
2ndcount6.0000006.000000
mean67.00000015.500000
std14.05702714.652645
min57.0000002.000000
25%58.2500002.250000
50%62.00000013.500000
75%68.00000029.250000
max94.00000031.000000
\n", + "
" + ], + "text/plain": [ + " postTestScore preTestScore\n", + "company \n", + "1st count 6.000000 6.000000\n", + " mean 57.666667 6.666667\n", + " std 27.485754 8.524475\n", + " min 25.000000 2.000000\n", + " 25% 34.250000 3.000000\n", + " 50% 66.000000 3.500000\n", + " 75% 70.000000 4.000000\n", + " max 94.000000 24.000000\n", + "2nd count 6.000000 6.000000\n", + " mean 67.000000 15.500000\n", + " std 14.057027 14.652645\n", + " min 57.000000 2.000000\n", + " 25% 58.250000 2.250000\n", + " 50% 62.000000 13.500000\n", + " 75% 68.000000 29.250000\n", + " max 94.000000 31.000000" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment.groupby('company').describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6. What is the mean of each company's preTestScore?" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "company\n", + "1st 6.666667\n", + "2nd 15.500000\n", + "Name: preTestScore, dtype: float64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment.groupby('company').preTestScore.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7. Present the mean preTestScores grouped by regiment and company" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "regiment company\n", + "Dragoons 1st 3.5\n", + " 2nd 27.5\n", + "Nighthawks 1st 14.0\n", + " 2nd 16.5\n", + "Scouts 1st 2.5\n", + " 2nd 2.5\n", + "Name: preTestScore, dtype: float64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment.groupby(['regiment', 'company']).preTestScore.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 8. Present the mean preTestScores grouped by regiment and company without heirarchical indexing" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
company1st2nd
regiment
Dragoons3.527.5
Nighthawks14.016.5
Scouts2.52.5
\n", + "
" + ], + "text/plain": [ + "company 1st 2nd\n", + "regiment \n", + "Dragoons 3.5 27.5\n", + "Nighthawks 14.0 16.5\n", + "Scouts 2.5 2.5" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment.groupby(['regiment', 'company']).preTestScore.mean().unstack()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 9. Group the entire dataframe by regiment and company" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
preTestScorepostTestScore
regimentcompany
Dragoons1st3.547.5
2nd27.575.5
Nighthawks1st14.059.5
2nd16.559.5
Scouts1st2.566.0
2nd2.566.0
\n", + "
" + ], + "text/plain": [ + " preTestScore postTestScore\n", + "regiment company \n", + "Dragoons 1st 3.5 47.5\n", + " 2nd 27.5 75.5\n", + "Nighthawks 1st 14.0 59.5\n", + " 2nd 16.5 59.5\n", + "Scouts 1st 2.5 66.0\n", + " 2nd 2.5 66.0" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment.groupby(['regiment', 'company']).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 10. What is the number of observations in each regiment and company" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "company regiment \n", + "1st Dragoons 2\n", + " Nighthawks 2\n", + " Scouts 2\n", + "2nd Dragoons 2\n", + " Nighthawks 2\n", + " Scouts 2\n", + "dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "regiment.groupby(['company', 'regiment']).size()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 11. Iterate over a group and print the name and the whole data from the regiment" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dragoons\n", + " regiment company name preTestScore postTestScore\n", + "4 Dragoons 1st Cooze 3 70\n", + "5 Dragoons 1st Jacon 4 25\n", + "6 Dragoons 2nd Ryaner 24 94\n", + "7 Dragoons 2nd Sone 31 57\n", + "Nighthawks\n", + " regiment company name preTestScore postTestScore\n", + "0 Nighthawks 1st Miller 4 25\n", + "1 Nighthawks 1st Jacobson 24 94\n", + "2 Nighthawks 2nd Ali 31 57\n", + "3 Nighthawks 2nd Milner 2 62\n", + "Scouts\n", + " regiment company name preTestScore postTestScore\n", + "8 Scouts 1st Sloan 2 62\n", + "9 Scouts 1st Piger 3 70\n", + "10 Scouts 2nd Riani 2 62\n", + "11 Scouts 2nd Ali 3 70\n" + ] + } + ], + "source": [ + "# Group the dataframe by regiment, and for each regiment,\n", + "for name, group in regiment.groupby('regiment'):\n", + " # print the name of the regiment\n", + " print(name)\n", + " # print the data of that regiment\n", + " print(group)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.7 ('base')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "vscode": { + "interpreter": { + "hash": "9b9ced3d0af0cb92224640680f81aa3cf99730ecb34e6382d788e77830a0b6a6" + } + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}