Add files via upload

prasertcbs · web-flow · commit ae8f4bdf4b84 · 2021-02-12T13:03:08.000+07:00
diff --git a/pandas_clean_multilevel_table.ipynb b/pandas_clean_multilevel_table.ipynb
@@ -0,0 +1,250 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "collect-story",
+   "metadata": {},
+   "source": [
+    "# Clean multilevel table\n",
+    "* starbucks nutrition: https://www.starbucks.ca/menu/nutrition-info\n",
+    "* starbucks bakery nutrition: https://globalassets.starbucks.com/assets/c4874ecf0a8b418f9436b1f1900cc2fa.pdf"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "whole-toolbox",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "* author:  [Prasert Kanawattanachai](prasert.k@chula.ac.th)\n",
+    "* YouTube: https://www.youtube.com/prasertcbs\n",
+    "* github: https://github.com/prasertcbs/\n",
+    "* [Chulalongkorn Business School](https://www.cbs.chula.ac.th/en/)\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "willing-hardwood",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "pd.set_option('display.max_rows', None)\n",
+    "\n",
+    "%config InlineBackend.figure_format='retina'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "virgin-miller",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 64
+    },
+    "colab_type": "code",
+    "id": "fdr0pYIf7P-_",
+    "outputId": "c919deae-c99b-44b0-8924-4d2355ca0b63"
+   },
+   "outputs": [],
+   "source": [
+    "print(f'Python version:  {sys.version}')\n",
+    "print(f'pandas version:  {pd.__version__}')\n",
+    "\n",
+    "pd.Timestamp.now()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cooperative-template",
+   "metadata": {},
+   "source": [
+    "## read data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "convinced-insertion",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=pd.read_excel('https://github.com/prasertcbs/basic-dataset/raw/master/starbucks_bakery.xlsx')\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "postal-ontario",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=df.dropna(subset=['Product Name']).reset_index(drop=True) # blank rows\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "living-force",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "terminal-philippines",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.loc[0, :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "smooth-greece",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "type(df.loc[0, 'Calories'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "numeric-hungarian",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.isnan(df.loc[0, 'Calories']) # check numpy.float64 isnan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cardiovascular-hydrogen",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['Category']=df.apply(lambda r: r['Product Name'] if np.isnan(r['Calories']) else np.nan, axis=1)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "final-disabled",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['Category']=df['Category'].ffill()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fresh-harvey",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "sustained-injection",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df=df.dropna(subset=['Calories']).reset_index(drop=True) # blank rows\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "serious-lighting",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "corresponding-counter",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',\n",
+    "       'Saturated  Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',\n",
+    "       'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',\n",
+    "       'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',\n",
+    "       'Iron (%DV)']].to_csv('starbucks_bakery_nutrition_fact.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "pointed-arlington",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',\n",
+    "       'Saturated  Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',\n",
+    "       'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',\n",
+    "       'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',\n",
+    "       'Iron (%DV)']].to_excel('starbucks_bakery_nutrition_fact.xlsx', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dress-quantum",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}