Skip to content

Commit ae8f4bd

Browse files
authored
Add files via upload
1 parent 453da66 commit ae8f4bd

File tree

1 file changed

+250
-0
lines changed

1 file changed

+250
-0
lines changed

pandas_clean_multilevel_table.ipynb

+250
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "collect-story",
6+
"metadata": {},
7+
"source": [
8+
"# Clean multilevel table\n",
9+
"* starbucks nutrition: https://www.starbucks.ca/menu/nutrition-info\n",
10+
"* starbucks bakery nutrition: https://globalassets.starbucks.com/assets/c4874ecf0a8b418f9436b1f1900cc2fa.pdf"
11+
]
12+
},
13+
{
14+
"cell_type": "markdown",
15+
"id": "whole-toolbox",
16+
"metadata": {},
17+
"source": [
18+
"---\n",
19+
"* author: [Prasert Kanawattanachai]([email protected])\n",
20+
"* YouTube: https://www.youtube.com/prasertcbs\n",
21+
"* github: https://github.com/prasertcbs/\n",
22+
"* [Chulalongkorn Business School](https://www.cbs.chula.ac.th/en/)\n",
23+
"---"
24+
]
25+
},
26+
{
27+
"cell_type": "code",
28+
"execution_count": null,
29+
"id": "willing-hardwood",
30+
"metadata": {},
31+
"outputs": [],
32+
"source": [
33+
"import sys\n",
34+
"import pandas as pd\n",
35+
"import numpy as np\n",
36+
"\n",
37+
"pd.set_option('display.max_rows', None)\n",
38+
"\n",
39+
"%config InlineBackend.figure_format='retina'"
40+
]
41+
},
42+
{
43+
"cell_type": "code",
44+
"execution_count": null,
45+
"id": "virgin-miller",
46+
"metadata": {
47+
"colab": {
48+
"base_uri": "https://localhost:8080/",
49+
"height": 64
50+
},
51+
"colab_type": "code",
52+
"id": "fdr0pYIf7P-_",
53+
"outputId": "c919deae-c99b-44b0-8924-4d2355ca0b63"
54+
},
55+
"outputs": [],
56+
"source": [
57+
"print(f'Python version: {sys.version}')\n",
58+
"print(f'pandas version: {pd.__version__}')\n",
59+
"\n",
60+
"pd.Timestamp.now()"
61+
]
62+
},
63+
{
64+
"cell_type": "markdown",
65+
"id": "cooperative-template",
66+
"metadata": {},
67+
"source": [
68+
"## read data"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": null,
74+
"id": "convinced-insertion",
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"df=pd.read_excel('https://github.com/prasertcbs/basic-dataset/raw/master/starbucks_bakery.xlsx')\n",
79+
"df"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": null,
85+
"id": "postal-ontario",
86+
"metadata": {},
87+
"outputs": [],
88+
"source": [
89+
"df=df.dropna(subset=['Product Name']).reset_index(drop=True) # blank rows\n",
90+
"df"
91+
]
92+
},
93+
{
94+
"cell_type": "code",
95+
"execution_count": null,
96+
"id": "living-force",
97+
"metadata": {},
98+
"outputs": [],
99+
"source": [
100+
"df.info()"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": null,
106+
"id": "terminal-philippines",
107+
"metadata": {},
108+
"outputs": [],
109+
"source": [
110+
"df.loc[0, :]"
111+
]
112+
},
113+
{
114+
"cell_type": "code",
115+
"execution_count": null,
116+
"id": "smooth-greece",
117+
"metadata": {},
118+
"outputs": [],
119+
"source": [
120+
"type(df.loc[0, 'Calories'])"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": null,
126+
"id": "numeric-hungarian",
127+
"metadata": {},
128+
"outputs": [],
129+
"source": [
130+
"np.isnan(df.loc[0, 'Calories']) # check numpy.float64 isnan"
131+
]
132+
},
133+
{
134+
"cell_type": "code",
135+
"execution_count": null,
136+
"id": "cardiovascular-hydrogen",
137+
"metadata": {},
138+
"outputs": [],
139+
"source": [
140+
"df['Category']=df.apply(lambda r: r['Product Name'] if np.isnan(r['Calories']) else np.nan, axis=1)\n",
141+
"df"
142+
]
143+
},
144+
{
145+
"cell_type": "code",
146+
"execution_count": null,
147+
"id": "final-disabled",
148+
"metadata": {},
149+
"outputs": [],
150+
"source": [
151+
"df['Category']=df['Category'].ffill()"
152+
]
153+
},
154+
{
155+
"cell_type": "code",
156+
"execution_count": null,
157+
"id": "fresh-harvey",
158+
"metadata": {},
159+
"outputs": [],
160+
"source": [
161+
"df"
162+
]
163+
},
164+
{
165+
"cell_type": "code",
166+
"execution_count": null,
167+
"id": "sustained-injection",
168+
"metadata": {},
169+
"outputs": [],
170+
"source": [
171+
"df=df.dropna(subset=['Calories']).reset_index(drop=True) # blank rows\n",
172+
"df"
173+
]
174+
},
175+
{
176+
"cell_type": "code",
177+
"execution_count": null,
178+
"id": "serious-lighting",
179+
"metadata": {},
180+
"outputs": [],
181+
"source": [
182+
"df.columns"
183+
]
184+
},
185+
{
186+
"cell_type": "code",
187+
"execution_count": null,
188+
"id": "corresponding-counter",
189+
"metadata": {},
190+
"outputs": [],
191+
"source": [
192+
"df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',\n",
193+
" 'Saturated Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',\n",
194+
" 'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',\n",
195+
" 'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',\n",
196+
" 'Iron (%DV)']].to_csv('starbucks_bakery_nutrition_fact.csv', index=False)"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": null,
202+
"id": "pointed-arlington",
203+
"metadata": {},
204+
"outputs": [],
205+
"source": [
206+
"df[['Category', 'Product Name', 'Label Wt (g)', 'Calories', 'Total fat (g)',\n",
207+
" 'Saturated Fat (g)', 'Trans Fat (g)', 'Cholesterol (mg)',\n",
208+
" 'Sodium (mg)', 'Carbohydrates (g)', 'Fiber (g)', 'Sugar (g)',\n",
209+
" 'Protein (g)', 'Vitamin A (%DV)', 'Vitamin C (%DV)', 'Calcium (%DV)',\n",
210+
" 'Iron (%DV)']].to_excel('starbucks_bakery_nutrition_fact.xlsx', index=False)"
211+
]
212+
},
213+
{
214+
"cell_type": "code",
215+
"execution_count": null,
216+
"id": "dress-quantum",
217+
"metadata": {},
218+
"outputs": [],
219+
"source": []
220+
}
221+
],
222+
"metadata": {
223+
"kernelspec": {
224+
"display_name": "Python 3",
225+
"language": "python",
226+
"name": "python3"
227+
},
228+
"language_info": {
229+
"codemirror_mode": {
230+
"name": "ipython",
231+
"version": 3
232+
},
233+
"file_extension": ".py",
234+
"mimetype": "text/x-python",
235+
"name": "python",
236+
"nbconvert_exporter": "python",
237+
"pygments_lexer": "ipython3",
238+
"version": "3.7.9"
239+
},
240+
"widgets": {
241+
"application/vnd.jupyter.widget-state+json": {
242+
"state": {},
243+
"version_major": 2,
244+
"version_minor": 0
245+
}
246+
}
247+
},
248+
"nbformat": 4,
249+
"nbformat_minor": 5
250+
}

0 commit comments

Comments
 (0)