Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit c4371bb

Browse files
committed
Update pandas optimization notebook: further improvements
1 parent 53de264 commit c4371bb

1 file changed

Lines changed: 176 additions & 45 deletions

File tree

notebooks/03_pandas_optimization.ipynb

Lines changed: 176 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,28 +5,36 @@
55
"execution_count": 1,
66
"metadata": {
77
"execution": {
8-
"iopub.execute_input": "2020-08-05T08:10:32.897313Z",
9-
"iopub.status.busy": "2020-08-05T08:10:32.897021Z",
10-
"iopub.status.idle": "2020-08-05T08:10:33.617170Z",
11-
"shell.execute_reply": "2020-08-05T08:10:33.616383Z",
12-
"shell.execute_reply.started": "2020-08-05T08:10:32.897271Z"
8+
"iopub.execute_input": "2020-08-11T15:51:56.088764Z",
9+
"iopub.status.busy": "2020-08-11T15:51:56.088469Z",
10+
"iopub.status.idle": "2020-08-11T15:51:56.837524Z",
11+
"shell.execute_reply": "2020-08-11T15:51:56.836477Z",
12+
"shell.execute_reply.started": "2020-08-11T15:51:56.088716Z"
1313
}
1414
},
1515
"outputs": [],
1616
"source": [
17+
"import pandas as pd\n",
1718
"import seaborn as sns"
1819
]
1920
},
21+
{
22+
"cell_type": "markdown",
23+
"metadata": {},
24+
"source": [
25+
"### Data & column memory statistics"
26+
]
27+
},
2028
{
2129
"cell_type": "code",
2230
"execution_count": 2,
2331
"metadata": {
2432
"execution": {
25-
"iopub.execute_input": "2020-08-05T08:10:33.618834Z",
26-
"iopub.status.busy": "2020-08-05T08:10:33.618634Z",
27-
"iopub.status.idle": "2020-08-05T08:10:33.665354Z",
28-
"shell.execute_reply": "2020-08-05T08:10:33.664646Z",
29-
"shell.execute_reply.started": "2020-08-05T08:10:33.618802Z"
33+
"iopub.execute_input": "2020-08-11T15:51:56.839417Z",
34+
"iopub.status.busy": "2020-08-11T15:51:56.839203Z",
35+
"iopub.status.idle": "2020-08-11T15:51:56.888382Z",
36+
"shell.execute_reply": "2020-08-11T15:51:56.887185Z",
37+
"shell.execute_reply.started": "2020-08-11T15:51:56.839382Z"
3038
}
3139
},
3240
"outputs": [
@@ -119,6 +127,7 @@
119127
],
120128
"source": [
121129
"iris = sns.load_dataset('iris')\n",
130+
"iris_init = iris.copy() # will be used to get the reduction performance\n",
122131
"iris.head()"
123132
]
124133
},
@@ -127,11 +136,11 @@
127136
"execution_count": 3,
128137
"metadata": {
129138
"execution": {
130-
"iopub.execute_input": "2020-08-05T08:10:33.666916Z",
131-
"iopub.status.busy": "2020-08-05T08:10:33.666612Z",
132-
"iopub.status.idle": "2020-08-05T08:10:33.673004Z",
133-
"shell.execute_reply": "2020-08-05T08:10:33.672154Z",
134-
"shell.execute_reply.started": "2020-08-05T08:10:33.666878Z"
139+
"iopub.execute_input": "2020-08-11T15:51:56.889851Z",
140+
"iopub.status.busy": "2020-08-11T15:51:56.889567Z",
141+
"iopub.status.idle": "2020-08-11T15:51:56.896621Z",
142+
"shell.execute_reply": "2020-08-11T15:51:56.895716Z",
143+
"shell.execute_reply.started": "2020-08-11T15:51:56.889803Z"
135144
}
136145
},
137146
"outputs": [
@@ -160,11 +169,11 @@
160169
"execution_count": 4,
161170
"metadata": {
162171
"execution": {
163-
"iopub.execute_input": "2020-08-05T08:10:33.674289Z",
164-
"iopub.status.busy": "2020-08-05T08:10:33.674054Z",
165-
"iopub.status.idle": "2020-08-05T08:10:33.682060Z",
166-
"shell.execute_reply": "2020-08-05T08:10:33.681195Z",
167-
"shell.execute_reply.started": "2020-08-05T08:10:33.674250Z"
172+
"iopub.execute_input": "2020-08-11T15:51:56.898063Z",
173+
"iopub.status.busy": "2020-08-11T15:51:56.897839Z",
174+
"iopub.status.idle": "2020-08-11T15:51:56.906122Z",
175+
"shell.execute_reply": "2020-08-11T15:51:56.905172Z",
176+
"shell.execute_reply.started": "2020-08-11T15:51:56.898026Z"
168177
}
169178
},
170179
"outputs": [
@@ -189,51 +198,174 @@
189198
"iris.memory_usage(deep=True)"
190199
]
191200
},
201+
{
202+
"cell_type": "markdown",
203+
"metadata": {
204+
"execution": {
205+
"iopub.execute_input": "2020-08-11T15:22:46.192393Z",
206+
"iopub.status.busy": "2020-08-11T15:22:46.192141Z",
207+
"iopub.status.idle": "2020-08-11T15:22:46.196169Z",
208+
"shell.execute_reply": "2020-08-11T15:22:46.195015Z",
209+
"shell.execute_reply.started": "2020-08-11T15:22:46.192354Z"
210+
}
211+
},
212+
"source": [
213+
"### Applying optimizations"
214+
]
215+
},
216+
{
217+
"cell_type": "markdown",
218+
"metadata": {
219+
"execution": {
220+
"iopub.execute_input": "2020-08-11T15:22:57.876536Z",
221+
"iopub.status.busy": "2020-08-11T15:22:57.876286Z",
222+
"iopub.status.idle": "2020-08-11T15:22:57.880191Z",
223+
"shell.execute_reply": "2020-08-11T15:22:57.879111Z",
224+
"shell.execute_reply.started": "2020-08-11T15:22:57.876496Z"
225+
}
226+
},
227+
"source": [
228+
"#### Casting categorical columns as `category`"
229+
]
230+
},
231+
{
232+
"cell_type": "markdown",
233+
"metadata": {},
234+
"source": [
235+
"Usually gives biggest memory savings. Instead of storing whole strings/objects `pandas` tokenizes them and only stores indices which significantly reduces the memory usage."
236+
]
237+
},
192238
{
193239
"cell_type": "code",
194240
"execution_count": 5,
195241
"metadata": {
196242
"execution": {
197-
"iopub.execute_input": "2020-08-05T08:10:33.684114Z",
198-
"iopub.status.busy": "2020-08-05T08:10:33.683891Z",
199-
"iopub.status.idle": "2020-08-05T08:10:33.690066Z",
200-
"shell.execute_reply": "2020-08-05T08:10:33.689310Z",
201-
"shell.execute_reply.started": "2020-08-05T08:10:33.684077Z"
243+
"iopub.execute_input": "2020-08-11T15:51:56.908888Z",
244+
"iopub.status.busy": "2020-08-11T15:51:56.908622Z",
245+
"iopub.status.idle": "2020-08-11T15:51:56.914680Z",
246+
"shell.execute_reply": "2020-08-11T15:51:56.913797Z",
247+
"shell.execute_reply.started": "2020-08-11T15:51:56.908847Z"
248+
}
249+
},
250+
"outputs": [],
251+
"source": [
252+
"iris.species = iris.species.astype(\"category\")"
253+
]
254+
},
255+
{
256+
"cell_type": "code",
257+
"execution_count": 6,
258+
"metadata": {
259+
"execution": {
260+
"iopub.execute_input": "2020-08-11T15:51:56.916242Z",
261+
"iopub.status.busy": "2020-08-11T15:51:56.916011Z",
262+
"iopub.status.idle": "2020-08-11T15:51:56.923110Z",
263+
"shell.execute_reply": "2020-08-11T15:51:56.921794Z",
264+
"shell.execute_reply.started": "2020-08-11T15:51:56.916205Z"
202265
}
203266
},
204267
"outputs": [
205268
{
206269
"data": {
207270
"text/plain": [
208-
"9928"
271+
"'Column memory usage is reduced by 94.4%'"
209272
]
210273
},
211-
"execution_count": 5,
274+
"execution_count": 6,
212275
"metadata": {},
213276
"output_type": "execute_result"
214277
}
215278
],
216279
"source": [
217-
"iris.species.memory_usage(deep=True)"
280+
"\"Column memory usage is reduced by {:.1%}\".format(1 - iris.species.memory_usage(deep=True) / iris_init.species.memory_usage(deep=True))"
281+
]
282+
},
283+
{
284+
"cell_type": "markdown",
285+
"metadata": {
286+
"execution": {
287+
"iopub.execute_input": "2020-08-11T15:31:05.555896Z",
288+
"iopub.status.busy": "2020-08-11T15:31:05.555631Z",
289+
"iopub.status.idle": "2020-08-11T15:31:05.561770Z",
290+
"shell.execute_reply": "2020-08-11T15:31:05.560011Z",
291+
"shell.execute_reply.started": "2020-08-11T15:31:05.555855Z"
292+
}
293+
},
294+
"source": [
295+
"#### Optimizing the numerical columns with `pd.to_numeric` and `downcast` argument"
296+
]
297+
},
298+
{
299+
"cell_type": "markdown",
300+
"metadata": {
301+
"execution": {
302+
"iopub.execute_input": "2020-08-11T15:47:12.928593Z",
303+
"iopub.status.busy": "2020-08-11T15:47:12.928363Z",
304+
"iopub.status.idle": "2020-08-11T15:47:12.934125Z",
305+
"shell.execute_reply": "2020-08-11T15:47:12.932791Z",
306+
"shell.execute_reply.started": "2020-08-11T15:47:12.928556Z"
307+
}
308+
},
309+
"source": [
310+
"By default pandas are using `float64` numeric type which is one of the heaviest ones. Some optimization can be done here given that we do not care about such high precision in this case (and, in fact, in this dataset we have only 1 digit after comma/dot)."
311+
]
312+
},
313+
{
314+
"cell_type": "code",
315+
"execution_count": 7,
316+
"metadata": {
317+
"execution": {
318+
"iopub.execute_input": "2020-08-11T15:51:56.924884Z",
319+
"iopub.status.busy": "2020-08-11T15:51:56.924425Z",
320+
"iopub.status.idle": "2020-08-11T15:51:56.936372Z",
321+
"shell.execute_reply": "2020-08-11T15:51:56.935309Z",
322+
"shell.execute_reply.started": "2020-08-11T15:51:56.924827Z"
323+
}
324+
},
325+
"outputs": [],
326+
"source": [
327+
"columns = iris.columns.drop(\"species\")\n",
328+
"iris[columns] = iris[columns].apply(pd.to_numeric, downcast=\"float\")"
329+
]
330+
},
331+
{
332+
"cell_type": "markdown",
333+
"metadata": {
334+
"execution": {
335+
"iopub.execute_input": "2020-08-11T15:29:23.859559Z",
336+
"iopub.status.busy": "2020-08-11T15:29:23.859306Z",
337+
"iopub.status.idle": "2020-08-11T15:29:23.862930Z",
338+
"shell.execute_reply": "2020-08-11T15:29:23.861985Z",
339+
"shell.execute_reply.started": "2020-08-11T15:29:23.859519Z"
340+
}
341+
},
342+
"source": [
343+
"### Final memory usage"
218344
]
219345
},
220346
{
221347
"cell_type": "code",
222348
"execution_count": 8,
223349
"metadata": {
224350
"execution": {
225-
"iopub.execute_input": "2020-08-05T08:10:47.941329Z",
226-
"iopub.status.busy": "2020-08-05T08:10:47.941060Z",
227-
"iopub.status.idle": "2020-08-05T08:10:47.948498Z",
228-
"shell.execute_reply": "2020-08-05T08:10:47.947516Z",
229-
"shell.execute_reply.started": "2020-08-05T08:10:47.941286Z"
351+
"iopub.execute_input": "2020-08-11T15:51:56.938099Z",
352+
"iopub.status.busy": "2020-08-11T15:51:56.937861Z",
353+
"iopub.status.idle": "2020-08-11T15:51:56.944942Z",
354+
"shell.execute_reply": "2020-08-11T15:51:56.944129Z",
355+
"shell.execute_reply.started": "2020-08-11T15:51:56.938062Z"
230356
}
231357
},
232358
"outputs": [
233359
{
234360
"data": {
235361
"text/plain": [
236-
"554"
362+
"Index 128\n",
363+
"sepal_length 600\n",
364+
"sepal_width 600\n",
365+
"petal_length 600\n",
366+
"petal_width 600\n",
367+
"species 426\n",
368+
"dtype: int64"
237369
]
238370
},
239371
"execution_count": 8,
@@ -242,36 +374,35 @@
242374
}
243375
],
244376
"source": [
245-
"iris.species_categorical = iris.species.astype(\"category\")\n",
246-
"iris.species_categorical.memory_usage(deep=True)"
377+
"iris.memory_usage(deep=True)"
247378
]
248379
},
249380
{
250381
"cell_type": "code",
251-
"execution_count": 14,
382+
"execution_count": 9,
252383
"metadata": {
253384
"execution": {
254-
"iopub.execute_input": "2020-08-05T08:12:27.622248Z",
255-
"iopub.status.busy": "2020-08-05T08:12:27.622005Z",
256-
"iopub.status.idle": "2020-08-05T08:12:27.627567Z",
257-
"shell.execute_reply": "2020-08-05T08:12:27.626678Z",
258-
"shell.execute_reply.started": "2020-08-05T08:12:27.622210Z"
385+
"iopub.execute_input": "2020-08-11T15:51:56.946379Z",
386+
"iopub.status.busy": "2020-08-11T15:51:56.946137Z",
387+
"iopub.status.idle": "2020-08-11T15:51:56.954586Z",
388+
"shell.execute_reply": "2020-08-11T15:51:56.953734Z",
389+
"shell.execute_reply.started": "2020-08-11T15:51:56.946341Z"
259390
}
260391
},
261392
"outputs": [
262393
{
263394
"data": {
264395
"text/plain": [
265-
"'Reduced by 94.4%'"
396+
"'In total memory usage is reduced by 79.9%'"
266397
]
267398
},
268-
"execution_count": 14,
399+
"execution_count": 9,
269400
"metadata": {},
270401
"output_type": "execute_result"
271402
}
272403
],
273404
"source": [
274-
"\"Reduced by {:.1%}\".format(1 - iris.species_categorical.memory_usage(deep=True) / iris.species.memory_usage(deep=True))"
405+
"\"In total memory usage is reduced by {:.1%}\".format(1 - iris.memory_usage(deep=True).sum() / iris_init.memory_usage(deep=True).sum())"
275406
]
276407
}
277408
],

0 commit comments

Comments
 (0)