|
5 | 5 | "execution_count": 1, |
6 | 6 | "metadata": { |
7 | 7 | "execution": { |
8 | | - "iopub.execute_input": "2020-08-05T08:10:32.897313Z", |
9 | | - "iopub.status.busy": "2020-08-05T08:10:32.897021Z", |
10 | | - "iopub.status.idle": "2020-08-05T08:10:33.617170Z", |
11 | | - "shell.execute_reply": "2020-08-05T08:10:33.616383Z", |
12 | | - "shell.execute_reply.started": "2020-08-05T08:10:32.897271Z" |
| 8 | + "iopub.execute_input": "2020-08-11T15:51:56.088764Z", |
| 9 | + "iopub.status.busy": "2020-08-11T15:51:56.088469Z", |
| 10 | + "iopub.status.idle": "2020-08-11T15:51:56.837524Z", |
| 11 | + "shell.execute_reply": "2020-08-11T15:51:56.836477Z", |
| 12 | + "shell.execute_reply.started": "2020-08-11T15:51:56.088716Z" |
13 | 13 | } |
14 | 14 | }, |
15 | 15 | "outputs": [], |
16 | 16 | "source": [ |
| 17 | + "import pandas as pd\n", |
17 | 18 | "import seaborn as sns" |
18 | 19 | ] |
19 | 20 | }, |
| 21 | + { |
| 22 | + "cell_type": "markdown", |
| 23 | + "metadata": {}, |
| 24 | + "source": [ |
| 25 | + "### Data & column memory statistics" |
| 26 | + ] |
| 27 | + }, |
20 | 28 | { |
21 | 29 | "cell_type": "code", |
22 | 30 | "execution_count": 2, |
23 | 31 | "metadata": { |
24 | 32 | "execution": { |
25 | | - "iopub.execute_input": "2020-08-05T08:10:33.618834Z", |
26 | | - "iopub.status.busy": "2020-08-05T08:10:33.618634Z", |
27 | | - "iopub.status.idle": "2020-08-05T08:10:33.665354Z", |
28 | | - "shell.execute_reply": "2020-08-05T08:10:33.664646Z", |
29 | | - "shell.execute_reply.started": "2020-08-05T08:10:33.618802Z" |
| 33 | + "iopub.execute_input": "2020-08-11T15:51:56.839417Z", |
| 34 | + "iopub.status.busy": "2020-08-11T15:51:56.839203Z", |
| 35 | + "iopub.status.idle": "2020-08-11T15:51:56.888382Z", |
| 36 | + "shell.execute_reply": "2020-08-11T15:51:56.887185Z", |
| 37 | + "shell.execute_reply.started": "2020-08-11T15:51:56.839382Z" |
30 | 38 | } |
31 | 39 | }, |
32 | 40 | "outputs": [ |
|
119 | 127 | ], |
120 | 128 | "source": [ |
121 | 129 | "iris = sns.load_dataset('iris')\n", |
| 130 | + "iris_init = iris.copy() # will be used to get the reduction performance\n", |
122 | 131 | "iris.head()" |
123 | 132 | ] |
124 | 133 | }, |
|
127 | 136 | "execution_count": 3, |
128 | 137 | "metadata": { |
129 | 138 | "execution": { |
130 | | - "iopub.execute_input": "2020-08-05T08:10:33.666916Z", |
131 | | - "iopub.status.busy": "2020-08-05T08:10:33.666612Z", |
132 | | - "iopub.status.idle": "2020-08-05T08:10:33.673004Z", |
133 | | - "shell.execute_reply": "2020-08-05T08:10:33.672154Z", |
134 | | - "shell.execute_reply.started": "2020-08-05T08:10:33.666878Z" |
| 139 | + "iopub.execute_input": "2020-08-11T15:51:56.889851Z", |
| 140 | + "iopub.status.busy": "2020-08-11T15:51:56.889567Z", |
| 141 | + "iopub.status.idle": "2020-08-11T15:51:56.896621Z", |
| 142 | + "shell.execute_reply": "2020-08-11T15:51:56.895716Z", |
| 143 | + "shell.execute_reply.started": "2020-08-11T15:51:56.889803Z" |
135 | 144 | } |
136 | 145 | }, |
137 | 146 | "outputs": [ |
|
160 | 169 | "execution_count": 4, |
161 | 170 | "metadata": { |
162 | 171 | "execution": { |
163 | | - "iopub.execute_input": "2020-08-05T08:10:33.674289Z", |
164 | | - "iopub.status.busy": "2020-08-05T08:10:33.674054Z", |
165 | | - "iopub.status.idle": "2020-08-05T08:10:33.682060Z", |
166 | | - "shell.execute_reply": "2020-08-05T08:10:33.681195Z", |
167 | | - "shell.execute_reply.started": "2020-08-05T08:10:33.674250Z" |
| 172 | + "iopub.execute_input": "2020-08-11T15:51:56.898063Z", |
| 173 | + "iopub.status.busy": "2020-08-11T15:51:56.897839Z", |
| 174 | + "iopub.status.idle": "2020-08-11T15:51:56.906122Z", |
| 175 | + "shell.execute_reply": "2020-08-11T15:51:56.905172Z", |
| 176 | + "shell.execute_reply.started": "2020-08-11T15:51:56.898026Z" |
168 | 177 | } |
169 | 178 | }, |
170 | 179 | "outputs": [ |
|
189 | 198 | "iris.memory_usage(deep=True)" |
190 | 199 | ] |
191 | 200 | }, |
| 201 | + { |
| 202 | + "cell_type": "markdown", |
| 203 | + "metadata": { |
| 204 | + "execution": { |
| 205 | + "iopub.execute_input": "2020-08-11T15:22:46.192393Z", |
| 206 | + "iopub.status.busy": "2020-08-11T15:22:46.192141Z", |
| 207 | + "iopub.status.idle": "2020-08-11T15:22:46.196169Z", |
| 208 | + "shell.execute_reply": "2020-08-11T15:22:46.195015Z", |
| 209 | + "shell.execute_reply.started": "2020-08-11T15:22:46.192354Z" |
| 210 | + } |
| 211 | + }, |
| 212 | + "source": [ |
| 213 | + "### Applying optimizations" |
| 214 | + ] |
| 215 | + }, |
| 216 | + { |
| 217 | + "cell_type": "markdown", |
| 218 | + "metadata": { |
| 219 | + "execution": { |
| 220 | + "iopub.execute_input": "2020-08-11T15:22:57.876536Z", |
| 221 | + "iopub.status.busy": "2020-08-11T15:22:57.876286Z", |
| 222 | + "iopub.status.idle": "2020-08-11T15:22:57.880191Z", |
| 223 | + "shell.execute_reply": "2020-08-11T15:22:57.879111Z", |
| 224 | + "shell.execute_reply.started": "2020-08-11T15:22:57.876496Z" |
| 225 | + } |
| 226 | + }, |
| 227 | + "source": [ |
| 228 | + "#### Casting categorical columns as `category`" |
| 229 | + ] |
| 230 | + }, |
| 231 | + { |
| 232 | + "cell_type": "markdown", |
| 233 | + "metadata": {}, |
| 234 | + "source": [ |
| 235 | + "Usually gives biggest memory savings. Instead of storing whole strings/objects `pandas` tokenizes them and only stores indices which significantly reduces the memory usage." |
| 236 | + ] |
| 237 | + }, |
192 | 238 | { |
193 | 239 | "cell_type": "code", |
194 | 240 | "execution_count": 5, |
195 | 241 | "metadata": { |
196 | 242 | "execution": { |
197 | | - "iopub.execute_input": "2020-08-05T08:10:33.684114Z", |
198 | | - "iopub.status.busy": "2020-08-05T08:10:33.683891Z", |
199 | | - "iopub.status.idle": "2020-08-05T08:10:33.690066Z", |
200 | | - "shell.execute_reply": "2020-08-05T08:10:33.689310Z", |
201 | | - "shell.execute_reply.started": "2020-08-05T08:10:33.684077Z" |
| 243 | + "iopub.execute_input": "2020-08-11T15:51:56.908888Z", |
| 244 | + "iopub.status.busy": "2020-08-11T15:51:56.908622Z", |
| 245 | + "iopub.status.idle": "2020-08-11T15:51:56.914680Z", |
| 246 | + "shell.execute_reply": "2020-08-11T15:51:56.913797Z", |
| 247 | + "shell.execute_reply.started": "2020-08-11T15:51:56.908847Z" |
| 248 | + } |
| 249 | + }, |
| 250 | + "outputs": [], |
| 251 | + "source": [ |
| 252 | + "iris.species = iris.species.astype(\"category\")" |
| 253 | + ] |
| 254 | + }, |
| 255 | + { |
| 256 | + "cell_type": "code", |
| 257 | + "execution_count": 6, |
| 258 | + "metadata": { |
| 259 | + "execution": { |
| 260 | + "iopub.execute_input": "2020-08-11T15:51:56.916242Z", |
| 261 | + "iopub.status.busy": "2020-08-11T15:51:56.916011Z", |
| 262 | + "iopub.status.idle": "2020-08-11T15:51:56.923110Z", |
| 263 | + "shell.execute_reply": "2020-08-11T15:51:56.921794Z", |
| 264 | + "shell.execute_reply.started": "2020-08-11T15:51:56.916205Z" |
202 | 265 | } |
203 | 266 | }, |
204 | 267 | "outputs": [ |
205 | 268 | { |
206 | 269 | "data": { |
207 | 270 | "text/plain": [ |
208 | | - "9928" |
| 271 | + "'Column memory usage is reduced by 94.4%'" |
209 | 272 | ] |
210 | 273 | }, |
211 | | - "execution_count": 5, |
| 274 | + "execution_count": 6, |
212 | 275 | "metadata": {}, |
213 | 276 | "output_type": "execute_result" |
214 | 277 | } |
215 | 278 | ], |
216 | 279 | "source": [ |
217 | | - "iris.species.memory_usage(deep=True)" |
| 280 | + "\"Column memory usage is reduced by {:.1%}\".format(1 - iris.species.memory_usage(deep=True) / iris_init.species.memory_usage(deep=True))" |
| 281 | + ] |
| 282 | + }, |
| 283 | + { |
| 284 | + "cell_type": "markdown", |
| 285 | + "metadata": { |
| 286 | + "execution": { |
| 287 | + "iopub.execute_input": "2020-08-11T15:31:05.555896Z", |
| 288 | + "iopub.status.busy": "2020-08-11T15:31:05.555631Z", |
| 289 | + "iopub.status.idle": "2020-08-11T15:31:05.561770Z", |
| 290 | + "shell.execute_reply": "2020-08-11T15:31:05.560011Z", |
| 291 | + "shell.execute_reply.started": "2020-08-11T15:31:05.555855Z" |
| 292 | + } |
| 293 | + }, |
| 294 | + "source": [ |
| 295 | + "#### Optimizing the numerical columns with `pd.to_numeric` and `downcast` argument" |
| 296 | + ] |
| 297 | + }, |
| 298 | + { |
| 299 | + "cell_type": "markdown", |
| 300 | + "metadata": { |
| 301 | + "execution": { |
| 302 | + "iopub.execute_input": "2020-08-11T15:47:12.928593Z", |
| 303 | + "iopub.status.busy": "2020-08-11T15:47:12.928363Z", |
| 304 | + "iopub.status.idle": "2020-08-11T15:47:12.934125Z", |
| 305 | + "shell.execute_reply": "2020-08-11T15:47:12.932791Z", |
| 306 | + "shell.execute_reply.started": "2020-08-11T15:47:12.928556Z" |
| 307 | + } |
| 308 | + }, |
| 309 | + "source": [ |
| 310 | + "By default pandas are using `float64` numeric type which is one of the heaviest ones. Some optimization can be done here given that we do not care about such high precision in this case (and, in fact, in this dataset we have only 1 digit after comma/dot)." |
| 311 | + ] |
| 312 | + }, |
| 313 | + { |
| 314 | + "cell_type": "code", |
| 315 | + "execution_count": 7, |
| 316 | + "metadata": { |
| 317 | + "execution": { |
| 318 | + "iopub.execute_input": "2020-08-11T15:51:56.924884Z", |
| 319 | + "iopub.status.busy": "2020-08-11T15:51:56.924425Z", |
| 320 | + "iopub.status.idle": "2020-08-11T15:51:56.936372Z", |
| 321 | + "shell.execute_reply": "2020-08-11T15:51:56.935309Z", |
| 322 | + "shell.execute_reply.started": "2020-08-11T15:51:56.924827Z" |
| 323 | + } |
| 324 | + }, |
| 325 | + "outputs": [], |
| 326 | + "source": [ |
| 327 | + "columns = iris.columns.drop(\"species\")\n", |
| 328 | + "iris[columns] = iris[columns].apply(pd.to_numeric, downcast=\"float\")" |
| 329 | + ] |
| 330 | + }, |
| 331 | + { |
| 332 | + "cell_type": "markdown", |
| 333 | + "metadata": { |
| 334 | + "execution": { |
| 335 | + "iopub.execute_input": "2020-08-11T15:29:23.859559Z", |
| 336 | + "iopub.status.busy": "2020-08-11T15:29:23.859306Z", |
| 337 | + "iopub.status.idle": "2020-08-11T15:29:23.862930Z", |
| 338 | + "shell.execute_reply": "2020-08-11T15:29:23.861985Z", |
| 339 | + "shell.execute_reply.started": "2020-08-11T15:29:23.859519Z" |
| 340 | + } |
| 341 | + }, |
| 342 | + "source": [ |
| 343 | + "### Final memory usage" |
218 | 344 | ] |
219 | 345 | }, |
220 | 346 | { |
221 | 347 | "cell_type": "code", |
222 | 348 | "execution_count": 8, |
223 | 349 | "metadata": { |
224 | 350 | "execution": { |
225 | | - "iopub.execute_input": "2020-08-05T08:10:47.941329Z", |
226 | | - "iopub.status.busy": "2020-08-05T08:10:47.941060Z", |
227 | | - "iopub.status.idle": "2020-08-05T08:10:47.948498Z", |
228 | | - "shell.execute_reply": "2020-08-05T08:10:47.947516Z", |
229 | | - "shell.execute_reply.started": "2020-08-05T08:10:47.941286Z" |
| 351 | + "iopub.execute_input": "2020-08-11T15:51:56.938099Z", |
| 352 | + "iopub.status.busy": "2020-08-11T15:51:56.937861Z", |
| 353 | + "iopub.status.idle": "2020-08-11T15:51:56.944942Z", |
| 354 | + "shell.execute_reply": "2020-08-11T15:51:56.944129Z", |
| 355 | + "shell.execute_reply.started": "2020-08-11T15:51:56.938062Z" |
230 | 356 | } |
231 | 357 | }, |
232 | 358 | "outputs": [ |
233 | 359 | { |
234 | 360 | "data": { |
235 | 361 | "text/plain": [ |
236 | | - "554" |
| 362 | + "Index 128\n", |
| 363 | + "sepal_length 600\n", |
| 364 | + "sepal_width 600\n", |
| 365 | + "petal_length 600\n", |
| 366 | + "petal_width 600\n", |
| 367 | + "species 426\n", |
| 368 | + "dtype: int64" |
237 | 369 | ] |
238 | 370 | }, |
239 | 371 | "execution_count": 8, |
|
242 | 374 | } |
243 | 375 | ], |
244 | 376 | "source": [ |
245 | | - "iris.species_categorical = iris.species.astype(\"category\")\n", |
246 | | - "iris.species_categorical.memory_usage(deep=True)" |
| 377 | + "iris.memory_usage(deep=True)" |
247 | 378 | ] |
248 | 379 | }, |
249 | 380 | { |
250 | 381 | "cell_type": "code", |
251 | | - "execution_count": 14, |
| 382 | + "execution_count": 9, |
252 | 383 | "metadata": { |
253 | 384 | "execution": { |
254 | | - "iopub.execute_input": "2020-08-05T08:12:27.622248Z", |
255 | | - "iopub.status.busy": "2020-08-05T08:12:27.622005Z", |
256 | | - "iopub.status.idle": "2020-08-05T08:12:27.627567Z", |
257 | | - "shell.execute_reply": "2020-08-05T08:12:27.626678Z", |
258 | | - "shell.execute_reply.started": "2020-08-05T08:12:27.622210Z" |
| 385 | + "iopub.execute_input": "2020-08-11T15:51:56.946379Z", |
| 386 | + "iopub.status.busy": "2020-08-11T15:51:56.946137Z", |
| 387 | + "iopub.status.idle": "2020-08-11T15:51:56.954586Z", |
| 388 | + "shell.execute_reply": "2020-08-11T15:51:56.953734Z", |
| 389 | + "shell.execute_reply.started": "2020-08-11T15:51:56.946341Z" |
259 | 390 | } |
260 | 391 | }, |
261 | 392 | "outputs": [ |
262 | 393 | { |
263 | 394 | "data": { |
264 | 395 | "text/plain": [ |
265 | | - "'Reduced by 94.4%'" |
| 396 | + "'In total memory usage is reduced by 79.9%'" |
266 | 397 | ] |
267 | 398 | }, |
268 | | - "execution_count": 14, |
| 399 | + "execution_count": 9, |
269 | 400 | "metadata": {}, |
270 | 401 | "output_type": "execute_result" |
271 | 402 | } |
272 | 403 | ], |
273 | 404 | "source": [ |
274 | | - "\"Reduced by {:.1%}\".format(1 - iris.species_categorical.memory_usage(deep=True) / iris.species.memory_usage(deep=True))" |
| 405 | + "\"In total memory usage is reduced by {:.1%}\".format(1 - iris.memory_usage(deep=True).sum() / iris_init.memory_usage(deep=True).sum())" |
275 | 406 | ] |
276 | 407 | } |
277 | 408 | ], |
|
0 commit comments