Update pandas optimization notebook: further improvements

tomcis · tomcis · commit c4371bb6e21a · 2020-08-11T18:55:05.000+03:00
diff --git a/notebooks/03_pandas_optimization.ipynb b/notebooks/03_pandas_optimization.ipynb
@@ -5,28 +5,36 @@
    "execution_count": 1,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2020-08-05T08:10:32.897313Z",
-     "iopub.status.busy": "2020-08-05T08:10:32.897021Z",
-     "iopub.status.idle": "2020-08-05T08:10:33.617170Z",
-     "shell.execute_reply": "2020-08-05T08:10:33.616383Z",
-     "shell.execute_reply.started": "2020-08-05T08:10:32.897271Z"
+     "iopub.execute_input": "2020-08-11T15:51:56.088764Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.088469Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.837524Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.836477Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.088716Z"
     }
    },
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
     "import seaborn as sns"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data & column memory statistics"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2020-08-05T08:10:33.618834Z",
-     "iopub.status.busy": "2020-08-05T08:10:33.618634Z",
-     "iopub.status.idle": "2020-08-05T08:10:33.665354Z",
-     "shell.execute_reply": "2020-08-05T08:10:33.664646Z",
-     "shell.execute_reply.started": "2020-08-05T08:10:33.618802Z"
+     "iopub.execute_input": "2020-08-11T15:51:56.839417Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.839203Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.888382Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.887185Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.839382Z"
     }
    },
    "outputs": [
@@ -119,6 +127,7 @@
    ],
    "source": [
     "iris = sns.load_dataset('iris')\n",
+    "iris_init = iris.copy() # will be used to get the reduction performance\n",
     "iris.head()"
    ]
   },
@@ -127,11 +136,11 @@
    "execution_count": 3,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2020-08-05T08:10:33.666916Z",
-     "iopub.status.busy": "2020-08-05T08:10:33.666612Z",
-     "iopub.status.idle": "2020-08-05T08:10:33.673004Z",
-     "shell.execute_reply": "2020-08-05T08:10:33.672154Z",
-     "shell.execute_reply.started": "2020-08-05T08:10:33.666878Z"
+     "iopub.execute_input": "2020-08-11T15:51:56.889851Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.889567Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.896621Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.895716Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.889803Z"
     }
    },
    "outputs": [
@@ -160,11 +169,11 @@
    "execution_count": 4,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2020-08-05T08:10:33.674289Z",
-     "iopub.status.busy": "2020-08-05T08:10:33.674054Z",
-     "iopub.status.idle": "2020-08-05T08:10:33.682060Z",
-     "shell.execute_reply": "2020-08-05T08:10:33.681195Z",
-     "shell.execute_reply.started": "2020-08-05T08:10:33.674250Z"
+     "iopub.execute_input": "2020-08-11T15:51:56.898063Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.897839Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.906122Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.905172Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.898026Z"
     }
    },
    "outputs": [
@@ -189,51 +198,174 @@
     "iris.memory_usage(deep=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-08-11T15:22:46.192393Z",
+     "iopub.status.busy": "2020-08-11T15:22:46.192141Z",
+     "iopub.status.idle": "2020-08-11T15:22:46.196169Z",
+     "shell.execute_reply": "2020-08-11T15:22:46.195015Z",
+     "shell.execute_reply.started": "2020-08-11T15:22:46.192354Z"
+    }
+   },
+   "source": [
+    "### Applying optimizations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-08-11T15:22:57.876536Z",
+     "iopub.status.busy": "2020-08-11T15:22:57.876286Z",
+     "iopub.status.idle": "2020-08-11T15:22:57.880191Z",
+     "shell.execute_reply": "2020-08-11T15:22:57.879111Z",
+     "shell.execute_reply.started": "2020-08-11T15:22:57.876496Z"
+    }
+   },
+   "source": [
+    "#### Casting categorical columns as `category`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Usually gives biggest memory savings. Instead of storing whole strings/objects `pandas` tokenizes them and only stores indices which significantly reduces the memory usage."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 5,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2020-08-05T08:10:33.684114Z",
-     "iopub.status.busy": "2020-08-05T08:10:33.683891Z",
-     "iopub.status.idle": "2020-08-05T08:10:33.690066Z",
-     "shell.execute_reply": "2020-08-05T08:10:33.689310Z",
-     "shell.execute_reply.started": "2020-08-05T08:10:33.684077Z"
+     "iopub.execute_input": "2020-08-11T15:51:56.908888Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.908622Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.914680Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.913797Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.908847Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "iris.species = iris.species.astype(\"category\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-08-11T15:51:56.916242Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.916011Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.923110Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.921794Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.916205Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "9928"
+       "'Column memory usage is reduced by 94.4%'"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "iris.species.memory_usage(deep=True)"
+    "\"Column memory usage is reduced by {:.1%}\".format(1 - iris.species.memory_usage(deep=True) / iris_init.species.memory_usage(deep=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-08-11T15:31:05.555896Z",
+     "iopub.status.busy": "2020-08-11T15:31:05.555631Z",
+     "iopub.status.idle": "2020-08-11T15:31:05.561770Z",
+     "shell.execute_reply": "2020-08-11T15:31:05.560011Z",
+     "shell.execute_reply.started": "2020-08-11T15:31:05.555855Z"
+    }
+   },
+   "source": [
+    "#### Optimizing the numerical columns with `pd.to_numeric` and `downcast` argument"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-08-11T15:47:12.928593Z",
+     "iopub.status.busy": "2020-08-11T15:47:12.928363Z",
+     "iopub.status.idle": "2020-08-11T15:47:12.934125Z",
+     "shell.execute_reply": "2020-08-11T15:47:12.932791Z",
+     "shell.execute_reply.started": "2020-08-11T15:47:12.928556Z"
+    }
+   },
+   "source": [
+    "By default pandas are using `float64` numeric type which is one of the heaviest ones. Some optimization can be done here given that we do not care about such high precision in this case (and, in fact, in this dataset we have only 1 digit after comma/dot)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-08-11T15:51:56.924884Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.924425Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.936372Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.935309Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.924827Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "columns = iris.columns.drop(\"species\")\n",
+    "iris[columns] = iris[columns].apply(pd.to_numeric, downcast=\"float\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2020-08-11T15:29:23.859559Z",
+     "iopub.status.busy": "2020-08-11T15:29:23.859306Z",
+     "iopub.status.idle": "2020-08-11T15:29:23.862930Z",
+     "shell.execute_reply": "2020-08-11T15:29:23.861985Z",
+     "shell.execute_reply.started": "2020-08-11T15:29:23.859519Z"
+    }
+   },
+   "source": [
+    "### Final memory usage"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 8,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2020-08-05T08:10:47.941329Z",
-     "iopub.status.busy": "2020-08-05T08:10:47.941060Z",
-     "iopub.status.idle": "2020-08-05T08:10:47.948498Z",
-     "shell.execute_reply": "2020-08-05T08:10:47.947516Z",
-     "shell.execute_reply.started": "2020-08-05T08:10:47.941286Z"
+     "iopub.execute_input": "2020-08-11T15:51:56.938099Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.937861Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.944942Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.944129Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.938062Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "554"
+       "Index           128\n",
+       "sepal_length    600\n",
+       "sepal_width     600\n",
+       "petal_length    600\n",
+       "petal_width     600\n",
+       "species         426\n",
+       "dtype: int64"
       ]
      },
      "execution_count": 8,
@@ -242,36 +374,35 @@
     }
    ],
    "source": [
-    "iris.species_categorical = iris.species.astype(\"category\")\n",
-    "iris.species_categorical.memory_usage(deep=True)"
+    "iris.memory_usage(deep=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2020-08-05T08:12:27.622248Z",
-     "iopub.status.busy": "2020-08-05T08:12:27.622005Z",
-     "iopub.status.idle": "2020-08-05T08:12:27.627567Z",
-     "shell.execute_reply": "2020-08-05T08:12:27.626678Z",
-     "shell.execute_reply.started": "2020-08-05T08:12:27.622210Z"
+     "iopub.execute_input": "2020-08-11T15:51:56.946379Z",
+     "iopub.status.busy": "2020-08-11T15:51:56.946137Z",
+     "iopub.status.idle": "2020-08-11T15:51:56.954586Z",
+     "shell.execute_reply": "2020-08-11T15:51:56.953734Z",
+     "shell.execute_reply.started": "2020-08-11T15:51:56.946341Z"
     }
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'Reduced by 94.4%'"
+       "'In total memory usage is reduced by 79.9%'"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "\"Reduced by {:.1%}\".format(1 - iris.species_categorical.memory_usage(deep=True) / iris.species.memory_usage(deep=True))"
+    "\"In total memory usage is reduced by {:.1%}\".format(1 - iris.memory_usage(deep=True).sum() / iris_init.memory_usage(deep=True).sum())"
    ]
   }
  ],