html tag regex

rasbt · rasbt · commit 76da7ee6e041 · 2014-07-06T12:36:55.000-04:00
diff --git a/tutorials/useful_regex.ipynb b/tutorials/useful_regex.ipynb
@@ -1,7 +1,7 @@
 {
  "metadata": {
   "name": "",
-  "signature": "sha256:9fd7d5201ce5b97fadad65f2c30cfec993fc83907e04418b032bd1bbdac05ff4"
+  "signature": "sha256:8f1ee7a7bfaeca0ee3e12b2387445faa10632d57277d59af6dbfdca9732e5910"
  },
  "nbformat": 3,
  "nbformat_minor": 0,
@@ -41,7 +41,7 @@
        "output_type": "stream",
        "stream": "stdout",
        "text": [
-        "Last updated: 06/07/2014 10:07:02 EDT\n",
+        "Last updated: 06/07/2014 12:24:58 EDT\n",
         "\n",
         "CPython 3.4.1\n",
         "IPython 2.1.0\n"
@@ -264,7 +264,9 @@
      "cell_type": "markdown",
      "metadata": {},
      "source": [
-      "A regular expression to check for file extensions."
+      "A regular expression to check for file extensions.  \n",
+      "\n",
+      "Note: This approach is not recommended for thorough limitation of file types (parse the file header instead). However, this regex is still a useful alternative to e.g., a Python's `endswith` approach for quick pre-filtering for certain files of interest."
      ]
     },
     {
@@ -746,14 +748,6 @@
       "<br>"
      ]
     },
-    {
-     "cell_type": "heading",
-     "level": 2,
-     "metadata": {},
-     "source": [
-      "Time"
-     ]
-    },
     {
      "cell_type": "markdown",
      "metadata": {},
@@ -810,14 +804,64 @@
       "\n",
       "for t in str_true:\n",
       "    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
+      "for f in str_false:\n",
+      "    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 18
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<br>\n",
+      "<br>"
+     ]
+    },
+    {
+     "cell_type": "heading",
+     "level": 2,
+     "metadata": {},
+     "source": [
+      "Checking for HTML tags"
+     ]
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "[[back to top](#Sections)]"
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "pattern = r\"\"\"</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>\"\"\"\n",
+      "\n",
+      "str_true = ('<a>', '<a href=\"something\">', '</a>', '<img src>')\n",
+      "            \n",
+      "str_false = ('a>', '<a ', '< a >')\n",
+      "\n",
+      "for t in str_true:\n",
+      "    assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n",
       "\n",
       "for f in str_false:\n",
       "    assert(bool(re.match(pattern, f)) == False), '%s is not False' %f"
      ],
      "language": "python",
      "metadata": {},
      "outputs": [],
-     "prompt_number": 33
+     "prompt_number": 16
+    },
+    {
+     "cell_type": "markdown",
+     "metadata": {},
+     "source": [
+      "<font size=\"1px\">source: [http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/](http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/)</font>"
+     ]
     }
    ],
    "metadata": {}