|
1 | 1 | { |
2 | 2 | "metadata": { |
3 | 3 | "name": "", |
4 | | - "signature": "sha256:9fd7d5201ce5b97fadad65f2c30cfec993fc83907e04418b032bd1bbdac05ff4" |
| 4 | + "signature": "sha256:8f1ee7a7bfaeca0ee3e12b2387445faa10632d57277d59af6dbfdca9732e5910" |
5 | 5 | }, |
6 | 6 | "nbformat": 3, |
7 | 7 | "nbformat_minor": 0, |
|
41 | 41 | "output_type": "stream", |
42 | 42 | "stream": "stdout", |
43 | 43 | "text": [ |
44 | | - "Last updated: 06/07/2014 10:07:02 EDT\n", |
| 44 | + "Last updated: 06/07/2014 12:24:58 EDT\n", |
45 | 45 | "\n", |
46 | 46 | "CPython 3.4.1\n", |
47 | 47 | "IPython 2.1.0\n" |
|
264 | 264 | "cell_type": "markdown", |
265 | 265 | "metadata": {}, |
266 | 266 | "source": [ |
267 | | - "A regular expression to check for file extensions." |
| 267 | + "A regular expression to check for file extensions. \n", |
| 268 | + "\n", |
| 269 | + "Note: This approach is not recommended for thorough limitation of file types (parse the file header instead). However, this regex is still a useful alternative to e.g., a Python's `endswith` approach for quick pre-filtering for certain files of interest." |
268 | 270 | ] |
269 | 271 | }, |
270 | 272 | { |
|
746 | 748 | "<br>" |
747 | 749 | ] |
748 | 750 | }, |
749 | | - { |
750 | | - "cell_type": "heading", |
751 | | - "level": 2, |
752 | | - "metadata": {}, |
753 | | - "source": [ |
754 | | - "Time" |
755 | | - ] |
756 | | - }, |
757 | 751 | { |
758 | 752 | "cell_type": "markdown", |
759 | 753 | "metadata": {}, |
|
810 | 804 | "\n", |
811 | 805 | "for t in str_true:\n", |
812 | 806 | " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n", |
| 807 | + "for f in str_false:\n", |
| 808 | + " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f" |
| 809 | + ], |
| 810 | + "language": "python", |
| 811 | + "metadata": {}, |
| 812 | + "outputs": [], |
| 813 | + "prompt_number": 18 |
| 814 | + }, |
| 815 | + { |
| 816 | + "cell_type": "markdown", |
| 817 | + "metadata": {}, |
| 818 | + "source": [ |
| 819 | + "<br>\n", |
| 820 | + "<br>" |
| 821 | + ] |
| 822 | + }, |
| 823 | + { |
| 824 | + "cell_type": "heading", |
| 825 | + "level": 2, |
| 826 | + "metadata": {}, |
| 827 | + "source": [ |
| 828 | + "Checking for HTML tags" |
| 829 | + ] |
| 830 | + }, |
| 831 | + { |
| 832 | + "cell_type": "markdown", |
| 833 | + "metadata": {}, |
| 834 | + "source": [ |
| 835 | + "[[back to top](#Sections)]" |
| 836 | + ] |
| 837 | + }, |
| 838 | + { |
| 839 | + "cell_type": "code", |
| 840 | + "collapsed": false, |
| 841 | + "input": [ |
| 842 | + "pattern = r\"\"\"</?\\w+((\\s+\\w+(\\s*=\\s*(?:\".*?\"|'.*?'|[^'\">\\s]+))?)+\\s*|\\s*)/?>\"\"\"\n", |
| 843 | + "\n", |
| 844 | + "str_true = ('<a>', '<a href=\"something\">', '</a>', '<img src>')\n", |
| 845 | + " \n", |
| 846 | + "str_false = ('a>', '<a ', '< a >')\n", |
| 847 | + "\n", |
| 848 | + "for t in str_true:\n", |
| 849 | + " assert(bool(re.match(pattern, t)) == True), '%s is not True' %t\n", |
813 | 850 | "\n", |
814 | 851 | "for f in str_false:\n", |
815 | 852 | " assert(bool(re.match(pattern, f)) == False), '%s is not False' %f" |
816 | 853 | ], |
817 | 854 | "language": "python", |
818 | 855 | "metadata": {}, |
819 | 856 | "outputs": [], |
820 | | - "prompt_number": 33 |
| 857 | + "prompt_number": 16 |
| 858 | + }, |
| 859 | + { |
| 860 | + "cell_type": "markdown", |
| 861 | + "metadata": {}, |
| 862 | + "source": [ |
| 863 | + "<font size=\"1px\">source: [http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/](http://haacked.com/archive/2004/10/25/usingregularexpressionstomatchhtml.aspx/)</font>" |
| 864 | + ] |
821 | 865 | } |
822 | 866 | ], |
823 | 867 | "metadata": {} |
|
0 commit comments