#!/usr/bin/env python
# -*- indent-with-tabs: 0 -*-
# Copyright The IETF Trust 2013-2019, All Rights Reserved


import os, sys, re, datetime, argparse, traceback, json, subprocess
import html5lib
import random

# Set up import path to find our own Django
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
if not basedir in sys.path:
    sys.path.insert(0, basedir)

# Parse args now, so we can use custom settings when importing django
parser = argparse.ArgumentParser(
    description="""Perform a test crawl of the project. For each found URL, the HTTP
    response status is printed. If it's not OK/redirect, FAIL is
    printed - in case of errors, a stacktrace is also included.""")
parser.add_argument('urls', metavar='URL', nargs='*',
                    help='One or more URLs to start the crawl from')
parser.add_argument('--urls', '-u', dest='url_file',
                    help='File with URLs to start the crawl from')
parser.add_argument('--slow', dest='slow_threshold', type=float, default=1.0,
                    help='Responses taking longer than this (in seconds) results in SLOW being printed')
parser.add_argument('--settings', help='Custom settings file')
parser.add_argument('--logfile', help='Write to logfile')
parser.add_argument('--user', help='Crawl logged in as this user', default=None)
parser.add_argument('--no-follow', dest='follow', action='store_false', default=True,
                    help='Do not follow URLs found in fetched pages, just check the given URLs')
parser.add_argument('--validator-nu', dest='validator_nu', action='store_true',
                    help='Use validator.nu instead of html5lib for HTML validation')
parser.add_argument('--pedantic', action='store_true',
                    help='Stop the crawl on the first HTML validation issue')
parser.add_argument('--random', action='store_true',
                    help='Crawl URLs randomly')
parser.add_argument('--validate-all', dest='validate_all', action='store_true', default=False,
                    help='Run html 5 validation on all pages, without skipping similar urls. '
                    '(The default is to only run validation on one of /foo/1/, /foo/2/, /foo/3/, etc.)')
parser.add_argument('-v', '--verbose', action='store_true', default=False,
                    help='Be more verbose')

args = parser.parse_args()

# Import Django, call setup()
os.environ.setdefault("DJANGO_SETTINGS_MODULE", args.settings or "ietf.settings_testcrawl")

import django
import django.test
import django.core.checks
from django.conf import settings

django.setup()

# This needs to come after we set up sys path to include the local django
import debug    # pyflakes:ignore

# prevent memory from leaking when settings.DEBUG=True
# from django.db import connection
# class DontSaveQueries(list):
#     def append(self, x):
#         pass
# connection.queries = DontSaveQueries()

from ietf.name.models import DocTypeName
from ietf.utils.html import unescape
from ietf.utils.test_utils import unicontent

# --- Constants ---

MAX_URL_LENGTH = 500

# --- Functions ---

def note(s):
    if args.verbose:
        sys.stderr.write(s)
        sys.stderr.write('\n')

def strip_url(url):
    if url.startswith("http://testserver"):
        url = url[len("http://testserver"):]
    fragment_url = re.search("^(.+)#[a-z_.-]+$", url)
    if fragment_url:
        url = fragment_url.group(1)
    next_url = re.search(r"^(.+)\?next=.+$", url)
    if next_url:
        url = next_url.group(1)
    return url

def extract_html_urls(content):
    for m in re.finditer(r'(<(?:(?:a|link) [^>]*href|(?:img|script) [^>]*src)=[\'"]([^"]+)[\'"][^>]*>)', content):
        if re.search(r'rel=["\']?nofollow["\']', m.group(1)):
            continue
        url = strip_url(m.group(2))
        if len(url) > MAX_URL_LENGTH:
            continue # avoid infinite GET parameter appendages

        if not url.startswith("/"):
            continue

        if url.startswith("//"):
            continue

        yield unescape(url)

def extract_tastypie_urls(content):
    VISIT_OBJECTS = False
    VISIT_NEXT = False
    data = json.loads(content)
    for item in data:
        if type(data[item]) is dict:
            if "list_endpoint" in data[item]:
                uri = data[item]["list_endpoint"]
                yield uri
    if VISIT_NEXT:
        if "meta" in data and "next" in data["meta"]:
            uri = data["meta"]["next"]
            if uri != None:
                yield uri
    if VISIT_OBJECTS:
        if "objects" in data:
            object_list = data["objects"]
            for i in range(len(object_list)):
                if "resource_uri" in object_list[i]:
                    uri = object_list[i]["resource_uri"]
                    yield uri

def check_html_valid(url, response, args):
    global parser, validated_urls, doc_types, warnings
    key = url
    if not args.validate_all:
        # derive a key for urls like this by replacing primary keys
        key = re.sub("#.*$",       "", key)
        key = re.sub("/.+@.+/",    "/x@x.org/", key)
        key = re.sub("/[0-9.]+/",   "/mmmm/", key)
        key = re.sub("/[0-9.]+/",   "/nnnn/", key)
        key = re.sub("/ag/[a-z0-9-]+/",   "/ag/foo/", key)
        key = re.sub("/area/[a-z0-9-]+/",   "/area/foo/", key)
        key = re.sub("/bcp[0-9]+/",   "/bcpnnn/", key)
        key = re.sub("/conflict-review-[a-z0-9-]+/",   "/conflrev-foo/", key)
        key = re.sub("/dir/[a-z0-9-]+/",   "/dir/foo/", key)
        key = re.sub("/draft-[a-z0-9-]+/",   "/draft-foo/", key)
        key = re.sub("/group/[a-z0-9-]+/",   "/group/foo/", key)
        key = re.sub("/html/[a-z0-9-]+",   "/html/foo/", key)
        key = re.sub("/ipr/search/.*",   "/ipr/search/", key)
        key = re.sub("/meeting/[-0-9a-z]+/agenda/[0-9a-z]+/",   "/meeting/nn/agenda/foo/", key)
        key = re.sub("/release/[0-9dev.]+/",   "/release/n.n.n/", key)
        key = re.sub("/rfc[0-9]+/",   "/rfcnnnn/", key)
        key = re.sub("/rg/[a-z0-9-]+/",   "/rg/foo/", key)
        key = re.sub("/secr/srec/nnnn/[0-9a-z-]+/", "/secr/sreq/nn/bar/", key)
        key = re.sub("/state/[a-z0-9-]+/",   "/state/foo/", key)
        key = re.sub("/state/[a-z0-9-]+/[a-z0-9-]+/",   "/state/foo/bar/", key)
        key = re.sub("/status-change-[a-z0-9-]+/",   "/statchg-foo/", key)
        key = re.sub("/std[0-9]+/",   "/stdnnn/", key)
        key = re.sub("/submit/status/nnnn/[0-9a-f]+/", "/submit/status/nnnn/bar/", key)
        key = re.sub("/team/[a-z0-9-]+/",   "/team/foo/", key)
        key = re.sub("/wg/[a-z0-9-]+/",   "/wg/foo/", key)
        key = re.sub("\?.*$",       "", key)

        for slug in doc_types:
            key = re.sub("/%s-.*/"%slug, "/%s-nnnn/"%slug, key)

    if not key in validated_urls:
        note('Validate: %-32s: %s' % (url[:32], key))
        # These URLs have known issues, skip them until those are fixed
        for pattern in (
                '/secr',
		'admin/',
		'/doc/.*/edit/info/',
		'rfc542$',
		'rfc776$',
		'draft-leroux-pce-pcecp-interarea-reqs',
                'draft-fujiwara-dnsop-resolver-update',
            ):
            if re.search(pattern, url):
                validated_urls[key] = True
                log("%s blacklisted; skipping HTML validation" % url)
                return

        if hasattr(response, "content"):
            content = response.content
        else:
            content = response.streaming_content
        validated_urls[key] = True
        if args.validator_nu:
            v = subprocess.Popen(["java", "-jar", basedir + "/bin/vnu.jar",
                                  "--format", "json", "-"],
                                 stdin=subprocess.PIPE, stderr=subprocess.PIPE)
            for m in json.loads(v.communicate(content)[1])["messages"]:
                t = m["subType"] if m["type"] == "info" else m["type"]
                tags.append("\n%s\tLine %d: %s" %
                            (t.upper(), m["lastLine"], m["message"]))
                tags.append("\n\t%s" % m["extract"].replace('\n', ' '))
                tags.append("\n\t%s%s" %
                            (" " * m["hiliteStart"], "^" * m["hiliteLength"]))
                # disregard some HTML issues that are (usually) due to invalid
                # database content
                if not re.search('Forbidden code point|Bad value|seamless|The first child|Duplicate ID|The first occurrence of ID', m["message"]):
                    warnings += 1
        else:
            try:
                parser.parse(content)
            except Exception as e:
                for err in parser.errors:
                    pos, code, data = err
                    tags.append(u"WARN invalid html at line, pos %s: %s" % (pos, e))
                warnings += 1

def skip_extract_from(url):
    for pattern in (
            r'^/doc/html/[a-z0-9-]+',
            r'^/meeting/[a-z0-9-]+/agenda/[a-z0-9-]+',
            r'^/static/coverage/',
        ):
        if re.search(pattern, url):
            return True
    return False

def skip_url(url):
    for pattern in (
            "^/community/[0-9]+/remove_document/",
            "^/community/personal/",
            # Skip most of the slow pdf composite generation urls and svg urls
            "^/meeting/[0-9]+/agenda/[0-9b-z].*-drafts\\.pdf",
            "^/wg/[a-z0-9-]+/deps/svg/",
            # This bad url occurs in an uploaded html agenda:
            r"/site/ietfdhcwg/_/rsrc/1311005436000/system/app/css/overlay.css\?cb=simple100%250150goog-ws-left",
            r"/dir/tsvdir/reviews/",
            r"draft-touch-msword-template-v2\.0",

            # Skip most html conversions, not worth the time
            "^/doc/html/draft-[0-9ac-z]",
            "^/doc/html/draft-b[0-9b-z]",
            "^/doc/pdf/draft-[0-9ac-z]",
            "^/doc/pdf/draft-b[0-9b-z]",
            "^/doc/html/charter-.*",
            "^/doc/html/status-.*",
            "^/doc/html/rfc.*",
            # These will always 404, but include only those not excluded above
#             r"^/doc/html/charter-ietf-cicm",
#             r"^/doc/html/charter-ietf-dcon",
#             r"^/doc/html/charter-ietf-fun",
#             r"^/doc/html/charter-ietf-multrans",
#             r"^/doc/html/charter-ietf-sdn",
#             r"^/doc/html/charter-ietf-woes",
#             r"^/doc/html/draft-allan-mpls-loadbal-06",
#             r"^/doc/html/draft-allocchio-mail11-00",
#             r"^/doc/html/draft-almquist-leak-01",
#             r"^/doc/html/draft-almquist-nexthop-01",
#             r"^/doc/html/draft-armitage-ion-mars-scsp-06",
            r"^/doc/html/draft-balakrishnan-cm-03",
            r"^/doc/html/draft-ballardie-cbt-02",
#             r"^/doc/html/draft-bellovin-ipng-shape-of-bits-00",
#             r"^/doc/html/draft-bellovin-itrace-04",
#             r"^/doc/html/draft-bhaskar-pim-ss-04",
#             r"^/doc/html/draft-bhattach-diot-pimso-04",
#             r"^/doc/html/draft-bierman-rmonmib-apmcaps-04",
#             r"^/doc/html/draft-blaze-ipsp-trustmgt-04",
#             r"^/doc/html/draft-blumenthal-snmpv2a-community-00",
#             r"^/doc/html/draft-borenstein-agc-spec-00",
#             r"^/doc/html/draft-borenstein-kidcode-00",
#             r"^/doc/html/draft-borenstein-mailcap-00",
#             r"^/doc/html/draft-borenstein-pgp-mime-00",
#             r"^/doc/html/draft-brockhaus-lamps-cmp-updates-00",
#             r"^/doc/html/draft-brockhaus-lamps-cmp-updates-03",
#             r"^/doc/html/draft-brockhaus-lamps-lightweight-cmp-profile-00",
#             r"^/doc/html/draft-brockhaus-lamps-lightweight-cmp-profile-03",
#             r"^/doc/html/draft-brown-supplpkgs-04",
#             r"^/doc/html/draft-brownlee-acct-arch-report-03",
#             r"^/doc/html/draft-cain-igmp-00",
#             r"^/doc/html/draft-calhoun-aaa-diameter-comp-01",
#             r"^/doc/html/draft-calhoun-mobileip-fa-tokens-04",
#             r"^/doc/html/draft-calhoun-mobileip-min-lat-handoff-02",
#             r"^/doc/html/draft-callon-addflow-support-clnp-00",
#             r"^/doc/html/draft-callon-routing-00",
#             r"^/doc/html/draft-carpenter-ipng-nosi-00",
#             r"^/doc/html/draft-carpenter-percep-00",
#             r"^/doc/html/draft-casati-gtp-03",
#             r"^/doc/html/draft-ceuppens-mpls-optical-04",
#             r"^/doc/html/draft-chapin-clnp-ISO8473-00",
#             r"^/doc/html/draft-cheng-modular-ikmp-00",
#             r"^/doc/html/draft-cole-appm-01",
#             r"^/doc/html/draft-conta-ipv6-icmp-igmp-00",
#             r"^/doc/html/draft-coya-test-01",
#             r"^/doc/html/draft-crocker-cidrd-myth-00",
#             r"^/doc/html/draft-crocker-pci-00",
#             r"^/doc/html/draft-crocker-stif-00",
#             r"^/doc/html/draft-daigle-napstr-05",
#             r"^/doc/html/draft-davie-intserv-compress-01",
#             r"^/doc/html/draft-davie-tag-switching-atm-04",
#             r"^/doc/html/draft-davin-qosrep-00",
#             r"^/doc/html/draft-davin-rsvfms-00",
#             r"^/doc/html/draft-dckmtr-proxy-00",
#             r"^/doc/html/draft-doolan-tdp-spec-04",
#             r"^/doc/html/draft-duerst-ruby-04",
#             r"^/doc/html/draft-dusse-pem-message-00",
#             r"^/doc/html/draft-dutt-rap-rsvp-proxy-01",
#             r"^/doc/html/draft-eastlake-muse-05",
#             r"^/doc/html/draft-elmalki-soliman-hmipv4v6-04",
#             r"^/doc/html/draft-ema-vpim-simplev3-02",
#             r"^/doc/html/draft-esaki-co-cl-ip-forw-atm-01",
#             r"^/doc/html/draft-etf-marid-protocol-00",
#             r"^/doc/html/draft-faltstrom-macmime-00",
#             r"^/doc/html/draft-faltstrom-whois-05",
#             r"^/doc/html/draft-fielding-http-spec-01",
#             r"^/doc/html/draft-flick-interfaces-mib-00",
#             r"^/doc/html/draft-flick-repeater-dev-mib-00",
#             r"^/doc/html/draft-floyd-cc-alt",
#             r"^/doc/html/draft-ford-bigten-bt-format-00",
#             r"^/doc/html/draft-ford-sdrp-sipp16-format-00",
#             r"^/doc/html/draft-freed-ftbp-00",
#             r"^/doc/html/draft-freed-newenc-01",
#             r"^/doc/html/draft-gellens-imapext-annotate-01",
#             r"^/doc/html/draft-gharai-hdtv-video-04",
#             r"^/doc/html/draft-glenn-layer-security-00",
#             r"^/doc/html/draft-hares-idrp-familytree-00",
#             r"^/doc/html/draft-harrington-control-mib-00",
#             r"^/doc/html/draft-harrington-data-filter-mib-00",
#             r"^/doc/html/draft-haskin-intra-route-server-00",
#             r"^/doc/html/draft-helenius-ppp-subnet-04",
#             r"^/doc/html/draft-hinden-ipng-addr-00",
#             r"^/doc/html/draft-holbrook-ssm-04",
#             r"^/doc/html/draft-hollenbeck-epp-tcp-02",
#             r"^/doc/html/draft-houldsworth-ip6-nsap-use-00",
#             r"^/doc/html/draft-houldsworth-sc6-hot-finland-00",
#             r"^/doc/html/draft-huitema-shipworm-01",
#             r"^/doc/html/draft-iab-liaisons-00",
#             r"^/doc/html/draft-iab-mou2jtc1-03",
#             r"^/doc/html/draft-iab-standards-processv3-00",
#             r"^/doc/html/draft-ietf-aft-socks-md5-auth-00",
#             r"^/doc/html/draft-ietf-bgpdepl-minutes-93feb-00",
#             r"^/doc/html/draft-ietf-bmwg-overallperf-00",
#             r"^/doc/html/draft-ietf-bridge-sr-obj-00",
#             r"^/doc/html/draft-ietf-cat-altftp-00",
#             r"^/doc/html/draft-ietf-cip-apisocket-00",
#             r"^/doc/html/draft-ietf-cipso-ipsec-option-00",
#             r"^/doc/html/draft-ietf-decnetiv-mib-implement-00",
#             r"^/doc/html/draft-ietf-dhc-problem-stmt-00",
#             r"^/doc/html/draft-ietf-dns-idpr-02",
#             r"^/doc/html/draft-ietf-dns-ixfr-01",
#             r"^/doc/html/draft-ietf-dnsind-dynDNS-arch-00",
#             r"^/doc/html/draft-ietf-dnsind-dynDNS-impl-00",
#             r"^/doc/html/draft-ietf-dtn-tcpclv4-00",
#             r"^/doc/html/draft-ietf-dtn-tcpclv4-15",
#             r"^/doc/html/draft-ietf-dtn-tcpclv4-18",
#             r"^/doc/html/draft-ietf-dtn-tcpclv4-19",
#             r"^/doc/html/draft-ietf-ethermib-objects-00",
#             r"^/doc/html/draft-ietf-fax-tiff-f-reg-01",
#             r"^/doc/html/draft-ietf-geopriv-dhcp-lo-option-01",
#             r"^/doc/html/draft-ietf-html-charset-harmful-00",
#             r"^/doc/html/draft-ietf-iafa-templates-00",
#             r"^/doc/html/draft-ietf-idmr-mtree-00",
#             r"^/doc/html/draft-ietf-idmr-pim-dense-spec-00",
#             r"^/doc/html/draft-ietf-idmr-pim-dm-spec-08",
#             r"^/doc/html/draft-ietf-idr-bgp-tcp-md5bad-01",
#             r"^/doc/html/draft-ietf-idr-community-00",
#             r"^/doc/html/draft-ietf-idr-rifs-00",
#             r"^/doc/html/draft-ietf-ids-iwps-design-spec-01",
#             r"^/doc/html/draft-ietf-ids-pilots-00",
#             r"^/doc/html/draft-ietf-iesg-evolutionplan-00",
#             r"^/doc/html/draft-ietf-iiir-html-01",
#             r"^/doc/html/draft-ietf-iiir-http-00",
#             r"^/doc/html/draft-ietf-ipae-new-ip-00",
#             r"^/doc/html/draft-ietf-ipidrp-sip-01",
#             r"^/doc/html/draft-ietf-iplpdn-multi-isdn-02",
#             r"^/doc/html/draft-ietf-iplpdn-para-negotiation-02",
#             r"^/doc/html/draft-ietf-iplpdn-shortcutrouting-02",
#             r"^/doc/html/draft-ietf-iplpdn-simple-multi-01",
#             r"^/doc/html/draft-ietf-ipp-indp-04",
#             r"^/doc/html/draft-ietf-ipsec-ike-base-mode-03",
#             r"^/doc/html/draft-ietf-ipsec-intragkm-03",
#             r"^/doc/html/draft-ietf-ipsp-spsl-04",
#             r"^/doc/html/draft-ietf-ipsra-pic-07",
#             r"^/doc/html/draft-ietf-isis-atipx-00",
#             r"^/doc/html/draft-ietf-isis-multilevel-routing-00",
#             r"^/doc/html/draft-ietf-isis-nbma-00",
#             r"^/doc/html/draft-ietf-isis-tcpip-01",
#             r"^/doc/html/draft-ietf-isn-aup-01",
#             r"^/doc/html/draft-ietf-lsma-scenarios-03",
#             r"^/doc/html/draft-ietf-mailext-lang-char-00",
#             r"^/doc/html/draft-ietf-mhsds-822dir-03",
#             r"^/doc/html/draft-ietf-mhsds-convert-01",
#             r"^/doc/html/draft-ietf-mhsds-mhsprofile-04",
#             r"^/doc/html/draft-ietf-mhsds-mhsuse-03",
#             r"^/doc/html/draft-ietf-mmusic-agree-00",
#             r"^/doc/html/draft-ietf-mobileip-aaa-req-00",
#             r"^/doc/html/draft-ietf-mobileip-addr-ext-00",
#             r"^/doc/html/draft-ietf-mobileip-integrated-00",
#             r"^/doc/html/draft-ietf-mobileip-mib-fa-01",
#             r"^/doc/html/draft-ietf-mobileip-mib-ha-01",
#             r"^/doc/html/draft-ietf-mobileip-mib-mn-01",
#             r"^/doc/html/draft-ietf-mobileip-mib-sec-01",
#             r"^/doc/html/draft-ietf-msi-api-03",
#             r"^/doc/html/draft-ietf-nasreq-nasrequirements-01",
#             r"^/doc/html/draft-ietf-netdata-implement-03",
#             r"^/doc/html/draft-ietf-netdata-netdata-04",
#             r"^/doc/html/draft-ietf-nimrod-dns-01",
#             r"^/doc/html/draft-ietf-nisi-nicdoc-00",
#             r"^/doc/html/draft-ietf-nisi-nics-00",
#             r"^/doc/html/draft-ietf-nntp-news-01",
#             r"^/doc/html/draft-ietf-oncrpc-remote-06",
#             r"^/doc/html/draft-ietf-osids-dirtree-00",
#             r"^/doc/html/draft-ietf-osids-dsanaming-02",
#             r"^/doc/html/draft-ietf-osids-requirements-00",
#             r"^/doc/html/draft-ietf-osids-simple-stack-00",
#             r"^/doc/html/draft-ietf-osids-treestructure-00",
#             r"^/doc/html/draft-ietf-osinsap-format-01",
#             r"^/doc/html/draft-ietf-osix500-directories-01",
#             r"^/doc/html/draft-ietf-ospf-extattr-00",
#             r"^/doc/html/draft-ietf-ospf-ipv6-ext-00",
#             r"^/doc/html/draft-ietf-ospf-pmp-if-00",
#             r"^/doc/html/draft-ietf-otp-ver-03",
#             r"^/doc/html/draft-ietf-pana-aaa-interworking-00",
#             r"^/doc/html/draft-ietf-pem-notary-00",
#             r"^/doc/html/draft-ietf-pim-ipv6-04",
#             r"^/doc/html/draft-ietf-pim-simplekmp-02",
#             r"^/doc/html/draft-ietf-pint-conf-02",
#             r"^/doc/html/draft-ietf-pip-vector-00",
#             r"^/doc/html/draft-ietf-poised-nomcomm-00",
#             r"^/doc/html/draft-ietf-pppext-aha-auth-00",
#             r"^/doc/html/draft-ietf-pppext-ipcp-network-04",
#             r"^/doc/html/draft-ietf-pppext-kap-auth-00",
#             r"^/doc/html/draft-ietf-pppext-kapv4-auth-00",
#             r"^/doc/html/draft-ietf-ripv2-ripng-00",
#             r"^/doc/html/draft-ietf-rmon-trap-00",
#             r"^/doc/html/draft-ietf-rmonmib-rmon2hc-01",
#             r"^/doc/html/draft-ietf-roamops-actng-08",
#             r"^/doc/html/draft-ietf-rohc-rtp-rocco-performance-01",
#             r"^/doc/html/draft-ietf-rohc-rtp-rocco-video-01",
#             r"^/doc/html/draft-ietf-rolc-nhrp-mib-00",
#             r"^/doc/html/draft-ietf-rreq-iprouters-04",
#             r"^/doc/html/draft-ietf-rsvp-policy-ext-05",
#             r"^/doc/html/draft-ietf-rsvp-state-compression-04",
#             r"^/doc/html/draft-ietf-sdr-IPv6-pack-00",
#             r"^/doc/html/draft-ietf-sdr-pl-00",
#             r"^/doc/html/draft-ietf-sdr-route-construction-01",
#             r"^/doc/html/draft-ietf-sdr-route-setup-00",
#             r"^/doc/html/draft-ietf-sdr-speakers-attribute-00",
#             r"^/doc/html/draft-ietf-sip-64bit-plan-00",
#             r"^/doc/html/draft-ietf-sip-dnss-00",
#             r"^/doc/html/draft-ietf-sip-ospf-00",
#             r"^/doc/html/draft-ietf-sip-rip-01",
#             r"^/doc/html/draft-ietf-sip-unicast-addr-00",
#             r"^/doc/html/draft-ietf-sipp-auto-addr-00",
#             r"^/doc/html/draft-ietf-sipp-bsd-api-02",
#             r"^/doc/html/draft-ietf-sipp-dhcpopt-01",
#             r"^/doc/html/draft-ietf-sipp-discovery-04",
#             r"^/doc/html/draft-ietf-sipp-discovery-formats-00",
#             r"^/doc/html/draft-ietf-sipp-dns-01",
#             r"^/doc/html/draft-ietf-sipp-dns-ext-00",
#             r"^/doc/html/draft-ietf-sipp-icmp-igmp-00",
#             r"^/doc/html/draft-ietf-sipp-routing-addr-02",
#             r"^/doc/html/draft-ietf-sipp-sst-overview-00",
#             r"^/doc/html/draft-ietf-sipping-overload-design",
#             r"^/doc/html/draft-ietf-smime-certdist-06",
#             r"^/doc/html/draft-ietf-smtpext-pipeline-02",
#             r"^/doc/html/draft-ietf-snmp-isdn-cisco-00",
#             r"^/doc/html/draft-ietf-snmpsec-m2mv2-01",
#             r"^/doc/html/draft-ietf-snmpsec-mibv2-00",
#             r"^/doc/html/draft-ietf-snmpsec-protov2-01",
#             r"^/doc/html/draft-ietf-snmpsec-tmv2-00",
#             r"^/doc/html/draft-ietf-stjohns-ipso-00",
#             r"^/doc/html/draft-ietf-svrloc-discovery-11",
#             r"^/doc/html/draft-ietf-tcplw-extensions-00",
#             r"^/doc/html/draft-ietf-tcplw-high-performance-01",
#             r"^/doc/html/draft-ietf-telnet-authker-v5-01",
#             r"^/doc/html/draft-ietf-telnet-compression-00",
#             r"^/doc/html/draft-ietf-telnet-encryption-02",
#             r"^/doc/html/draft-ietf-tewg-measure-07",
#             r"^/doc/html/draft-ietf-thinosi-profile-00",
#             r"^/doc/html/draft-ietf-tnfs-spec-03",
#             r"^/doc/html/draft-ietf-ucp-connectivity-01",
#             r"^/doc/html/draft-ietf-udlr-life-03",
#             r"^/doc/html/draft-ietf-ufdl-spec-01",
#             r"^/doc/html/draft-ietf-uri-roy-urn-urc-00",
#             r"^/doc/html/draft-ietf-uri-urc-00",
#             r"^/doc/html/draft-ietf-uri-urc-sgml-00",
#             r"^/doc/html/draft-ietf-uri-urc-spec-00",
#             r"^/doc/html/draft-ietf-uri-urc-trivial-00",
#             r"^/doc/html/draft-ietf-uri-urn-issues-00",
#             r"^/doc/html/draft-ietf-uri-urn-madsen-critique-00",
#             r"^/doc/html/draft-ietf-uri-urn-res-descript-00",
#             r"^/doc/html/draft-ietf-uri-urn-res-thoughts-00",
#             r"^/doc/html/draft-ietf-uri-urn-syntax-00",
#             r"^/doc/html/draft-ietf-uri-urn-x-dns-2-00",
#             r"^/doc/html/draft-ietf-uri-urn2urc-00",
#             r"^/doc/html/draft-ietf-uri-yaurn-00",
#             r"^/doc/html/draft-ietf-userdoc2-fyi-biblio-00",
#             r"^/doc/html/draft-ietf-uswg-fyi1-02",
#             r"^/doc/html/draft-ietf-whip-reqs-summary-01",
#             r"^/doc/html/draft-ietf-x400ops-admd-03",
#             r"^/doc/html/draft-ietf-x400ops-dnsx400rout-02",
#             r"^/doc/html/draft-ietf-x400ops-tbl-dist-00",
#             r"^/doc/html/draft-ietf-x400ops-tbl-dist-part1-01",
#             r"^/doc/html/draft-ietf-x400ops-tbl-dist-part2-01",
#             r"^/doc/html/draft-ipsec-isakmp-mode-cfg-02",
#             r"^/doc/html/draft-johnson-imhp-00",
#             r"^/doc/html/draft-jseng-utf5-02",
#             r"^/doc/html/draft-just-ldapv3-rescodes-03",
#             r"^/doc/html/draft-karrenberg-proposal-00",
#             r"^/doc/html/draft-kastenholz-loki-00",
#             r"^/doc/html/draft-kempf-scope-rules-04",
#             r"^/doc/html/draft-klyne-conneg-feature-match-03",
#             r"^/doc/html/draft-koch-dnsind-local-compression-01",
#             r"^/doc/html/draft-kzm-rap-sppi-04",
#             r"^/doc/html/draft-kzm-snmpv2-adminv2-alt-00",
#             r"^/doc/html/draft-kzm-snmpv2-coex-alt-00",
#             r"^/doc/html/draft-kzm-snmpv2-conf-alt-00",
#             r"^/doc/html/draft-kzm-snmpv2-intro-alt-00",
#             r"^/doc/html/draft-kzm-snmpv2-mib-alt-00",
#             r"^/doc/html/draft-kzm-snmpv2-smi-alt-00",
#             r"^/doc/html/draft-kzm-snmpv2-usec-conf-alt-00",
#             r"^/doc/html/draft-larson-bad-dns-res-01",
#             r"^/doc/html/draft-lear-foglamps-03",
#             r"^/doc/html/draft-leech-socks-protocol-v4-01",
#             r"^/doc/html/draft-levi-snmp-mid-level-mgr-00",
#             r"^/doc/html/draft-levi-snmp-script-language-00",
#             r"^/doc/html/draft-levinson-sgml-02",
#             r"^/doc/html/draft-li-bigten-addr-format-00",
#             r"^/doc/html/draft-li-tap-ipv7-00",
#             r"^/doc/html/draft-lloyd-ip6-iso-itu-reg-00",
#             r"^/doc/html/draft-macker-mdp-framework-05",
#             r"^/doc/html/draft-mahoney-snmpv2-features-00",
#             r"^/doc/html/draft-mahoney-snmpv2-proto-alt-00",
#             r"^/doc/html/draft-martensson-rocco-video-04",
#             r"^/doc/html/draft-mccann-mobileip-sessionid-04",
#             r"^/doc/html/draft-megginson-ldup-lcup-01",
#             r"^/doc/html/draft-metzger-ah-sha-00",
#             r"^/doc/html/draft-mpls-rsvpte-attributes-00",
#             r"^/doc/html/draft-myers-imap-imsp-01",
#             r"^/doc/html/draft-myers-imap-mbox-00",
#             r"^/doc/html/draft-myers-smtp-mult-01",
#             r"^/doc/html/draft-nelson-model-mailext-00",
#             r"^/doc/html/draft-newman-imap-annotate-00",
#             r"^/doc/html/draft-nguyen-bgp-ipv6-vpn-03",
#             r"^/doc/html/draft-nordmark-ipv6-aaa-hooks-04",
#             r"^/doc/html/draft-nyckelgard-isl-arch-04",
#             r"^/doc/html/draft-ohta-address-allocation-01",
#             r"^/doc/html/draft-ohta-dynamic-dns-00",
#             r"^/doc/html/draft-ohta-ip-over-atm-02",
#             r"^/doc/html/draft-ohta-mime-charset-names-00",
#             r"^/doc/html/draft-ohta-shared-media-02",
#             r"^/doc/html/draft-ohta-simple-dns-01",
#             r"^/doc/html/draft-ohta-text-encoding-01",
#             r"^/doc/html/draft-ohta-translation-instr-01",
#             r"^/doc/html/draft-ooms-cl-multicast-03",
#             r"^/doc/html/draft-ops-rfc2011-update-01",
#             r"^/doc/html/draft-ouldbrahim-bgpvpn-auto-03",
#             r"^/doc/html/draft-palme-autosub-06",
#             r"^/doc/html/draft-pan-diffserv-mib-01",
#             r"^/doc/html/draft-perkins-cnlp-support-00",
#             r"^/doc/html/draft-perkins-homeaddr-dhcpopt-00",
#             r"^/doc/html/draft-perkins-opaque-04",
#             r"^/doc/html/draft-polk-slp-loc-auth-server-04",
#             r"^/doc/html/draft-popp-cnrp-goals-01",
#             r"^/doc/html/draft-pusateri-igmp-mib-00",
#             r"^/doc/html/draft-pusateri-ipmulti-mib-00",
#             r"^/doc/html/draft-reddy-opsawg-mud-tls-00",
#             r"^/doc/html/draft-reddy-opsawg-mud-tls-03",
#             r"^/doc/html/draft-reichmeyer-polterm-terminology-04",
#             r"^/doc/html/draft-rekhter-arch-sipp16-addr-00",
#             r"^/doc/html/draft-rekhter-bigten-addr-arch-00",
#             r"^/doc/html/draft-rekhter-direct-provider-01",
#             r"^/doc/html/draft-rekhter-idr-over-atm-00",
#             r"^/doc/html/draft-rekhter-lsr-mobile-hosts-00",
#             r"^/doc/html/draft-rekhter-select-providers-02",
#             r"^/doc/html/draft-rekhter-sops-02",
#             r"^/doc/html/draft-rekhter-stratum-aggregation-01",
#             r"^/doc/html/draft-renwick-hippiarp-01",
#             r"^/doc/html/draft-renwick-hippimib-01",
#             r"^/doc/html/draft-rfced-info-corson-00",
#             r"^/doc/html/draft-rfced-info-katsube-oops-00",
#             r"^/doc/html/draft-rfced-info-perkins-05",
#             r"^/doc/html/draft-rfced-info-pi-vs-pa-addrspac-00",
#             r"^/doc/html/draft-rfced-info-senie-00",
#             r"^/doc/html/draft-ronc-domain-phb-set-ldap-rep-04",
#             r"^/doc/html/draft-ronc-domain-phb-set-specification-04",
#             r"^/doc/html/draft-rose-limit-01",
#             r"^/doc/html/draft-rose-smxp-spec-00",
#             r"^/doc/html/draft-rosen-ppvpn-l2vpn-01",
#             r"^/doc/html/draft-rosen-tag-stack-05",
#             r"^/doc/html/draft-rosenberg-mmusic-sdp-offer-answer-01",
#             r"^/doc/html/draft-rosenberg-sip-tunnels-01",
#             r"^/doc/html/draft-salzr-ldap-repsig-01",
#             r"^/doc/html/draft-sandick-pimsm-ssmrules-04",
#             r"^/doc/html/draft-schroeppel-dnsind-ecc-04",
#             r"^/doc/html/draft-simpson-exchanges-00",
#             r"^/doc/html/draft-simpson-ipv6-deploy-00",
#             r"^/doc/html/draft-simpson-ipv6-discovery-req-00",
#             r"^/doc/html/draft-simpson-ipv6-hc-00",
#             r"^/doc/html/draft-simpson-sipp-64-bit-plan-00",
#             r"^/doc/html/draft-sinnreich-interdomain-sip-qos-osp-02",
#             r"^/doc/html/draft-slutsman-aicd-02",
#             r"^/doc/html/draft-speer-avt-layered-video-05",
#             r"^/doc/html/draft-stein-green-commerce-model-00",
#             r"^/doc/html/draft-svanbro-rohc-lower-layer-guidelines-04",
#             r"^/doc/html/draft-templin-atn-aero-interface-00",
#             r"^/doc/html/draft-templin-atn-aero-interface-21",
#             r"^/doc/html/draft-teraoka-ipv6-mobility-sup-07",
#             r"^/doc/html/draft-thayer-seccomp-04",
#             r"^/doc/html/draft-traina-bgp-confed-00",
#             r"^/doc/html/draft-treese-class-desc-00",
#             r"^/doc/html/draft-vaudreuil-binaryheaders-01",
#             r"^/doc/html/draft-vaudreuil-enum-e164dir-05",
#             r"^/doc/html/draft-veizades-ipng-svrloc-00",
#             r"^/doc/html/draft-villamizar-isis-omp-01",
#             r"^/doc/html/draft-waldbusser-conventions-01",
#             r"^/doc/html/draft-waldbusser-rmonmib-apm-04",
#             r"^/doc/html/draft-waldbusser-ssecimpl-01",
#             r"^/doc/html/draft-waters-snmpv1-sec-mech-00",
#             r"^/doc/html/draft-weider-comindex-00",
#             r"^/doc/html/draft-wijnen-snmpv2-snmpv2t-00",
#             r"^/doc/html/draft-woundy-dhcpleasequery-04",
#             r"^/doc/html/draft-wright-policy-mpls-04",
#             r"^/doc/html/draft-yu-asn1-pitfalls-04",
#             r"^/doc/html/draft-yu-rpd-00",
#             r"^/doc/html/draft-zaccone-nat-rsip-gen-arch-02",
#             r"^/doc/html/draft-zaccone-nat-transp-fram-02",
#             r"^/doc/html/draft-zaccone-nat-transport-03",
#             r"^/doc/html/status-change-icmpv6-dns-ipv6-to-internet-standard",

            r"^/static/coverage/",
            r"^/meeting/6[0-4]/agenda",
            r"^https?://www.ietf.org/",
        ):
        if re.search(pattern, url):
            return True
    return False

def log(s):
    print(s)
    if logfile:
        if not type(s) is str:
            s = s.encode('utf-8')
        logfile.write(s)
        logfile.write('\n')

def get_referrers(url):
    ref_list = []
    while url in referrers:
        url = referrers[url]
        if url in ref_list:
            log("Circular referral list, discovered at %s" % url)
            break
        ref_list.append(url)
    return ref_list

# --- GLobals ---

slow_threshold = args.slow_threshold

visited = set()
urls = {} # url -> referrer
referrers = {}

initial_urls = []
initial_urls.extend(args.urls)

if args.url_file:
    with open(args.url_file) as f:
        for line in f:
            line = line.partition("#")[0].strip()
            if line:
                initial_urls.append(line)

if not initial_urls:
    initial_urls.append("/")
    initial_urls.append("/api/v1")

for url in initial_urls:
    urls[url] = "[initial]"

parser = html5lib.HTMLParser(strict=True)

# initialise validated_urls with some patterns we don't want to check,
# because they aren't under our control, such as uploaded group agendas.
validated_urls = {'/meeting/nn/agenda/foo/': True, }

doc_types = [ t.slug for t in DocTypeName.objects.all() ]

errors = 0
warnings = 0
count = 0

start_time = datetime.datetime.now()

client = django.test.Client(Accept='text/html,text/plain,application/json')

logfile = None
if args.logfile:
    logfile = open(args.logfile, "w")

# --- Main ---

if __name__ == "__main__":
    if (args.user):
        # log in as user, to have the respective HTML generated by the templates
        response = client.post('/accounts/login/',
                               {'username': args.user, 'password': 'password'},
                               secure=True, follow=True)
        if (response.status_code != 200):
            log("Could not log in as %s, HTML response %d" %
                (args.user, response.status_code))
            sys.exit(1)

    # Run django system checks and checks from ietf.checks:
    error_list = django.core.checks.run_checks()
    silenced = []
    for i in range(len(error_list)):
        if error_list[i].id in settings.SILENCED_SYSTEM_CHECKS:
            silenced.append(i)
    silenced.sort(reverse=True)
    for i in silenced:
        del error_list[i]
    if error_list:
        print("")
        for entry in error_list:
            print(entry)

    while urls:
        if args.random:
            # popitem() is documented to be random, but really isn't
            url = random.choice(urls.keys())
            referrer = urls.pop(url)
        else:
            url, referrer = urls.popitem()

        visited.add(url)

        if skip_url(url):
            continue

        timestamp = datetime.datetime.now()
        acc_time = (timestamp - start_time).total_seconds()
        acc_secs = (timestamp - start_time).total_seconds()
        hrs = acc_secs // (60*60)
        min = (acc_secs % (60*60)) // 60
        sec = acc_secs % 60

        try:
            request_start = datetime.datetime.now()
            if args.verbose:
                sys.stderr.write(url+'\n')
            r = client.get(url, secure=True)
            elapsed = datetime.datetime.now() - request_start
        except KeyboardInterrupt:
            log(" ... was fetching %s" % url)
            sys.exit(1)
        except:
            elapsed = datetime.datetime.now() - request_start
            tags = [ u"FAIL  (from [ %s ])" % (",\n\t".join(get_referrers(url))) ]
            log("%2d:%02d:%02d %7d %6d  %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), 500, elapsed.total_seconds(), url, " ".join(tags)))
            log("=============")
            log(traceback.format_exc())
            log("=============")
            errors += 1
        else:
            tags = []

            if r.status_code in (301, 302):
                u = strip_url(r["Location"])
                if u not in visited and u not in urls:
                    urls[u] = referrer # referrer is original referrer, not redirected url
                    referrers[u] = referrer

            elif r.status_code == 200:
                ctype = r["Content-Type"]
                if ";" in ctype:
                    ctype = ctype[:ctype.index(";")]

                if ctype == "text/html":
                    try:
                        if args.follow and not skip_extract_from(url):
                            for u in extract_html_urls(unicontent(r)):
                                if u not in visited and u not in urls:
                                    urls[u] = url
                                    referrers[u] = url

                        check_html_valid(url, r, args)

                    except:
                        log("error extracting HTML urls from %s" % url)
                        log("=============")
                        log(traceback.format_exc())
                        log("=============")

                elif ctype == "application/json":
                    try:
                        if args.follow:
                            for u in extract_tastypie_urls(unicontent(r)):
                                if u not in visited and u not in urls:
                                    urls[u] = url
                                    referrers[u] = url
                    except:
                        log("error extracting urls from %s" % url)
                        log("=============")
                        log(traceback.format_exc())
                        log("=============")

            else:
                tags.append(u"FAIL  (from %s)" % (referrer, ))
                errors += 1

            if elapsed.total_seconds() > slow_threshold:
                tags.append("SLOW")

            acc_time = (timestamp - start_time).total_seconds()
            acc_secs = (timestamp - start_time).total_seconds()
            hrs = acc_secs // (60*60)
            min = (acc_secs % (60*60)) // 60
            sec = acc_secs % 60

            if (len(visited) % 100) == 1:
                log("\nElapsed  Visited  Queue Code   Time  Url  ...  Notes")

            log("%2d:%02d:%02d %7d %6d  %s %6.3fs %s %s" % (hrs,min,sec, len(visited), len(urls), r.status_code, elapsed.total_seconds(), url, " ".join(tags)))
            if ((errors or warnings) and args.pedantic):
                sys.exit(1)

    if logfile:
        logfile.close()
        sys.stderr.write("Output written to %s\n\n" % logfile.name)

    if errors > 0:
        sys.stderr.write("Found %s errors, grep output for FAIL for details\n" % errors)
        sys.exit(1)
    else:
        sys.stderr.write("Found no errors.\n")
    if warnings > 0:
        sys.stderr.write("Found %s warnings, grep output for WARN for details\n" % warnings)
    else:
        sys.stderr.write("Found no warnings.\n")
