Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 80120e8

Browse files
committed
Minor refactoring and update of sqlharvest.py
1 parent 1bcf5a6 commit 80120e8

1 file changed

Lines changed: 42 additions & 50 deletions

File tree

extra/sqlharvest/sqlharvest.py

Lines changed: 42 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -15,66 +15,62 @@
1515

1616
from operator import itemgetter
1717

18-
def main():
19-
20-
TIMEOUT = 10
21-
CONFIG_FILE = 'sqlharvest.cfg'
22-
TABLES_FILE = 'tables.txt'
23-
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; AskTB5.3)'
24-
SEARCH_URL = 'http://www.google.com/m?source=mobileproducts&dc=gorganic'
25-
MAX_FILE_SIZE = 2*1024*1024 # if a result (.sql) file for downloading is more than 2MB in size just skip it
26-
QUERY = 'CREATE TABLE ext:sql'
27-
REGEX_URLS = r';u=([^"]+)'
28-
REGEX_RESULT = r'CREATE TABLE\s*(/\*.*\*/)?\s*(IF NOT EXISTS)?\s*(?P<result>[^\(;]+)'
18+
TIMEOUT = 10
19+
CONFIG_FILE = 'sqlharvest.cfg'
20+
TABLES_FILE = 'tables.txt'
21+
USER_AGENT = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; AskTB5.3)'
22+
SEARCH_URL = 'http://www.google.com/m?source=mobileproducts&dc=gorganic'
23+
MAX_FILE_SIZE = 2 * 1024 * 1024 # if a result (.sql) file for downloading is more than 2MB in size just skip it
24+
QUERY = 'CREATE TABLE ext:sql'
25+
REGEX_URLS = r';u=([^"]+?)&amp;q='
26+
REGEX_RESULT = r'(?i)CREATE TABLE\s*(/\*.*\*/)?\s*(IF NOT EXISTS)?\s*(?P<result>[^\(;]+)'
2927

28+
def main():
3029
tables = dict()
31-
refiles = re.compile(REGEX_URLS)
32-
retables = re.compile(REGEX_RESULT, re.I)
33-
3430
cookies = cookielib.CookieJar()
3531
cookie_processor = urllib2.HTTPCookieProcessor(cookies)
3632
opener = urllib2.build_opener(cookie_processor)
37-
opener.addheaders = [('User-Agent', USER_AGENT)]
33+
opener.addheaders = [("User-Agent", USER_AGENT)]
3834

3935
conn = opener.open(SEARCH_URL)
4036
page = conn.read() #set initial cookie values
4137

4238
config = ConfigParser.ConfigParser()
4339
config.read(CONFIG_FILE)
44-
if not config.has_section('options'):
45-
config.add_section('options')
4640

47-
if not config.has_option('options', 'index'):
48-
config.set('options', 'index', '0')
41+
if not config.has_section("options"):
42+
config.add_section("options")
43+
if not config.has_option("options", "index"):
44+
config.set("options", "index", "0")
4945

50-
i = int(config.get('options', 'index'))
46+
i = int(config.get("options", "index"))
5147

5248
try:
53-
f = open(TABLES_FILE, 'r')
54-
for line in f.xreadlines():
55-
if len(line) > 0 and ',' in line:
56-
temp = line.split(',')
57-
tables[temp[0]] = int(temp[1])
58-
f.close()
49+
with open(TABLES_FILE, 'r') as f:
50+
for line in f.xreadlines():
51+
if len(line) > 0 and ',' in line:
52+
temp = line.split(',')
53+
tables[temp[0]] = int(temp[1])
5954
except:
6055
pass
6156

6257
socket.setdefaulttimeout(TIMEOUT)
6358

64-
files, oldFiles = None, None
59+
files, old_files = None, None
6560
try:
6661
while True:
6762
abort = False
68-
oldFiles = files
63+
old_files = files
6964
files = []
7065

7166
try:
72-
conn = opener.open('%s&q=%s&start=%d&sa=N' % (SEARCH_URL, QUERY.replace(' ', '+'), i*10))
67+
conn = opener.open("%s&q=%s&start=%d&sa=N" % (SEARCH_URL, QUERY.replace(' ', '+'), i * 10))
7368
page = conn.read()
74-
for match in refiles.finditer(page):
69+
for match in re.finditer(REGEX_URLS, page):
7570
files.append(urllib.unquote(match.group(1)))
76-
if len(files) >= 10: break
77-
abort = (files == oldFiles)
71+
if len(files) >= 10:
72+
break
73+
abort = (files == old_files)
7874

7975
except KeyboardInterrupt:
8076
raise
@@ -91,23 +87,24 @@ def main():
9187

9288
for sqlfile in files:
9389
print sqlfile
90+
9491
try:
9592
req = urllib2.Request(sqlfile)
9693
response = urllib2.urlopen(req)
9794

98-
if response.headers.has_key('Content-Length'):
99-
if int(response.headers.get('Content-Length')) > MAX_FILE_SIZE:
95+
if response.headers.has_key("Content-Length"):
96+
if int(response.headers.get("Content-Length")) > MAX_FILE_SIZE:
10097
continue
10198

10299
page = response.read()
103100
found = False
104101
counter = 0
105102

106-
for match in retables.finditer(page):
103+
for match in re.finditer(REGEX_RESULT, page):
107104
counter += 1
108-
table = match.group("result").strip().strip("`").strip("\"").strip("'").replace('"."', ".").replace("].[", ".").strip('[').strip(']')
105+
table = match.group("result").strip().strip("`\"'").replace('"."', ".").replace("].[", ".").strip('[]')
109106

110-
if table and '>' not in table and '<' not in table and '--' not in table and ' ' not in table:
107+
if table and not any(_ in table for _ in ('>', '<', '--', ' ')):
111108
found = True
112109
sys.stdout.write('*')
113110

@@ -131,19 +128,14 @@ def main():
131128
pass
132129

133130
finally:
134-
f = open(TABLES_FILE, 'w+')
135-
136-
tables = sorted(tables.items(), key=itemgetter(1), reverse=True)
137-
138-
for table, count in tables:
139-
f.write("%s,%d\n" % (table, count))
140-
141-
f.close()
142-
config.set('options', 'index', str(i+1))
143-
144-
f = open(CONFIG_FILE, 'w+')
145-
config.write(f)
146-
f.close()
131+
with open(TABLES_FILE, 'w+') as f:
132+
tables = sorted(tables.items(), key=itemgetter(1), reverse=True)
133+
for table, count in tables:
134+
f.write("%s,%d\n" % (table, count))
135+
136+
config.set("options", "index", str(i + 1))
137+
with open(CONFIG_FILE, 'w+') as f:
138+
config.write(f)
147139

148140
if __name__ == "__main__":
149141
main()

0 commit comments

Comments
 (0)