forked from simsong/bulk_extractor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbe_sampler.py
More file actions
128 lines (116 loc) · 5.39 KB
/
be_sampler.py
File metadata and controls
128 lines (116 loc) · 5.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
"""This program creates a BE report that is a statistical sample of an original BE report.
The user can then grade the report. A second pass then reports the accuracy of the BE report."""
from bulk_extractor_reader import BulkReport,is_comment_line
import random,re
def get_lines_array(f):
"""Returns an array of integers corresponding to each line in the feature file"""
line_number = 0
line_numbers = []
if args.pattern:
pattern = args.pattern.encode('utf-8')
else:
pattern = None
if args.xpattern:
xpattern = args.xpattern.encode('utf-8')
else:
xpattern = None
for line in f:
line_number += 1
if pattern and not pattern in line.split(b"\t")[0]:
continue
if xpattern and xpattern in line.split(b"\t")[0]:
continue
if is_comment_line(line):
continue
line_numbers.append(line_number)
return line_numbers
def sample(outdir,fn):
line_numbers = get_lines_array(report.open(fn,"r"))
count = min(args.count,len(line_numbers))
print("{} has {} lines".format(fn,len(line_numbers)))
lines_to_sample = sorted(random.sample(line_numbers,count))
line_number = 0
with open(os.path.join(outdir,fn),"w") as out:
out.write("# -*- mode:text; truncate-lines:t -*-\n")
out.write("# Sampled {} out of {}\n".format(count,len(line_numbers)))
out.write("# Place '=' or 'y' in front of correct classifications and '-' or 'x' in front of incorrect ones\n")
with report.open(fn) as f:
for line in f:
line_number += 1
if is_comment_line(line):
out.write(line.decode('utf-8'))
if line_number in lines_to_sample:
out.write("{}:\t".format(line_number))
out.write(line.decode('utf-8'))
def calc_stats(fn):
wrong = 0
right = 0
for line in open(fn,"r"):
if line[0]=='#':
m = re.search("# Sampled (\d+) out of (\d+)",line)
if m:
sampled = int(m.group(1))
total = int(m.group(2))
elif line[0] in '-_xzn' :
wrong += 1
elif line[0] in '+=y' :
right += 1
else:
if not args.quiet: print("No classification:",line,end='');
return {"fn":os.path.basename(fn),
"total":total,
"sampled":sampled,
"sampling_rate":sampled/total if total>0 else 0,
"accuracy":(right/sampled if sampled>0 else 1),
"error_rate":(wrong/sampled if sampled>0 else 0),
"uncertainity":(sampled-(right+wrong))/sampled if sampled>0 else 0}
def calc_report(dirname):
for (dirpath,dirnames,filenames) in os.walk(dirname):
for filename in filenames:
if filename.endswith("~"): continue
fn = os.path.join(dirpath,filename)
r = calc_stats(fn)
if r: res.append(r)
print("Report: {}".format(args.calc))
print("{:20} {:8} {:8} {:8} {:8}".format("Feature","Total","Sampled","Accuracy","Err Rate"))
for r in res:
print("{:20} {:8} {:8} ({:4.0f}%) {:4.0f}% {:4.0f}%".format(
r['fn'],r['total'],r['sampled'],r['sampling_rate']*100.0,r['accuracy']*100.0,r['error_rate']*100.0))
if __name__ == "__main__":
import argparse,sys,os
arg_parser = argparse.ArgumentParser(description=(
"Create a bulk_extractor report that is sampled from an existing report. Number each feature file line; do not copy over the histograms. Currently does not handle carved objects"))
arg_parser.add_argument("report", metavar="report",
help="bulk_extractor report directory or zip file to graph")
arg_parser.add_argument("output", type=str, help="Output directory")
arg_parser.add_argument("--count", type=int, default="100",
help="Number of items to sample")
arg_parser.add_argument("--pattern", type=str, help="Only sample lines that include this pattern in the forensic path")
arg_parser.add_argument("--xpattern", type=str, help="Do not sample lines that include this pattern in the forensic path")
arg_parser.add_argument("--calc", help="Compute the statistics",action="store_true")
arg_parser.add_argument("--trials", type=int, default="5", help="Number of trials to divide into")
arg_parser.add_argument("--quiet",action='store_true',help='do not alert on lines with no classification')
args = arg_parser.parse_args()
res = []
if args.calc:
for (dirpath,dirnames,filenames) in os.walk(args.output):
for filename in filenames:
if filename.endswith("~"): continue
fn = os.path.join(dirpath,filename)
r = calc_stats(fn)
print(r)
res.append(r)
print("{:20} {:8} {:8} {:4} {:8} {:8}".format("Feature","Total","Sampled","%","Accuracy","Err Rate"))
for r in res:
print("{:20} {:8} {:8} {:4}% {:8} {:8}".format(
r['fn'],r['total'],r['sampled'],r['sampled']*100.0/r['total'],r['accuracy'],r['error_rate']))
exit(0)
if args.sample:
(input,output) = args.sample
if os.path.exists(output):
raise RuntimeError(output+" exists")
os.mkdir(output)
report = BulkReport(input)
for fn in report.feature_files():
sample(output,fn)