Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 6813901

Browse files
committed
fixed evaluation code of the classifiers
1 parent 5419df2 commit 6813901

File tree

13 files changed

+221
-4445
lines changed

13 files changed

+221
-4445
lines changed

classifier/classifier.py

Lines changed: 44 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from argparse import ArgumentParser
1616
from configparser import ConfigParser
1717
from pathlib import Path
18+
from datetime import datetime
19+
from sklearn import model_selection
1820

1921
# from string import printable
2022
import pandas as pd
@@ -43,17 +45,22 @@ def __init__(self):
4345
self.config = self.util.load_config("config/classifier.yaml")
4446
self.model = None
4547
self.history = None
48+
self.arch = ModelArchs(self.config)
4649

4750
def update_config_args(self, paras):
4851
"""create model dir to store it
4952
set the config args from the command line if provided """
5053
self.config["model"]["name"] = paras.model if paras.model else self.config["model"]["name"]
5154
self.config["data_file"] = paras.data if paras.data else self.config["data_file"]
55+
5256
if self.config['debug']:
5357
self.config['dnn']['epochs'] = self.config['dnn']['debug_epochs']
58+
time_now = ''
59+
else:
60+
time_now = datetime.now().strftime('%Y-%m-%d_%H.%Mm')
5461

5562
mdir = self.config["model"]["name"] + "-" + str(self.config["dnn"]["epochs"]) + \
56-
"-" + Path(self.config["data_file"]).stem + "/"
63+
"-" + Path(self.config["data_file"]).stem + "-" + time_now + "/"
5764

5865
self.config["model"]["path"] = self.config["model"]["path"] + mdir
5966

@@ -120,7 +127,7 @@ def apply_checkpoints(self, model, cp_path, patience):
120127

121128
def select_model_arch(self):
122129
"""Choose ML model"""
123-
arch = ModelArchs(self.config)
130+
124131
model_name = self.config["model"]["name"]
125132

126133
print("\n\n" + "=" * 25 + " " + model_name +
@@ -129,15 +136,15 @@ def select_model_arch(self):
129136
print("-" * 50)
130137

131138
if model_name == "RNN":
132-
model = arch.apply_RNN()
139+
model = self.arch.apply_RNN()
133140
elif model_name == "CNN":
134-
model = arch.apply_CNN()
141+
model = self.arch.apply_CNN()
135142
elif model_name == "LSTM":
136-
model = arch.apply_LSTM()
143+
model = self.arch.apply_LSTM()
137144
elif model_name == "RF":
138-
model = arch.apply_RF(df)
145+
model = self.arch.apply_RF(df)
139146
elif model_name == "multiDNN":
140-
model = arch.apply_multiDNN()
147+
model = self.arch.apply_multiDNN()
141148
else:
142149
print("Invalid model! Please select a valid model!")
143150
exit(1)
@@ -148,18 +155,18 @@ def train_model(self, model_file, X_train, y_train, X_test, y_test):
148155
model_name = self.config["model"]["name"]
149156
epochs = self.config["dnn"]["epochs"]
150157

151-
# Select the model architecture
152-
model = self.select_model_arch()
153-
154-
# store metadata to neptune.ai
155-
if self.config["model"]["use_neptune"]:
156-
from neptune.integrations.tensorflow_keras import NeptuneCallback
157-
nt_run = self.init_neptune(
158-
model_name, epochs, self.config["data_file"])
159-
160158
# Apply callbacks for training to store the best model checkpoint
161159
# and apply early stopping.
162160
if model_name != "RF":
161+
# Select the model architecture
162+
model = self.select_model_arch()
163+
164+
# store metadata to neptune.ai
165+
if self.config["model"]["use_neptune"]:
166+
from neptune.integrations.tensorflow_keras import NeptuneCallback
167+
nt_run = self.init_neptune(
168+
model_name, epochs, self.config["data_file"])
169+
163170
tf_callbacks = self.apply_checkpoints(
164171
model=model,
165172
cp_path=self.config["model"]["path"],
@@ -196,18 +203,15 @@ def train_model(self, model_file, X_train, y_train, X_test, y_test):
196203
else:
197204
# TODO: log non-DNN models output to Neptune
198205
# nt_run["acc"] = ?? or params=dict
206+
# Fitting
207+
model = self.arch.apply_RF(df.code)
208+
# model = self.select_model_arch()
209+
model.fit(X_train, y_train)
210+
acc = model.score(X_test, y_test)
211+
print(f"Accuracy: {acc}")
199212
print(f"Trained with non-DNN model: {model_name}")
200213
return model
201214

202-
# def load_model(self, model_JSON, file_weights):
203-
# """Load model from disk"""
204-
# with open(model_JSON, "r") as f:
205-
# model_json = json.load(f)
206-
# model = model_from_json(model_json)
207-
208-
# model.load_weights(file_weights)
209-
# return model
210-
211215
def load_tf_model(self, model_file):
212216
"""
213217
Load model from disk
@@ -230,12 +234,11 @@ def evaluate_model(self, model_file, X_eval, y_eval):
230234
if self.config["model"]["name"] != "RF":
231235
if Path(model_file).is_file():
232236
model = self.load_tf_model(model_file)
233-
237+
print("\nEvaluating the model...\n")
234238
# evaluate the model
235-
# loss, acc = model.evaluate(X_eval, y_eval, verbose=0)
239+
loss, acc = model.evaluate(X_eval, y_eval, verbose=1)
236240

237-
print("\nEvaluating the model...\n")
238-
y_pred = model.predict(X_eval)
241+
# y_pred = model.predict(X_eval)
239242
# print(f'y_pred: {y_pred}')
240243
# print(f'y_eval: {y_eval}')
241244
# print(f'\ny_pred.shape: {y_pred.shape}')
@@ -251,14 +254,13 @@ def evaluate_model(self, model_file, X_eval, y_eval):
251254

252255
# cls_report = classification_report(y_eval, y_pred)
253256
# print(f"Classification Report: \n{cls_report}")
254-
# print('loss: ', loss)
255-
# print('acc: ', acc)
257+
print('loss: ', loss)
258+
print('acc: ', acc)
256259
else:
257260
print(f"\n\nModel file: {model_file} not found!")
258261
print("Please train the model first!")
259262
else:
260-
train_model = pickle.load(open(model_file, "RF"))
261-
result = train_model.score(X_eval, y_eval)
263+
result = model_file.score(X_eval, y_eval)
262264
print("Result: ", result)
263265
print("\n" + "-" * 35 + "Testing Completed" + "-" * 35 + "\n")
264266

@@ -298,6 +300,13 @@ def parse_args(self):
298300
preprocess.save_model(model, model_file)
299301

300302
# TODO: Evaluation of the trained model
303+
df_eval = preprocess.load_data(data_file=config["eval_data"])
304+
301305
if config["test"]:
302-
output_size = len(set(list(y_train)))
303-
classfr.evaluate_model(model_file, X_test, y_test)
306+
if config["model"]["name"] != "RF":
307+
X_eval, y_eval = preprocess.tokenize_data(
308+
df=df_eval, max_len=config["preprocess"]["max_len"])
309+
# output_size = len(set(list(y_train)))
310+
classfr.evaluate_model(model_file, X_eval, y_eval)
311+
else:
312+
classfr.evaluate_model(model, X_eval, y_eval)

classifier/models.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,18 @@
1212

1313
import json
1414
import os
15+
import pickle
1516
import re
1617
import warnings
1718
from pathlib import Path
1819
from string import printable
1920

21+
import joblib
2022
# import keras
2123
import matplotlib.pyplot as plt
2224
import numpy as np
2325
import pandas as pd
26+
import skops.io as sio
2427
import tensorflow as tf
2528
from matplotlib import pyplot
2629
from nltk.tokenize.regexp import WhitespaceTokenizer
@@ -263,32 +266,21 @@ def apply_multiDNN(self):
263266
model = self.optimize_model(self, model)
264267
return model
265268

266-
def apply_RF(self, df):
269+
def apply_RF(self, code_col):
267270
"""Defining the Training Model Classifier for Binary Classification"""
268-
269-
code_col = df.code
270-
271271
def preprocess4RF(code_col):
272272
"""Cleaning-up"""
273273
return (
274274
pd.Series(code_col)
275275
.replace(r"\b([A-Za-z])\1+\b", "", regex=True)
276276
.replace(r"\b[A-Za-z]\b", "", regex=True)
277277
)
278-
279278
transformer = FunctionTransformer(preprocess4RF)
279+
280280
token_pattern = r"""([A-Za-z_]\w*\b|[!\#\$%\&\*\+:\-\./<=>\?@\\\^_\|\~]+|[ \t\(\),;\{\}\[\]"'`])"""
281281
vectorizer = TfidfVectorizer(
282282
token_pattern=token_pattern, max_features=3000)
283283

284-
# Dataset split for training and testing.
285-
code_train, code_test, tag_train, tag_test = train_test_split(
286-
df.code,
287-
df.isMalicious,
288-
test_size=0.15,
289-
shuffle=True, # TODO apply random_state instead shuffle=True for reproducibility
290-
)
291-
292284
# Training Model Classifier for Multi-Class Classification
293285
clf = RandomForestClassifier(n_jobs=4)
294286

@@ -305,9 +297,4 @@ def preprocess4RF(code_col):
305297
"clf__n_estimators": 300,
306298
}
307299
model.set_params(**best_params)
308-
309-
# Fitting
310-
model.fit(code_train, tag_train)
311-
acc = model.score(code_test, tag_test)
312-
print(f"Accuracy: {acc}")
313300
return model

classifier/preprocess.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
from pathlib import Path
1616
from string import printable
1717
import pickle
18+
import _pickle as cPickle
19+
import skops.io as sio
1820

1921
import numpy as np
2022
import pandas as pd
@@ -40,13 +42,19 @@ def __init__(self, config):
4042

4143
def load_data(self, data_file):
4244
"""Load data code snippets"""
43-
df = pd.read_csv(
44-
data_file, encoding="utf-8") # og: encoding="unicode_escape"
45+
# og: encoding="unicode_escape"
46+
if self.config['debug'] is True:
47+
df = pd.read_csv(data_file,
48+
encoding="utf-8",
49+
nrows=int(self.config['debug_rows'])
50+
)
51+
else:
52+
df = pd.read_csv(data_file, encoding='utf-8')
4553

4654
# Checking for duplicate rows or null values
4755
df = df.dropna().drop_duplicates().reset_index(drop=True)
4856
print(f"\nShape of the input data: {df.shape}")
49-
print("Samples:")
57+
print("\nSamples:")
5058
print("-" * 50)
5159
print(df.head(3))
5260
print("-" * 50)
@@ -68,7 +76,10 @@ def tokenize_data(self, df, max_len):
6876

6977
def split_data(self, df):
7078
"""Split data into train and test sets"""
71-
X, y = self.tokenize_data(df, self.config["preprocess"]["max_len"])
79+
if self.config['model']['name'] != 'RF':
80+
X, y = self.tokenize_data(df, self.config["preprocess"]["max_len"])
81+
else:
82+
X, y = df.code, df.label
7283

7384
X_train, X_test, y_train, y_test = model_selection.train_test_split(
7485
X, y,
@@ -99,7 +110,7 @@ def save_model(self, model, model_file):
99110
if self.config["model"]["name"] != "RF":
100111
model.save(model_file)
101112
else:
102-
pickle.dump(model, open(model_file, "wb"))
113+
sio.dumps(model_file)
103114

104115
print(f"The final trained model is saved at: {model_file}")
105116
print("\n" + "-" * 35 + "Training Completed" + "-" * 35 + "\n")

classifier/utility.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,12 @@ def load_config(self, yaml_file):
3030
return yaml.safe_load(stream)
3131
except yaml.YAMLError as err:
3232
return err
33+
34+
# function to keep logging of the verbose and save log line to a file
35+
def log(self, log_file, log_line, verbose):
36+
"""Log the verbose to a file
37+
"""
38+
if verbose:
39+
print(log_line)
40+
with open(log_file, 'a') as f:
41+
f.write(log_line + '\n')

config/classifier.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
# data_file: "data/iDetect_refine/DNN_Binary.csv"
2-
data_file: data/TinyVul-v2-function-binary.csv
2+
data_file: data/TinyVul-v2-statement-binary.csv
3+
eval_data: data/iDetect_refine/RF_Binary.csv
34
# data_file: "data/contiki-master_Binary.csv"
45
result_dir: result/
56
apply_balancer: False # True if you want to apply the classes' balancer
6-
train: True
7+
train: False
78
test: True
89
debug: True # mode: turn debugging mode True for preprocessing on debug_rows samples.
10+
debug_rows: 400
911

1012
preprocess:
1113
raw_dir: data/local
@@ -18,7 +20,7 @@ preprocess:
1820
model: # settings for training and testing experiments
1921
seed: 30 # seed for reproduciability to generate same set of samples of data splits
2022
type: binary # binary or multiclass
21-
name: RNN # RNN, CNN or RF - RandomForest
23+
name: RNN # RNN, LSTM, CNN or RF - RandomForest
2224
k-fold: 20 # number of cross-validation folds
2325
split_ratio: 0.30 # ratio of training and testing ratio
2426
path: result/

data/iDetect_refine/DNN_Binary.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
code,isMalicious
1+
code,label
22
( strlen ( me ) ,1
33
*pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,1
44
*pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,1

data/iDetect_refine/DNN_Multi_Class.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
code,isMalicious
1+
code,label
22
( strlen ( me ) ,36
33
*pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,16
44
*pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,16

data/iDetect_refine/RF_Binary.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
code,isMalicious
1+
code,label
22
( strlen ( me ) ,1
33
*pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,1
44
*pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,1

data/iDetect_refine/RF_Multi_Class.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
code,isMalicious
1+
code,label
22
( strlen ( me ) ,36.0
33
*pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,16.0
44
*pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,16.0

extractor/src2funs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ def src2src_functions(self, src):
103103
def fix_cwe_labeling(self, cwe) -> str:
104104
""" Extract CWE type information,
105105
In case of Rats tool's 'CWE-unknown' list,
106-
make it just a single item."""
106+
make it just a single item.
107+
"""
107108
cwe = list(set(cwe)) if isinstance(cwe, list) else [cwe]
108109

109110
if len(cwe) > 1:

0 commit comments

Comments
 (0)