fixed evaluation code of the classifiers

gurubhandari · gurubhandari · commit 6813901ef0c7 · 2023-10-17T16:22:40.000+02:00
diff --git a/classifier/classifier.py b/classifier/classifier.py
@@ -15,6 +15,8 @@
 from argparse import ArgumentParser
 from configparser import ConfigParser
 from pathlib import Path
+from datetime import datetime
+from sklearn import model_selection
 
 # from string import printable
 import pandas as pd
@@ -43,17 +45,22 @@ def __init__(self):
         self.config = self.util.load_config("config/classifier.yaml")
         self.model = None
         self.history = None
+        self.arch = ModelArchs(self.config)
 
     def update_config_args(self, paras):
         """create model dir to store it
          set the config args from the command line if provided """
         self.config["model"]["name"] = paras.model if paras.model else self.config["model"]["name"]
         self.config["data_file"] = paras.data if paras.data else self.config["data_file"]
+
         if self.config['debug']:
             self.config['dnn']['epochs'] = self.config['dnn']['debug_epochs']
+            time_now = ''
+        else:
+            time_now = datetime.now().strftime('%Y-%m-%d_%H.%Mm')
 
         mdir = self.config["model"]["name"] + "-" + str(self.config["dnn"]["epochs"]) + \
-            "-" + Path(self.config["data_file"]).stem + "/"
+            "-" + Path(self.config["data_file"]).stem + "-" + time_now + "/"
 
         self.config["model"]["path"] = self.config["model"]["path"] + mdir
 
@@ -120,7 +127,7 @@ def apply_checkpoints(self, model, cp_path, patience):
 
     def select_model_arch(self):
         """Choose ML model"""
-        arch = ModelArchs(self.config)
+
         model_name = self.config["model"]["name"]
 
         print("\n\n" + "=" * 25 + " " + model_name +
@@ -129,15 +136,15 @@ def select_model_arch(self):
         print("-" * 50)
 
         if model_name == "RNN":
-            model = arch.apply_RNN()
+            model = self.arch.apply_RNN()
         elif model_name == "CNN":
-            model = arch.apply_CNN()
+            model = self.arch.apply_CNN()
         elif model_name == "LSTM":
-            model = arch.apply_LSTM()
+            model = self.arch.apply_LSTM()
         elif model_name == "RF":
-            model = arch.apply_RF(df)
+            model = self.arch.apply_RF(df)
         elif model_name == "multiDNN":
-            model = arch.apply_multiDNN()
+            model = self.arch.apply_multiDNN()
         else:
             print("Invalid model! Please select a valid model!")
             exit(1)
@@ -148,18 +155,18 @@ def train_model(self, model_file, X_train, y_train, X_test, y_test):
         model_name = self.config["model"]["name"]
         epochs = self.config["dnn"]["epochs"]
 
-        # Select the model architecture
-        model = self.select_model_arch()
-
-        # store metadata to neptune.ai
-        if self.config["model"]["use_neptune"]:
-            from neptune.integrations.tensorflow_keras import NeptuneCallback
-            nt_run = self.init_neptune(
-                model_name, epochs, self.config["data_file"])
-
         # Apply callbacks for training to store the best model checkpoint
         # and apply early stopping.
         if model_name != "RF":
+            # Select the model architecture
+            model = self.select_model_arch()
+
+            # store metadata to neptune.ai
+            if self.config["model"]["use_neptune"]:
+                from neptune.integrations.tensorflow_keras import NeptuneCallback
+                nt_run = self.init_neptune(
+                    model_name, epochs, self.config["data_file"])
+
             tf_callbacks = self.apply_checkpoints(
                 model=model,
                 cp_path=self.config["model"]["path"],
@@ -196,18 +203,15 @@ def train_model(self, model_file, X_train, y_train, X_test, y_test):
         else:
             # TODO: log non-DNN models output to Neptune
             # nt_run["acc"] = ?? or params=dict
+            # Fitting
+            model = self.arch.apply_RF(df.code)
+            # model = self.select_model_arch()
+            model.fit(X_train, y_train)
+            acc = model.score(X_test, y_test)
+            print(f"Accuracy: {acc}")
             print(f"Trained with non-DNN model: {model_name}")
         return model
 
-    # def load_model(self, model_JSON, file_weights):
-    #     """Load model from disk"""
-    #     with open(model_JSON, "r") as f:
-    #         model_json = json.load(f)
-    #         model = model_from_json(model_json)
-
-    #     model.load_weights(file_weights)
-    #     return model
-
     def load_tf_model(self, model_file):
         """ 
         Load model from disk
@@ -230,12 +234,11 @@ def evaluate_model(self, model_file, X_eval, y_eval):
         if self.config["model"]["name"] != "RF":
             if Path(model_file).is_file():
                 model = self.load_tf_model(model_file)
-
+                print("\nEvaluating the model...\n")
                 # evaluate the model
-                # loss, acc = model.evaluate(X_eval, y_eval, verbose=0)
+                loss, acc = model.evaluate(X_eval, y_eval, verbose=1)
 
-                print("\nEvaluating the model...\n")
-                y_pred = model.predict(X_eval)
+                # y_pred = model.predict(X_eval)
                 # print(f'y_pred: {y_pred}')
                 # print(f'y_eval: {y_eval}')
                 # print(f'\ny_pred.shape: {y_pred.shape}')
@@ -251,14 +254,13 @@ def evaluate_model(self, model_file, X_eval, y_eval):
 
                 # cls_report = classification_report(y_eval, y_pred)
                 # print(f"Classification Report: \n{cls_report}")
-                # print('loss: ', loss)
-                # print('acc: ', acc)
+                print('loss: ', loss)
+                print('acc: ', acc)
             else:
                 print(f"\n\nModel file: {model_file} not found!")
                 print("Please train the model first!")
         else:
-            train_model = pickle.load(open(model_file, "RF"))
-            result = train_model.score(X_eval, y_eval)
+            result = model_file.score(X_eval, y_eval)
             print("Result: ", result)
         print("\n" + "-" * 35 + "Testing Completed" + "-" * 35 + "\n")
 
@@ -298,6 +300,13 @@ def parse_args(self):
         preprocess.save_model(model, model_file)
 
     # TODO: Evaluation of the trained model
+    df_eval = preprocess.load_data(data_file=config["eval_data"])
+
     if config["test"]:
-        output_size = len(set(list(y_train)))
-        classfr.evaluate_model(model_file, X_test, y_test)
+        if config["model"]["name"] != "RF":
+            X_eval, y_eval = preprocess.tokenize_data(
+                df=df_eval, max_len=config["preprocess"]["max_len"])
+            # output_size = len(set(list(y_train)))
+            classfr.evaluate_model(model_file, X_eval, y_eval)
+        else:
+            classfr.evaluate_model(model, X_eval, y_eval)
diff --git a/classifier/models.py b/classifier/models.py
@@ -12,15 +12,18 @@
 
 import json
 import os
+import pickle
 import re
 import warnings
 from pathlib import Path
 from string import printable
 
+import joblib
 # import keras
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import skops.io as sio
 import tensorflow as tf
 from matplotlib import pyplot
 from nltk.tokenize.regexp import WhitespaceTokenizer
@@ -263,32 +266,21 @@ def apply_multiDNN(self):
         model = self.optimize_model(self, model)
         return model
 
-    def apply_RF(self, df):
+    def apply_RF(self, code_col):
         """Defining the Training Model Classifier for Binary Classification"""
-
-        code_col = df.code
-
         def preprocess4RF(code_col):
             """Cleaning-up"""
             return (
                 pd.Series(code_col)
                 .replace(r"\b([A-Za-z])\1+\b", "", regex=True)
                 .replace(r"\b[A-Za-z]\b", "", regex=True)
             )
-
         transformer = FunctionTransformer(preprocess4RF)
+
         token_pattern = r"""([A-Za-z_]\w*\b|[!\#\$%\&\*\+:\-\./<=>\?@\\\^_\|\~]+|[ \t\(\),;\{\}\[\]"'`])"""
         vectorizer = TfidfVectorizer(
             token_pattern=token_pattern, max_features=3000)
 
-        # Dataset split for training and testing.
-        code_train, code_test, tag_train, tag_test = train_test_split(
-            df.code,
-            df.isMalicious,
-            test_size=0.15,
-            shuffle=True,  # TODO apply random_state instead shuffle=True for reproducibility
-        )
-
         # Training Model Classifier for Multi-Class Classification
         clf = RandomForestClassifier(n_jobs=4)
 
@@ -305,9 +297,4 @@ def preprocess4RF(code_col):
             "clf__n_estimators": 300,
         }
         model.set_params(**best_params)
-
-        # Fitting
-        model.fit(code_train, tag_train)
-        acc = model.score(code_test, tag_test)
-        print(f"Accuracy: {acc}")
         return model
diff --git a/classifier/preprocess.py b/classifier/preprocess.py
@@ -15,6 +15,8 @@
 from pathlib import Path
 from string import printable
 import pickle
+import _pickle as cPickle
+import skops.io as sio
 
 import numpy as np
 import pandas as pd
@@ -40,13 +42,19 @@ def __init__(self, config):
 
     def load_data(self, data_file):
         """Load data code snippets"""
-        df = pd.read_csv(
-            data_file, encoding="utf-8")  # og: encoding="unicode_escape"
+        # og: encoding="unicode_escape"
+        if self.config['debug'] is True:
+            df = pd.read_csv(data_file,
+                             encoding="utf-8",
+                             nrows=int(self.config['debug_rows'])
+                             )
+        else:
+            df = pd.read_csv(data_file, encoding='utf-8')
 
         # Checking for duplicate rows or null values
         df = df.dropna().drop_duplicates().reset_index(drop=True)
         print(f"\nShape of the input data: {df.shape}")
-        print("Samples:")
+        print("\nSamples:")
         print("-" * 50)
         print(df.head(3))
         print("-" * 50)
@@ -68,7 +76,10 @@ def tokenize_data(self, df, max_len):
 
     def split_data(self, df):
         """Split data into train and test sets"""
-        X, y = self.tokenize_data(df, self.config["preprocess"]["max_len"])
+        if self.config['model']['name'] != 'RF':
+            X, y = self.tokenize_data(df, self.config["preprocess"]["max_len"])
+        else:
+            X, y = df.code, df.label
 
         X_train, X_test, y_train, y_test = model_selection.train_test_split(
             X, y,
@@ -99,7 +110,7 @@ def save_model(self, model, model_file):
             if self.config["model"]["name"] != "RF":
                 model.save(model_file)
             else:
-                pickle.dump(model, open(model_file, "wb"))
+                sio.dumps(model_file)
 
         print(f"The final trained model is saved at: {model_file}")
         print("\n" + "-" * 35 + "Training Completed" + "-" * 35 + "\n")
diff --git a/classifier/utility.py b/classifier/utility.py
@@ -30,3 +30,12 @@ def load_config(self, yaml_file):
                 return yaml.safe_load(stream)
             except yaml.YAMLError as err:
                 return err
+
+    # function to keep logging of the verbose and save log line to a file
+    def log(self, log_file, log_line, verbose):
+        """Log the verbose to a file
+        """
+        if verbose:
+            print(log_line)
+        with open(log_file, 'a') as f:
+            f.write(log_line + '\n')
diff --git a/config/classifier.yaml b/config/classifier.yaml
@@ -1,11 +1,13 @@
 # data_file: "data/iDetect_refine/DNN_Binary.csv"
-data_file: data/TinyVul-v2-function-binary.csv
+data_file: data/TinyVul-v2-statement-binary.csv
+eval_data: data/iDetect_refine/RF_Binary.csv
 # data_file: "data/contiki-master_Binary.csv"
 result_dir: result/
 apply_balancer: False # True if you want to apply the classes' balancer
-train: True
+train: False
 test: True
 debug: True # mode: turn debugging mode True for preprocessing on debug_rows samples.
+debug_rows: 400
 
 preprocess:
   raw_dir: data/local
@@ -18,7 +20,7 @@ preprocess:
 model: # settings for training and testing experiments
   seed: 30 # seed for reproduciability to generate same set of samples of data splits
   type: binary # binary or multiclass
-  name: RNN # RNN, CNN or RF - RandomForest
+  name: RNN # RNN, LSTM, CNN or RF - RandomForest
   k-fold: 20 # number of cross-validation folds
   split_ratio: 0.30 # ratio of training and testing ratio
   path: result/
diff --git a/data/iDetect_refine/DNN_Binary.csv b/data/iDetect_refine/DNN_Binary.csv
@@ -1,4 +1,4 @@
-code,isMalicious
+code,label
  ( strlen ( me ) ,1
  *pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,1
  *pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,1
diff --git a/data/iDetect_refine/DNN_Multi_Class.csv b/data/iDetect_refine/DNN_Multi_Class.csv
@@ -1,4 +1,4 @@
-code,isMalicious
+code,label
  ( strlen ( me ) ,36
  *pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,16
  *pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,16
diff --git a/data/iDetect_refine/RF_Binary.csv b/data/iDetect_refine/RF_Binary.csv
@@ -1,4 +1,4 @@
-code,isMalicious
+code,label
  ( strlen ( me ) ,1
  *pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,1
  *pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,1
diff --git a/data/iDetect_refine/RF_Multi_Class.csv b/data/iDetect_refine/RF_Multi_Class.csv
@@ -1,4 +1,4 @@
-code,isMalicious
+code,label
  ( strlen ( me ) ,36.0
  *pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,16.0
  *pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,16.0
diff --git a/extractor/src2funs.py b/extractor/src2funs.py
@@ -103,7 +103,8 @@ def src2src_functions(self, src):
     def fix_cwe_labeling(self, cwe) -> str:
         """ Extract CWE type information,
         In case of Rats tool's 'CWE-unknown' list, 
-        make it just a single item."""
+        make it just a single item.
+        """
         cwe = list(set(cwe)) if isinstance(cwe, list) else [cwe]
 
         if len(cwe) > 1:
diff --git a/notebooks/extract-iot.ipynb b/notebooks/extract-iot.ipynb
diff --git a/notebooks/src2src_funs.ipynb b/notebooks/src2src_funs.ipynb
diff --git a/notebooks/stat.ipynb b/notebooks/stat.ipynb

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-code,isMalicious`
	`1`	`+code,label`
`2`	`2`	`( strlen ( me ) ,1`
`3`	`3`	`*pBuffer = MQTT_PACKET_TYPE_CONNECT ; pBuffer++ ;,1`
`4`	`4`	`*pBuffer1 = MQTT_PACKET_TYPE_CONNECT1 ; pBuffer++ ;,1`