Merge branch 'develop' of https://github.com/zkh2016/PaddleNLP into develop

zkh2016 · zkh2016 · commit b877f92abf22 · 2021-11-05T12:22:50.000Z
diff --git a/examples/experimental/faster_bert/run_glue.py b/examples/experimental/faster_bert/run_glue.py
@@ -37,6 +37,7 @@
 from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
 from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
+from static.model_convert_util import convert_base_to_fused
 
 FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -246,123 +247,123 @@ def convert_example(example,
     else:
         return example['input_ids'], example['token_type_ids']
 
-def fused_weight(weight, num_head):
-    a = paddle.transpose(weight, perm=[1, 0])
-    return paddle.reshape(a, shape=[1, num_head, int(a.shape[0]/num_head), a.shape[1]])
-
-def fused_qkv(qkv_weight, num_head):
-    q = qkv_weight['q']
-    k = qkv_weight['k']
-    v = qkv_weight['v']
-
-    fq = fused_weight(q, num_head)
-    fk = fused_weight(k, num_head)
-    fv = fused_weight(v, num_head)
-    a = paddle.concat(x=[fq, fk, fv], axis=0)
-    return a
-
-def convert_base_to_fused(state_to_load):
-    base_to_fused = dict()
-    base_to_fused["weight"] = "scale"
-    base_to_fused["bias"] = "bias"
-
-    fused_state_to_load = dict()
-    qkv_weight = dict()
-    qkv_bias = dict()
-    qkv_count = 0
-    num_head = 16
-    layer_index = 0
-    for key, value in state_to_load.items():
-        array = key.split('.')
-        fused_array = list(array)
-        if len(array) == 6:#linear or layer_norm
-            if 'linear' in array[4]:
-                #linear1.weight -> ffn._linear1_weight
-                #linear1.bias -> ffn._linear1_bias
-                fused_array[5] = "_" + array[4] + "_" + array[5]
-                fused_array[4] = "ffn"
-                fused_key = '.'.join(fused_array)
-                fused_state_to_load[fused_key] = value
-                #print(key, fused_key)
-                #if array[3] == "0":
-                #    np.savetxt(key+".txt", value)
-
-            elif 'norm' in array[4]:
-                if array[4][-1] == '1':
-                    #norm1.weight -> fused_atten.pre_ln_scale
-                    #norm2.weight -> fused_atten.ln_scale
-                    fused_array[4] = "fused_attn"
-                    fused_array[5] = "ln_" + base_to_fused[array[5]]
-                    fused_key = '.'.join(fused_array)
-                    fused_state_to_load[fused_key] = value
-                    #print(key, fused_key)
-                    #if array[3] == "0":
-                    #    np.savetxt(key+".txt", value)
-                else:
-                    #norm1.weight -> ffn._ln1_scale
-                    fused_array[4] = "ffn"
-                    fused_array[5] = "_ln" + array[4][-1] + "_" + base_to_fused[array[5]]
-                    fused_key = '.'.join(fused_array)
-                    fused_state_to_load[fused_key] = value
-                    #print(key, fused_key)
-                    #if array[3] == "0":
-                    #    np.savetxt(key+".txt", value)
-        elif len(array) == 7:#self_atten
-            if 'q' in array[5]:
-                if array[6] == "weight":
-                    qkv_weight['q'] = value
-                else:
-                    qkv_bias['q'] = value
-                qkv_count += 1
-            elif 'k' in array[5]:
-                if array[6] == "weight":
-                    qkv_weight['k'] = value
-                else:
-                    qkv_bias['k'] = value
-                qkv_count += 1
-            elif 'v' in array[5]:
-                if array[6] == "weight":
-                    qkv_weight['v'] = value
-                else:
-                    qkv_bias['v'] = value
-                qkv_count += 1
-            else:
-                fused_array.pop()
-                fused_array[4] = "fused_attn"
-                if array[6] == "weight":
-                    fused_array[5] = "linear_weight"
-                else:
-                    fused_array[5] = "linear_bias"
-                fused_key = '.'.join(fused_array)
-                fused_state_to_load[fused_key] = value
-                #print(key, fused_key)
-                #if array[3] == "0":
-                #    np.savetxt(key+".txt", value)
-
-            if qkv_count == 6:
-                qkv_count = 0
-                fused_array.pop()
-
-                fused_array[4] = "fused_attn"
-                fused_array[5] = "qkv_weight"
-                fused_key = '.'.join(fused_array)
-                fused_state_to_load[fused_key] = fused_qkv(qkv_weight, num_head)
-                #print(key, fused_key)
-
-                fused_array[4] = "fused_attn"
-                fused_array[5] = "qkv_bias"
-                fused_key = '.'.join(fused_array)
-                a = paddle.concat(x=[qkv_bias['q'], qkv_bias['k'], qkv_bias['v']], axis=0)
-                tmp_bias = paddle.reshape(a, shape=[3, num_head, int(a.shape[0]/3/num_head)])
-                fused_state_to_load[fused_key] = tmp_bias
-                #print(key, fused_key, tmp_bias.numpy().shape)
-                #if array[3] == "0":
-                #    np.savetxt("fused_bias.txt", tmp_bias.numpy().flatten())
-                    #if array[3] == "0":
-
-        else:
-            fused_state_to_load[key] = value
-    return fused_state_to_load
+#def fused_weight(weight, num_head):
+#    a = paddle.transpose(weight, perm=[1, 0])
+#    return paddle.reshape(a, shape=[1, num_head, int(a.shape[0]/num_head), a.shape[1]])
+#
+#def fused_qkv(qkv_weight, num_head):
+#    q = qkv_weight['q']
+#    k = qkv_weight['k']
+#    v = qkv_weight['v']
+#
+#    fq = fused_weight(q, num_head)
+#    fk = fused_weight(k, num_head)
+#    fv = fused_weight(v, num_head)
+#    a = paddle.concat(x=[fq, fk, fv], axis=0)
+#    return a
+#
+#def convert_base_to_fused(state_to_load):
+#    base_to_fused = dict()
+#    base_to_fused["weight"] = "scale"
+#    base_to_fused["bias"] = "bias"
+#
+#    fused_state_to_load = dict()
+#    qkv_weight = dict()
+#    qkv_bias = dict()
+#    qkv_count = 0
+#    num_head = 16
+#    layer_index = 0
+#    for key, value in state_to_load.items():
+#        array = key.split('.')
+#        fused_array = list(array)
+#        if len(array) == 6:#linear or layer_norm
+#            if 'linear' in array[4]:
+#                #linear1.weight -> ffn._linear1_weight
+#                #linear1.bias -> ffn._linear1_bias
+#                fused_array[5] = "_" + array[4] + "_" + array[5]
+#                fused_array[4] = "ffn"
+#                fused_key = '.'.join(fused_array)
+#                fused_state_to_load[fused_key] = value
+#                #print(key, fused_key)
+#                #if array[3] == "0":
+#                #    np.savetxt(key+".txt", value)
+#
+#            elif 'norm' in array[4]:
+#                if array[4][-1] == '1':
+#                    #norm1.weight -> fused_atten.pre_ln_scale
+#                    #norm2.weight -> fused_atten.ln_scale
+#                    fused_array[4] = "fused_attn"
+#                    fused_array[5] = "ln_" + base_to_fused[array[5]]
+#                    fused_key = '.'.join(fused_array)
+#                    fused_state_to_load[fused_key] = value
+#                    #print(key, fused_key)
+#                    #if array[3] == "0":
+#                    #    np.savetxt(key+".txt", value)
+#                else:
+#                    #norm1.weight -> ffn._ln1_scale
+#                    fused_array[4] = "ffn"
+#                    fused_array[5] = "_ln" + array[4][-1] + "_" + base_to_fused[array[5]]
+#                    fused_key = '.'.join(fused_array)
+#                    fused_state_to_load[fused_key] = value
+#                    #print(key, fused_key)
+#                    #if array[3] == "0":
+#                    #    np.savetxt(key+".txt", value)
+#        elif len(array) == 7:#self_atten
+#            if 'q' in array[5]:
+#                if array[6] == "weight":
+#                    qkv_weight['q'] = value
+#                else:
+#                    qkv_bias['q'] = value
+#                qkv_count += 1
+#            elif 'k' in array[5]:
+#                if array[6] == "weight":
+#                    qkv_weight['k'] = value
+#                else:
+#                    qkv_bias['k'] = value
+#                qkv_count += 1
+#            elif 'v' in array[5]:
+#                if array[6] == "weight":
+#                    qkv_weight['v'] = value
+#                else:
+#                    qkv_bias['v'] = value
+#                qkv_count += 1
+#            else:
+#                fused_array.pop()
+#                fused_array[4] = "fused_attn"
+#                if array[6] == "weight":
+#                    fused_array[5] = "linear_weight"
+#                else:
+#                    fused_array[5] = "linear_bias"
+#                fused_key = '.'.join(fused_array)
+#                fused_state_to_load[fused_key] = value
+#                #print(key, fused_key)
+#                #if array[3] == "0":
+#                #    np.savetxt(key+".txt", value)
+#
+#            if qkv_count == 6:
+#                qkv_count = 0
+#                fused_array.pop()
+#
+#                fused_array[4] = "fused_attn"
+#                fused_array[5] = "qkv_weight"
+#                fused_key = '.'.join(fused_array)
+#                fused_state_to_load[fused_key] = fused_qkv(qkv_weight, num_head)
+#                #print(key, fused_key)
+#
+#                fused_array[4] = "fused_attn"
+#                fused_array[5] = "qkv_bias"
+#                fused_key = '.'.join(fused_array)
+#                a = paddle.concat(x=[qkv_bias['q'], qkv_bias['k'], qkv_bias['v']], axis=0)
+#                tmp_bias = paddle.reshape(a, shape=[3, num_head, int(a.shape[0]/3/num_head)])
+#                fused_state_to_load[fused_key] = tmp_bias
+#                #print(key, fused_key, tmp_bias.numpy().shape)
+#                #if array[3] == "0":
+#                #    np.savetxt("fused_bias.txt", tmp_bias.numpy().flatten())
+#                    #if array[3] == "0":
+#
+#        else:
+#            fused_state_to_load[key] = value
+#    return fused_state_to_load
 
 
 
@@ -445,7 +446,7 @@ def do_train(args):
 ####convert model to fused model
     model = fused_model
     #model = base_model
-    #model.set_state_dict(state_to_load)
+    #model.set_state_dict(base_state_to_load)
 
     if paddle.distributed.get_world_size() > 1:
         model = paddle.DataParallel(model)
diff --git a/examples/experimental/faster_bert/static/modeling.py b/examples/experimental/faster_bert/static/modeling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/examples/experimental/faster_bert/static/run_glue.py b/examples/experimental/faster_bert/static/run_glue.py
@@ -28,12 +28,15 @@
 from paddle.metric import Accuracy
 from paddlenlp.data import Stack, Tuple, Pad
 from paddlenlp.data.sampler import SamplerHelper
-from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
+from paddlenlp.transformers import BertTokenizer
+from modeling import BertForSequenceClassification
 from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
 from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.metrics import Mcc, PearsonAndSpearman
 from paddlenlp.utils.log import logger
 
+from model_convert_util import convert_base_to_fused
+
 METRIC_CLASSES = {
     "cola": Mcc,
     "sst-2": Accuracy,
@@ -168,14 +171,15 @@ def create_data_holder(task_name):
 
 def reset_program_state_dict(args, model, state_dict, pretrained_state_dict):
     """
-    Initialize the parameter from the bert config, and set the parameter by 
+    Initialize the parameter from the bert config, and set the parameter by
     reseting the state dict."
     """
     reset_state_dict = {}
     scale = model.initializer_range if hasattr(model, "initializer_range")\
         else getattr(model, args.model_type).config["initializer_range"]
     reset_parameter_names = []
     for n, p in state_dict.items():
+        print(n)
         if n in pretrained_state_dict:
             reset_state_dict[p.name] = np.array(pretrained_state_dict[n])
             reset_parameter_names.append(n)
@@ -208,7 +212,7 @@ def set_seed(args):
 def evaluate(exe, metric, loss, correct, dev_program, data_loader,
              phase="eval"):
     """
-    The evaluate process, calcluate the eval loss and metric. 
+    The evaluate process, calcluate the eval loss and metric.
     """
     metric.reset()
     returns = [loss]
@@ -295,7 +299,7 @@ def do_train(args):
 
     batchify_fn = lambda samples, fn=Tuple(
         Pad(axis=0, pad_val=tokenizer.pad_token_id),  # input
-        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type 
+        Pad(axis=0, pad_val=tokenizer.pad_token_type_id),  # token_type
         Stack(dtype="int64" if train_ds.label_list else "float32")  # label
     ): fn(samples)
 
@@ -357,8 +361,14 @@ def do_train(args):
     with paddle.static.program_guard(main_program, startup_program):
         num_class = 1 if train_ds.label_list is None else len(
             train_ds.label_list)
-        model, pretrained_state_dict = model_class.from_pretrained(
+        base_model, pretrained_state_dict = model_class.from_pretrained(
+            args.model_name_or_path, num_classes=num_class)
+
+        fused_model, fused_pretrained_state_dict = model_class.from_pretrained(
             args.model_name_or_path, num_classes=num_class)
+
+        model = fused_model
+
         loss_fct = paddle.nn.loss.CrossEntropyLoss(
         ) if train_ds.label_list else paddle.nn.loss.MSELoss()
         logits = model(input_ids, token_type_ids)
@@ -395,11 +405,16 @@ def do_train(args):
     # Initialize the fine-tuning parameter, we will load the parameters in
     # pre-training model. And initialize the parameter which not in pre-training model
     # by the normal distribution.
+
+####convert model to fused model
+    fused_pretrained_state_dict = convert_base_to_fused(pretrained_state_dict)
+####convert model to fused model
+
     exe = paddle.static.Executor(place)
     exe.run(startup_program)
     state_dict = model.state_dict()
     reset_state_dict = reset_program_state_dict(args, model, state_dict,
-                                                pretrained_state_dict)
+                                                fused_pretrained_state_dict)
     paddle.static.set_program_state(main_program, reset_state_dict)
 
     global_step = 0

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.`
	`1`	`+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
`2`	`2`	`#`
`3`	`3`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`4`	`4`	`# you may not use this file except in compliance with the License.`