From e4a645649744bf03fa2c3d90b771d48a8dc204fc Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 28 Mar 2024 21:22:34 +0800 Subject: [PATCH 001/199] update oneDNN to 769a481dcf on main (#2707) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 25b6363bd..5bd1a28af 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 25b6363bdb4fc6dca3b291aebf02abb03b863625 +Subproject commit 5bd1a28afb0a8a9c949564fe31800d3692586fd8 From acb37cf35a4a9cb12493de920c852738a82938fa Mon Sep 17 00:00:00 2001 From: blzheng Date: Fri, 29 Mar 2024 10:05:19 +0800 Subject: [PATCH 002/199] enable accuracy test for git and llava (#2709) --- .../run_accuracy_with_deepspeed.py | 647 +++++++++++++++++- .../run_generation_with_deepspeed.py | 13 +- .../llm/single_instance/run_accuracy.py | 539 ++++++++++++++- .../python/llm/tools/prepare_llava.sh | 5 +- 4 files changed, 1133 insertions(+), 71 deletions(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 76426b00b..cab50385a 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -20,12 +20,30 @@ AutoTokenizer, LlamaTokenizer, T5ForConditionalGeneration, + AutoProcessor, ) import sys sys.path.append(sys.path[0] + '/../../') +try: + from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM + from llava.model.builder import load_pretrained_model + from llava.conversation import conv_templates + from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token + from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + import lmms_eval + from lmms_eval.api.instance import Instance + from lmms_eval.api.model import lmms + from lmms_eval.api.registry import register_model + from lmms_eval import evaluator as lmms_evaluator + from lmms_eval import utils as lmms_utils + from lmms_eval.api.registry import ALL_TASKS + from lmms_eval.tasks import initialize_tasks +except ImportError: + pass + MODEL_CLASSES = { "gpt-j": (AutoModelForCausalLM, AutoTokenizer), "gpt-neox": (AutoModelForCausalLM, AutoTokenizer), @@ -43,12 +61,14 @@ "mpt": (AutoModelForCausalLM, AutoTokenizer), "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), + "git": (AutoModelForCausalLM, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } parser = argparse.ArgumentParser() parser.add_argument("--model", nargs="?", default="EleutherAI/gpt-j-6b") parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--output_path", nargs="?", default="./logs") parser.add_argument("--device", default="cpu", type=str, help="cpu") parser.add_argument( "--dtype", default="bfloat16", type=str, help="float32 or bfloat16 or int8" @@ -744,39 +764,600 @@ def _loglikelihood_tokens( return results -task_dict = lm_eval.tasks.get_task_dict(args.tasks) -torch._C._jit_set_texpr_fuser_enabled(False) -if args.model in ["google/flan-t5-xl"]: - hfmodel = T5ModelLambada( - pretrained=args.model, - device="cpu", - batch_size=args.batch_size, - with_ipex=use_ipex, - with_jit=not args.disable_jit, - dtype=args.dtype, - tp_number=world_size, - config=args.config_file, - add_special_tokens=True, + +@register_model("test") +class LMMS(lmms): + def __init__( + self, + pretrained: str, + device: Optional[str] = "cpu", + with_ipex=True, + with_jit=True, with_greedy=False, - ) -else: - hfmodel = HuggingFaceModel( - pretrained=args.model, - device="cpu", - batch_size=args.batch_size, - with_ipex=use_ipex, - with_jit=not args.disable_jit, - dtype=args.dtype, - tp_number=world_size, - config=args.config_file, - add_special_tokens=False - ) + batch_size=1, + dtype: Optional[Union[str, torch.dtype]] = "auto", + tp_number=1, + config=None, + add_special_tokens = True, + ) -> None: + super().__init__() + self._device = torch.device(device) + self._batch_size = int(batch_size) + self._with_jit = with_jit + self._with_ipex = with_ipex + self._with_greedy = with_greedy + self._tp_number = tp_number + self._dtype = dtype + self.add_special_tokens = add_special_tokens + load_dtype = torch.float32 + infer_dtype = torch.float32 + if args.quant_with_amp or dtype == "bfloat16": + load_dtype = torch.bfloat16 + infer_dtype = torch.bfloat16 + else: + if dtype == "float16": + load_dtype = torch.half + infer_dtype = torch.half + elif dtype == "int8": + load_dtype = torch.float32 + infer_dtype = torch.int8 + self.amp_dtype = torch.bfloat16 if args.quant_with_amp or self._dtype == "bfloat16" else torch.float32 + if re.search("llava", pretrained, re.IGNORECASE): + self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained)) + model_name = get_model_name_from_path(pretrained) + if 'llama-2' in model_name.lower(): + conv_mode = "llava_llama_2" + elif "v1" in model_name.lower(): + conv_mode = "llava_v1" + elif "mpt" in model_name.lower(): + conv_mode = "mpt" + else: + conv_mode = "llava_v0" + self.conv_template = conv_mode + elif re.search("git", pretrained, re.IGNORECASE): + model_class = MODEL_CLASSES["git"] + self._image_processor = model_class[1].from_pretrained( + pretrained, trust_remote_code=True + ) + self._tokenizer = self._image_processor.tokenizer + self._config = AutoConfig.from_pretrained( + pretrained if config is None else config, torchscript=with_jit, trust_remote_code=True + ) + self._model = model_class[0].from_pretrained( + pretrained, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + self._config = self._model.config + self._config.torchscript = self._with_jit + self._model.eval() -results = evaluator.evaluate( - hfmodel, - task_dict, - # bootstrap_iters=1000, - # limit=100 -) + checkpoints_json = "checkpoints.json" -print(evaluator.make_table(results)) + def print_rank0(*msg): + if local_rank != 0: + return + print(*msg) + + def get_repo_root(model_name_or_path): + if os.path.exists(model_name_or_path): + # local path + return model_name_or_path + # checks if online or not + if is_offline_mode(): + print_rank0("Offline mode: forcing local_files_only=True") + # download only on first process + allow_patterns = ["*.bin", "*.model", "*.json", "*.txt", "*.py", "*LICENSE"] + if local_rank == 0: + snapshot_download( + model_name_or_path, + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None), + allow_patterns=allow_patterns, + # ignore_patterns=["*.safetensors"], + ) + + dist.barrier() + + return snapshot_download( + model_name_or_path, + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None), + allow_patterns=allow_patterns, + # ignore_patterns=["*.safetensors"], + ) + + def get_checkpoint_files(model_name_or_path): + cached_repo_dir = get_repo_root(model_name_or_path) + + # extensions: .bin | .pt + # creates a list of paths from all downloaded files in cache dir + file_list = [ + str(entry) + for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") + if entry.is_file() + ] + return file_list + + def write_checkpoints_json(): + checkpoint_files = get_checkpoint_files(pretrained) + if local_rank == 0: + # model.config.model_type.upper() + data = { + "type": "BLOOM", + "checkpoints": checkpoint_files, + "version": 1.0, + } + json.dump(data, open(checkpoints_json, "w")) + + repo_root = get_repo_root(pretrained) + write_checkpoints_json() + dist.barrier() + self._model = deepspeed.init_inference( + self._model, + mp_size=tp_number, + base_dir=repo_root, + dtype=infer_dtype, + checkpoint=checkpoints_json, + ) + + self._model = self._model.module + + if self._with_ipex: + ipex_woq_enabled = args.ipex_weight_only_quantization + if ipex_woq_enabled: + from intel_extension_for_pytorch.quantization import WoqWeightDtype + weight_dtype = ( + WoqWeightDtype.INT4 if args.weight_dtype == "INT4" else WoqWeightDtype.INT8 + ) + + if args.lowp_mode == "INT8": + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + elif args.lowp_mode == "FP32": + lowp_mode = ipex.quantization.WoqLowpMode.NONE + elif args.lowp_mode == "FP16": + lowp_mode = ipex.quantization.WoqLowpMode.FP16 + elif args.lowp_mode == "BF16": + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + else: # AUTO + if weight_dtype == WoqWeightDtype.INT4: + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + else: + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + + act_quant_mode_dict = { + "PER_TENSOR": ipex.quantization.WoqActQuantMode.PER_TENSOR, + "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, + "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + } + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode_dict[args.act_quant_mode], + group_size=args.group_size, + ) + self._model = ipex.llm.optimize( + self._model.eval(), + dtype=infer_dtype, + quantization_config=qconfig if ipex_woq_enabled else None, + inplace=True, + deployment_mode=False, + ) + + self._base_model = self._model + + self.iter = 0 + self.num_beams = 1 if with_greedy else 4 + if self._with_jit: + input_ids = torch.ones(32).to(torch.long).unsqueeze(0) + attention_mask = torch.ones_like(input_ids) + past_key_values = tuple( + [ + ( + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + ) + for i in range(self.model.config.num_hidden_layers) + ] + ) + sample_inputs = { + "attention_mask": attention_mask, + "past_key_values": past_key_values, + } + if re.search("llava", pretrained, re.IGNORECASE): + sample_inputs["inputs_embeds"] = torch.zeros(batch_size, 1, 4096).to(self.amp_dtype) + elif re.search("git", pretrained, re.IGNORECASE): + sample_inputs["input_ids"] = input_ids.repeat(self.batch_size, 1) + sample_inputs["attention_mask"] = attention_mask.repeat(self.batch_size, 1) + sample_inputs["pixel_values"] = torch.zeros(batch_size, 3, 224, 224) + num_head = self.model.git.encoder.layer[0].attention.self.num_attention_heads + head_dim = int(self.model.git.encoder.layer[0].attention.self.hidden_size / num_head) + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), + torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(self.model.config.num_hidden_layers) + ] + ) + sample_inputs["past_key_values"] = past_key_values + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False,): + if self._dtype != "int8": + traced_model = torch.jit.trace( + self._model.eval(), + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + traced_model = torch.jit.freeze(traced_model) + else: + traced_model = torch.jit.load(args.quantized_model_path) + traced_model = torch.jit.freeze(traced_model) + + traced_model(**sample_inputs) + traced_model(**sample_inputs) + ipex._set_optimized_model_for_generation(self._model, optimized_model=traced_model) + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + def pad_sequence(self, input_ids, batch_first, padding_value): + if self.tokenizer.padding_side == "left": + input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids] + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value) + if self.tokenizer.padding_side == "left": + input_ids = torch.flip(input_ids, [1]) + return input_ids + + @property + def batch_size(self): + return self._batch_size + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + if type(doc_to_target) == str: + continuation = doc_to_target + else: + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + if visuals: + image = process_images(visuals, self._image_processor, self._config) + if type(image) is list: + image = [_image.to(dtype=torch.float16, device=self.device) for _image in image] + else: + image = image.to(dtype=torch.float16, device=self.device) + else: + image = None + + prompts_input = contexts[0] + + if image is not None and len(image) != 0 and DEFAULT_IMAGE_TOKEN not in prompts_input: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visuals) + image_tokens = " ".join(image_tokens) + prompts_input = image_tokens + "\n" + contexts[0] + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], prompts_input) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + # Add the answer of the second role + conv.messages[1][1] = continuation + + prompt = conv.get_prompt() + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + labels = input_ids.clone() + # Context part no need to calculate for loss + labels[0, : contxt_id.shape[1]] = -100 + with torch.inference_mode(): + outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True) + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + pbar.close() + return res + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = lmms_utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = self.flatten(visuals) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + if re.search("llava", self.model.config.architectures[0], re.IGNORECASE): + # Set default values for until and max_new_tokens + until = [self.tok_decode(self.eot_token_id)] + + # Update values from gen_kwargs if present + if "until" in gen_kwargs: + until = gen_kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + + if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__: + # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation + self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") + # encode, pad, and truncate contexts for this batch + if visuals: + image_tensor = process_images(visuals, self._image_processor, self._config) + else: + image_tensor = None + + # prompts_input = contexts[0] + + question_input = [] + + for visual, context in zip(visuals, contexts): + if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN] + image_tokens = " ".join(image_tokens) + question = image_tokens + "\n" + context + else: + question = context + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + question_input.append(prompt_question) + + # The above for loop has bugs. When there is no visuals, e.g. pure text, + # there will be no for loop execute resulting in an empty question_input (because no visuals) + # Scenario 1 won't even be execute + if len(visuals) == 0: + for context in contexts: + question = context + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + question_input.append(prompt_question) + + # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + # preconfigure gen_kwargs with defaults + gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input] + pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(get_accelerator().current_device_name()) + attention_masks = input_ids.ne(pad_token_ids).to(get_accelerator().current_device_name()) + input_dict = { + "input_ids":input_ids, + "attention_mask": attention_masks, + "pad_token_id": pad_token_ids, + "images": image_tensor.to(self.amp_dtype), + "do_sample": True if gen_kwargs["temperature"] > 0 else False, + "temperature": gen_kwargs["temperature"], + "top_p": gen_kwargs["top_p"], + "num_beams": gen_kwargs["num_beams"], + "max_new_tokens": gen_kwargs["max_new_tokens"], + } + elif re.search("git", self.model.config.architectures[0], re.IGNORECASE): + input_ids=self._image_processor(images=visuals, return_tensors="pt").pixel_values + gen_kwargs.pop("until", None) + input_dict = { + "pixel_values": input_ids.to(self.amp_dtype), + **gen_kwargs, + } + try: + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + cont = self.model.generate(**input_dict) + text_outputs = self.tokenizer.batch_decode( + cont[:, input_ids.shape[1]:] if re.search("llava", self.model.config.architectures[0], re.IGNORECASE) else cont, + skip_special_tokens=True) + except Exception as e: + print(f"Error {e} in generating") + cont = "" + text_outputs = [""] + res.extend(text_outputs) + # self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs) + pbar.update(1) + res = re_ords.get_original(res) + + pbar.close() + return res + + +lm_tasks = [] +lmms_tasks = [] +lm_all_tasks = lm_eval.tasks.ALL_TASKS +try: + initialize_tasks() +except Exception as e: + print(e) +for task in args.tasks: + if task in lm_all_tasks: + lm_tasks.append(task) + elif task in ALL_TASKS: + lmms_tasks.append(task) + else: + print(f"Task {task} in not supported by lm_eval and lmms_eval") + exit(0) +torch._C._jit_set_texpr_fuser_enabled(False) + +if len(lm_tasks) != 0: + lm_task_dict = lm_eval.tasks.get_task_dict(lm_tasks) + if args.model in ["google/flan-t5-xl"]: + hfmodel = T5ModelLambada( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=use_ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + tp_number=world_size, + config=args.config_file, + add_special_tokens=True, + with_greedy=False, + ) + else: + hfmodel = HuggingFaceModel( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=use_ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + tp_number=world_size, + config=args.config_file, + add_special_tokens=False + ) + + results = evaluator.evaluate( + hfmodel, + lm_task_dict, + # bootstrap_iters=1000, + # limit=100 + ) + print(evaluator.make_table(results)) +elif len(lmms_tasks) != 0: + task_names = lmms_utils.pattern_match(lmms_tasks, ALL_TASKS) + lm = LMMS(pretrained=args.model, device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + tp_number=world_size, + config=args.config_file, + add_special_tokens=False + ) + + task_dict = lmms_eval.tasks.get_task_dict(task_names, model_name="test") + for task_name in task_dict.keys(): + task_obj = task_dict[task_name] + if type(task_obj) == tuple: + group, task_obj = task_obj + if task_obj is None: + continue + lm.task_dict[task_name] = task_obj.dataset + + config = task_obj._config + + results = lmms_evaluator.evaluate( + lm=lm, + task_dict=task_dict, + # limit=10, + # bootstrap_iters=100, + cli_args=args + ) + print(lmms_evaluator.make_table(results)) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 8ed70b538..9b52f93a3 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -339,7 +339,11 @@ def get_checkpoint_files(model_name_or_path): # For now, Falcon, baichuan, baichuan2, and gptbigcode have accuracy issue with from_config with deepspeed meta device load. # TODO: we will change the scope once deepspeed providing the support -if world_size == 1 or model_type in ["falcon", "baichuan", "baichuan2", "gptbigcode", "git", "qwen"]: + +if model_type in ["llava"]: + tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) + model.config = config +elif world_size == 1 or model_type in ["falcon", "baichuan", "baichuan2", "gptbigcode", "git", "qwen"]: model = model_class[0].from_pretrained( model_name, config=config, @@ -347,9 +351,6 @@ def get_checkpoint_files(model_name_or_path): torch_dtype=load_dtype, trust_remote_code=True, ) -elif model_type in ["llava"]: - tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) - model.config = config else: # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load with deepspeed.OnDevice(dtype=load_dtype, device="meta"): if model_type in ["t5"]: @@ -529,7 +530,7 @@ def load_image(image_file): conv.append_message(conv.roles[0], prompt) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() - prompt = [prompt] * args.batch_size + inputs = [prompt] * args.batch_size else: # input tokens input_sentences = [] @@ -573,7 +574,7 @@ def generate(): input_tokens = tokenizer(images=inputs, return_tensors="pt") input_ids = input_tokens.pixel_values elif model_type == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) + input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in inputs]) image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(infer_dtype) for img in image] input_tokens = {"input_ids": input_ids, "images": image_tensor} else: diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 0cad7bf98..d8d7233d8 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -13,6 +13,7 @@ AutoTokenizer, LlamaTokenizer, T5ForConditionalGeneration, + AutoProcessor, ) MODEL_CLASSES = { @@ -32,12 +33,14 @@ "mpt": (AutoModelForCausalLM, AutoTokenizer), "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), + "git": (AutoModelForCausalLM, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } parser = argparse.ArgumentParser() parser.add_argument("-m", "--model", nargs="?", default="EleutherAI/gpt-j-6b") parser.add_argument("--output_dir", nargs="?", default="./saved_results") +parser.add_argument("--output_path", nargs="?", default="./logs") parser.add_argument("--device", default="cpu", type=str, help="cpu") parser.add_argument( "--dtype", default="bfloat16", type=str, help="float32 or bfloat16 or int8 or int4 or nf4" @@ -57,7 +60,7 @@ parser.add_argument("--torch-compile", action="store_true") parser.add_argument("--backend", default="ipex", type=str, help="backend of torch.compile") parser.add_argument("--quant-with-amp", action="store_true", help="by default static quant is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)") -parser.add_argument("--quantized-model-path", default="./saved_result/best_model.pt") +parser.add_argument("--quantized-model-path", default="./saved_results/best_model.pt") parser.add_argument( "--tasks", nargs="+", @@ -81,6 +84,22 @@ from typing import Union, List, Optional, Tuple from transformers import BatchEncoding import transformers +try: + from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM + from llava.model.builder import load_pretrained_model + from llava.conversation import conv_templates + from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token + from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + import lmms_eval + from lmms_eval.api.instance import Instance + from lmms_eval.api.model import lmms + from lmms_eval.api.registry import register_model + from lmms_eval import evaluator as lmms_evaluator + from lmms_eval import utils as lmms_utils + from lmms_eval.api.registry import ALL_TASKS + from lmms_eval.tasks import initialize_tasks +except ImportError: + pass TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding] @@ -540,37 +559,495 @@ def _loglikelihood_tokens( self.cache_hook.add_partial("loglikelihood", cache_key, answer) return results -task_dict = lm_eval.tasks.get_task_dict(args.tasks) -torch._C._jit_set_texpr_fuser_enabled(False) -if args.model in ["google/flan-t5-xl"]: - hfmodel = T5ModelLambada( - pretrained=args.model, - device="cpu", - batch_size=args.batch_size, - with_ipex=args.ipex, - with_jit=not args.disable_jit, - dtype=args.dtype, - config=args.config_file, - add_special_tokens=True, + +@register_model("test") +class LMMS(lmms): + def __init__( + self, + pretrained: str, + device: Optional[str] = "cpu", + with_ipex=True, + with_jit=True, with_greedy=False, + batch_size=1, + dtype: Optional[Union[str, torch.dtype]] = "auto", + config=None, + add_special_tokens = True, + ) -> None: + super().__init__() + self._device = torch.device(device) + self._batch_size = int(batch_size) + self._with_jit = with_jit + self._with_ipex = with_ipex + self._with_greedy = with_greedy + self._dtype = dtype + self.add_special_tokens = add_special_tokens + load_dtype = torch.float32 + infer_dtype = torch.float32 + if dtype == "float16": + load_dtype = torch.half + infer_dtype = torch.half + elif dtype == "bfloat16": + load_dtype = torch.bfloat16 + infer_dtype = torch.bfloat16 + elif dtype in ["int8", "int4", "nf4"]: + load_dtype = torch.float32 + infer_dtype = torch.int8 + self.amp_dtype = torch.bfloat16 if args.quant_with_amp or self._dtype == "bfloat16" else torch.float32 + if re.search("llava", pretrained, re.IGNORECASE): + self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained)) + model_name = get_model_name_from_path(pretrained) + if 'llama-2' in model_name.lower(): + conv_mode = "llava_llama_2" + elif "v1" in model_name.lower(): + conv_mode = "llava_v1" + elif "mpt" in model_name.lower(): + conv_mode = "mpt" + else: + conv_mode = "llava_v0" + self.conv_template = conv_mode + elif re.search("git", pretrained, re.IGNORECASE): + model_class = MODEL_CLASSES["git"] + self._image_processor = model_class[1].from_pretrained( + pretrained, trust_remote_code=True + ) + self._tokenizer = self._image_processor.tokenizer + self._config = AutoConfig.from_pretrained( + pretrained if config is None else config, torchscript=with_jit, trust_remote_code=True + ) + self._model = model_class[0].from_pretrained( + pretrained, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + self._config = self._model.config + self._config.torchscript = self._with_jit + self._model.eval() + if with_ipex and dtype not in ["int8", "int4", "nf4"]: + self._model = ipex.llm.optimize( + self._model.eval(), + dtype=infer_dtype, + inplace=True, + deployment_mode=False, + ) + + if args.torch_compile: + if dtype in ["int8", "int4", "nf4"]: + raise SystemExit("[ERROR] Currently this script does not support torch.compile with int8/int4/nf4 datatype, please set dtype to float32 or bfloat16 if want to use torch.compile.") + if with_jit: + raise SystemExit("[ERROR] JIT cannot co-work with torch.compile, please set jit to False if want to use torch.compile.") + self._model.forward = torch.compile(self._model.forward, dynamic=True, backend=args.backend) + + self._base_model = self._model + + self.iter = 0 + self.num_beams = 1 if with_greedy else 4 + self.tp_number = 1 + if self._with_jit: + input_ids = torch.ones(32).to(torch.long).unsqueeze(0) + attention_mask = torch.ones_like(input_ids) + past_key_values = tuple( + [ + ( + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + ) + for i in range(self.model.config.num_hidden_layers) + ] + ) + sample_inputs = { + "attention_mask": attention_mask, + "past_key_values": past_key_values, + } + if re.search("llava", pretrained, re.IGNORECASE): + sample_inputs["inputs_embeds"] = torch.zeros(batch_size, 1, 4096).to(self.amp_dtype) + elif re.search("git", pretrained, re.IGNORECASE): + sample_inputs["input_ids"] = input_ids.repeat(self.batch_size, 1) + sample_inputs["attention_mask"] = attention_mask.repeat(self.batch_size, 1) + sample_inputs["pixel_values"] = torch.zeros(batch_size, 3, 224, 224) + num_head = self.model.config.num_attention_heads + head_dim = int(self.model.config.hidden_size / num_head) + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), + torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(self.model.config.num_hidden_layers) + ] + ) + sample_inputs["past_key_values"] = past_key_values + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False,): + if self._dtype != "int8": + traced_model = torch.jit.trace( + self._model.eval(), + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + traced_model = torch.jit.freeze(traced_model.eval()) + else: + traced_model = torch.jit.load(args.quantized_model_path) + traced_model = torch.jit.freeze(traced_model.eval()) + + traced_model(**sample_inputs) + traced_model(**sample_inputs) + ipex._set_optimized_model_for_generation(self._model, optimized_model=traced_model) + + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + def pad_sequence(self, input_ids, batch_first, padding_value): + if self.tokenizer.padding_side == "left": + input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids] + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value) + if self.tokenizer.padding_side == "left": + input_ids = torch.flip(input_ids, [1]) + return input_ids + + @property + def batch_size(self): + return self._batch_size + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + if type(doc_to_target) == str: + continuation = doc_to_target + else: + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + if visuals: + image = process_images(visuals, self._image_processor, self._config) + if type(image) is list: + image = [_image.to(dtype=torch.float16, device=self.device) for _image in image] + else: + image = image.to(dtype=torch.float16, device=self.device) + else: + image = None + + prompts_input = contexts[0] + + if image is not None and len(image) != 0 and DEFAULT_IMAGE_TOKEN not in prompts_input: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visuals) + image_tokens = " ".join(image_tokens) + prompts_input = image_tokens + "\n" + contexts[0] + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], prompts_input) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + # Add the answer of the second role + conv.messages[1][1] = continuation + + prompt = conv.get_prompt() + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + labels = input_ids.clone() + # Context part no need to calculate for loss + labels[0, : contxt_id.shape[1]] = -100 + with torch.inference_mode(): + outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True) + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + pbar.close() + return res + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = lmms_utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = self.flatten(visuals) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + if re.search("llava", self.model.config.architectures[0], re.IGNORECASE): + # Set default values for until and max_new_tokens + until = [self.tok_decode(self.eot_token_id)] + + # Update values from gen_kwargs if present + if "until" in gen_kwargs: + until = gen_kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + + if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__: + # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation + self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") + # encode, pad, and truncate contexts for this batch + if visuals: + image_tensor = process_images(visuals, self._image_processor, self._config) + else: + image_tensor = None + + # prompts_input = contexts[0] + + question_input = [] + + for visual, context in zip(visuals, contexts): + if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN] + image_tokens = " ".join(image_tokens) + question = image_tokens + "\n" + context + else: + question = context + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + question_input.append(prompt_question) + + # The above for loop has bugs. When there is no visuals, e.g. pure text, + # there will be no for loop execute resulting in an empty question_input (because no visuals) + # Scenario 1 won't even be execute + if len(visuals) == 0: + for context in contexts: + question = context + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + question_input.append(prompt_question) + + # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + # preconfigure gen_kwargs with defaults + gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input] + pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device) + attention_masks = input_ids.ne(pad_token_ids).to(self.device) + input_dict = { + "input_ids":input_ids, + "attention_mask": attention_masks, + "pad_token_id": pad_token_ids, + "images": image_tensor.to(self.amp_dtype), + "do_sample": True if gen_kwargs["temperature"] > 0 else False, + "temperature": gen_kwargs["temperature"], + "top_p": gen_kwargs["top_p"], + "num_beams": gen_kwargs["num_beams"], + "max_new_tokens": gen_kwargs["max_new_tokens"], + } + elif re.search("git", self.model.config.architectures[0], re.IGNORECASE): + input_ids=self._image_processor(images=visuals, return_tensors="pt").pixel_values + gen_kwargs.pop("until", None) + input_dict = { + "pixel_values": input_ids.to(self.amp_dtype), + **gen_kwargs, + } + try: + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False,): + cont = self.model.generate(**input_dict) + text_outputs = self.tokenizer.batch_decode( + cont[:, input_ids.shape[1]:] if re.search("llava", self.model.config.architectures[0], re.IGNORECASE) else cont, + skip_special_tokens=True) + except Exception as e: + print(f"Error {e} in generating") + cont = "" + text_outputs = [""] + res.extend(text_outputs) + # self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs) + pbar.update(1) + res = re_ords.get_original(res) + + pbar.close() + return res + +lm_tasks = [] +lmms_tasks = [] +lm_all_tasks = lm_eval.tasks.ALL_TASKS +try: + initialize_tasks() +except Exception as e: + print(e) +for task in args.tasks: + if task in lm_all_tasks: + lm_tasks.append(task) + elif task in ALL_TASKS: + lmms_tasks.append(task) + else: + print(f"Task {task} in not supported by lm_eval and lmms_eval") + exit(0) +torch._C._jit_set_texpr_fuser_enabled(False) + +if len(lm_tasks) != 0: + lm_task_dict = lm_eval.tasks.get_task_dict(lm_tasks) + if args.model in ["google/flan-t5-xl"]: + hfmodel = T5ModelLambada( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=True, + with_greedy=False, + ) + else: + hfmodel = HuggingFaceModel( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=False + ) + + results = evaluator.evaluate( + hfmodel, + lm_task_dict, + # bootstrap_iters=1000, + # limit=100 ) -else: - hfmodel = HuggingFaceModel( - pretrained=args.model, - device="cpu", - batch_size=args.batch_size, - with_ipex=args.ipex, - with_jit=not args.disable_jit, - dtype=args.dtype, - config=args.config_file, - add_special_tokens=False - ) + print(evaluator.make_table(results)) +elif len(lmms_tasks) != 0: + task_names = lmms_utils.pattern_match(lmms_tasks, ALL_TASKS) + lm = LMMS(pretrained=args.model, device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=False + ) -results = evaluator.evaluate( - hfmodel, - task_dict, - # bootstrap_iters=1000, - # limit=100 -) + task_dict = lmms_eval.tasks.get_task_dict(task_names, model_name="test") + for task_name in task_dict.keys(): + task_obj = task_dict[task_name] + if type(task_obj) == tuple: + group, task_obj = task_obj + if task_obj is None: + continue + lm.task_dict[task_name] = task_obj.dataset -print(evaluator.make_table(results)) + config = task_obj._config + + results = lmms_evaluator.evaluate( + lm=lm, + task_dict=task_dict, + # limit=10, + # bootstrap_iters=100, + cli_args=args + ) + print(lmms_evaluator.make_table(results)) diff --git a/examples/cpu/inference/python/llm/tools/prepare_llava.sh b/examples/cpu/inference/python/llm/tools/prepare_llava.sh index 2bc461517..3b00b9abd 100644 --- a/examples/cpu/inference/python/llm/tools/prepare_llava.sh +++ b/examples/cpu/inference/python/llm/tools/prepare_llava.sh @@ -8,4 +8,7 @@ cd LLaVA pip install einops pillow sentencepiece protobuf --no-deps git checkout intel git apply ../llava.patch -pip install -e . --no-deps \ No newline at end of file +pip install -e . --no-deps + +pip install tenacity hf_transfer lmms-eval --no-deps +conda install -y openjdk=8 \ No newline at end of file From 758ee06872e507b9a6a30be05f3e22f435448800 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Fri, 29 Mar 2024 13:08:14 +0800 Subject: [PATCH 003/199] Update dependency_version.yml 20240329 (#2712) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 1e23cc4eb..66616e1b9 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240325+cpu + version: 2.4.0.dev20240328+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240325+cpu + version: 2.2.0.dev20240328+cpu torchvision: - version: 0.19.0.dev20240325+cpu + version: 0.19.0.dev20240328+cpu transformers: version: 4.38.1 From 35bf2c0768ad1831ee2c2acfda7a56dd429f3b65 Mon Sep 17 00:00:00 2001 From: Cao E Date: Fri, 29 Mar 2024 17:03:50 +0800 Subject: [PATCH 004/199] Modify bf16/fp16 ISA assert information (#2703) --- intel_extension_for_pytorch/frontend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/intel_extension_for_pytorch/frontend.py b/intel_extension_for_pytorch/frontend.py index 29450a35e..679e500e4 100644 --- a/intel_extension_for_pytorch/frontend.py +++ b/intel_extension_for_pytorch/frontend.py @@ -555,13 +555,13 @@ def xpu_check_channel_last(): if opt_properties.weights_prepack and device_type == "cpu": if dtype == torch.bfloat16: assert core.onednn_has_bf16_support(), ( - "BF16 weight prepack needs the cpu support avx512bw, avx512vl and avx512dq, " + "BF16 weight prepack needs the cpu support avx_ne_convert or avx512bw, avx512vl and avx512dq, " + "but the desired instruction sets are not available. " + "Please set dtype to torch.float or set weights_prepack to False." ) if dtype == torch.half: assert core.onednn_has_fp16_support(), ( - "FP16 weight prepack needs the cpu support avx512_core_fp16, " + "FP16 weight prepack needs the cpu support avx_ne_convert or avx512_core_fp16, " + "but the desired instruction sets are not available. " + "Please set dtype to torch.float or set weights_prepack to False." ) From 9500a8721d8e6e27ffce0d85afdfac580ac6c49b Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Fri, 29 Mar 2024 23:04:14 +0800 Subject: [PATCH 005/199] ipex.llm submodules model level examples (#2710) --- .../inference/python/llm-modeling/README.md | 97 +++ .../python/llm-modeling/modeling_gptj.py | 531 +++++++++++++ .../python/llm-modeling/modeling_llama.py | 526 +++++++++++++ .../python/llm-modeling/modeling_opt.py | 696 ++++++++++++++++++ .../cpu/inference/python/llm-modeling/run.py | 175 +++++ intel_extension_for_pytorch/llm/__init__.py | 1 + .../llm/functional/__init__.py | 7 + .../llm/functional/fusions.py | 211 ++++++ .../llm/modules/mha_fusion.py | 29 +- .../models/cpu/fusions/mha_fusion.py | 12 +- tests/cpu/test_ipex_llm_module.py | 20 +- 11 files changed, 2278 insertions(+), 27 deletions(-) create mode 100644 examples/cpu/inference/python/llm-modeling/README.md create mode 100644 examples/cpu/inference/python/llm-modeling/modeling_gptj.py create mode 100644 examples/cpu/inference/python/llm-modeling/modeling_llama.py create mode 100644 examples/cpu/inference/python/llm-modeling/modeling_opt.py create mode 100644 examples/cpu/inference/python/llm-modeling/run.py create mode 100644 intel_extension_for_pytorch/llm/functional/__init__.py create mode 100644 intel_extension_for_pytorch/llm/functional/fusions.py diff --git a/examples/cpu/inference/python/llm-modeling/README.md b/examples/cpu/inference/python/llm-modeling/README.md new file mode 100644 index 000000000..857d85940 --- /dev/null +++ b/examples/cpu/inference/python/llm-modeling/README.md @@ -0,0 +1,97 @@ +# 1. LLM Optimization Overview + +ipex.llm provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. +To further provide optimized modules or functions to help build modelings, ipex supports the following module/function level APIs: + +``` +import intel_extension_for_pytorch as ipex +``` + +### linear post-op fusions +``` +#using module init and forward +ipex.llm.modules.linearMul +ipex.llm.modules.linearGelu +ipex.llm.modules.linearNewGelu +ipex.llm.modules.linearAdd +ipex.llm.modules.linearAddAdd +ipex.llm.modules.linearSilu +ipex.llm.modules.linearSiluMul +ipex.llm.modules.linear2SiluMul +ipex.llm.modules.linearRelu +``` + +### Attention related fusions +``` +#using module init and forward +ipex.llm.modules.RotaryEmbedding +ipex.llm.modules.RMSNorm +ipex.llm.modules.FastLayerNorm +ipex.llm.modules.VarlenAttention +ipex.llm.modules.PagedAttention +ipex.llm.modules.IndirectAccessKVCache + +#using as functions +ipex.llm.functional.rotary_embedding +ipex.llm.functional.rms_norm +ipex.llm.functional.fast_layer_norm +ipex.llm.functional.indirect_access_kv_cache +ipex.llm.functional.varlen_attention + +``` + +### Generation related fusions +``` +ipex.llm.generation.hf_beam_search +ipex.llm.generation.hf_greedy_search +ipex.llm.generation.hf_sample +``` + +
+ +# 2. Show cases of ipex.llm optimized modules and functions based modeling +We provide LLAMA, GPTJ and OPT modeling as show cases that apply the optimized modules or functions from ipex.llm layers. + +| MODEL FAMILY | MODEL NAME (Huggingface hub) | +|:---:|:---:| +|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", etc. | +|GPT-J| "EleutherAI/gpt-j-6b", etc. | +|OPT| "facebook/opt-30b", "facebook/opt-1.3b", etc. | + +## How To Run LLM with ipex.llm + +**ipex.llm provides a single script to facilitate running generation tasks as below:** +Note that please setup ENV according to the ../llm/README.md + +``` +python run.py --help # for more detailed usages +``` + +| Key args of run.py | Notes | +|---|---| +| model name | use "-m MODEL_NAME" to choose models to run | +| generation | default: beam search (beam size = 4), "--greedy" for greedy search | +| input tokens | default: 32, provide fixed sizes for input prompt size, use "--input-tokens" for [32, 64, 128, 256, 512, 1024, 2016, 2017, 2048, 4096, 8192]; if "--input-tokens" is not used, use "--prompt" to choose other strings as inputs| +| output tokens | default: 32, use "--max-new-tokens" to choose any other size | +| batch size | default: 1, use "--batch-size" to choose any other size | +| generation iterations | use "--num-iter" and "--num-warmup" to control the repeated iterations of generation, default: 100-iter/10-warmup | +| ipex prepack | apply ipex weight prepack optimization by "--use-ipex-optimize"| +| profiling | enable pytorch profiling by " --profile"| + +*Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login). + + +## Run commands + +```bash +# The following "OMP_NUM_THREADS" and "numactl" settings are based on the assumption that +# the target server has 56 physical cores per numa socket, and we benchmark with 1 socket. +# Please adjust the settings per your hardware. + +# Running FP32 model +OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype float32 --use-ipex-optimize + +# Running BF16 model +OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --use-ipex-optimize + +``` diff --git a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py new file mode 100644 index 000000000..7445d0817 --- /dev/null +++ b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py @@ -0,0 +1,531 @@ +from typing import Optional, Tuple, Union + +import torch +import intel_extension_for_pytorch as ipex +import torch.fx +import torch.utils.checkpoint +from torch import nn +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.models.gptj.modeling_gptj import GPTJPreTrainedModel + + +class GPTJAttention(nn.Module): + def __init__(self, config): + super().__init__() + max_positions = config.max_position_embeddings + + self.embed_dim = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_attention_heads + self.scale_attn = torch.sqrt( + torch.tensor(self.head_dim, dtype=torch.float32) + ).to(torch.get_default_dtype()) + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False) + self.rotary_dim = config.rotary_dim + pos_embd_dim = self.rotary_dim or self.embed_dim + + # ==================== Changes to apply ipex.llm layers ==================== + self.ipex_rotary_emb = ipex.llm.modules.RotaryEmbedding( + max_positions, + pos_embd_dim, + backbone=config.architectures[0], + ) + self._IPEXIndirectAccessKVCache = ipex.llm.modules.IndirectAccessKVCache( + max_positions + ) + # ========================================================================== + + def _split_heads(self, tensor, num_attention_heads, attn_head_size, rotary): + new_shape = tensor.size()[:-1] + (num_attention_heads, attn_head_size) + tensor = tensor.view(new_shape) + if rotary: + return tensor + if len(tensor.shape) == 5: + return tensor.permute(0, 1, 3, 2, 4) + elif len(tensor.shape) == 4: + return tensor.permute(0, 2, 1, 3) + else: + raise ValueError( + f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}" + ) + + def _merge_heads(self, tensor, num_attention_heads, attn_head_size): + if len(tensor.shape) == 5: + tensor = tensor.permute(0, 1, 3, 2, 4).contiguous() + elif len(tensor.shape) == 4: + tensor = tensor.permute(0, 2, 1, 3).contiguous() + else: + raise ValueError( + f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}" + ) + new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,) + return tensor.view(new_shape) + + def forward( + self, + hidden_states: torch.FloatTensor, + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ) -> Union[ + Tuple[torch.Tensor, Tuple[torch.Tensor]], + Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]], + ]: + query = self.q_proj(hidden_states) + key = self.k_proj(hidden_states) + value = self.v_proj(hidden_states) + + query = self._split_heads(query, self.num_attention_heads, self.head_dim, True) + key = self._split_heads(key, self.num_attention_heads, self.head_dim, True) + + # ==================== Changes to apply ipex.llm layers ==================== + key = self.ipex_rotary_emb( + key, + position_ids.contiguous(), + self.num_attention_heads, + self.head_dim, + 1, + 64, + ) + query = self.ipex_rotary_emb( + query, + position_ids.contiguous(), + self.num_attention_heads, + self.head_dim, + 1, + 64, + ) + value = self._split_heads(value, self.num_attention_heads, self.head_dim, True) + + ( + attn_output, + attn_weights, + present, + ) = self._IPEXIndirectAccessKVCache( + query, + key, + value, + self.scale_attn, + layer_past, + head_mask, + attention_mask, + ) + # ========================================================================== + + attn_output = self._merge_heads( + attn_output, self.num_attention_heads, self.head_dim + ) + attn_output = self.out_proj(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class GPTJMLP(nn.Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.n_embd + self.fc_in = nn.Linear(embed_dim, intermediate_size) + self.fc_out = nn.Linear(intermediate_size, embed_dim) + + def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor: + # ==================== orignal path ==================== + # hidden_states = NewGelu(self.fc_in(hidden_states)) + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion"): + self.ipex_fusion = ipex.llm.modules.LinearNewGelu(self.fc_in) + del self.__dict__["_modules"]["fc_in"] + hidden_states = self.ipex_fusion(hidden_states) + # move self.fc_out to GPTJBlock to enable linear+add+add fusion + # hidden_states = self.fc_out(hidden_states) + # ========================================================================== + + return hidden_states + + +class GPTJBlock(nn.Module): + def __init__(self, config): + super().__init__() + inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd + self.n_embd = config.n_embd + self.eps = config.layer_norm_epsilon + self.ln_1 = nn.LayerNorm(self.n_embd, eps=self.eps) + self.attn = GPTJAttention(config) + self.mlp = GPTJMLP(inner_dim, config) + + def forward( + self, + hidden_states: Optional[torch.FloatTensor], + layer_past: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + ) -> Union[ + Tuple[torch.Tensor], + Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]], + ]: + residual = hidden_states + + # ==================== orignal path ==================== + # hidden_states = self.ln_1(hidden_states) + # ==================== Changes to apply ipex.llm layers ==================== + # option 1 : replace module + # if not hasattr(self, "ipex_layernorm"): + # self.ipex_layernorm = ipex.llm.modules.FastLayerNorm( + # self.n_embd, + # eps=self.eps, + # weight=self.ln_1.weight, + # bias=self.ln_1.bias if hasattr(self, "ln_1") else None, + # ) + # del self.ln_1 + # hidden_states = self.ipex_layernorm(hidden_states) + # + # option 2 : use function call + hidden_states = ipex.llm.functional.fast_layer_norm( + hidden_states, [self.n_embd], self.ln_1.weight, self.ln_1.bias, self.eps + ) + # ========================================================================== + + attn_outputs = self.attn( + hidden_states=hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + outputs = attn_outputs[1:] + + feed_forward_hidden_states = self.mlp(hidden_states) + + # ==================== orignal path ==================== + # hidden_states = attn_output + feed_forward_hidden_states + residual + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion"): + self.ipex_fusion = ipex.llm.modules.LinearAddAdd(self.mlp.fc_out) + del self.__dict__["_modules"]["mlp"].fc_out + hidden_states = self.ipex_fusion( + feed_forward_hidden_states, residual, attn_output + ) + # ========================================================================== + + if use_cache: + outputs = (hidden_states,) + outputs + else: + outputs = (hidden_states,) + outputs[1:] + + return outputs + + +class GPTJModel(GPTJPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.embed_dim = config.n_embd + self.vocab_size = config.vocab_size + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.h = nn.ModuleList([GPTJBlock(config) for _ in range(config.n_layer)]) + self.eps = config.layer_norm_epsilon + self.ln_f = nn.LayerNorm(self.embed_dim, eps=self.eps) + self.post_init() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + + if past_key_values is None: + past_length = 0 + past_key_values = tuple([None] * len(self.h)) + else: + past_length = past_key_values[0][0].size(-2) + + if position_ids is None: + position_ids = torch.arange( + past_length, + input_shape[-1] + past_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0) + + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + attention_mask = attention_mask[:, None, None, :] + attention_mask = attention_mask.to(dtype=self.dtype) + attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min + + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + + hidden_states = inputs_embeds + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),) + + presents = () if use_cache else None + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states=hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + if use_cache is True: + presents = presents + (outputs[1],) + + if output_attentions: + all_self_attentions = all_self_attentions + ( + outputs[2 if use_cache else 1], + ) + + # ==================== orignal path ==================== + # hidden_states = self.ln_f(hidden_states) + + # ==================== Changes to apply ipex.llm layers ==================== + # option 1 : replace module + # if not hasattr(self, "ipex_layernorm"): + # self.ipex_layernorm = ipex.llm.modules.FastLayerNorm( + # self.embed_dim, + # eps=self.eps, + # weight=self.ln_f.weight, + # bias=self.ln_f.bias, + # ) + # del self.ln_f + # hidden_states = self.ipex_layernorm(hidden_states) + # + # option 2 : use a function call + hidden_states = ipex.llm.functional.fast_layer_norm( + hidden_states, [self.embed_dim], self.ln_f.weight, self.ln_f.bias, self.eps + ) + # ========================================================================== + + hidden_states = hidden_states.view(output_shape) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + presents, + all_hidden_states, + all_self_attentions, + ] + if v is not None + ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class IPEXGPTJForCausalLM(GPTJPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = GPTJModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size) + self.post_init() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs + ): + token_type_ids = kwargs.get("token_type_ids", None) + if past_key_values: + past_length = past_key_values[0][0].shape[2] + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -input_ids.shape[1] :] + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + ) + + return model_inputs + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states).to(torch.float32) + + loss = None + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCache ==================== + def _reorder_cache( + self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + if len(past_key_values[0]) == 4 and past_key_values[0][0].shape[-1] == 1: + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + return past_key_values + else: + return tuple( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ) + for layer_past in past_key_values + ) diff --git a/examples/cpu/inference/python/llm-modeling/modeling_llama.py b/examples/cpu/inference/python/llm-modeling/modeling_llama.py new file mode 100644 index 000000000..697de66ea --- /dev/null +++ b/examples/cpu/inference/python/llm-modeling/modeling_llama.py @@ -0,0 +1,526 @@ +import math +from typing import List, Optional, Tuple, Union + +import torch +import intel_extension_for_pytorch as ipex +import torch.utils.checkpoint +from torch import nn +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.models.llama.configuration_llama import LlamaConfig +from transformers.models.llama.modeling_llama import LlamaPreTrainedModel + + +from transformers.modeling_attn_mask_utils import ( + _prepare_4d_causal_attention_mask, +) + + +class LlamaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + + def forward(self, x): + # ==================== orignal path ==================== + # down_proj = self.down_proj(torch.nn.functional.silu(self.gate_proj(x))*self.up_proj)) + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion"): + self.ipex_fusion = ipex.llm.modules.Linear2SiluMul( + self.gate_proj, self.up_proj + ) + del self.__dict__["_modules"]["gate_proj"] + del self.__dict__["_modules"]["up_proj"] + down_proj = self.ipex_fusion(x) + # self.down_proj is move to LlamaDecoderLayer to enable linear+add fusion + # ========================================================================== + return down_proj + + +class LlamaAttention(nn.Module): + def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + self.q_proj = nn.Linear( + self.hidden_size, self.num_heads * self.head_dim, bias=False + ) + self.k_proj = nn.Linear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False + ) + self.v_proj = nn.Linear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False + ) + self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + # ==================== Changes to apply ipex.llm layers ==================== + self._IPEXIndirectAccessKVCache = ipex.llm.modules.IndirectAccessKVCache( + self.max_position_embeddings + ) + self.ipex_rotary_emb = ipex.llm.modules.RotaryEmbedding( + self.max_position_embeddings, + self.head_dim, + self.rope_theta, + self.config.architectures[0], + ) + # ========================================================================== + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[List[torch.FloatTensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + + kv_seq_len = ( + q_len + past_key_value[0].size(-2) if past_key_value is not None else q_len + ) + # ==================== Changes to apply ipex.llm layers ==================== + key_states = self.ipex_rotary_emb( + key_states, + position_ids, + self.num_key_value_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + query_states = self.ipex_rotary_emb( + query_states, + position_ids, + self.num_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + + (attn_output, attn_weights, past_key_value) = self._IPEXIndirectAccessKVCache( + query_states, + key_states, + value_states, + math.sqrt(self.head_dim), + past_key_value, + None, + attention_mask, + ) + # ========================================================================== + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + # move to LlamaDecoderLayer to enable linear+add fusion + # attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + return attn_output, attn_weights, past_key_value + + +class LlamaDecoderLayer(nn.Module): + def __init__(self, config: LlamaConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx) + self.mlp = LlamaMLP(config) + self.input_layernorm = ipex.llm.modules.RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = ipex.llm.modules.RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + # ==================== origal path ==================== + # hidden_states = self.self_attn.o_proj(hidden_states) + # hidden_states = residual + hidden_states + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion_1"): + self.ipex_fusion_1 = ipex.llm.modules.LinearAdd(self.self_attn.o_proj) + del self.__dict__["_modules"]["self_attn"].o_proj + hidden_states = self.ipex_fusion_1(hidden_states, residual) + # ========================================================================== + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + # ==================== origal path ==================== + # hidden_states = self.mlp.down_proj(hidden_states) + # hidden_states = residual + hidden_states + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion_2"): + self.ipex_fusion_2 = ipex.llm.modules.LinearAdd(self.mlp.down_proj) + del self.__dict__["_modules"]["mlp"].down_proj + hidden_states = self.ipex_fusion_2(hidden_states, residual) + # ========================================================================== + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class LlamaModel(PreTrainedModel): + def __init__(self, config: LlamaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + LlamaDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + + # ==================== Changes to apply ipex.llm layers ==================== + self.norm = ipex.llm.modules.RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + # ========================================================================== + + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).repeat(batch_size, 1) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if hasattr(self, "_prepare_decoder_attention_mask"): + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = ( + past_key_values[idx] if past_key_values is not None else None + ) + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class IPEXLlamaForCausalLM(LlamaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = LlamaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # ==================== rewrite to prepare_inputs_for_generation to work with ipex.llm.modules.IndirectAccessKVCache ==================== + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values is not None: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + if ( + attention_mask is not None + and attention_mask.shape[1] > input_ids.shape[1] + ): + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCache ==================== + def _reorder_cache( + self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + if ( + len(past_key_values[0]) == 4 and past_key_values[0][0].shape[-1] == 1 + ): # discrete kv_cache + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + return past_key_values + else: + return tuple( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ) + for layer_past in past_key_values + ) diff --git a/examples/cpu/inference/python/llm-modeling/modeling_opt.py b/examples/cpu/inference/python/llm-modeling/modeling_opt.py new file mode 100644 index 000000000..e648d7ee4 --- /dev/null +++ b/examples/cpu/inference/python/llm-modeling/modeling_opt.py @@ -0,0 +1,696 @@ +from typing import List, Optional, Tuple, Union + +import torch +import intel_extension_for_pytorch as ipex +import torch.utils.checkpoint +from torch import nn + +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, +) +from transformers.models.opt.configuration_opt import OPTConfig +from transformers.models.opt.modeling_opt import OPTPreTrainedModel + + +class OPTAttention(nn.Module): + def __init__( + self, + config: OPTConfig, + is_decoder: bool = False, + **kwargs, + ): + super().__init__() + self.config = config + + def _handle_deprecated_argument(config_arg_name, config, fn_arg_name, kwargs): + val = None + if fn_arg_name in kwargs: + val = kwargs.pop(fn_arg_name) + else: + val = getattr(config, config_arg_name) + return val + + self.embed_dim = _handle_deprecated_argument( + "hidden_size", config, "embed_dim", kwargs + ) + self.num_heads = _handle_deprecated_argument( + "num_attention_heads", config, "num_heads", kwargs + ) + self.dropout = _handle_deprecated_argument( + "attention_dropout", config, "dropout", kwargs + ) + self.enable_bias = _handle_deprecated_argument( + "enable_bias", config, "bias", kwargs + ) + + self.head_dim = self.embed_dim // self.num_heads + self.is_causal = True + + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) + # ==================== Changes to apply ipex.llm layers ==================== + self._IPEXIndirectAccessKVCache = ipex.llm.modules.IndirectAccessKVCache( + config.max_position_embeddings + ) + # ========================================================================== + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + if is_cross_attention and past_key_value is not None: + key = ( + past_key_value[0] + .view(bsz, tgt_len, self.num_heads, self.head_dim) + .contiguous() + ) + value = ( + past_key_value[1] + .view(bsz, tgt_len, self.num_heads, self.head_dim) + .contiguous() + ) + elif is_cross_attention: + key = ( + self.k_proj(key_value_states) + .view(bsz, tgt_len, self.num_heads, self.head_dim) + .contiguous() + ) + value = ( + self.v_proj(key_value_states) + .view(bsz, tgt_len, self.num_heads, self.head_dim) + .contiguous() + ) + else: + key = ( + self.k_proj(hidden_states) + .view(bsz, tgt_len, self.num_heads, self.head_dim) + .contiguous() + ) + value = ( + self.v_proj(hidden_states) + .view(bsz, tgt_len, self.num_heads, self.head_dim) + .contiguous() + ) + query = ( + self.q_proj(hidden_states) + .view(bsz, tgt_len, self.num_heads, self.head_dim) + .contiguous() + ) + # ==================== Changes to apply ipex.llm layers ==================== + ( + attn_output, + attn_weights, + past_key_value_decoder, + ) = self._IPEXIndirectAccessKVCache( + query, + key, + value, + 1 / self.scaling, + past_key_value, + layer_head_mask, + attention_mask, + ) + # ========================================================================== + if self.is_decoder: + past_key_value = past_key_value_decoder + + if not output_attentions: + attn_weights_reshaped = None + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + + attn_output = attn_output.transpose(1, 2) + + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + # move to OPTDecoderLayer to enable linear+add fusion + # attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class OPTDecoderLayer(nn.Module): + def __init__(self, config: OPTConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.eps = config.layer_norm_elementwise_affine + self.self_attn = OPTAttention(config=config, is_decoder=True) + self.do_layer_norm_before = config.do_layer_norm_before + self.self_attn_layer_norm = nn.LayerNorm( + self.embed_dim, elementwise_affine=self.eps + ) + self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.enable_bias) + self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.enable_bias) + self.final_layer_norm = nn.LayerNorm( + self.embed_dim, elementwise_affine=self.eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + residual = hidden_states + + if self.do_layer_norm_before: + # ==================== orignal path ==================== + hidden_states = self.self_attn_layer_norm(hidden_states) + # ==================== Changes to apply ipex.llm layers ==================== + # option 1 : replace module + # if not hasattr(self, "ipex_layernorm_1"): + # self.ipex_layernorm_1 = ipex.llm.modules.FastLayerNorm( + # self.embed_dim, + # eps=self.eps, + # weight=self.self_attn_layer_norm.weight, + # bias=self.self_attn_layer_norm.bias, + # ) + # del self.self_attn_layer_norm + # hidden_states = self.ipex_layernorm_1(hidden_states) + # + # option 2 : use function call + # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.self_attn_layer_norm.weight, self.self_attn_layer_norm.bias, self.eps) + # ========================================================================== + + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + # ==================== orignal path ==================== + # hidden_states = self.self_attn.out_proj(hidden_states) +residual + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion_0"): + self.ipex_fusion_0 = ipex.llm.modules.LinearAdd(self.self_attn.out_proj) + hidden_states = self.ipex_fusion_0(hidden_states, residual) + # ========================================================================== + + if not self.do_layer_norm_before: + # ==================== orignal path ==================== + hidden_states = self.self_attn_layer_norm(hidden_states) + # ==================== Changes to apply ipex.llm layers ==================== + # option 1 : replace module + # if not hasattr(self, "ipex_layernorm_1"): + # self.ipex_layernorm_1 = ipex.llm.modules.FastLayerNorm( + # self.embed_dim, + # eps=self.eps, + # weight=self.self_attn_layer_norm.weight, + # bias=self.self_attn_layer_norm.bias, + # ) + # del self.self_attn_layer_norm + # hidden_states = self.ipex_layernorm_1(hidden_states) + # + # option 2 : use function call + # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.self_attn_layer_norm.weight, self.self_attn_layer_norm.bias, self.eps) + # ========================================================================== + + hidden_states_shape = hidden_states.shape + residual = hidden_states + + if self.do_layer_norm_before: + # ==================== orignal path ==================== + hidden_states = self.final_layer_norm(hidden_states) + # ==================== Changes to apply ipex.llm layers ==================== + # option 1 : replace module + # if not hasattr(self, "ipex_layernorm_2"): + # self.ipex_layernorm_2 = ipex.llm.modules.FastLayerNorm( + # self.embed_dim, + # eps=self.eps, + # weight=self.final_layer_norm.weight, + # bias=self.final_layer_norm.bias, + # ) + # del self.final_layer_norm + # hidden_states = self.ipex_layernorm_2(hidden_states) + # + # option 2 : use function call + # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.final_layer_norm.weight, self.final_layer_norm.bias, self.eps) + # ========================================================================== + + # ==================== orignal path ==================== + # hidden_states = torch.nn.functional.relu(self.fc1(hidden_states)) + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion_1"): + self.ipex_fusion_1 = ipex.llm.modules.LinearRelu(self.fc1) + hidden_states = self.ipex_fusion_1(hidden_states) + # ========================================================================== + + # ==================== orignal path ==================== + # hidden_states = self.fc2(hidden_states) + residual + # ==================== Changes to apply ipex.llm layers ==================== + if not hasattr(self, "ipex_fusion_2"): + self.ipex_fusion_2 = ipex.llm.modules.LinearAdd(self.fc2) + hidden_states = self.ipex_fusion_2(hidden_states, residual) + # ========================================================================== + + hidden_states = hidden_states.view(hidden_states_shape) + + if not self.do_layer_norm_before: + # ==================== orignal path ==================== + hidden_states = self.final_layer_norm(hidden_states) + # ==================== Changes to apply ipex.llm layers ==================== + # option 1 : replace module + # if not hasattr(self, "ipex_layernorm_2"): + # self.ipex_layernorm_2 = ipex.llm.modules.FastLayerNorm( + # self.embed_dim, + # eps=self.eps, + # weight=self.final_layer_norm.weight, + # bias=self.final_layer_norm.bias, + # ) + # del self.final_layer_norm + # hidden_states = self.ipex_layernorm_2(hidden_states) + # + # option 2 : use function call + # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.final_layer_norm.weight, self.final_layer_norm.bias, self.eps) + # ========================================================================== + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class OPTLearnedPositionalEmbedding(nn.Embedding): + def __init__(self, num_embeddings: int, embedding_dim: int): + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward( + self, attention_mask: torch.LongTensor, past_key_values_length: int = 0 + ): + attention_mask = attention_mask.long() + positions = ( + torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask + ).long() - 1 + positions = positions[:, past_key_values_length:] + return super().forward(positions + self.offset) + + +class OPTDecoder(OPTPreTrainedModel): + def __init__(self, config: OPTConfig): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.word_embed_proj_dim, self.padding_idx + ) + self.embed_positions = OPTLearnedPositionalEmbedding( + config.max_position_embeddings, config.hidden_size + ) + + if config.word_embed_proj_dim != config.hidden_size: + self.project_out = nn.Linear( + config.hidden_size, config.word_embed_proj_dim, bias=False + ) + else: + self.project_out = None + + if config.word_embed_proj_dim != config.hidden_size: + self.project_in = nn.Linear( + config.word_embed_proj_dim, config.hidden_size, bias=False + ) + else: + self.project_in = None + + if config.do_layer_norm_before and not config._remove_final_layer_norm: + self.final_layer_norm = nn.LayerNorm( + config.hidden_size, + elementwise_affine=config.layer_norm_elementwise_affine, + ) + else: + self.final_layer_norm = None + + self.layers = nn.ModuleList( + [OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)] + ) + + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" + ) + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + batch_size, seq_length = input_shape + past_key_values_length = ( + past_key_values[0][0].shape[2] if past_key_values is not None else 0 + ) + + mask_seq_length = past_key_values_length + seq_length + + if attention_mask is None: + attention_mask = torch.ones( + batch_size, mask_seq_length, device=inputs_embeds.device + ) + elif attention_mask.shape[1] != mask_seq_length: + raise ValueError( + f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be " + f"{mask_seq_length} (sum of the lengths of current and past inputs)" + ) + causal_attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + pos_embeds = self.embed_positions(attention_mask, past_key_values_length) + + if self.project_in is not None: + inputs_embeds = self.project_in(inputs_embeds) + + hidden_states = inputs_embeds + pos_embeds + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = ( + past_key_values[idx] if past_key_values is not None else None + ) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if self.final_layer_norm is not None: + hidden_states = self.final_layer_norm(hidden_states) + + if self.project_out is not None: + hidden_states = self.project_out(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class OPTModel(OPTPreTrainedModel): + def __init__(self, config: OPTConfig): + super().__init__(config) + self.decoder = OPTDecoder(config) + self.post_init() + + def get_input_embeddings(self): + return self.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.decoder.embed_tokens = value + + def get_decoder(self): + return self.decoder + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + decoder_outputs = self.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + + return BaseModelOutputWithPast( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + ) + + +class IPEXOPTForCausalLM(OPTPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = OPTModel(config) + + self.lm_head = nn.Linear( + config.word_embed_proj_dim, config.vocab_size, bias=False + ) + + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]).contiguous() + + loss = None + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCache ==================== + def _reorder_cache( + self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + if ( + len(past_key_values[0]) == 4 and past_key_values[0][0].shape[-1] == 1 + ): # discrete kv_cache + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + return past_key_values + else: + return tuple( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ) + for layer_past in past_key_values + ) diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py new file mode 100644 index 000000000..c28003287 --- /dev/null +++ b/examples/cpu/inference/python/llm-modeling/run.py @@ -0,0 +1,175 @@ +import torch +import time +import json +import pathlib +import argparse +from transformers import ( + AutoTokenizer, + LlamaTokenizer, + AutoModelForCausalLM, +) +import transformers +import intel_extension_for_pytorch as ipex +from modeling_llama import IPEXLlamaForCausalLM + +transformers.models.llama.modeling_llama.LlamaForCausalLM = IPEXLlamaForCausalLM +from modeling_gptj import IPEXGPTJForCausalLM + +transformers.models.gptj.modeling_gptj.GPTJForCausalLM = IPEXGPTJForCausalLM +from modeling_opt import IPEXOPTForCausalLM + +transformers.models.opt.modeling_opt.OPTForCausalLM = IPEXOPTForCausalLM + +MODEL_CLASSES = { + "gpt-j": (AutoModelForCausalLM, AutoTokenizer), + "llama": (AutoModelForCausalLM, LlamaTokenizer), + "opt": (AutoModelForCausalLM, AutoTokenizer), +} + +parser = argparse.ArgumentParser("Generation script (fp32/bf16 path)", add_help=False) +parser.add_argument( + "-m", + "--model-id", + type=str, + default="EleutherAI/gpt-j-6B", + help="the huggingface mdoel id", +) +parser.add_argument( + "--dtype", + type=str, + choices=["float32", "bfloat16"], + default="bfloat16", + help="bfloat16, float32", +) +parser.add_argument( + "--input-tokens", + default="32", + type=str, + help="input tokens length if needed from prompt.json", +) +parser.add_argument( + "--max-new-tokens", default=32, type=int, help="output max new tokens" +) +parser.add_argument( + "--prompt", default=None, type=str, help="input prompt for self-defined if needed" +) +parser.add_argument("--greedy", action="store_true") +parser.add_argument("--profile", action="store_true") +parser.add_argument("--use-ipex-optimize", action="store_true") +parser.add_argument("--num-iter", default=100, type=int, help="num iter") +parser.add_argument("--num-warmup", default=10, type=int, help="num warmup") +parser.add_argument("--batch-size", default=1, type=int, help="batch size") +args = parser.parse_args() +print(args) + +model_type = next( + (x for x in MODEL_CLASSES.keys() if x in args.model_id.lower()), "auto" +) +model_class = MODEL_CLASSES[model_type] + +amp_enabled = True if args.dtype != "float32" else False +amp_dtype = getattr(torch, args.dtype) + +model = model_class[0].from_pretrained( + args.model_id, + torch_dtype=amp_dtype, + low_cpu_mem_usage=True, + attn_implementation="eager", +) +tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) + +num_beams = 1 if args.greedy else 4 +generate_kwargs = dict( + do_sample=False, + temperature=0.9, + num_beams=num_beams, + max_new_tokens=args.max_new_tokens, + min_new_tokens=args.max_new_tokens, +) + +model = model.eval() + +if args.use_ipex_optimize: + from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( + _enable_tpp, + _disable_tpp, + ) + + _disable_tpp() + if args.dtype == "bfloat16": + _enable_tpp() + model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True) + else: + model = ipex.optimize( + model.eval(), + dtype=torch.float32, + inplace=True, + auto_kernel_selection=True, + ) + + +def trace_handler(prof): + print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) + + +# input prompt +current_path = pathlib.Path(__file__).parent.resolve() +with open(str(current_path) + "/prompt.json") as f: + prompt_pool = json.load(f) +if args.prompt is not None: + prompt = args.prompt +elif model_type == "auto": + raise SystemExit( + "[ERROR] model prompt is not supported, please use --prompt for this model: " + + args.model_id + ) +elif int(args.input_tokens) > 8192: + prompt = prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) +elif args.input_tokens in prompt_pool[model_type]: + prompt = prompt_pool[model_type][args.input_tokens] +else: + raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.") + +input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1) +print("---- Prompt size:", input_size) + +# start +total_time = 0.0 +num_iter = args.num_iter +num_warmup = args.num_warmup +prompt = [prompt] * args.batch_size +total_list = [] +with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=amp_enabled +): + if args.profile: + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU], + schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), + on_trace_ready=trace_handler, + ) as prof: + for i in range(5): + input_ids = tokenizer(prompt, return_tensors="pt").input_ids + output = model.generate(input_ids, **generate_kwargs) + prof.step() + + for i in range(num_iter): + tic = time.time() + input_ids = tokenizer(prompt, return_tensors="pt").input_ids + output = model.generate(input_ids, **generate_kwargs) + gen_ids = output + gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True) + toc = time.time() + input_tokens_lengths = [x.shape[0] for x in input_ids] + output_tokens_lengths = [x.shape[0] for x in gen_ids] + total_new_tokens = [ + o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths) + ] + print(gen_text, total_new_tokens, flush=True) + print("Iteration: %d, Time: %.6f sec" % (i, toc - tic), flush=True) + if i >= num_warmup: + total_time += toc - tic + +print("\n", "-" * 10, "Summary:", "-" * 10) +latency = total_time / (num_iter - num_warmup) +print("Inference latency: %.3f sec." % latency) diff --git a/intel_extension_for_pytorch/llm/__init__.py b/intel_extension_for_pytorch/llm/__init__.py index 793a26dfc..725a1257e 100644 --- a/intel_extension_for_pytorch/llm/__init__.py +++ b/intel_extension_for_pytorch/llm/__init__.py @@ -1,6 +1,7 @@ import warnings from .frontend import optimize from . import modules +from . import functional try: from . import generation diff --git a/intel_extension_for_pytorch/llm/functional/__init__.py b/intel_extension_for_pytorch/llm/functional/__init__.py new file mode 100644 index 000000000..429679f49 --- /dev/null +++ b/intel_extension_for_pytorch/llm/functional/__init__.py @@ -0,0 +1,7 @@ +from .fusions import ( + rotary_embedding, + rms_norm, + fast_layer_norm, + indirect_access_kv_cache, + varlen_attention, +) diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py new file mode 100644 index 000000000..4109cfc08 --- /dev/null +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -0,0 +1,211 @@ +from typing import Optional, Tuple +import torch +from intel_extension_for_pytorch.llm.modules import ( + RotaryEmbedding, + RMSNorm, + FastLayerNorm, + IndirectAccessKVCache, + VarlenAttention, +) + + +def rotary_embedding( + query: torch.Tensor, + key: torch.Tensor, + sin: torch.Tensor, + cos: torch.Tensor, + rotary_dim: int, + rotary_half: bool, + position_ids: torch.Tensor = None, +): + r""" + Applies RotaryEmbedding (see https://huggingface.co/papers/2104.09864) + on the `query ` or `key` before their multi-head attention computation. + Args: + - query, key (torch.Tensor) : inputs to be applied with position embeddings, taking shape of + [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). + - sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor generated to be applied on query/key. + - rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. + - head_dim (int) : head dim from the input shape. + - rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, + so the offset is 1. + if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, + so the offset is rotary_dim/2. + - position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. the according position_ids + for the input. The shape should be [batch size, sequence length]. + Return + - query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim]. + + """ + return RotaryEmbedding.apply_function( + query, key, sin, cos, rotary_dim, rotary_half, position_ids + ) + + +def rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor, eps: float): + r""" + Applies RMSnorm on the input (hidden states). + (see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L76) + Args: + - hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. + - weight (torch.Tensor): the weight to apply RMSnorm. + - eps (float) : the variance_epsilon to apply RMSnorm. + + """ + return RMSNorm.apply_function(hidden_states, weight, eps) + + +def fast_layer_norm( + hidden_states: torch.Tensor, + normalized_shape: Tuple[int, ...], + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, +): + r""" + Applies PyTorch Layernorm (see https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) + on the input (hidden states). + Args: + - hidden_states(torch.Tensor) : the input tensor to apply normalization. + - normalized_shape (int or list) or torch.Size) input shape from an expected input of size. + - weight (torch.Tensor): the weight to apply normalization. + - bias (torch.Tensor): an additive bias for normalization. + - eps (float): a value added to the denominator for numerical stability. + + """ + + return FastLayerNorm.apply_function( + hidden_states, normalized_shape, weight, bias, eps + ) + + +def indirect_access_kv_cache( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale_attn: float, + layer_past: Optional[Tuple[torch.Tensor]] = None, + head_mask: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[Tuple[torch.Tensor]] = None, + alibi: Optional[torch.Tensor] = None, + add_casual_mask: Optional[bool] = True, + seq_info: Optional[torch.Tensor] = None, + text_max_length: Optional[int] = 0, +): + r""" + kv_cache is used to reduce computation for **Decoder** layer but it also brings memory overheads, + for example, when using beam search, the kv_cache should be reordered according to the latest beam + idx and the current key/value should also be concat with kv_cache in the attention layer to get entire + context to do scale dot product. When the sequence is very long, the memory overhead will be the + performance bottleneck. This module provides an Indirect Access KV_cache(IAKV), Firstly, IAKV pre-allocates + buffers(key and value use different buffers) to store all key/value hidden states and beam index information. + It can use beam index history to decide which beam should be used by a timestamp and this information will + generate an offset to access the kv_cache buffer. + Data Format: + - The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], + the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. + All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. + + forward + - query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). + - key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). + - value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). + - scale_attn (float):scale used by the attention layer. should be the sqrt(head_size). + - layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). + key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + beam-idx: history beam idx, shape:(max_seq, beam*batch); + seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). + - head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. + - attention_mask(torch.Tensor): Attention mask information. + - text_max_length (int) : the max length of kv cache to be used for generation (allocate the pre-cache buffer). + + Return: + - attn_output: weighted value which is the output of scale dot product. shape (beam*batch, seq_len, head_num, head_size). + - attn_weights: The output tensor of the first matmul in scale dot product which is not supported by kernel now. + - new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). + + Notes: + - How to reorder KV cache when using the format of IndirectAccessKVCache (e.g., on llama model + see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + def _reorder_cache( + self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + if ( + len(past_key_values[0]) == 4 and past_key_values[0][0].shape[-1] == 1 + ): + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + return past_key_values + + """ + return IndirectAccessKVCache.apply_function( + query, + key, + value, + scale_attn, + layer_past, + head_mask, + attention_mask, + alibi, + add_casual_mask, + seq_info, + text_max_length, + ) + + +def varlen_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + out: torch.Tensor, + seqlen_q: torch.Tensor, + seqlen_k: torch.Tensor, + max_seqlen_q: int, + max_seqlen_k: int, + pdropout: float, + softmax_scale: float, + zero_tensors: bool, + is_causal: bool, + return_softmax: bool, + gen_: torch.Generator, +): + r""" + Applies PyTorch scaled_dot_product_attention on the inputs of query, key and value + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), + and accept the variant (different) sequence length among the query, key and value. + + Args: + module init: this module does not have args for module init + forward: + - query (torch.Tensor): shape [query_tokens, num_head, head_size], where tokens is total sequence length among batch size. + - key (torch.Tensor): shape [key_tokens, num_head, head_size], where tokens is total sequence length among batch size. + - value (torch.Tensor): shape [value_tokens, num_head, head_size], where tokens is total sequence length among batch size. + - out (torch.Tensor): buffer to get the results, the shape is the same as query. + - seqlen_q (torch.Tensor): shape [batch_size + 1], points the current query_tokens among total sequence length. + - seqlen_k (torch.Tensor): shape [batch_size + 1], points the current key_tokens among total sequence length. + - max_seqlen_q (int): max/total sequence length of query. + - max_seqlen_k (int): max/total sequence length of key. + - pdropout (float): dropout probability; if greater than 0.0, dropout is applied, default is 0.0. + - softmax_scale (float): scaling factor applied is prior to softmax. + - is_causal (bool): whether to apply causal attention masking, default is True. + + """ + return VarlenAttention.apply_function( + query, + key, + value, + out, + seqlen_q, + seqlen_k, + max_seqlen_q, + max_seqlen_k, + pdropout, + softmax_scale, + zero_tensors, + is_causal, + return_softmax, + gen_, + ) diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py index b39512bc8..fa15fc66d 100644 --- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py @@ -40,7 +40,7 @@ class RotaryEmbedding(nn.Module): >>> position_ids = torch.arange(32).unsqueeze(0) >>> query_rotery = rope_module(query, position_ids, 16, 256, 1, 64) - [Direct function call] This module also provides a `.apply` function call to be used on query and key + [Direct function call] This module also provides a `.apply_function` function call to be used on query and key at the same time without initializing the module (assume rotary embedding sin/cos values are provided). Args: @@ -120,7 +120,7 @@ def forward( ) @classmethod - def apply( + def apply_function( cls, query: torch.Tensor, key: torch.Tensor, @@ -168,7 +168,7 @@ class FastLayerNorm(nn.Module): >>> input = torch.randn(1, 32, 4096) >>> result = layernorm_module(input) - [Direct function call] This module also provides a `.apply` function call to apply fast layernorm + [Direct function call] This module also provides a `.apply_function` function call to apply fast layernorm without initializing the module. Args: - hidden_states(torch.Tensor) : the input tensor to apply normalization. @@ -195,10 +195,10 @@ def __init__( self.bias = bias @classmethod - def apply(cls, hidden_states, normalized_shape, weight, bias, eps): + def apply_function(cls, hidden_states, normalized_shape, weight, bias, eps): return cls.runtime_ops.get_module_from_device( hidden_states.device.type, IPEXCustomOpType.FAST_LAYERNORM, False - ).apply(hidden_states, normalized_shape, weight, bias, eps) + ).apply_function(hidden_states, normalized_shape, weight, bias, eps) def forward(self, hidden_states: torch.Tensor): runtime_module = self.runtime_ops.get_module_from_device( @@ -235,7 +235,8 @@ class RMSNorm(nn.Module): >>> input = torch.randn(1, 32, 4096) >>> result = rmsnorm_module(input) - [Direct function call] This module also provides a `.apply` function call to apply RMSNorm without initializing the module. + [Direct function call] This module also provides a `.apply_function` function call to apply RMSNorm without + initializing the module. Args: - hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. - weight (torch.Tensor): the weight to apply RMSnorm. @@ -255,10 +256,10 @@ def __init__( ) @classmethod - def apply(cls, hidden_states, weight, eps): + def apply_function(cls, hidden_states, weight, eps): return cls.runtime_ops.get_module_from_device( hidden_states.device.type, IPEXCustomOpType.RMS_NORM, False - ).apply(hidden_states, weight, eps) + ).apply_function(hidden_states, weight, eps) def forward(self, x: torch.Tensor): runtime_module = self.runtime_ops.get_module_from_device( @@ -304,7 +305,7 @@ class VarlenAttention(nn.Module): >>> softmax_scale = 0.5 >>> varlenAttention_module(query, key, value, out, seqlen_q, seqlen_k, max_seqlen_q, max_seqlen_k, pdropout, softmax_scale) - [Direct function call] This module also provides a `.apply` function call to apply VarlenAttention without + [Direct function call] This module also provides a `.apply_function` function call to apply VarlenAttention without initializing the module. Args: - The parameters are the same as the forward call. @@ -317,7 +318,7 @@ def __init__(self): super().__init__() @classmethod - def apply( + def apply_function( cls, query: torch.Tensor, key: torch.Tensor, @@ -336,7 +337,7 @@ def apply( ): return cls.runtime_ops.get_module_from_device( query.device.type, IPEXCustomOpType.VARLEN_ATTENTION, False - ).apply( + ).apply_function( query, key, value, @@ -551,7 +552,7 @@ def _reorder_cache( layer_past[3][layer_past[0].size(-2) - 1] = beam_idx return past_key_values - [Direct function call] This module also provides a `.apply` function call to apply IndirectAccessKVCache + [Direct function call] This module also provides a `.apply_function` function call to apply IndirectAccessKVCache without initializing the module. Args: - The parameters are the same as the forward call. @@ -565,7 +566,7 @@ def __init__(self, text_max_length=2048): self.text_max_length = text_max_length @classmethod - def apply( + def apply_function( cls, query: torch.Tensor, key: torch.Tensor, @@ -581,7 +582,7 @@ def apply( ): return cls.runtime_ops.get_module_from_device( query.device.type, IPEXCustomOpType.INDIRECTACCESS_KVCACHE, False - ).apply( + ).apply_function( query, key, value, diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index 308af317b..ede6ba186 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -142,7 +142,7 @@ def __init__(self, text_max_length): self.text_max_length = text_max_length @classmethod - def apply( + def apply_function( cls, query: torch.Tensor, key: torch.Tensor, @@ -278,7 +278,7 @@ def forward( cutoff: Optional[torch.Tensor] = None, vision: Optional[torch.Tensor] = False, ): - return self.apply( + return self.apply_function( query, key, value, @@ -312,7 +312,7 @@ def forward(self, hidden_states): ) @classmethod - def apply(cls, hidden_states, weight, eps): + def apply_function(cls, hidden_states, weight, eps): return torch.ops.torch_ipex.rmsnorm(hidden_states, weight=weight, eps=eps) @@ -327,7 +327,7 @@ def forward(self, hidden_states): return self.module(hidden_states) @classmethod - def apply(cls, hidden_states, normalized_shape, weight, bias, eps): + def apply_function(cls, hidden_states, normalized_shape, weight, bias, eps): return torch.nn.functional.layer_norm( hidden_states, normalized_shape, weight=weight, bias=bias, eps=eps ) @@ -375,7 +375,7 @@ def __init__(self): super().__init__() @classmethod - def apply( + def apply_functions( cls, query, # [total_q, num_head, head_size] key, # [total_k, num_head_k, head_size] @@ -471,7 +471,7 @@ def forward( return_softmax, gen_, ): - self.apply( + self.apply_function( query, key, value, diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py index 00697d6e4..96fe6f1fc 100644 --- a/tests/cpu/test_ipex_llm_module.py +++ b/tests/cpu/test_ipex_llm_module.py @@ -263,7 +263,7 @@ def test_rmsnorm(self): target_m = ipex.llm.modules.RMSNorm(4096).to(dtype) ref_out = ref_m(x1.to(dtype)) out = target_m(x2.to(dtype)) - out_2 = ipex.llm.modules.RMSNorm.apply( + out_2 = ipex.llm.functional.rms_norm( x2.to(dtype), ref_m.weight, ref_m.variance_epsilon ) self.assertEqual(out, ref_out) @@ -272,16 +272,22 @@ def test_rmsnorm(self): def test_modules_naming(self): # below ipex.llm modeules has thier own UTs, here only test their access of naming from ipex.llm.modules assert ipex.llm.modules.RotaryEmbedding is not None - assert ipex.llm.modules.RotaryEmbedding.apply is not None + assert ipex.llm.modules.RotaryEmbedding.apply_function is not None assert ipex.llm.modules.PagedAttention is not None assert ipex.llm.modules.IndirectAccessKVCache is not None - assert ipex.llm.modules.IndirectAccessKVCache.apply is not None + assert ipex.llm.modules.IndirectAccessKVCache.apply_function is not None assert ipex.llm.modules.VarlenAttention is not None - assert ipex.llm.modules.VarlenAttention.apply is not None + assert ipex.llm.modules.VarlenAttention.apply_function is not None assert ipex.llm.modules.FastLayerNorm is not None - assert ipex.llm.modules.FastLayerNorm.apply is not None + assert ipex.llm.modules.FastLayerNorm.apply_function is not None assert ipex.llm.modules.RMSNorm is not None - assert ipex.llm.modules.RMSNorm.apply is not None + assert ipex.llm.modules.RMSNorm.apply_function is not None + # below only test their access of naming from ipex.llm functional + assert ipex.llm.functional.rotary_embedding is not None + assert ipex.llm.functional.rms_norm is not None + assert ipex.llm.functional.fast_layer_norm is not None + assert ipex.llm.functional.indirect_access_kv_cache is not None + assert ipex.llm.functional.varlen_attention is not None def test_rotary_embedding_tgi(self): test_tensor_size = [ @@ -299,7 +305,7 @@ def test_rotary_embedding_tgi(self): ref_q = apply(q, cos, sin) ref_k = apply(k, cos, sin) - ipex_q, ipex_k = ipex.llm.modules.RotaryEmbedding.apply( + ipex_q, ipex_k = ipex.llm.functional.rotary_embedding( q, k, sin, cos, rotary_dim, True ) From e252bd2662e51489c46b29471778f8271ad11869 Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 2 Apr 2024 10:59:23 +0800 Subject: [PATCH 006/199] LLM script: optimize prepare_llava.sh (#2717) --- .../python/llm/distributed/run_accuracy_with_deepspeed.py | 6 ++++++ .../inference/python/llm/single_instance/run_accuracy.py | 6 ++++++ examples/cpu/inference/python/llm/tools/prepare_llava.sh | 8 ++++++-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index cab50385a..5c4f38b78 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -42,6 +42,12 @@ from lmms_eval.api.registry import ALL_TASKS from lmms_eval.tasks import initialize_tasks except ImportError: + def register_model(name): + def decorator(func): + return func + return decorator + from abc import ABC as lmms + Instance = None pass MODEL_CLASSES = { diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index d8d7233d8..21b85c10c 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -99,6 +99,12 @@ from lmms_eval.api.registry import ALL_TASKS from lmms_eval.tasks import initialize_tasks except ImportError: + def register_model(name): + def decorator(func): + return func + return decorator + from abc import ABC as lmms + Instance = None pass TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding] diff --git a/examples/cpu/inference/python/llm/tools/prepare_llava.sh b/examples/cpu/inference/python/llm/tools/prepare_llava.sh index 3b00b9abd..5c4c8133d 100644 --- a/examples/cpu/inference/python/llm/tools/prepare_llava.sh +++ b/examples/cpu/inference/python/llm/tools/prepare_llava.sh @@ -1,5 +1,9 @@ #!/usr/bin/env bash set -e +llava_patch=`pwd`/llava.patch +if ! [ -f $llava_patch ]; then + llava_patch=`pwd`/tools/llava.patch +fi pip uninstall llava -y rm -rf LLaVA git clone https://github.com/haotian-liu/LLaVA.git @@ -7,8 +11,8 @@ cd LLaVA pip install einops pillow sentencepiece protobuf --no-deps git checkout intel -git apply ../llava.patch +git apply ${llava_patch} pip install -e . --no-deps -pip install tenacity hf_transfer lmms-eval --no-deps +pip install tenacity hf_transfer lmms-eval evaluate sqlitedict pycocoevalcap pycocotools --no-deps conda install -y openjdk=8 \ No newline at end of file From 41964856c80889f479cd1d1f16cf73f7cc3e978d Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Tue, 2 Apr 2024 12:36:14 +0800 Subject: [PATCH 007/199] enable more perf for llm modeling (#2719) (#2722) --- .../python/llm-modeling/modeling_gptj.py | 35 +++-- .../python/llm-modeling/modeling_llama.py | 34 ++++- .../python/llm-modeling/modeling_opt.py | 82 +++++++++--- .../cpu/inference/python/llm-modeling/run.py | 126 +++++++++++++++++- .../llm/generation/__init__.py | 2 +- .../llm/generation/hf_function.py | 4 + 6 files changed, 246 insertions(+), 37 deletions(-) diff --git a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py index 7445d0817..d36ce6c91 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py @@ -226,10 +226,11 @@ def forward( ) # ========================================================================== - if use_cache: - outputs = (hidden_states,) + outputs - else: - outputs = (hidden_states,) + outputs[1:] + # use cache always to be true for generation + # if use_cache: + outputs = (hidden_states,) + outputs + # else: + # outputs = (hidden_states,) + outputs[1:] return outputs @@ -354,8 +355,9 @@ def forward( ) hidden_states = outputs[0] - if use_cache is True: - presents = presents + (outputs[1],) + # use cache always to be true for generation + # if use_cache is True: + presents = presents + (outputs[1],) if output_attentions: all_self_attentions = all_self_attentions + ( @@ -466,10 +468,10 @@ def prepare_inputs_for_generation( def forward( self, input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, attention_mask: Optional[torch.FloatTensor] = None, - token_type_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, position_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.FloatTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, @@ -496,10 +498,25 @@ def forward( return_dict=return_dict, ) hidden_states = transformer_outputs[0] - + # ==================== for generation, lm head only needs last token as input ==================== + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and hidden_states.size(1) != 1 + ): + hidden_states = hidden_states[:, -1:, :] lm_logits = self.lm_head(hidden_states).to(torch.float32) loss = None + if ( + hasattr(self, "config") + and hasattr(self.config, "use_ipex_optimize") + and self.config.use_ipex_optimize + ): + # return dict is handled by ipex._set_optimized_model_for_generation + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output if not return_dict: output = (lm_logits,) + transformer_outputs[1:] diff --git a/examples/cpu/inference/python/llm-modeling/modeling_llama.py b/examples/cpu/inference/python/llm-modeling/modeling_llama.py index 697de66ea..67fccfc11 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_llama.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_llama.py @@ -216,8 +216,9 @@ def forward( if output_attentions: outputs += (self_attn_weights,) - if use_cache: - outputs += (present_key_value,) + # if use_cache: + # use cache always to be true for generation + outputs += (present_key_value,) return outputs @@ -346,8 +347,9 @@ def forward( hidden_states = layer_outputs[0] - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + # if use_cache: + # use cache always to be true for generation + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) @@ -357,7 +359,9 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - next_cache = next_decoder_cache if use_cache else None + next_cache = next_decoder_cache # if use_cache else None + # use cache always to be true for generation + if not return_dict: return tuple( v @@ -440,16 +444,32 @@ def forward( ) hidden_states = outputs[0] + # ==================== for generation, lm head only needs last token as input ==================== + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and hidden_states.size(1) != 1 + ): + hidden_states = hidden_states[:, -1:, :] logits = self.lm_head(hidden_states) logits = logits.float() loss = None - if not return_dict: + if ( + hasattr(self, "config") + and hasattr(self.config, "use_ipex_optimize") + and self.config.use_ipex_optimize + ): + # return dict is handled by ipex._set_optimized_model_for_generation output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, @@ -458,7 +478,7 @@ def forward( attentions=outputs.attentions, ) - # ==================== rewrite to prepare_inputs_for_generation to work with ipex.llm.modules.IndirectAccessKVCache ==================== + # ======== rewrite to prepare_inputs_for_generation to work with ipex.llm.modules.IndirectAccessKVCache ========= def prepare_inputs_for_generation( self, input_ids, diff --git a/examples/cpu/inference/python/llm-modeling/modeling_opt.py b/examples/cpu/inference/python/llm-modeling/modeling_opt.py index e648d7ee4..f521a9505 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_opt.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_opt.py @@ -155,16 +155,15 @@ class OPTDecoderLayer(nn.Module): def __init__(self, config: OPTConfig): super().__init__() self.embed_dim = config.hidden_size - self.eps = config.layer_norm_elementwise_affine self.self_attn = OPTAttention(config=config, is_decoder=True) self.do_layer_norm_before = config.do_layer_norm_before self.self_attn_layer_norm = nn.LayerNorm( - self.embed_dim, elementwise_affine=self.eps + self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine ) self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=config.enable_bias) self.fc2 = nn.Linear(config.ffn_dim, self.embed_dim, bias=config.enable_bias) self.final_layer_norm = nn.LayerNorm( - self.embed_dim, elementwise_affine=self.eps + self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine ) def forward( @@ -182,7 +181,7 @@ def forward( if self.do_layer_norm_before: # ==================== orignal path ==================== - hidden_states = self.self_attn_layer_norm(hidden_states) + # hidden_states = self.self_attn_layer_norm(hidden_states) # ==================== Changes to apply ipex.llm layers ==================== # option 1 : replace module # if not hasattr(self, "ipex_layernorm_1"): @@ -196,7 +195,13 @@ def forward( # hidden_states = self.ipex_layernorm_1(hidden_states) # # option 2 : use function call - # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.self_attn_layer_norm.weight, self.self_attn_layer_norm.bias, self.eps) + hidden_states = ipex.llm.functional.fast_layer_norm( + hidden_states, + [self.embed_dim], + self.self_attn_layer_norm.weight, + self.self_attn_layer_norm.bias, + 1e-05, + ) # ========================================================================== hidden_states, self_attn_weights, present_key_value = self.self_attn( @@ -211,12 +216,13 @@ def forward( # ==================== Changes to apply ipex.llm layers ==================== if not hasattr(self, "ipex_fusion_0"): self.ipex_fusion_0 = ipex.llm.modules.LinearAdd(self.self_attn.out_proj) + del self.__dict__["_modules"]["self_attn"].out_proj hidden_states = self.ipex_fusion_0(hidden_states, residual) # ========================================================================== if not self.do_layer_norm_before: # ==================== orignal path ==================== - hidden_states = self.self_attn_layer_norm(hidden_states) + # hidden_states = self.self_attn_layer_norm(hidden_states) # ==================== Changes to apply ipex.llm layers ==================== # option 1 : replace module # if not hasattr(self, "ipex_layernorm_1"): @@ -230,7 +236,13 @@ def forward( # hidden_states = self.ipex_layernorm_1(hidden_states) # # option 2 : use function call - # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.self_attn_layer_norm.weight, self.self_attn_layer_norm.bias, self.eps) + hidden_states = ipex.llm.functional.fast_layer_norm( + hidden_states, + [self.embed_dim], + self.self_attn_layer_norm.weight, + self.self_attn_layer_norm.bias, + 1e-05, + ) # ========================================================================== hidden_states_shape = hidden_states.shape @@ -238,7 +250,7 @@ def forward( if self.do_layer_norm_before: # ==================== orignal path ==================== - hidden_states = self.final_layer_norm(hidden_states) + # hidden_states = self.final_layer_norm(hidden_states) # ==================== Changes to apply ipex.llm layers ==================== # option 1 : replace module # if not hasattr(self, "ipex_layernorm_2"): @@ -252,7 +264,13 @@ def forward( # hidden_states = self.ipex_layernorm_2(hidden_states) # # option 2 : use function call - # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.final_layer_norm.weight, self.final_layer_norm.bias, self.eps) + hidden_states = ipex.llm.functional.fast_layer_norm( + hidden_states, + [self.embed_dim], + self.final_layer_norm.weight, + self.final_layer_norm.bias, + 1e-05, + ) # ========================================================================== # ==================== orignal path ==================== @@ -260,6 +278,7 @@ def forward( # ==================== Changes to apply ipex.llm layers ==================== if not hasattr(self, "ipex_fusion_1"): self.ipex_fusion_1 = ipex.llm.modules.LinearRelu(self.fc1) + del self.__dict__["_modules"]["fc1"] hidden_states = self.ipex_fusion_1(hidden_states) # ========================================================================== @@ -268,6 +287,7 @@ def forward( # ==================== Changes to apply ipex.llm layers ==================== if not hasattr(self, "ipex_fusion_2"): self.ipex_fusion_2 = ipex.llm.modules.LinearAdd(self.fc2) + del self.__dict__["_modules"]["fc2"] hidden_states = self.ipex_fusion_2(hidden_states, residual) # ========================================================================== @@ -275,7 +295,7 @@ def forward( if not self.do_layer_norm_before: # ==================== orignal path ==================== - hidden_states = self.final_layer_norm(hidden_states) + # hidden_states = self.final_layer_norm(hidden_states) # ==================== Changes to apply ipex.llm layers ==================== # option 1 : replace module # if not hasattr(self, "ipex_layernorm_2"): @@ -289,7 +309,13 @@ def forward( # hidden_states = self.ipex_layernorm_2(hidden_states) # # option 2 : use function call - # hidden_states = ipex.llm.functional.fast_layer_norm(hidden_states, [self.embed_dim], self.final_layer_norm.weight, self.final_layer_norm.bias, self.eps) + hidden_states = ipex.llm.functional.fast_layer_norm( + hidden_states, + [self.embed_dim], + self.final_layer_norm.weight, + self.final_layer_norm.bias, + 1e-05, + ) # ========================================================================== outputs = (hidden_states,) @@ -297,8 +323,9 @@ def forward( if output_attentions: outputs += (self_attn_weights,) - if use_cache: - outputs += (present_key_value,) + # if use_cache: + # use cache always to be true for generation + outputs += (present_key_value,) return outputs @@ -464,8 +491,9 @@ def forward( hidden_states = layer_outputs[0] - if use_cache: - next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + # if use_cache: + # use cache always to be true for generation + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) if output_attentions: all_self_attns += (layer_outputs[1],) @@ -478,8 +506,8 @@ def forward( if output_hidden_states: all_hidden_states += (hidden_states,) - - next_cache = next_decoder_cache if use_cache else None + # use cache always to be true for generation + next_cache = next_decoder_cache # if use_cache else None if not return_dict: return tuple( v @@ -628,10 +656,28 @@ def forward( output_hidden_states=output_hidden_states, return_dict=return_dict, ) + hidden_states = outputs[0] + # ==================== for generation, lm head only needs last token as input ==================== + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and hidden_states.size(1) != 1 + ): + hidden_states = hidden_states[:, -1:, :] - logits = self.lm_head(outputs[0]).contiguous() + logits = self.lm_head(hidden_states).contiguous() loss = None + if ( + hasattr(self, "config") + and hasattr(self.config, "use_ipex_optimize") + and self.config.use_ipex_optimize + ): + # return dict is handled by ipex._set_optimized_model_for_generation + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py index c28003287..2090797eb 100644 --- a/examples/cpu/inference/python/llm-modeling/run.py +++ b/examples/cpu/inference/python/llm-modeling/run.py @@ -26,6 +26,69 @@ "opt": (AutoModelForCausalLM, AutoTokenizer), } + +def get_dummy_input(_model, return_dict=False): + sample_inputs = None + + if hasattr(_model.config, "n_layer"): + model_num_layers = _model.config.n_layer + elif hasattr(_model.config, "num_hidden_layers"): + model_num_layers = _model.config.num_hidden_layers + elif hasattr(_model.config, "num_layers"): + model_num_layers = _model.config.num_layers + elif hasattr(_model.config, "n_layers"): + model_num_layers = _model.config.n_layers + else: + AssertionError( + False, + "Cannot support the dummy sample_inputs for your model, please use your sample_inputs as the inputs and run again", + ) + past_key_values = tuple( + [ + ( + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + ) + for i in range(model_num_layers) + ] + ) + + input_ids = torch.ones(32).to(torch.long).unsqueeze(0) + attention_mask = torch.ones_like(input_ids) + model_inputs = _model.prepare_inputs_for_generation( + input_ids, attention_mask=attention_mask + ) + has_position_ids = model_inputs.get("position_ids", None) is not None + position_ids = torch.arange(input_ids.shape[-1]).unsqueeze(0) + if has_position_ids: + sample_inputs = ( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "position_ids": position_ids, + } + if return_dict + else (input_ids, attention_mask, past_key_values, position_ids) + ) + else: + sample_inputs = ( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + } + if return_dict + else (input_ids, attention_mask, past_key_values) + ) + + return sample_inputs + + parser = argparse.ArgumentParser("Generation script (fp32/bf16 path)", add_help=False) parser.add_argument( "-m", @@ -56,6 +119,7 @@ parser.add_argument("--greedy", action="store_true") parser.add_argument("--profile", action="store_true") parser.add_argument("--use-ipex-optimize", action="store_true") +parser.add_argument("--token-latency", action="store_true") parser.add_argument("--num-iter", default=100, type=int, help="num iter") parser.add_argument("--num-warmup", default=10, type=int, help="num warmup") parser.add_argument("--batch-size", default=1, type=int, help="batch size") @@ -75,6 +139,7 @@ torch_dtype=amp_dtype, low_cpu_mem_usage=True, attn_implementation="eager", + # torchscript=True if args.use_ipex_optimize else False, ) tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) @@ -90,6 +155,9 @@ model = model.eval() if args.use_ipex_optimize: + if not hasattr(model.config, "use_ipex_optimize"): + model.config.use_ipex_optimize = True + # 1) using ipex weight prepack to work with IPEX linear module and their fusions from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( _enable_tpp, _disable_tpp, @@ -107,6 +175,44 @@ auto_kernel_selection=True, ) + # 2) using ipex geneartion function to get prompt sharing and first token optimizations + hf_beam_search = ipex.llm.generation.hf_beam_search.__get__(model, model.__class__) + hf_greedy_search = ipex.llm.generation.hf_greedy_search.__get__( + model, model.__class__ + ) + hf_sample = ipex.llm.generation.hf_sample.__get__(model, model.__class__) + hf_beam_sample = ipex.llm.generation.hf_beam_sample.__get__(model, model.__class__) + + setattr(model, "beam_search", hf_beam_search) # noqa: B010 + setattr(model, "greedy_search", hf_greedy_search) # noqa: B010 + setattr(model, "sample", hf_sample) # noqa: B010 + setattr(model, "beam_sample", hf_beam_sample) # noqa: B010 + + if not hasattr(model.config, "lm_head_generation"): + model.config.lm_head_generation = True + + # 3) using PyTorch jit to further reduce dispatch overhead + sample_inputs = get_dummy_input(model, return_dict=True) + with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): + trace_model = torch.jit.trace( + model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + trace_model = torch.jit.freeze(trace_model) + model = ipex._set_optimized_model_for_generation( + model, optimized_model=trace_model + ) + + +if ( + args.token_latency + and args.use_ipex_optimize + and not hasattr(model.config, "token_latency") +): + model.config.token_latency = True + def trace_handler(prof): print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) @@ -132,7 +238,6 @@ def trace_handler(prof): input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1) print("---- Prompt size:", input_size) - # start total_time = 0.0 num_iter = args.num_iter @@ -157,7 +262,7 @@ def trace_handler(prof): tic = time.time() input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) - gen_ids = output + gen_ids = output[0] if args.token_latency else output gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True) toc = time.time() input_tokens_lengths = [x.shape[0] for x in input_ids] @@ -169,7 +274,24 @@ def trace_handler(prof): print("Iteration: %d, Time: %.6f sec" % (i, toc - tic), flush=True) if i >= num_warmup: total_time += toc - tic + if args.token_latency: + total_list.append(output[1]) print("\n", "-" * 10, "Summary:", "-" * 10) latency = total_time / (num_iter - num_warmup) print("Inference latency: %.3f sec." % latency) + +if args.token_latency: + import numpy as np + from itertools import chain + + first_latency = np.mean([x[0] for x in total_list]) + average_2n = list(chain(*[x[1:] for x in total_list])) + average_2n.sort() + average_2n_latency = np.mean(average_2n) + p90_latency = average_2n[int(len(average_2n) * 0.9)] + p99_latency = average_2n[int(len(average_2n) * 0.99)] + print("First token average latency: %.3f sec." % first_latency) + print("Average 2... latency: %.3f sec." % average_2n_latency) + print("P90 2... latency: %.3f sec." % p90_latency) + print("P99 2... latency: %.3f sec." % p99_latency) diff --git a/intel_extension_for_pytorch/llm/generation/__init__.py b/intel_extension_for_pytorch/llm/generation/__init__.py index 25b227bff..e47bba746 100644 --- a/intel_extension_for_pytorch/llm/generation/__init__.py +++ b/intel_extension_for_pytorch/llm/generation/__init__.py @@ -1 +1 @@ -from .hf_function import hf_greedy_search, hf_beam_search +from .hf_function import hf_greedy_search, hf_beam_search, hf_beam_sample, hf_sample diff --git a/intel_extension_for_pytorch/llm/generation/hf_function.py b/intel_extension_for_pytorch/llm/generation/hf_function.py index 2f9cfa714..b3ff44783 100644 --- a/intel_extension_for_pytorch/llm/generation/hf_function.py +++ b/intel_extension_for_pytorch/llm/generation/hf_function.py @@ -5,7 +5,11 @@ from intel_extension_for_pytorch.transformers.generation.sample import ( _sample, ) +from intel_extension_for_pytorch.transformers.generation.beam_sample import ( + _beam_sample, +) hf_greedy_search = _greedy_search hf_beam_search = _beam_search hf_sample = _sample +hf_beam_sample = _beam_sample From 25548f0b0bc0089af225077bb8d68ae252d8e1a2 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Wed, 3 Apr 2024 17:45:28 +0800 Subject: [PATCH 008/199] update oneDNN to 1bd41a0e63 on main (#2727) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 5bd1a28af..d83040111 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 5bd1a28afb0a8a9c949564fe31800d3692586fd8 +Subproject commit d83040111fe211ac9496a6125f31ff3c66a9c194 From 6acdb760a0df9cb018e91d18cad090af9d47fc6b Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Sun, 7 Apr 2024 16:21:33 +0800 Subject: [PATCH 009/199] GQA concat support (#2734) * support concat linear for gqa * reduce overhead (#2726) * Extend ipex.llm.optimize dtype with half and bf32 (#2728) --- csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp | 4 +- .../models/reference/modules/attentions.py | 24 ++-------- .../transformers/optimize.py | 48 +++++++++++++++---- 3 files changed, 47 insertions(+), 29 deletions(-) diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp index 95c54e87f..32a316a94 100644 --- a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp @@ -249,7 +249,9 @@ void single_query_cached_kv_attention_kernel( attn_out_start, head_size, flag_access[thread_id][seq_id][head_id]); - flag_access[thread_id][seq_id][head_id] = 1; + if (flag_access[thread_id][seq_id][head_id] == 0) { + flag_access[thread_id][seq_id][head_id] = 1; + } } // for token_id } // for head_id } // for seq_id diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 548751776..072c4defa 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -1935,25 +1935,11 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): or isinstance(module.v_proj, WeightOnlyQuantizedLinear) ) ) and not (hasattr(self, "use_qk_layernorm") and self.use_qk_layernorm): - - def get_weight_shape(mod): - if hasattr(mod, "in_features") and hasattr(mod, "out_features"): - return [mod.in_features, mod.out_features] - elif hasattr(mod, "weight") and hasattr(mod.weight, "shape"): - return list(mod.weight.shape) - return None - - weight_shapes = [ - get_weight_shape(mod) - for mod in [module.q_proj, module.k_proj, module.v_proj] - ] - if weight_shapes[0] is not None and all( - weight_shapes[0] == shape for shape in weight_shapes[1:] - ): - self.concat_qkv = _IPEXConcatLinearRef( - [module.q_proj, module.k_proj, module.v_proj] - ) - del module.q_proj, module.k_proj, module.v_proj + # we support MHA, GQA, MQA for concat linear + self.concat_qkv = _IPEXConcatLinearRef( + [module.q_proj, module.k_proj, module.v_proj] + ) + del module.q_proj, module.k_proj, module.v_proj self._IPEXScaleDotProduct = _IPEXScaleDotProductRef(module, config) diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 6a9288004..7297f2ae1 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -903,7 +903,7 @@ def ipex_quantization_flow( print("ipex.llm.optimize is doing the weight only quantization") with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.half] else False, dtype=dtype ): convert_model = convert(prepared_model.eval(), inplace=True).eval() if is_woq and dtype is torch.bfloat16: @@ -930,8 +930,6 @@ def model_convert_lowering( _disable_tpp() if not is_quantization: - if dtype is torch.bfloat16: - _enable_tpp() if ipex._C.is_llga_fp32_bf16_enabled(): _disable_tpp() _model = ipex.optimize( @@ -942,13 +940,22 @@ def model_convert_lowering( ) else: if dtype is torch.float32: + # this call also support bf32 path _model = ipex.optimize( _model.eval(), dtype=dtype, inplace=True, auto_kernel_selection=True, ) - else: + elif dtype is torch.half: + _model = ipex.optimize( + _model.eval(), + dtype=dtype, + inplace=True, + auto_kernel_selection=True, + ) + elif dtype is torch.bfloat16: + _enable_tpp() _model = ipex.optimize(_model.eval(), dtype=dtype, inplace=True) if not is_quantization or woq: @@ -1019,7 +1026,8 @@ def model_convert_lowering( else sample_inputs ) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.half] else False, + dtype=dtype, ): trace_model = torch.jit.trace( _model, @@ -1076,13 +1084,13 @@ def optimize( For the model that is not in the scope of supported model family above, will try to apply default ipex.optimize transparently to get benifits (not include quantizations, - only works for dtypes of torch.bfloat16 and torch.float). + only works for dtypes of torch.bfloat16 and torch.half and torch.float). Args: model (torch.nn.Module): User model to apply optimizations. optimizer (torch.optim.Optimizer): User optimizer to apply optimizations on, such as SGD. The default value is ``None``, meaning inference case. - dtype (torch.dtype): Now it works for ``torch.bfloat16`` and ``torch.float``. + dtype (torch.dtype): Now it works for ``torch.bfloat16``, ``torch.half`` and ``torch.float``. The default value is ``torch.float``. When working with quantization, it means the mixed dtype with quantization. inplace (bool): Whether to perform inplace optimization. Default value is ``False``. device (str): Specifying the device on which the optimization will be performed. @@ -1171,7 +1179,26 @@ def optimize( + "fallback to origin model" ) return model - _model = ipex.optimize(model.eval(), dtype=dtype, inplace=inplace) + + if dtype is torch.float: + _model = ipex.optimize( + model.eval(), + dtype=dtype, + inplace=inplace, + auto_kernel_selection=True + if ipex.get_fp32_math_mode() == ipex.FP32MathMode.BF32 + else False, + ) + elif dtype is torch.bfloat16: + _model = ipex.optimize(model.eval(), dtype=dtype, inplace=inplace) + elif dtype is torch.half: + _model = ipex.optimize( + model.eval(), + dtype=dtype, + auto_kernel_selection=True, + inplace=inplace, + ) + return _model if not inplace: @@ -1245,7 +1272,10 @@ def optimize( else sample_inputs ) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True + if dtype in [torch.bfloat16, torch.half] + else False, + dtype=dtype, ): trace_model = torch.jit.trace( _model, From 4335b0f7441efa418bb108e6594760853e127c71 Mon Sep 17 00:00:00 2001 From: blzheng Date: Mon, 8 Apr 2024 09:03:21 +0800 Subject: [PATCH 010/199] Fix scan issues (#2729) (#2730) --- csrc/cpu/aten/Linear.cpp | 6 +- csrc/cpu/aten/MergedEmbeddingBag.h | 4 +- csrc/cpu/aten/kernels/WoqLinearKrnl.cpp | 2 +- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 6 +- csrc/cpu/aten/utils/woq.h | 8 +- csrc/cpu/comm/shm_reduction.h | 4 +- csrc/cpu/tpp/jit_compile.cpp | 4 +- csrc/cpu/tpp/par_loop_generator.cpp | 240 ++++++++++++------ csrc/cpu/tpp/xsmm_functors.h | 3 +- csrc/cpu/utils/robin_hood.h | 2 +- .../python/llm-modeling/modeling_gptj.py | 4 +- .../python/llm-modeling/modeling_llama.py | 13 +- .../python/llm-modeling/modeling_opt.py | 4 +- .../llm/modules/linear_fusion.py | 2 +- .../quantization/_quantize_utils.py | 3 + .../transformers/models/reference/models.py | 25 +- .../models/reference/modules/attentions.py | 19 +- tests/cpu/hf_configs/qwen/modeling_qwen.py | 2 + 18 files changed, 212 insertions(+), 139 deletions(-) diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp index e94ed466e..c29f4de53 100644 --- a/csrc/cpu/aten/Linear.cpp +++ b/csrc/cpu/aten/Linear.cpp @@ -392,10 +392,10 @@ at::Tensor woq_linear_pack_weight( int64_t weight_int4_size_bytes = weight_int4.numel(); int64_t pad_size_bytes = weight_int4_size_bytes - weight_size_bytes; std::memcpy(weight_int4.data_ptr(), weight.data_ptr(), weight_size_bytes); - std::memset( + std::fill_n( (uint8_t*)weight_int4.data_ptr() + weight_size_bytes, - 0, - pad_size_bytes); + pad_size_bytes, + 0); return woq_tpp_gemm_packB_stub( kCPU, weight_int4, weight_dtype, block_n, block_k, lowp_mode); } diff --git a/csrc/cpu/aten/MergedEmbeddingBag.h b/csrc/cpu/aten/MergedEmbeddingBag.h index 2330b86bc..1687ce21f 100644 --- a/csrc/cpu/aten/MergedEmbeddingBag.h +++ b/csrc/cpu/aten/MergedEmbeddingBag.h @@ -24,7 +24,7 @@ class EMBROW { length = len; arr.resize(length); data = &arr[0]; - memset(data, 0, len * sizeof(T)); + std::fill_n(data, len, T(0)); } }; @@ -36,7 +36,7 @@ class EMBROWFixLen { EMBROWFixLen(int32_t len) { data = &arr[0]; - memset(data, 0, emb_dim * sizeof(T)); + std::fill_n(data, emb_dim, T(0)); } }; diff --git a/csrc/cpu/aten/kernels/WoqLinearKrnl.cpp b/csrc/cpu/aten/kernels/WoqLinearKrnl.cpp index 18778ef22..a770f4456 100644 --- a/csrc/cpu/aten/kernels/WoqLinearKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqLinearKrnl.cpp @@ -1507,7 +1507,7 @@ void dot_update( template void zero_fill(T* C, int M, int N, int stride) { for (int m = 0; m < M; m++) { - memset(C + m * stride, 0, sizeof(T) * N); + std::fill_n(C + m * stride, N, T(0)); } } diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index 110d5695b..30403b90b 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -2016,10 +2016,8 @@ void qlinear_woq_affine_impl( 64, num_threads * M * N * sizeof(TGemmOut)); y_private_valid = (bool*)std::aligned_alloc( 64, num_threads * (M / BLOCK_M) * Nc * sizeof(bool)); - memset( - y_private_valid, - 0, - sizeof(bool) * num_threads * (M / BLOCK_M) * Nc); + std::fill_n( + y_private_valid, num_threads * (M / BLOCK_M) * Nc, false); } auto y_private_ptr = GetVLAPtr(y_private, {M, Nc, Nb}); auto y_private_valid_ptr = diff --git a/csrc/cpu/aten/utils/woq.h b/csrc/cpu/aten/utils/woq.h index f81654e1c..82e0d75c5 100644 --- a/csrc/cpu/aten/utils/woq.h +++ b/csrc/cpu/aten/utils/woq.h @@ -74,12 +74,16 @@ class DotMicroKernel { (trans_a ? LIBXSMM_GEMM_FLAG_TRANS_A : LIBXSMM_GEMM_FLAG_NONE) | (trans_b ? LIBXSMM_GEMM_FLAG_TRANS_B : LIBXSMM_GEMM_FLAG_NONE); libxsmm_gemm_batch_reduce_config brconfig; - memset(&brconfig, 0, sizeof(libxsmm_gemm_batch_reduce_config)); + std::fill_n( + reinterpret_cast(&brconfig), + sizeof(libxsmm_gemm_batch_reduce_config), + 0); brconfig.br_type = LIBXSMM_GEMM_BATCH_REDUCE_NONE; kernel_func_ = libxsmm_dispatch_brgemm_v2( brshape, brflags, /*prefetch_flags=*/0, brconfig); - memset(&gemm_param_, 0, sizeof(libxsmm_gemm_param)); + std::fill_n( + reinterpret_cast(&gemm_param_), sizeof(libxsmm_gemm_param), 0); } void operator()(void* A, void* B, void* C) { diff --git a/csrc/cpu/comm/shm_reduction.h b/csrc/cpu/comm/shm_reduction.h index eb1717ff3..7bca980a0 100644 --- a/csrc/cpu/comm/shm_reduction.h +++ b/csrc/cpu/comm/shm_reduction.h @@ -149,8 +149,8 @@ class ShmReduction { shmCtx_.nblocks = MAX_SHM_BLOCK_COUNT; if (rank_ == 0) { torch_ipex::cpu::create_shm(&shmCtx_); - memset(shmCtx_.state, 0, shmCtx_.nstates * sizeof(int)); - memset((void*)shmCtx_.blockState, 0, shmCtx_.nstates * shmCtx_.nblocks); + std::fill_n(shmCtx_.state, shmCtx_.nstates, 0); + std::fill_n(shmCtx_.blockState, shmCtx_.nstates * shmCtx_.nblocks, 0); } callback(shmCtx_.pid_fd, 2); diff --git a/csrc/cpu/tpp/jit_compile.cpp b/csrc/cpu/tpp/jit_compile.cpp index 29091604b..2341efd99 100644 --- a/csrc/cpu/tpp/jit_compile.cpp +++ b/csrc/cpu/tpp/jit_compile.cpp @@ -18,7 +18,7 @@ void* jit_compile_and_load( int fd = mkstemp(libname); unlink(libname); char fdname[50]; - sprintf(fdname, "/proc/self/fd/%d", fd); + snprintf(fdname, sizeof(fdname), "/proc/self/fd/%d", fd); auto cmd = std::string("g++ -shared -fPIC -x c++ ") + flags; cmd = cmd + " -o " + fdname + " " + filename; printf("JIT COMPILE: %s\n", cmd.c_str()); @@ -66,7 +66,7 @@ void* jit_from_str( int fd = mkstemp(filename); unlink(filename); char fdname[50]; - sprintf(fdname, "/proc/self/fd/%d", fd); + snprintf(fdname, sizeof(fdname), "/proc/self/fd/%d", fd); write(fd, src.c_str(), src.length()); return jit_from_file(fdname, flags, func_name); #else diff --git a/csrc/cpu/tpp/par_loop_generator.cpp b/csrc/cpu/tpp/par_loop_generator.cpp index 07f901375..52e4f0fcf 100644 --- a/csrc/cpu/tpp/par_loop_generator.cpp +++ b/csrc/cpu/tpp/par_loop_generator.cpp @@ -75,7 +75,11 @@ loop_param_t find_loop_param_at_pos(loop_param_t* i_loop_params, int pos) { } void add_buf_to_code(loop_code* i_code, char* buf) { - sprintf(i_code->buf + i_code->cur_pos, "%s", buf); + snprintf( + i_code->buf + i_code->cur_pos, + (MAX_CODE_SIZE - i_code->cur_pos) * sizeof(char), + "%s", + buf); i_code->cur_pos += strlen(buf); } @@ -104,9 +108,13 @@ void emit_parallel_for(loop_code* i_code, int collapse_level) { char tmp_buf[512]; align_line(i_code); if (collapse_level > 1) { - sprintf(tmp_buf, "#pragma omp for collapse(%d) nowait\n", collapse_level); + snprintf( + tmp_buf, + sizeof(tmp_buf), + "#pragma omp for collapse(%d) nowait\n", + collapse_level); } else { - sprintf(tmp_buf, "#pragma omp for nowait\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "#pragma omp for nowait\n"); } add_buf_to_code(i_code, tmp_buf); return; @@ -115,36 +123,38 @@ void emit_parallel_for(loop_code* i_code, int collapse_level) { void emit_loop_header(loop_code* i_code) { char tmp_buf[512]; align_line(i_code); - sprintf(tmp_buf, "#pragma omp parallel\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "#pragma omp parallel\n"); } void emit_parallel_region(loop_code* i_code) { char tmp_buf[512]; align_line(i_code); - sprintf(tmp_buf, "#pragma omp parallel\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "#pragma omp parallel\n"); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf(tmp_buf, "{\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "{\n"); add_buf_to_code(i_code, tmp_buf); increase_nest_level(i_code); if (i_code->use_2d_par > 0) { align_line(i_code); - sprintf(tmp_buf, "int tid = omp_get_thread_num();\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "int tid = omp_get_thread_num();\n"); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf(tmp_buf, "int row_teams = %d;\n", i_code->n_row_teams); + snprintf( + tmp_buf, sizeof(tmp_buf), "int row_teams = %d;\n", i_code->n_row_teams); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf(tmp_buf, "int col_teams = %d;\n", i_code->n_col_teams); + snprintf( + tmp_buf, sizeof(tmp_buf), "int col_teams = %d;\n", i_code->n_col_teams); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf(tmp_buf, "int row_id = tid/col_teams;\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "int row_id = tid/col_teams;\n"); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf(tmp_buf, "int col_id = tid%%col_teams;\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "int col_id = tid%%col_teams;\n"); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf(tmp_buf, "if (tid < row_teams * col_teams) {\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "if (tid < row_teams * col_teams) {\n"); add_buf_to_code(i_code, tmp_buf); increase_nest_level(i_code); } @@ -155,7 +165,7 @@ void close_parallel_region(loop_code* i_code) { char tmp_buf[512]; decrease_nest_level(i_code); align_line(i_code); - sprintf(tmp_buf, "}\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "}\n"); add_buf_to_code(i_code, tmp_buf); return; } @@ -168,40 +178,41 @@ void emit_loop_header(loop_code* i_code, loop_param_t* i_loop_param) { char str_step[512]; if (strcmp(i_loop_param->idx_name, "") == 0) { - sprintf(str_idx, "i%d", i_loop_param->idx_id); + snprintf(str_idx, sizeof(str_idx), "i%d", i_loop_param->idx_id); } else { - sprintf(str_idx, "%s", i_loop_param->idx_name); + snprintf(str_idx, sizeof(str_idx), "%s", i_loop_param->idx_name); } if (strcmp(i_loop_param->start_var_name, "") == 0) { - sprintf(str_start, "%ld", i_loop_param->start); + snprintf(str_start, sizeof(str_start), "%ld", i_loop_param->start); } else { - sprintf(str_start, "%s", i_loop_param->start_var_name); + snprintf(str_start, sizeof(str_start), "%s", i_loop_param->start_var_name); } if (strcmp(i_loop_param->end_var_name, "") == 0) { - sprintf(str_end, "%ld", i_loop_param->end); + snprintf(str_end, sizeof(str_end), "%ld", i_loop_param->end); } else { - sprintf(str_end, "%s", i_loop_param->end_var_name); + snprintf(str_end, sizeof(str_end), "%s", i_loop_param->end_var_name); } if (strcmp(i_loop_param->step_var_name, "") == 0) { - sprintf(str_step, "%ld", i_loop_param->step); + snprintf(str_step, sizeof(str_step), "%ld", i_loop_param->step); } else { - sprintf(str_step, "%s", i_loop_param->step_var_name); + snprintf(str_step, sizeof(str_step), "%s", i_loop_param->step_var_name); } if ((i_loop_param->is_par_across_col_teams > 0) || (i_loop_param->is_par_across_row_teams > 0)) { char prefix[16]; if (i_loop_param->is_par_across_col_teams > 0) { - sprintf(prefix, "col"); + snprintf(prefix, sizeof(prefix), "col"); } else { - sprintf(prefix, "row"); + snprintf(prefix, sizeof(prefix), "row"); } align_line(i_code); - sprintf( + snprintf( tmp_buf, + sizeof(tmp_buf), "int %s_tasks = ((%s) - (%s) + ((%s) - 1))/(%s);\n", prefix, str_end, @@ -210,8 +221,9 @@ void emit_loop_header(loop_code* i_code, loop_param_t* i_loop_param) { str_step); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf( + snprintf( tmp_buf, + sizeof(tmp_buf), "int %s_tasks_chunksize = (%s_tasks + %s_teams - 1)/%s_teams;\n", prefix, prefix, @@ -219,8 +231,9 @@ void emit_loop_header(loop_code* i_code, loop_param_t* i_loop_param) { prefix); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf( + snprintf( tmp_buf, + sizeof(tmp_buf), "int my_%s_start = (%s_id * %s_tasks_chunksize < %s_tasks) ? %s + (%s_id * %s_tasks_chunksize) * %s : %s;\n", prefix, prefix, @@ -233,8 +246,9 @@ void emit_loop_header(loop_code* i_code, loop_param_t* i_loop_param) { str_end); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf( + snprintf( tmp_buf, + sizeof(tmp_buf), "int my_%s_end = ((%s_id+1) * %s_tasks_chunksize < %s_tasks) ? %s + ((%s_id+1) * %s_tasks_chunksize) * %s : %s;\n", prefix, prefix, @@ -247,8 +261,9 @@ void emit_loop_header(loop_code* i_code, loop_param_t* i_loop_param) { str_end); add_buf_to_code(i_code, tmp_buf); align_line(i_code); - sprintf( + snprintf( tmp_buf, + sizeof(tmp_buf), "for (int %s = my_%s_start; %s < my_%s_end; %s += %s) {\n", str_idx, prefix, @@ -260,8 +275,9 @@ void emit_loop_header(loop_code* i_code, loop_param_t* i_loop_param) { increase_nest_level(i_code); } else { align_line(i_code); - sprintf( + snprintf( tmp_buf, + sizeof(tmp_buf), "for (int %s = %s; %s < %s; %s += %s) {\n", str_idx, str_start, @@ -285,8 +301,9 @@ void emit_func_signature( char tmp_buf[512]; // int i; align_line(i_code); - sprintf( + snprintf( tmp_buf, + sizeof(tmp_buf), "#include \nextern \"C\" void par_nested_loops(loop_rt_spec_t *%s, std::function %s, std::function %s, std::function %s) {\n", spec_func_name, body_func_name, @@ -300,12 +317,12 @@ void emit_func_termination(loop_code* i_code) { char tmp_buf[512]; decrease_nest_level(i_code); align_line(i_code); - sprintf(tmp_buf, "}\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "}\n"); add_buf_to_code(i_code, tmp_buf); if (i_code->use_2d_par > 0) { decrease_nest_level(i_code); align_line(i_code); - sprintf(tmp_buf, "}\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "}\n"); add_buf_to_code(i_code, tmp_buf); } return; @@ -314,7 +331,7 @@ void emit_func_termination(loop_code* i_code) { void emit_void_function(loop_code* i_code, char* func_name) { char tmp_buf[512]; align_line(i_code); - sprintf(tmp_buf, "if (%s) %s();\n", func_name, func_name); + snprintf(tmp_buf, sizeof(tmp_buf), "if (%s) %s();\n", func_name, func_name); add_buf_to_code(i_code, tmp_buf); return; } @@ -323,18 +340,23 @@ void emit_loop_body(loop_code* i_code, char* body_func_name) { char tmp_buf[512]; int i; align_line(i_code); - sprintf(tmp_buf, "int idx[%d];\n", i_code->n_logical_loops); + snprintf(tmp_buf, sizeof(tmp_buf), "int idx[%d];\n", i_code->n_logical_loops); add_buf_to_code(i_code, tmp_buf); /* Here we set the idx array to be used by function called */ for (i = 0; i < i_code->n_logical_loops; i++) { char str_idx[64]; - sprintf(str_idx, "%c%d", 'a' + i, i_code->occurence_map['a' + i] - 1); + snprintf( + str_idx, + sizeof(tmp_buf), + "%c%d", + 'a' + i, + i_code->occurence_map['a' + i] - 1); align_line(i_code); - sprintf(tmp_buf, "idx[%d] = %s;\n", i, str_idx); + snprintf(tmp_buf, sizeof(tmp_buf), "idx[%d] = %s;\n", i, str_idx); add_buf_to_code(i_code, tmp_buf); } align_line(i_code); - sprintf(tmp_buf, "%s(idx);\n", body_func_name); + snprintf(tmp_buf, sizeof(tmp_buf), "%s(idx);\n", body_func_name); add_buf_to_code(i_code, tmp_buf); return; } @@ -343,7 +365,7 @@ void emit_loop_termination(loop_code* i_code) { char tmp_buf[512]; decrease_nest_level(i_code); align_line(i_code); - sprintf(tmp_buf, "}\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "}\n"); add_buf_to_code(i_code, tmp_buf); return; } @@ -351,7 +373,7 @@ void emit_loop_termination(loop_code* i_code) { void emit_barrier(loop_code* i_code) { char tmp_buf[512]; align_line(i_code); - sprintf(tmp_buf, "#pragma omp barrier\n"); + snprintf(tmp_buf, sizeof(tmp_buf), "#pragma omp barrier\n"); add_buf_to_code(i_code, tmp_buf); return; } @@ -364,10 +386,16 @@ void set_loop_param( const char* step_name, int pos) { io_param->pos_in_loopnest = pos; - sprintf(io_param->idx_name, "%s", idx_name); - sprintf(io_param->start_var_name, "%s", s_name); - sprintf(io_param->end_var_name, "%s", e_name); - sprintf(io_param->step_var_name, "%s", step_name); + snprintf(io_param->idx_name, sizeof(io_param->idx_name), "%s", idx_name); + snprintf( + io_param->start_var_name, sizeof(io_param->start_var_name), "%s", s_name); + snprintf( + io_param->end_var_name, sizeof(io_param->end_var_name), "%s", e_name); + snprintf( + io_param->step_var_name, + sizeof(io_param->step_var_name), + "%s", + step_name); return; } @@ -413,21 +441,21 @@ void parse_jit_info(char* jit_info_str, loop_param_t* loop_param) { if (i == 0) { /* Empty token */ if (token_id == 0) { - sprintf(token_start, ""); + snprintf(token_start, sizeof(token_start), ""); } else if (token_id == 1) { - sprintf(token_end, ""); + snprintf(token_end, sizeof(token_end), ""); } else if (token_id == 2) { - sprintf(token_step, ""); + snprintf(token_step, sizeof(token_step), ""); } token_id++; } else if (jit_info_str[i - 1] == ',') { /* Empty token */ if (token_id == 0) { - sprintf(token_start, ""); + snprintf(token_start, sizeof(token_start), ""); } else if (token_id == 1) { - sprintf(token_end, ""); + snprintf(token_end, sizeof(token_end), ""); } else if (token_id == 2) { - sprintf(token_step, ""); + snprintf(token_step, sizeof(token_step), ""); } token_id++; } else { @@ -435,11 +463,11 @@ void parse_jit_info(char* jit_info_str, loop_param_t* loop_param) { cur_token[j] = '\0'; j = 0; if (token_id == 0) { - sprintf(token_start, "%s", cur_token); + snprintf(token_start, sizeof(token_start), "%s", cur_token); } else if (token_id == 1) { - sprintf(token_end, "%s", cur_token); + snprintf(token_end, sizeof(token_end), "%s", cur_token); } else if (token_id == 2) { - sprintf(token_step, "%s", cur_token); + snprintf(token_step, sizeof(token_step), "%s", cur_token); } token_id++; } @@ -589,11 +617,12 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { char barrier_positions[256]; int jit_loop_spec = 0; int use_2d_par = 0; - char _loop_nest_desc_extended[strlen(__loop_nest_desc_extended)]; - char loop_nest_desc_extended[strlen(_loop_nest_desc_extended)]; + size_t src_len = strlen(__loop_nest_desc_extended); + char _loop_nest_desc_extended[src_len]; + char loop_nest_desc_extended[src_len]; /* Extract explicit 2D parallelization info */ - for (i = 0; i < strlen(__loop_nest_desc_extended); i++) { + for (i = 0; i < src_len; i++) { if (__loop_nest_desc_extended[i] == '{') { use_2d_par = 1; break; @@ -609,11 +638,12 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { loop_params, &l_code); } else { - strcpy(_loop_nest_desc_extended, __loop_nest_desc_extended); + strncpy(_loop_nest_desc_extended, __loop_nest_desc_extended, src_len); + _loop_nest_desc_extended[src_len] = '\0'; } /* Check if we have to jit the loop specs */ - for (i = 0; i < strlen(_loop_nest_desc_extended); i++) { + for (i = 0; i < src_len; i++) { if (_loop_nest_desc_extended[i] == '[') { jit_loop_spec = 1; break; @@ -621,18 +651,19 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { } l_code.jit_loop_spec = jit_loop_spec; - memset(loop_params_map, 0, 256 * sizeof(loop_param_t)); + std::fill_n(loop_params_map, 256, loop_param_t{}); if (jit_loop_spec > 0) { extract_jit_info( _loop_nest_desc_extended, loop_nest_desc_extended, loop_params_map); } else { - strcpy(loop_nest_desc_extended, _loop_nest_desc_extended); + strncpy(loop_nest_desc_extended, _loop_nest_desc_extended, src_len); + loop_nest_desc_extended[src_len] = '\0'; } /* Cleanup input descriptor to exclude barriers */ k = 0; - memset(barrier_positions, 0, 256); - for (i = 0; i < strlen(loop_nest_desc_extended); i++) { + std::fill_n(barrier_positions, 256, 0); + for (i = 0; i < src_len; i++) { if (loop_nest_desc_extended[i] == '|') { if (k - 1 >= 0) { barrier_positions[k - 1] = 1; @@ -662,13 +693,13 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { /* Count how many times each loop occurs (lower case and upper case are * equivalent for that matter) */ - memset(loop_map, 0, 256 * sizeof(char)); + std::fill_n(loop_map, 256, 0); for (i = 0; i < n_loops; i++) { loop_map[tolower(loop_nest_desc[i])]++; } /* Set up loop properties */ - memset(occurence_map, 0, 256 * sizeof(char)); + std::fill_n(occurence_map, 256, 0); for (i = 0; i < n_loops; i++) { int is_blocked = (loop_map[tolower(loop_nest_desc[i])] > 1) ? 1 : 0; int is_parallelizable = @@ -685,39 +716,67 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { is_blocked_outer = (occurence_id == 0) ? 1 : 0; occurence_map[tolower(loop_nest_desc[i])]++; - sprintf(spec_array_name, "%s", spec_func_name); + snprintf(spec_array_name, sizeof(spec_array_name), "%s", spec_func_name); - sprintf(idx_name, "%c%d", tolower(loop_nest_desc[i]), occurence_id); + snprintf( + idx_name, + sizeof(idx_name), + "%c%d", + tolower(loop_nest_desc[i]), + occurence_id); if (occurence_id == 0) { if (loop_params_map[loop_abs_index].jit_start > 0) { - sprintf(start_var_name, "%d", loop_params_map[loop_abs_index].start); + snprintf( + start_var_name, + sizeof(start_var_name), + "%d", + loop_params_map[loop_abs_index].start); } else { - sprintf( - start_var_name, "%s[%d].start", spec_array_name, loop_abs_index); + snprintf( + start_var_name, + sizeof(start_var_name), + "%s[%d].start", + spec_array_name, + loop_abs_index); } } else { - sprintf( - start_var_name, "%c%d", tolower(loop_nest_desc[i]), occurence_id - 1); + snprintf( + start_var_name, + sizeof(start_var_name), + "%c%d", + tolower(loop_nest_desc[i]), + occurence_id - 1); } if (occurence_id == 0) { if (loop_params_map[loop_abs_index].jit_end > 0) { - sprintf(end_var_name, "%d", loop_params_map[loop_abs_index].end); + snprintf( + end_var_name, + sizeof(end_var_name), + "%d", + loop_params_map[loop_abs_index].end); } else { - sprintf(end_var_name, "%s[%d].end", spec_array_name, loop_abs_index); + snprintf( + end_var_name, + sizeof(end_var_name), + "%s[%d].end", + spec_array_name, + loop_abs_index); } } else { if (loop_params_map[loop_abs_index].jit_block_sizes > 0) { - sprintf( + snprintf( end_var_name, + sizeof(end_var_name), "%c%d + %d", tolower(loop_nest_desc[i]), occurence_id - 1, loop_params_map[loop_abs_index].block_size[occurence_id - 1]); } else { - sprintf( + snprintf( end_var_name, + sizeof(end_var_name), "%c%d + %s[%d].block_size[%d]", tolower(loop_nest_desc[i]), occurence_id - 1, @@ -730,20 +789,30 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { if (is_blocked) { if (occurence_id == loop_map[tolower(loop_nest_desc[i])] - 1) { if (loop_params_map[loop_abs_index].jit_step > 0) { - sprintf(step_var_name, "%d", loop_params_map[loop_abs_index].step); + snprintf( + step_var_name, + sizeof(step_var_name), + "%d", + loop_params_map[loop_abs_index].step); } else { - sprintf( - step_var_name, "%s[%d].step", spec_array_name, loop_abs_index); + snprintf( + step_var_name, + sizeof(step_var_name), + "%s[%d].step", + spec_array_name, + loop_abs_index); } } else { if (loop_params_map[loop_abs_index].jit_block_sizes > 0) { - sprintf( + snprintf( step_var_name, + sizeof(step_var_name), "%d", loop_params_map[loop_abs_index].block_size[occurence_id]); } else { - sprintf( + snprintf( step_var_name, + sizeof(step_var_name), "%s[%d].block_size[%d]", spec_array_name, loop_abs_index, @@ -752,9 +821,18 @@ std::string loop_generator(const char* __loop_nest_desc_extended) { } } else { if (loop_params_map[loop_abs_index].jit_step > 0) { - sprintf(step_var_name, "%d", loop_params_map[loop_abs_index].step); + snprintf( + step_var_name, + sizeof(step_var_name), + "%d", + loop_params_map[loop_abs_index].step); } else { - sprintf(step_var_name, "%s[%d].step", spec_array_name, loop_abs_index); + snprintf( + step_var_name, + sizeof(step_var_name), + "%s[%d].step", + spec_array_name, + loop_abs_index); } } diff --git a/csrc/cpu/tpp/xsmm_functors.h b/csrc/cpu/tpp/xsmm_functors.h index 69c96486e..dc851c202 100644 --- a/csrc/cpu/tpp/xsmm_functors.h +++ b/csrc/cpu/tpp/xsmm_functors.h @@ -1903,7 +1903,8 @@ class BrgemmTPP { uint64_t count, bool no_tile_cfg = false) { libxsmm_gemm_param gemm_param; - memset(&gemm_param, 0, sizeof(libxsmm_gemm_param)); + std::fill_n( + reinterpret_cast(&gemm_param), sizeof(libxsmm_gemm_param), 0); gemm_param.op.tertiary = &count; gemm_param.c.primary = (void*)C; gemm_param.a.primary = (void*)B; diff --git a/csrc/cpu/utils/robin_hood.h b/csrc/cpu/utils/robin_hood.h index 896ffa2a7..08fc09521 100644 --- a/csrc/cpu/utils/robin_hood.h +++ b/csrc/cpu/utils/robin_hood.h @@ -2499,7 +2499,7 @@ class Table mKeyVals = reinterpret_cast( detail::assertNotNull(std::malloc(numBytesTotal))); mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); - std::memset(mInfo, 0, numBytesTotal - numElementsWithBuffer * sizeof(Node)); + std::fill_n(mInfo, numBytesTotal - numElementsWithBuffer * sizeof(Node), 0); // set sentinel mInfo[numElementsWithBuffer] = 1; diff --git a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py index d36ce6c91..bf8def64c 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py @@ -516,11 +516,11 @@ def forward( ): # return dict is handled by ipex._set_optimized_model_for_generation output = (lm_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output + return output if not return_dict: output = (lm_logits,) + transformer_outputs[1:] - return ((loss,) + output) if loss is not None else output + return output return CausalLMOutputWithPast( loss=loss, diff --git a/examples/cpu/inference/python/llm-modeling/modeling_llama.py b/examples/cpu/inference/python/llm-modeling/modeling_llama.py index 67fccfc11..27c9439a9 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_llama.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_llama.py @@ -465,11 +465,11 @@ def forward( ): # return dict is handled by ipex._set_optimized_model_for_generation output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output + return output if not return_dict: output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output + return output return CausalLMOutputWithPast( loss=loss, logits=logits, @@ -488,8 +488,7 @@ def prepare_inputs_for_generation( **kwargs, ): if past_key_values is not None: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None + past_length = past_key_values[0][0].shape[2] if ( attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1] @@ -497,12 +496,6 @@ def prepare_inputs_for_generation( input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] elif past_length < input_ids.shape[1]: input_ids = input_ids[:, past_length:] - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: diff --git a/examples/cpu/inference/python/llm-modeling/modeling_opt.py b/examples/cpu/inference/python/llm-modeling/modeling_opt.py index f521a9505..7b3247a22 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_opt.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_opt.py @@ -676,11 +676,11 @@ def forward( ): # return dict is handled by ipex._set_optimized_model_for_generation output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output + return output if not return_dict: output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output + return output return CausalLMOutputWithPast( loss=loss, diff --git a/intel_extension_for_pytorch/llm/modules/linear_fusion.py b/intel_extension_for_pytorch/llm/modules/linear_fusion.py index 5da28f737..380cf8de4 100644 --- a/intel_extension_for_pytorch/llm/modules/linear_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/linear_fusion.py @@ -42,7 +42,7 @@ def init_on_device(self, x, op_type): self.linear_1, self.linear_2, tpp=( - self.linear_1.use_tpp and self.linear_1.use_tpp + self.linear_1.use_tpp and self.linear_2.use_tpp if isinstance(self.linear_1, _IPEXLinear) and isinstance(self.linear_2, _IPEXLinear) else False diff --git a/intel_extension_for_pytorch/quantization/_quantize_utils.py b/intel_extension_for_pytorch/quantization/_quantize_utils.py index 7b6ecc7cb..f07f4617f 100644 --- a/intel_extension_for_pytorch/quantization/_quantize_utils.py +++ b/intel_extension_for_pytorch/quantization/_quantize_utils.py @@ -1035,6 +1035,7 @@ def get_qparams(scales, zps): if not is_sym_quant(dtype): zps = torch.cat([zps, zps_rem], dim=-1) if not is_4bit(dtype): + assert zps is not None zps -= 128 return scales, zps @@ -1071,12 +1072,14 @@ def get_qparams(scales, zps): if not is_sym_quant(dtype): zps_rem = zps[:, Kc - has_rem :] if dtype == WoqWeightDtype.INT8: + assert zps_rem is not None qt_rem = torch.clamp( torch.round(t_rem * inv_scales_rem) + zps_rem.unsqueeze(-1), min=qmin, max=qmax, ) elif dtype == WoqWeightDtype.INT4: + assert zps_rem is not None qt_rem = torch.clamp( torch.round(t_rem * inv_scales_rem) + zps_rem.unsqueeze(-1), min=qmin, diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 3fac7718d..679bce1e9 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -2435,11 +2435,12 @@ def GitVisionEncoder_forward( encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None - bias = None - if causal_attention_mask is not None: - bias += causal_attention_mask + bias = causal_attention_mask if attention_mask is not None: - bias += attention_mask + if bias is not None: + bias += attention_mask + else: + bias = attention_mask hidden_states = inputs_embeds for idx, encoder_layer in enumerate(self.layers): if output_hidden_states: @@ -2880,8 +2881,7 @@ def prepare_inputs_for_generation_llama( **kwargs, ): if past_key_values is not None: - cache_length = past_length = past_key_values[0][0].shape[2] - max_cache_length = None + past_length = past_key_values[0][0].shape[2] # Keep only the unprocessed tokens: # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where @@ -2895,14 +2895,6 @@ def prepare_inputs_for_generation_llama( input_ids = input_ids[:, past_length:] # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. - # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. - if ( - max_cache_length is not None - and attention_mask is not None - and cache_length + input_ids.shape[1] > max_cache_length - ): - attention_mask = attention_mask[:, -max_cache_length:] - position_ids = kwargs.get("position_ids", None) if attention_mask is not None and position_ids is None: # create position_ids on the fly for batch generation @@ -2993,8 +2985,11 @@ def prepare_inputs_labels_for_multimodal_llavallama( dtype=attention_mask.dtype, device=attention_mask.device, ) + input_embeds = self.model.embed_tokens(input_ids) + if images is not None: + input_embeds = input_embeds.to(images[0].dtype) model_inputs = { - "inputs_embeds": self.model.embed_tokens(input_ids).to(images[0].dtype), + "inputs_embeds": input_embeds, "attention_mask": attention_mask, "past_key_values": past_key_values, } diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 072c4defa..e3b96d813 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -1543,16 +1543,15 @@ def _QWenAttention_forward( 3, ) - if use_cache: - (attn_output, attn_weights, present) = self._IPEXScaleDotProduct( - query, - key, - value, - math.sqrt(self.head_dim) if self.scale_attn_weights else 1, - layer_past, - None, - attention_mask, - ) + (attn_output, attn_weights, present) = self._IPEXScaleDotProduct( + query, + key, + value, + math.sqrt(self.head_dim) if self.scale_attn_weights else 1, + layer_past, + None, + attention_mask, + ) attn_output = attn_output.transpose(1, 2) attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) diff --git a/tests/cpu/hf_configs/qwen/modeling_qwen.py b/tests/cpu/hf_configs/qwen/modeling_qwen.py index 8bbbc3e27..103d907e0 100644 --- a/tests/cpu/hf_configs/qwen/modeling_qwen.py +++ b/tests/cpu/hf_configs/qwen/modeling_qwen.py @@ -1286,6 +1286,7 @@ def chat_stream( get_stop_words_ids(generation_config.chat_format, tokenizer) ) if stop_words_ids is not None: + assert hasattr(generation_config, "eos_token_id") stop_words_logits_processor = StopWordsLogitsProcessor( stop_words_ids=stop_words_ids, eos_token_id=generation_config.eos_token_id, @@ -1352,6 +1353,7 @@ def generate( stop_words_ids = getattr(generation_config, "stop_words_ids", None) if stop_words_ids is not None: + assert hasattr(generation_config, "eos_token_id") stop_words_logits_processor = StopWordsLogitsProcessor( stop_words_ids=stop_words_ids, eos_token_id=generation_config.eos_token_id, From 3406cd21c64965b139253783679edb4fbc363763 Mon Sep 17 00:00:00 2001 From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com> Date: Mon, 8 Apr 2024 12:22:54 +0800 Subject: [PATCH 011/199] backport pre-r2.3 doc updates to main (#2736) --- README.md | 3 + docs/_static/htmls/tbl_deepspeed.html | 4 +- docs/_static/htmls/tbl_single.html | 27 + docs/design_doc/cpu/isa_dyndisp.md | 480 +----------------- docs/tutorials/api_doc.rst | 2 +- docs/tutorials/features.rst | 5 +- .../features/isa_dynamic_dispatch.md | 399 ++++++++++++++- docs/tutorials/known_issues.md | 12 +- docs/tutorials/llm.rst | 2 - .../python/bert_eager_mode_inference_bf16.py | 1 + .../bert_torchdynamo_mode_inference_bf16.py | 1 + .../bert_torchscript_mode_inference_bf16.py | 1 + examples/cpu/inference/python/llm/README.md | 4 +- .../inference/python/llm/tools/env_setup.sh | 2 +- .../resnet50_eager_mode_inference_bf16.py | 1 + ...esnet50_torchdynamo_mode_inference_bf16.py | 1 + ...esnet50_torchscript_mode_inference_bf16.py | 1 + .../training/single_instance_training_bf16.py | 1 + 18 files changed, 432 insertions(+), 515 deletions(-) diff --git a/README.md b/README.md index 7cda301ba..93fa2e794 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,9 @@ In the current technological landscape, Generative AI (GenAI) workloads and mode |T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | |MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | +|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | +|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | - ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32). diff --git a/docs/_static/htmls/tbl_deepspeed.html b/docs/_static/htmls/tbl_deepspeed.html index 73eb68eb4..2b0dd5bc7 100644 --- a/docs/_static/htmls/tbl_deepspeed.html +++ b/docs/_static/htmls/tbl_deepspeed.html @@ -47,8 +47,8 @@

FALCON

tiiuae/falcon-40b

-

๐ŸŸจ

-

๐ŸŸจ

+

๐ŸŸฉ

+

๐ŸŸฉ

OPT

diff --git a/docs/_static/htmls/tbl_single.html b/docs/_static/htmls/tbl_single.html index 54f9f9a3e..8eaefecf8 100644 --- a/docs/_static/htmls/tbl_single.html +++ b/docs/_static/htmls/tbl_single.html @@ -191,6 +191,33 @@

๐ŸŸฉ

๐ŸŸฉ

+ +

Mixtral

+

mistralai/Mixtral-8x7B-v0.1

+

๐ŸŸฉ

+

๐ŸŸฉ

+

+

๐ŸŸฉ

+

+ + +

Stablelm

+

stabilityai/stablelm-2-1_6b

+

๐ŸŸฉ

+

๐ŸŸฉ

+

+

๐ŸŸจ

+

+ + +

Qwen

+

Qwen/Qwen-7B-Chat

+

๐ŸŸฉ

+

๐ŸŸฉ

+

+

๐ŸŸฉ

+

+
    diff --git a/docs/design_doc/cpu/isa_dyndisp.md b/docs/design_doc/cpu/isa_dyndisp.md index 20c7931a8..9dd9dc150 100644 --- a/docs/design_doc/cpu/isa_dyndisp.md +++ b/docs/design_doc/cpu/isa_dyndisp.md @@ -1,481 +1,3 @@ # Intelยฎ Extension for PyTorch\* CPU ISA Dynamic Dispatch Design Doc -This document explains the dynamic kernel dispatch mechanism for Intelยฎ Extension for PyTorch\* (IPEX) based on CPU ISA. It is an extension to the similar mechanism in PyTorch. - -## Overview - -IPEX dyndisp is forked from **PyTorch:** `ATen/native/DispatchStub.h` and `ATen/native/DispatchStub.cpp`. IPEX adds additional CPU ISA level support, such as `AVX512_VNNI`, `AVX512_BF16` and `AMX`. - -PyTorch & IPEX CPU ISA support statement: - - | | DEFAULT | AVX2 | AVX2_VNNI | AVX512 | AVX512_VNNI | AVX512_BF16 | AMX | AVX512_FP16 - | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | - | PyTorch | โœ” | โœ” | โœ˜ | โœ” | โœ˜ | โœ˜ | โœ˜ | โœ˜ | - | IPEX-1.11 | โœ˜ | โœ” | โœ˜ | โœ” | โœ˜ | โœ˜ | โœ˜ | โœ˜ | - | IPEX-1.12 | โœ˜ | โœ” | โœ˜ | โœ” | โœ” | โœ” | โœ” | โœ˜ | - | IPEX-1.13 | โœ˜ | โœ” | โœ” | โœ” | โœ” | โœ” | โœ” | โœ˜ | - | IPEX-2.1 | โœ˜ | โœ” | โœ” | โœ” | โœ” | โœ” | โœ” | โœ” | - | IPEX-2.2 | โœ˜ | โœ” | โœ” | โœ” | โœ” | โœ” | โœ” | โœ” | - -\* Current IPEX DEFAULT level implemented as same as AVX2 level. - -### CPU ISA build compiler requirement - | ISA Level | GCC requirement | - | ---- | ---- | - | AVX2 | Any | - | AVX512 | GCC 9.2+ | - | AVX512_VNNI | GCC 9.2+ | - | AVX512_BF16 | GCC 10.3+ | - | AVX2_VNNI | GCC 11.2+ | - | AMX | GCC 11.2+ | - | AVX512_FP16 | GCC 12.1+ | - -\* Check with `cmake/Modules/FindAVX.cmake` for detailed compiler checks. - -## Dynamic Dispatch Design - -Dynamic dispatch copies the kernel implementation source files to multiple folders for each ISA level. It then builds each file using its ISA specific parameters. Each generated object file will contain its function body (**Kernel Implementation**). - -Kernel Implementation uses an anonymous namespace so that different CPU versions won't conflict. - -**Kernel Stub** is a "virtual function" with polymorphic kernel implementations pertaining to ISA levels. - -At the runtime, **Dispatch Stub implementation** will check CPUIDs and OS status to determins which ISA level pointer best matches the function body. - -### Code Folder Struct ->#### **Kernel implementation:** `csrc/cpu/aten/kernels/xyzKrnl.cpp` ->#### **Kernel Stub:** `csrc/cpu/aten/xyz.cpp` and `csrc/cpu/aten/xyz.h` ->#### **Dispatch Stub implementation:** `csrc/cpu/dyndisp/DispatchStub.cpp` and `csrc/cpu/dyndisp/DispatchStub.h` - -### CodeGen Process -IPEX build system will generate code for each ISA level with specifiy complier parameters. The CodeGen script is located at `cmake/cpu/IsaCodegen.cmake`. - -The CodeGen will copy each cpp files from **Kernel implementation**, and then add ISA level as new file suffix. - -> **Sample:** -> -> ---- -> -> **Origin file:** -> -> `csrc/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp` -> -> **Generate files:** -> -> DEFAULT: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.DEFAULT.cpp -O3 -D__AVX__ -DCPU_CAPABILITY_AVX2 -mavx2 -mfma -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -DCPU_CAPABILITY=DEFAULT -DCPU_CAPABILITY_DEFAULT` -> -> AVX2: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX2.cpp -O3 -D__AVX__ -mavx2 -mfma -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -DCPU_CAPABILITY=AVX2 -DCPU_CAPABILITY_AVX2` -> -> AVX512: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512.cpp -O3 -D__AVX512F__ -mavx512f -mavx512bw -mavx512vl -mavx512dq -mfma -DCPU_CAPABILITY=AVX512 -DCPU_CAPABILITY_AVX512` -> -> AVX512_VNNI: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_VNNI.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mfma -DCPU_CAPABILITY=AVX512_VNNI -DCPU_CAPABILITY_AVX512_VNNI` -> -> AVX512_BF16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_BF16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -DCPU_CAPABILITY=AVX512_BF16 -DCPU_CAPABILITY_AVX512_BF16` -> -> AMX: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AMX.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -DCPU_CAPABILITY=AMX -DCPU_CAPABILITY_AMX` -> -> AVX512_FP16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_FP16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16 -DCPU_CAPABILITY_AMX -DCPU_CAPABILITY=AVX512_FP16 -DCPU_CAPABILITY_AVX512_FP16` ---- - ->**Note:** ->1. DEFAULT level kernels is not fully implemented in IPEX. In order to align to PyTorch, we build default use AVX2 parameters in stead of that. So, IPEX minimal required executing machine support AVX2. ->2. `-D__AVX__` and `-D__AVX512F__` is defined for depends library [sleef](https://sleef.org/) . ->3. `-DCPU_CAPABILITY_AVX512` and `-DCPU_CAPABILITY_AVX2` are must to be defined for **PyTorch:** `aten/src/ATen/cpu/vec`, it determins vec register width. ->4. `-DCPU_CAPABILITY=[ISA_NAME]` is must to be defined for **PyTorch:** `aten/src/ATen/cpu/vec`, it is used as inline namespace name. ->5. Higher ISA level is compatible to lower ISA levels, so it needs to contains level ISA feature definitions. Such as AVX512_BF16 need contains `-DCPU_CAPABILITY_AVX512` `-DCPU_CAPABILITY_AVX512_VNNI`. But AVX512 don't contains AVX2 definitions, due to there are different vec register width. - -## Add Custom Kernel - -If you want to add a new custom kernel, and the kernel uses CPU ISA instructions, refer to these tips: - -1. Add CPU ISA related kernel implementation to the folder: `csrc/cpu/aten/kernels/NewKernelKrnl.cpp` -2. Add kernel stub to the folder: `csrc/cpu/aten/NewKernel.cpp` -3. Include header file: `csrc/cpu/dyndisp/DispatchStub.h`, and reference to the comment in the header file. -```c++ -// Implements instruction set specific function dispatch. -// -// Kernels that may make use of specialized instruction sets (e.g. AVX2) are -// compiled multiple times with different compiler flags (e.g. -mavx2). A -// DispatchStub contains a table of function pointers for a kernel. At runtime, -// the fastest available kernel is chosen based on the features reported by -// cpuinfo. -// -// Example: -// -// In csrc/cpu/aten/MyKernel.h: -// using fn_type = void(*)(const Tensor& x); -// IPEX_DECLARE_DISPATCH(fn_type, stub); -// -// In csrc/cpu/aten/MyKernel.cpp -// IPEX_DEFINE_DISPATCH(stub); -// -// In csrc/cpu/aten/kernels/MyKernel.cpp: -// namespace { -// // use anonymous namespace so that different cpu versions won't conflict -// void kernel(const Tensor& x) { ... } -// } -// IPEX_REGISTER_DISPATCH(stub, &kernel); -// -// To call: -// stub(kCPU, tensor); -``` -4. Write the kernel follow the guide. It contains: declare function type, register stub, call stub, etc. - ->**Note:** -> ->1. Some kernels only call **oneDNN** or **iDeep** implementation, or other backend implementation, which is not needed to add kernel implementations. (Refer: `BatchNorm.cpp`) ->2. Vec related header file must be included in kernel implementation files, but can not be included in kernel stub. Kernel stub is common code for all ISA level, and can't pass ISA related compiler parameters. ->3. For more intrinsics, check the [Intelยฎ Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html). - -### ISA intrinics specific kernel example: - -This is a FP32 convert to BF16 function example, and it is implemented for `AVX512_BF16`, `AVX512` and `DEFAULT` ISA levels. - -```c++ -//csrc/cpu/aten/CvtFp32ToBf16.h - -#pragma once - -#include - -namespace torch_ipex { -namespace cpu { - -void cvt_fp32_to_bf16(at::BFloat16* dst, const float* src, int len); - -namespace { - -void cvt_fp32_to_bf16_kernel_impl(at::BFloat16* dst, const float* src, int len); - -} - -using cvt_fp32_to_bf16_kernel_fn = void (*)(at::BFloat16*, const float*, int); -IPEX_DECLARE_DISPATCH(cvt_fp32_to_bf16_kernel_fn, cvt_fp32_to_bf16_kernel_stub); -} // namespace cpu -} // namespace torch_ipex - -``` -```c++ -//csrc/cpu/aten/CvtFp32ToBf16.cpp - -#include "CvtFp32ToBf16.h" - -namespace torch_ipex { -namespace cpu { - -IPEX_DEFINE_DISPATCH(cvt_fp32_to_bf16_kernel_stub); - -void cvt_fp32_to_bf16(at::BFloat16* dst, const float* src, int len) { - return cvt_fp32_to_bf16_kernel_stub(kCPU, dst, src, len); -} - -} // namespace cpu -} // namespace torch_ipex - -``` -Macro `CPU_CAPABILITY_AVX512` and `CPU_CAPABILITY_AVX512_BF16` are defined by compiler check, it is means that current compiler havs capability to generate defined ISA level code. - -Because of `AVX512_BF16` is higher level than `AVX512`, and it compatible to `AVX512`. `CPU_CAPABILITY_AVX512_BF16` can be contained in `CPU_CAPABILITY_AVX512` region. -```c++ -//csrc/cpu/aten/kernels/CvtFp32ToBf16Krnl.cpp - -#include -#include "csrc/aten/cpu/CvtFp32ToBf16.h" - -namespace torch_ipex { -namespace cpu { - -namespace { - -#if defined(CPU_CAPABILITY_AVX512) -#include -#else -#include -#endif -using namespace at::vec; - -#if defined(CPU_CAPABILITY_AVX512) -#include - -inline __m256i _cvt_fp32_to_bf16(const __m512 src) { -#if (defined CPU_CAPABILITY_AVX512_BF16) // AVX512_BF16 ISA implementation. - return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src)); -#else // AVX512 ISA implementation. - __m512i value = _mm512_castps_si512(src); - __m512i nan = _mm512_set1_epi32(0xffff); - auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); - __m512i ones = _mm512_set1_epi32(0x1); - __m512i vec_bias = _mm512_set1_epi32(0x7fff); - // uint32_t lsb = (input >> 16) & 1; - auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); - // uint32_t rounding_bias = 0x7fff + lsb; - t_value = _mm512_add_epi32(t_value, vec_bias); - // input += rounding_bias; - t_value = _mm512_add_epi32(t_value, value); - // input = input >> 16; - t_value = _mm512_srli_epi32(t_value, 16); - // Check NaN before converting back to bf16 - t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); - return _mm512_cvtusepi32_epi16(t_value); -#endif -} - -void cvt_fp32_to_bf16_kernel_impl( - at::BFloat16* dst, - const float* src, - int len) { - int i = 0; - for (; i < len - 15; i += 16) { - auto f32 = _mm512_loadu_ps(src + i); - _mm256_storeu_si256((__m256i*)(dst + i), _cvt_fp32_to_bf16(f32)); - } - if (i < len) { - auto mask = (1 << (len - i)) - 1; - auto f32 = _mm512_maskz_loadu_ps(mask, src + i); - _mm256_mask_storeu_epi16(dst + i, mask, _cvt_fp32_to_bf16(f32)); - } -} - -#else // DEFAULT ISA implementation. - -void cvt_fp32_to_bf16_kernel_impl( - at::BFloat16* dst, - const float* src, - int len) { - for (int j = 0; j < len; j++) { - *(dst + j) = *(src + j); - } -} - -#endif - -} // anonymous namespace - -IPEX_REGISTER_DISPATCH(cvt_fp32_to_bf16_kernel_stub, &cvt_fp32_to_bf16_kernel_impl); - -} // namespace cpu -} // namespace torch_ipex - -``` - -### Vec specific kernel example: -This example shows how to get the data type size and its Vec size. In different ISA, Vec has a different register width and a different Vec size. - -```c++ -//csrc/cpu/aten/GetVecLength.h -#pragma once - -#include - -namespace torch_ipex { -namespace cpu { - -std::tuple get_cpp_typesize_and_vecsize(at::ScalarType dtype); - -namespace { - -std::tuple get_cpp_typesize_and_vecsize_kernel_impl( - at::ScalarType dtype); -} - -using get_cpp_typesize_and_vecsize_kernel_fn = - std::tuple (*)(at::ScalarType); -IPEX_DECLARE_DISPATCH( - get_cpp_typesize_and_vecsize_kernel_fn, - get_cpp_typesize_and_vecsize_kernel_stub); - -} // namespace cpu -} // namespace torch_ipex - -``` - -```c++ -//csrc/cpu/aten/GetVecLength.cpp - -#include "GetVecLength.h" - -namespace torch_ipex { -namespace cpu { - -IPEX_DEFINE_DISPATCH(get_cpp_typesize_and_vecsize_kernel_stub); - -// get cpp typesize and vectorsize by at::ScalarType -std::tuple get_cpp_typesize_and_vecsize(at::ScalarType dtype) { - return get_cpp_typesize_and_vecsize_kernel_stub(kCPU, dtype); -} - -} // namespace cpu -} // namespace torch_ipex - -``` - -```c++ -//csrc/cpu/aten/kernels/GetVecLengthKrnl.cpp - -#include -#include "csrc/cpu/aten/GetVecLength.h" - -namespace torch_ipex { -namespace cpu { - -namespace { - -std::tuple get_cpp_typesize_and_vecsize_kernel_impl( - at::ScalarType dtype) { - switch (dtype) { - case at::ScalarType::Double: - return std::make_tuple( - sizeof(double), at::vec::Vectorized::size()); - case at::ScalarType::Float: - return std::make_tuple(sizeof(float), at::vec::Vectorized::size()); - case at::ScalarType::ComplexDouble: - return std::make_tuple( - sizeof(c10::complex), - at::vec::Vectorized>::size()); - case at::ScalarType::ComplexFloat: - return std::make_tuple( - sizeof(c10::complex), - at::vec::Vectorized>::size()); - case at::ScalarType::BFloat16: - return std::make_tuple( - sizeof(decltype( - c10::impl::ScalarTypeToCPPType::t)), - at::vec::Vectorized::t)>::size()); - case at::ScalarType::Half: - return std::make_tuple( - sizeof(decltype( - c10::impl::ScalarTypeToCPPType::t)), - at::vec::Vectorized::t)>::size()); - default: - TORCH_CHECK( - false, - "Currently only floating and complex ScalarType are supported."); - } -} - -} // anonymous namespace - -IPEX_REGISTER_DISPATCH( - get_cpp_typesize_and_vecsize_kernel_stub, - &get_cpp_typesize_and_vecsize_kernel_impl); - -} // namespace cpu -} // namespace torch_ipex - -``` -## Private Debug APIs - -Here are three ISA-related private APIs that can help debugging:: -1. Query current ISA level. -2. Query max CPU supported ISA level. -3. Query max binary supported ISA level. ->**Note:** -> ->1. Max CPU supported ISA level only depends on CPU features. ->2. Max binary supported ISA level only depends on built complier version. ->3. Current ISA level, it is the smaller of `max CPU ISA level` and `max binary ISA level`. - -### Example: -```bash -python -Python 3.9.7 (default, Sep 16 2021, 13:09:58) -[GCC 7.5.0] :: Anaconda, Inc. on linux -Type "help", "copyright", "credits" or "license" for more information. ->>> import intel_extension_for_pytorch._C as core ->>> core._get_current_isa_level() -'AMX' ->>> core._get_highest_cpu_support_isa_level() -'AMX' ->>> core._get_highest_binary_support_isa_level() -'AMX' ->>> quit() -``` - -## Select ISA level manually. - -By default, IPEX dispatches to the kernels with the maximum ISA level supported by the underlying CPU hardware. This ISA level can be overridden by the environment variable `ATEN_CPU_CAPABILITY` (same environment variable as PyTorch). The available values are {`avx2`, `avx512`, `avx512_vnni`, `avx512_bf16`, `amx`, `avx512_fp16`}. The effective ISA level would be the minimal level between `ATEN_CPU_CAPABILITY` and the maximum level supported by the hardware. -### Example: -```bash -$ python -c 'import intel_extension_for_pytorch._C as core;print(core._get_current_isa_level())' -AMX -$ ATEN_CPU_CAPABILITY=avx2 python -c 'import intel_extension_for_pytorch._C as core;print(core._get_current_isa_level())' -AVX2 -``` ->**Note:** -> ->`core._get_current_isa_level()` is an IPEX internal function used for checking the current effective ISA level. It is used for debugging purpose only and subject to change. - -## CPU feature check - -An addtional CPU feature check tool in the subfolder: `tests/cpu/isa` - -```bash -$ cmake . --- The C compiler identification is GNU 11.2.1 --- The CXX compiler identification is GNU 11.2.1 --- Detecting C compiler ABI info --- Detecting C compiler ABI info - done --- Check for working C compiler: /opt/rh/gcc-toolset-11/root/usr/bin/cc - skipped --- Detecting C compile features --- Detecting C compile features - done --- Detecting CXX compiler ABI info --- Detecting CXX compiler ABI info - done --- Check for working CXX compiler: /opt/rh/gcc-toolset-11/root/usr/bin/c++ - skipped --- Detecting CXX compile features --- Detecting CXX compile features - done --- Configuring done --- Generating done --- Build files have been written to: tests/cpu/isa -$ make -[ 33%] Building CXX object CMakeFiles/cpu_features.dir/intel_extension_for_pytorch/csrc/cpu/isa/cpu_feature.cpp.o -[ 66%] Building CXX object CMakeFiles/cpu_features.dir/intel_extension_for_pytorch/csrc/cpu/isa/cpu_feature_main.cpp.o -[100%] Linking CXX executable cpu_features -[100%] Built target cpu_features -$ ./cpu_features -XCR0: 00000000000602e7 -os --> avx: true -os --> avx2: true -os --> avx512: true -os --> amx: true -mmx: true -sse: true -sse2: true -sse3: true -ssse3: true -sse4_1: true -sse4_2: true -aes_ni: true -sha: true -xsave: true -fma: true -f16c: true -avx: true -avx2: true -avx_vnni: true -avx512_f: true -avx512_cd: true -avx512_pf: false -avx512_er: false -avx512_vl: true -avx512_bw: true -avx512_dq: true -avx512_ifma: true -avx512_vbmi: true -avx512_vpopcntdq: true -avx512_4fmaps: false -avx512_4vnniw: false -avx512_vbmi2: true -avx512_vpclmul: true -avx512_vnni: true -avx512_bitalg: true -avx512_fp16: true -avx512_bf16: true -avx512_vp2intersect: true -amx_bf16: true -amx_tile: true -amx_int8: true -prefetchw: true -prefetchwt1: false -``` +The design document has been merged with [the ISA Dynamic Dispatch feature introduction](../../tutorials/features/isa_dynamic_dispatch.md). \ No newline at end of file diff --git a/docs/tutorials/api_doc.rst b/docs/tutorials/api_doc.rst index cd47004aa..8dac4c1f1 100644 --- a/docs/tutorials/api_doc.rst +++ b/docs/tutorials/api_doc.rst @@ -15,7 +15,7 @@ General .. automodule:: intel_extension_for_pytorch.llm .. autofunction:: optimize -.. automodule:: intel_extension_for_pytorch +.. currentmodule:: intel_extension_for_pytorch .. autoclass:: verbose diff --git a/docs/tutorials/features.rst b/docs/tutorials/features.rst index dea2454e6..d7d98b81c 100644 --- a/docs/tutorials/features.rst +++ b/docs/tutorials/features.rst @@ -169,7 +169,9 @@ Intelยฎ Extension for PyTorch* provides built-in quantization recipes to deliver Users are always recommended to try quantization with the built-in quantization recipe first with Intelยฎ Extension for PyTorch* quantization APIs. For even higher accuracy demandings, users can try with separate `recipe tuning APIs `_. The APIs are powered by Intelยฎ Neural Compressor to take advantage of its tuning feature. -Check more detailed information for `INT8 Quantization `_ and `INT8 recipe tuning API guide (Prototype, *NEW feature in 1.13.0*) `_. +Smooth quantization (SmoothQuant) is a more recent post-training quantization (PTQ) solution which tackles the quantization error problem caused by systematic outliers in activations. SmoothQuant is commonly used for LLM quantization, and Intelยฎ Extension for PyTorch* has provided built-in support for this solution. + +Check more detailed information for `INT8 Quantization `_ and `INT8 recipe tuning API guide (Prototype) `_. In addition, SmoothQuant specific argument introduction and examples can be checked in `SmoothQuant recipe tuning API guide (Prototype) `_. .. toctree:: :hidden: @@ -177,6 +179,7 @@ Check more detailed information for `INT8 Quantization #### **Kernel implementation:** `csrc/cpu/aten/kernels/xyzKrnl.cpp` +>#### **Kernel Stub:** `csrc/cpu/aten/xyz.cpp` and `csrc/cpu/aten/xyz.h` +>#### **Dispatch Stub implementation:** `csrc/cpu/dyndisp/DispatchStub.cpp` and `csrc/cpu/dyndisp/DispatchStub.h` + +### CodeGen Process +IPEX build system will generate code for each ISA level with specifiy complier parameters. The CodeGen script is located at `cmake/cpu/IsaCodegen.cmake`. + +The CodeGen will copy each cpp files from **Kernel implementation**, and then add ISA level as new file suffix. + +> **Sample:** +> +> ---- +> +> **Origin file:** +> +> `csrc/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp` +> +> **Generate files:** +> +> DEFAULT: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.DEFAULT.cpp -O3 -D__AVX__ -DCPU_CAPABILITY_AVX2 -mavx2 -mfma -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -DCPU_CAPABILITY=DEFAULT -DCPU_CAPABILITY_DEFAULT` +> +> AVX2: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX2.cpp -O3 -D__AVX__ -mavx2 -mfma -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store -DCPU_CAPABILITY=AVX2 -DCPU_CAPABILITY_AVX2` +> +> AVX512: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512.cpp -O3 -D__AVX512F__ -mavx512f -mavx512bw -mavx512vl -mavx512dq -mfma -DCPU_CAPABILITY=AVX512 -DCPU_CAPABILITY_AVX512` +> +> AVX512_VNNI: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_VNNI.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mfma -DCPU_CAPABILITY=AVX512_VNNI -DCPU_CAPABILITY_AVX512_VNNI` +> +> AVX512_BF16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_BF16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -DCPU_CAPABILITY=AVX512_BF16 -DCPU_CAPABILITY_AVX512_BF16` +> +> AMX: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AMX.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -DCPU_CAPABILITY=AMX -DCPU_CAPABILITY_AMX` +> +> AVX512_FP16: `build/Release/csrc/isa_codegen/cpu/aten/kernels/AdaptiveAveragePoolingKrnl.cpp.AVX512_FP16.cpp -O3 -D__AVX512F__ -DCPU_CAPABILITY_AVX512 -DCPU_CAPABILITY_AVX512_VNNI -DCPU_CAPABILITY_AVX512_BF16 -mavx512f -mavx512bw -mavx512vl -mavx512dq -mavx512vnni -mavx512bf16 -mfma -mamx-tile -mamx-int8 -mamx-bf16 -mavx512fp16 -DCPU_CAPABILITY_AMX -DCPU_CAPABILITY=AVX512_FP16 -DCPU_CAPABILITY_AVX512_FP16` +--- + +>**Note:** +>1. DEFAULT level kernels is not fully implemented in IPEX. In order to align to PyTorch, we build default use AVX2 parameters in stead of that. So, IPEX minimal required executing machine support AVX2. +>2. `-D__AVX__` and `-D__AVX512F__` is defined for depends library [sleef](https://sleef.org/) . +>3. `-DCPU_CAPABILITY_AVX512` and `-DCPU_CAPABILITY_AVX2` are must to be defined for **PyTorch:** `aten/src/ATen/cpu/vec`, it determins vec register width. +>4. `-DCPU_CAPABILITY=[ISA_NAME]` is must to be defined for **PyTorch:** `aten/src/ATen/cpu/vec`, it is used as inline namespace name. +>5. Higher ISA level is compatible to lower ISA levels, so it needs to contains level ISA feature definitions. Such as AVX512_BF16 need contains `-DCPU_CAPABILITY_AVX512` `-DCPU_CAPABILITY_AVX512_VNNI`. But AVX512 don't contains AVX2 definitions, due to there are different vec register width. + +## Add Custom Kernel + +If you want to add a new custom kernel, and the kernel uses CPU ISA instructions, refer to these tips: + +1. Add CPU ISA related kernel implementation to the folder: `csrc/cpu/aten/kernels/NewKernelKrnl.cpp` +2. Add kernel stub to the folder: `csrc/cpu/aten/NewKernel.cpp` +3. Include header file: `csrc/cpu/dyndisp/DispatchStub.h`, and reference to the comment in the header file. +```c++ +// Implements instruction set specific function dispatch. +// +// Kernels that may make use of specialized instruction sets (e.g. AVX2) are +// compiled multiple times with different compiler flags (e.g. -mavx2). A +// DispatchStub contains a table of function pointers for a kernel. At runtime, +// the fastest available kernel is chosen based on the features reported by +// cpuinfo. +// +// Example: +// +// In csrc/cpu/aten/MyKernel.h: +// using fn_type = void(*)(const Tensor& x); +// IPEX_DECLARE_DISPATCH(fn_type, stub); +// +// In csrc/cpu/aten/MyKernel.cpp +// IPEX_DEFINE_DISPATCH(stub); +// +// In csrc/cpu/aten/kernels/MyKernel.cpp: +// namespace { +// // use anonymous namespace so that different cpu versions won't conflict +// void kernel(const Tensor& x) { ... } +// } +// IPEX_REGISTER_DISPATCH(stub, &kernel); +// +// To call: +// stub(kCPU, tensor); +``` +4. Write the kernel follow the guide. It contains: declare function type, register stub, call stub, etc. + +>**Note:** +> +>1. Some kernels only call **oneDNN** or **iDeep** implementation, or other backend implementation, which is not needed to add kernel implementations. (Refer: `BatchNorm.cpp`) +>2. Vec related header file must be included in kernel implementation files, but can not be included in kernel stub. Kernel stub is common code for all ISA level, and can't pass ISA related compiler parameters. +>3. For more intrinsics, check the [Intelยฎ Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html). + +### ISA intrinics specific kernel example: + +This is a FP32 convert to BF16 function example, and it is implemented for `AVX512_BF16`, `AVX512` and `DEFAULT` ISA levels. + +```c++ +//csrc/cpu/aten/CvtFp32ToBf16.h + +#pragma once + +#include + +namespace torch_ipex { +namespace cpu { + +void cvt_fp32_to_bf16(at::BFloat16* dst, const float* src, int len); + +namespace { + +void cvt_fp32_to_bf16_kernel_impl(at::BFloat16* dst, const float* src, int len); + +} + +using cvt_fp32_to_bf16_kernel_fn = void (*)(at::BFloat16*, const float*, int); +IPEX_DECLARE_DISPATCH(cvt_fp32_to_bf16_kernel_fn, cvt_fp32_to_bf16_kernel_stub); +} // namespace cpu +} // namespace torch_ipex + +``` +```c++ +//csrc/cpu/aten/CvtFp32ToBf16.cpp + +#include "CvtFp32ToBf16.h" + +namespace torch_ipex { +namespace cpu { + +IPEX_DEFINE_DISPATCH(cvt_fp32_to_bf16_kernel_stub); + +void cvt_fp32_to_bf16(at::BFloat16* dst, const float* src, int len) { + return cvt_fp32_to_bf16_kernel_stub(kCPU, dst, src, len); +} + +} // namespace cpu +} // namespace torch_ipex + +``` +Macro `CPU_CAPABILITY_AVX512` and `CPU_CAPABILITY_AVX512_BF16` are defined by compiler check, it is means that current compiler havs capability to generate defined ISA level code. + +Because of `AVX512_BF16` is higher level than `AVX512`, and it compatible to `AVX512`. `CPU_CAPABILITY_AVX512_BF16` can be contained in `CPU_CAPABILITY_AVX512` region. +```c++ +//csrc/cpu/aten/kernels/CvtFp32ToBf16Krnl.cpp + +#include +#include "csrc/aten/cpu/CvtFp32ToBf16.h" + +namespace torch_ipex { +namespace cpu { -By default, Intelยฎ Extension for PyTorch\* dispatches to kernels with the maximum ISA level supported on the underlying CPU hardware. This ISA level can be overridden by an environment variable `ATEN_CPU_CAPABILITY` (same environment variable as PyTorch). Available values are {`avx2`, `avx512`, `avx512_vnni`, `avx512_bf16`, `amx`}. The effective ISA level would be the minimal level between `ATEN_CPU_CAPABILITY` and the maximum level supported by the hardware. +namespace { + +#if defined(CPU_CAPABILITY_AVX512) +#include +#else +#include +#endif +using namespace at::vec; + +#if defined(CPU_CAPABILITY_AVX512) +#include + +inline __m256i _cvt_fp32_to_bf16(const __m512 src) { +#if (defined CPU_CAPABILITY_AVX512_BF16) // AVX512_BF16 ISA implementation. + return reinterpret_cast<__m256i>(_mm512_cvtneps_pbh(src)); +#else // AVX512 ISA implementation. + __m512i value = _mm512_castps_si512(src); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm512_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm512_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm512_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); + return _mm512_cvtusepi32_epi16(t_value); +#endif +} + +void cvt_fp32_to_bf16_kernel_impl( + at::BFloat16* dst, + const float* src, + int len) { + int i = 0; + for (; i < len - 15; i += 16) { + auto f32 = _mm512_loadu_ps(src + i); + _mm256_storeu_si256((__m256i*)(dst + i), _cvt_fp32_to_bf16(f32)); + } + if (i < len) { + auto mask = (1 << (len - i)) - 1; + auto f32 = _mm512_maskz_loadu_ps(mask, src + i); + _mm256_mask_storeu_epi16(dst + i, mask, _cvt_fp32_to_bf16(f32)); + } +} + +#else // DEFAULT ISA implementation. + +void cvt_fp32_to_bf16_kernel_impl( + at::BFloat16* dst, + const float* src, + int len) { + for (int j = 0; j < len; j++) { + *(dst + j) = *(src + j); + } +} + +#endif + +} // anonymous namespace + +IPEX_REGISTER_DISPATCH(cvt_fp32_to_bf16_kernel_stub, &cvt_fp32_to_bf16_kernel_impl); + +} // namespace cpu +} // namespace torch_ipex + +``` + +### Vec specific kernel example: +This example shows how to get the data type size and its Vec size. In different ISA, Vec has a different register width and a different Vec size. + +```c++ +//csrc/cpu/aten/GetVecLength.h +#pragma once + +#include + +namespace torch_ipex { +namespace cpu { + +std::tuple get_cpp_typesize_and_vecsize(at::ScalarType dtype); + +namespace { + +std::tuple get_cpp_typesize_and_vecsize_kernel_impl( + at::ScalarType dtype); +} + +using get_cpp_typesize_and_vecsize_kernel_fn = + std::tuple (*)(at::ScalarType); +IPEX_DECLARE_DISPATCH( + get_cpp_typesize_and_vecsize_kernel_fn, + get_cpp_typesize_and_vecsize_kernel_stub); + +} // namespace cpu +} // namespace torch_ipex + +``` + +```c++ +//csrc/cpu/aten/GetVecLength.cpp + +#include "GetVecLength.h" + +namespace torch_ipex { +namespace cpu { + +IPEX_DEFINE_DISPATCH(get_cpp_typesize_and_vecsize_kernel_stub); + +// get cpp typesize and vectorsize by at::ScalarType +std::tuple get_cpp_typesize_and_vecsize(at::ScalarType dtype) { + return get_cpp_typesize_and_vecsize_kernel_stub(kCPU, dtype); +} + +} // namespace cpu +} // namespace torch_ipex + +``` + +```c++ +//csrc/cpu/aten/kernels/GetVecLengthKrnl.cpp + +#include +#include "csrc/cpu/aten/GetVecLength.h" + +namespace torch_ipex { +namespace cpu { + +namespace { + +std::tuple get_cpp_typesize_and_vecsize_kernel_impl( + at::ScalarType dtype) { + switch (dtype) { + case at::ScalarType::Double: + return std::make_tuple( + sizeof(double), at::vec::Vectorized::size()); + case at::ScalarType::Float: + return std::make_tuple(sizeof(float), at::vec::Vectorized::size()); + case at::ScalarType::ComplexDouble: + return std::make_tuple( + sizeof(c10::complex), + at::vec::Vectorized>::size()); + case at::ScalarType::ComplexFloat: + return std::make_tuple( + sizeof(c10::complex), + at::vec::Vectorized>::size()); + case at::ScalarType::BFloat16: + return std::make_tuple( + sizeof(decltype( + c10::impl::ScalarTypeToCPPType::t)), + at::vec::Vectorized::t)>::size()); + case at::ScalarType::Half: + return std::make_tuple( + sizeof(decltype( + c10::impl::ScalarTypeToCPPType::t)), + at::vec::Vectorized::t)>::size()); + default: + TORCH_CHECK( + false, + "Currently only floating and complex ScalarType are supported."); + } +} + +} // anonymous namespace + +IPEX_REGISTER_DISPATCH( + get_cpp_typesize_and_vecsize_kernel_stub, + &get_cpp_typesize_and_vecsize_kernel_impl); + +} // namespace cpu +} // namespace torch_ipex + +``` +## Private Debug APIs + +Here are three ISA-related private APIs that can help debugging:: +1. Query current ISA level. +2. Query max CPU supported ISA level. +3. Query max binary supported ISA level. +>**Note:** +> +>1. Max CPU supported ISA level only depends on CPU features. +>2. Max binary supported ISA level only depends on built complier version. +>3. Current ISA level, it is the smaller of `max CPU ISA level` and `max binary ISA level`. ### Example: +```bash +python +Python 3.9.7 (default, Sep 16 2021, 13:09:58) +[GCC 7.5.0] :: Anaconda, Inc. on linux +Type "help", "copyright", "credits" or "license" for more information. +>>> import intel_extension_for_pytorch._C as core +>>> core._get_current_isa_level() +'AMX' +>>> core._get_highest_cpu_support_isa_level() +'AMX' +>>> core._get_highest_binary_support_isa_level() +'AMX' +>>> quit() +``` + +## Select ISA level manually. +By default, IPEX dispatches to the kernels with the maximum ISA level supported by the underlying CPU hardware. This ISA level can be overridden by the environment variable `ATEN_CPU_CAPABILITY` (same environment variable as PyTorch). The available values are {`avx2`, `avx512`, `avx512_vnni`, `avx512_bf16`, `amx`, `avx512_fp16`}. The effective ISA level would be the minimal level between `ATEN_CPU_CAPABILITY` and the maximum level supported by the hardware. +### Example: ```bash $ python -c 'import intel_extension_for_pytorch._C as core;print(core._get_current_isa_level())' AMX @@ -44,7 +407,7 @@ AVX2 ``` >**Note:** > ->`core._get_current_isa_level()` is an Intelยฎ Extension for PyTorch\* internal function used for checking the current effective ISA level. It is used for debugging purpose only and subject to change. +>`core._get_current_isa_level()` is an IPEX internal function used for checking the current effective ISA level. It is used for debugging purpose only and subject to change. ## CPU feature check @@ -67,13 +430,11 @@ $ cmake . -- Configuring done -- Generating done -- Build files have been written to: tests/cpu/isa - $ make [ 33%] Building CXX object CMakeFiles/cpu_features.dir/intel_extension_for_pytorch/csrc/cpu/isa/cpu_feature.cpp.o [ 66%] Building CXX object CMakeFiles/cpu_features.dir/intel_extension_for_pytorch/csrc/cpu/isa/cpu_feature_main.cpp.o [100%] Linking CXX executable cpu_features [100%] Built target cpu_features - $ ./cpu_features XCR0: 00000000000602e7 os --> avx: true @@ -104,7 +465,7 @@ avx512_bw: true avx512_dq: true avx512_ifma: true avx512_vbmi: true -avx512_vpopcntdq: true +avx512_vpopcntdq: true avx512_4fmaps: false avx512_4vnniw: false avx512_vbmi2: true @@ -113,10 +474,10 @@ avx512_vnni: true avx512_bitalg: true avx512_fp16: true avx512_bf16: true -avx512_vp2intersect: true +avx512_vp2intersect: true amx_bf16: true amx_tile: true amx_int8: true prefetchw: true prefetchwt1: false -``` +``` \ No newline at end of file diff --git a/docs/tutorials/known_issues.md b/docs/tutorials/known_issues.md index 3992c464a..0aff2be20 100644 --- a/docs/tutorials/known_issues.md +++ b/docs/tutorials/known_issues.md @@ -7,7 +7,7 @@ Troubleshooting - **Cause**: Certain Python packages may have PyTorch as a hard dependency. If you installed the `+cpu` version of PyTorch, installation of these packages might replace the `+cpu` version with the default version released on Pypi.org. - **Solution**: Reinstall the `+cpu` version back. - **Problem**: The workload running with Intelยฎ Extension for PyTorch\* occupies a remarkably large amount of memory. - - **Solution**: Try to reduce the occupied memory size by setting the `--weights_prepack` parameter of the `ipex.optimize()` function to `False`. + - **Solution**: Try to reduce the occupied memory size by setting the `weights_prepack` parameter of the `ipex.optimize()` function to `False`. - **Problem**: The `conv+bn` folding feature of the `ipex.optimize()` function does not work if inference is done with a custom function: ``` @@ -108,6 +108,9 @@ Troubleshooting - **Problem**: BF16 AMP(auto-mixed-precision) runs abnormally with the extension on the AVX2-only machine if the topology contains `Conv`, `Matmul`, `Linear`, and `BatchNormalization`. - **Solution**: TBD +- **Problem**: A PyTorch* model containing `torch.nn.TransformerEncoderLayer` component may encounter a RuntimeError in BF16 training or inference process if the model is optimized by `ipex.optimize()` with arguments set to default values. + - **Solution**: `TransformerEncoderLayer` optimized by `ipex.optimize()` with weight prepacking functionality enabled may encounter a weight dimension issue. The error can be avoided by disabling weight prepacking, `model = ipex.optimize(model, weights_prepack=False)`. + ## Runtime Extension The following limitations currently exist: @@ -118,9 +121,4 @@ The following limitations currently exist: ## Result Correctness - **Problem**: Incorrect Conv and Linear result if the number of OMP threads is changed at runtime. - - **Cause**: The oneDNN memory layout depends on the number of OMP threads, which requires the caller to detect the changes for the # of OMP threads while this release has not implemented it yet. - -## Float32 Training - -- **Problem**: Low throughput with DLRM FP32 Train. - - **Solution**: A 'Sparse Add' [PR](https://github.com/pytorch/pytorch/pull/23057) is pending on review. The issue will be fixed when the PR is merged. + - **Cause**: The oneDNN memory layout depends on the number of OMP threads, which requires the caller to detect the changes for the # of OMP threads while this release has not implemented it yet. \ No newline at end of file diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst index 4bdee9430..72eb62c2a 100644 --- a/docs/tutorials/llm.rst +++ b/docs/tutorials/llm.rst @@ -30,8 +30,6 @@ Verified for distributed inference mode via DeepSpeed *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future. -*Note*: The accuracy drop issue in distributed inference mode for "tiiuae/falcon-40b" has been fixed by DeepSpeed in a recent patch release `v0.13.1 `_. - Please check `LLM best known practice <../../examples/cpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts. diff --git a/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py b/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py index 61fec7022..50a639560 100644 --- a/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py +++ b/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py @@ -14,6 +14,7 @@ model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 +# Note: bf16 inference requires amp.autocast() context # noqa F401 with torch.no_grad(), torch.cpu.amp.autocast(): model(data) diff --git a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py index d2388cdaf..860e18b25 100644 --- a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py +++ b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py @@ -16,6 +16,7 @@ model = torch.compile(model, backend="ipex") ###################################################### # noqa F401 +# Note: bf16 inference requires amp.autocast() context # noqa F401 with torch.no_grad(), torch.cpu.amp.autocast(): model(data) diff --git a/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py index 02c384e6a..3c5d3bc05 100644 --- a/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py +++ b/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py @@ -14,6 +14,7 @@ model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 +# Note: bf16 inference requires amp.autocast() context # noqa F401 with torch.no_grad(), torch.cpu.amp.autocast(): d = torch.randint(vocab_size, size=[batch_size, seq_length]) model = torch.jit.trace(model, (d,), check_trace=False, strict=False) diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index bfc1de09d..1e0c03d06 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -44,7 +44,7 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) |GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸจ | ๐ŸŸฉ | |GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸจ | ๐ŸŸฉ | |DOLLY| databricks/dolly-v2-12b | ๐ŸŸจ | ๐ŸŸฉ | -|FALCON| tiiuae/falcon-40b | ๐ŸŸจ | ๐ŸŸจ | +|FALCON| tiiuae/falcon-40b | ๐ŸŸฉ | ๐ŸŸฉ | |OPT| facebook/opt-30b | ๐ŸŸจ | ๐ŸŸฉ | |OPT| facebook/opt-1.3b | ๐ŸŸฉ | ๐ŸŸฉ | |Bloom| bigscience/bloom-1b7 | ๐ŸŸจ | ๐ŸŸฉ | @@ -64,8 +64,6 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future. -*Note*: The accuracy drop issue in distributed inference mode for "tiiuae/falcon-40b" has been fixed by DeepSpeed in a recent patch release [v0.13.1](https://github.com/microsoft/DeepSpeed/tree/v0.13.1). -
    # 3. Environment Setup diff --git a/examples/cpu/inference/python/llm/tools/env_setup.sh b/examples/cpu/inference/python/llm/tools/env_setup.sh index 2f3d7d0b1..1546e4cbd 100644 --- a/examples/cpu/inference/python/llm/tools/env_setup.sh +++ b/examples/cpu/inference/python/llm/tools/env_setup.sh @@ -81,7 +81,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then fi # Install deps - conda install -y cmake ninja + conda install -y cmake ninja unzip echo "#!/bin/bash" > ${AUX_INSTALL_SCRIPT} if [ $((${MODE} & 0x04)) -ne 0 ]; then diff --git a/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py index e819d135f..423b139e3 100644 --- a/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py +++ b/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py @@ -10,6 +10,7 @@ model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 +# Note: bf16 inference requires amp.autocast() context # noqa F401 with torch.no_grad(), torch.cpu.amp.autocast(): model(data) diff --git a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py index 3eec7a3b0..3656cd868 100644 --- a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py +++ b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py @@ -12,6 +12,7 @@ model = torch.compile(model, backend="ipex") ###################################################### # noqa F401 +# Note: bf16 inference requires amp.autocast() context # noqa F401 with torch.no_grad(), torch.cpu.amp.autocast(): model(data) diff --git a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py index 985634559..988e83163 100644 --- a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py +++ b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py @@ -10,6 +10,7 @@ model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 +# Note: bf16 inference requires amp.autocast() context # noqa F401 with torch.no_grad(), torch.cpu.amp.autocast(): model = torch.jit.trace(model, torch.rand(128, 3, 224, 224)) model = torch.jit.freeze(model) diff --git a/examples/cpu/training/single_instance_training_bf16.py b/examples/cpu/training/single_instance_training_bf16.py index b4d513cb8..9a7c5cdcd 100644 --- a/examples/cpu/training/single_instance_training_bf16.py +++ b/examples/cpu/training/single_instance_training_bf16.py @@ -33,6 +33,7 @@ for batch_idx, (data, target) in enumerate(train_loader): optimizer.zero_grad() + # Note: bf16 training requires amp.autocast() context # noqa F401 with torch.cpu.amp.autocast(): output = model(data) loss = criterion(output, target) From 9d707e75ff8ae8767ffd47a13c27c4cb0831465a Mon Sep 17 00:00:00 2001 From: Xu Han Date: Wed, 10 Apr 2024 09:45:52 +0800 Subject: [PATCH 012/199] upgrade black (#2738) --- .../_inductor/compiler.py | 16 ++-- .../_meta_registrations.py | 6 +- .../cpu/autocast/_grad_scaler.py | 12 +-- .../cpu/hypertune/conf/config.py | 6 +- .../cpu/runtime/multi_stream.py | 8 +- .../nn/modules/merged_embeddingbag.py | 3 + .../optim/_functional.py | 1 + .../quantization/_quantization_state.py | 12 +-- .../quantization/_quantize.py | 6 +- .../quantization/_recipe.py | 6 +- .../quantization/_utils.py | 94 +++++++++---------- .../quantization/fp8/recipe.py | 1 + .../quantization/fp8/util.py | 1 + .../models/cpu/fusions/linear_fusion.py | 80 ++++++++++------ .../models/reference/modules/attentions.py | 10 +- .../models/reference/modules/decoder.py | 8 +- .../transformers/optimize.py | 14 +-- intel_extension_for_pytorch/xpu/__init__.py | 1 + scripts/tools/setup/requirements-flake8.txt | 2 +- setup.py | 4 +- tests/cpu/common_nn.py | 4 +- tests/cpu/hf_configs/qwen/modeling_qwen.py | 48 ++++++---- tests/cpu/test_inductor.py | 8 +- tests/cpu/test_weight_prepack.py | 32 ++++--- 24 files changed, 219 insertions(+), 164 deletions(-) diff --git a/intel_extension_for_pytorch/_inductor/compiler.py b/intel_extension_for_pytorch/_inductor/compiler.py index 37776aad0..b98d943cb 100644 --- a/intel_extension_for_pytorch/_inductor/compiler.py +++ b/intel_extension_for_pytorch/_inductor/compiler.py @@ -28,15 +28,19 @@ def defake(x): return x if x._has_symbolic_sizes_strides: size = [ - s.node.shape_env.size_hint(s.node.expr) - if isinstance(s, torch.SymInt) - else s + ( + s.node.shape_env.size_hint(s.node.expr) + if isinstance(s, torch.SymInt) + else s + ) for s in x.size() ] stride = [ - s.node.shape_env.size_hint(s.node.expr) - if isinstance(s, torch.SymInt) - else s + ( + s.node.shape_env.size_hint(s.node.expr) + if isinstance(s, torch.SymInt) + else s + ) for s in x.stride() ] else: diff --git a/intel_extension_for_pytorch/_meta_registrations.py b/intel_extension_for_pytorch/_meta_registrations.py index e46162dce..ed11f5913 100644 --- a/intel_extension_for_pytorch/_meta_registrations.py +++ b/intel_extension_for_pytorch/_meta_registrations.py @@ -425,9 +425,9 @@ def meta_ROIAlign_backward( is_channels_last, ): return grad.new_empty((batch_size, channels, height, width)).to( - memory_format=torch.channels_last - if is_channels_last - else torch.contiguous_format + memory_format=( + torch.channels_last if is_channels_last else torch.contiguous_format + ) ) diff --git a/intel_extension_for_pytorch/cpu/autocast/_grad_scaler.py b/intel_extension_for_pytorch/cpu/autocast/_grad_scaler.py index 64661e415..5958b9cfa 100644 --- a/intel_extension_for_pytorch/cpu/autocast/_grad_scaler.py +++ b/intel_extension_for_pytorch/cpu/autocast/_grad_scaler.py @@ -159,9 +159,9 @@ def scale(self, outputs): return outputs * self._scale.to(device=outputs.device, non_blocking=True) # Invoke the more complex machinery only if we're treating multiple outputs. - stash: List[ - _MultiDeviceReplicator - ] = [] # holds a reference that can be overwritten by apply_scale + stash: List[_MultiDeviceReplicator] = ( + [] + ) # holds a reference that can be overwritten by apply_scale def apply_scale(val): if isinstance(val, torch.Tensor): @@ -522,9 +522,9 @@ def _check_inf_per_device(self, optimizer): ) found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=_scale.device) - self._per_optimizer_states[id(optimizer)][ - "found_inf_per_device" - ] = self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True) + self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = ( + self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True) + ) return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] diff --git a/intel_extension_for_pytorch/cpu/hypertune/conf/config.py b/intel_extension_for_pytorch/cpu/hypertune/conf/config.py index 8ac2a2b6d..c15a8745f 100644 --- a/intel_extension_for_pytorch/cpu/hypertune/conf/config.py +++ b/intel_extension_for_pytorch/cpu/hypertune/conf/config.py @@ -122,9 +122,9 @@ def input_str_to_list_int(data): ), Optional( "use_all_nodes", - default=[True, False] - if len(set([c.node for c in cpuinfo])) > 1 - else [True], + default=( + [True, False] if len(set([c.node for c in cpuinfo])) > 1 else [True] + ), ): And(list, lambda s: all(isinstance(i, bool) for i in s)), Optional( "use_logical_core", diff --git a/intel_extension_for_pytorch/cpu/runtime/multi_stream.py b/intel_extension_for_pytorch/cpu/runtime/multi_stream.py index 74c0ebb8f..ad2e37922 100644 --- a/intel_extension_for_pytorch/cpu/runtime/multi_stream.py +++ b/intel_extension_for_pytorch/cpu/runtime/multi_stream.py @@ -512,10 +512,10 @@ def forward(self, *args, **kwargs): # For self._generate_outputs: here we put results_raw_future[stream_id].get() into a \ # [results_raw_future[stream_id].get()] # to align the multi_stream_module_concat_hint structure. - self._generate_outputs( - [results_raw_future[stream_id].get()], stream_id - ) if self.concat_output else results_raw.append( - results_raw_future[stream_id].get() + ( + self._generate_outputs([results_raw_future[stream_id].get()], stream_id) + if self.concat_output + else results_raw.append(results_raw_future[stream_id].get()) ) # If we need to concat the output, for each position, we will concat the result in the list \ # (generate in self._generate_outputs). diff --git a/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py b/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py index 6013d3267..152bdd5ac 100644 --- a/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py +++ b/intel_extension_for_pytorch/nn/modules/merged_embeddingbag.py @@ -259,6 +259,7 @@ class MergedEmbeddingBag(nn.Module): Now `MergedEmbeddingBagWithSGD` is the only option running with an optimizer. We plan to add more optimizer support in the future. Visit `MergedEmbeddingBagWithSGD` for introduction of `MergedEmbeddingBagWith[Optimizer]`. """ + embedding_specs: List[EmbeddingSpec] def __init__( @@ -402,6 +403,7 @@ class MergedEmbeddingBagWithSGD(MergedEmbeddingBag): gradients from the backward step and thus the memory access pattern becomes more friendly. Data access will happen on cache more than on memory. """ + embedding_specs: List[EmbeddingSpec] def __init__( @@ -625,6 +627,7 @@ class MergedEmbeddingBagWithCat(MergedEmbeddingBag): >>> merged_emb = MergedEmbeddingBagWithCat.from_embeddingbag_list(EmbLists) >>> cat_out = MergedEmbeddingBagWithCat(dense_feature, inputs) """ + embedding_specs: List[EmbeddingSpec] def __init__( diff --git a/intel_extension_for_pytorch/optim/_functional.py b/intel_extension_for_pytorch/optim/_functional.py index 9879b1eb3..9655da958 100644 --- a/intel_extension_for_pytorch/optim/_functional.py +++ b/intel_extension_for_pytorch/optim/_functional.py @@ -1,4 +1,5 @@ r"""Functional interface, port from torch/optim/_function.py""" + import torch from torch import Tensor from typing import List, Optional diff --git a/intel_extension_for_pytorch/quantization/_quantization_state.py b/intel_extension_for_pytorch/quantization/_quantization_state.py index 09ab4595d..8bd86d88d 100644 --- a/intel_extension_for_pytorch/quantization/_quantization_state.py +++ b/intel_extension_for_pytorch/quantization/_quantization_state.py @@ -90,9 +90,9 @@ def get_extra_state(self): def set_extra_state(self, state): self.tensor_id_to_scale_zp = state["tensor_id_to_scale_zp"] for _, seen_q_op_info in self.idx_to_seen_q_op_infos.items(): - self.idx_to_op_convert_info[ - seen_q_op_info.idx - ] = self.calculate_op_convert_info(seen_q_op_info) + self.idx_to_op_convert_info[seen_q_op_info.idx] = ( + self.calculate_op_convert_info(seen_q_op_info) + ) def has_at_least_one_seen_q_op_info(self) -> bool: return len(self.idx_to_seen_q_op_infos) > 0 @@ -1122,9 +1122,9 @@ def _maybe_insert_output_observers( continue else: output_tensor_id = tensor_info.id - self.tensor_id_to_observer[ - str(output_tensor_id) - ] = qconfig.activation() + self.tensor_id_to_observer[str(output_tensor_id)] = ( + qconfig.activation() + ) def insert_observers(self, root_module: torch.nn.Module): for _, seen_q_op_info in self.idx_to_seen_q_op_infos.items(): diff --git a/intel_extension_for_pytorch/quantization/_quantize.py b/intel_extension_for_pytorch/quantization/_quantize.py index a37c055e7..c21b867b3 100644 --- a/intel_extension_for_pytorch/quantization/_quantize.py +++ b/intel_extension_for_pytorch/quantization/_quantize.py @@ -423,9 +423,9 @@ def convert(model, inplace=False): torch.nn.GRUCell: convert_model.q_config, } module_mappings = get_default_dynamic_quant_module_mappings().copy() - module_mappings[ - torch.nn.Linear - ] = nn.modules.weight_only_quantization.WeightOnlyQuantizedLinear + module_mappings[torch.nn.Linear] = ( + nn.modules.weight_only_quantization.WeightOnlyQuantizedLinear + ) module_mappings, qconfig_spec = may_quantize_deepspeed_modules( IPEX_WEIGHT_ONLY_QUANTIZATION_MODULE_CPU(), diff --git a/intel_extension_for_pytorch/quantization/_recipe.py b/intel_extension_for_pytorch/quantization/_recipe.py index 20bce355f..dbff3b10d 100644 --- a/intel_extension_for_pytorch/quantization/_recipe.py +++ b/intel_extension_for_pytorch/quantization/_recipe.py @@ -143,9 +143,9 @@ def _default_recipe_init(nodes): for idx, tensor_info in enumerate(node.input_tensor_infos): if tensor_info is not None: tensor_info.inf_dtype = tensor_info.orig_dtype - node.input_tensor_force_inf_dtype[ - idx - ] = tensor_info.inf_dtype + node.input_tensor_force_inf_dtype[idx] = ( + tensor_info.inf_dtype + ) # For LSTM, if it's input is a PackedSequence, we don't support ot now. # TODO: support PackedSequence input for quantization LSTM. diff --git a/intel_extension_for_pytorch/quantization/_utils.py b/intel_extension_for_pytorch/quantization/_utils.py index 7b194983c..553e31629 100644 --- a/intel_extension_for_pytorch/quantization/_utils.py +++ b/intel_extension_for_pytorch/quantization/_utils.py @@ -289,12 +289,12 @@ def attach_op_convert_info_to_model( if hasattr(module, "_auto_quant_state"): qstate: AutoQuantizationState = module._auto_quant_state # type: ignore[assignment] for _, seen_q_op_info in qstate.idx_to_seen_q_op_infos.items(): - qstate.idx_to_op_convert_info[ - seen_q_op_info.idx - ] = qstate.calculate_op_convert_info(seen_q_op_info) - qstate.idx_to_op_weight_convert_info[ - seen_q_op_info.idx - ] = qstate.calculate_op_weight_convert_info(seen_q_op_info) + qstate.idx_to_op_convert_info[seen_q_op_info.idx] = ( + qstate.calculate_op_convert_info(seen_q_op_info) + ) + qstate.idx_to_op_weight_convert_info[seen_q_op_info.idx] = ( + qstate.calculate_op_weight_convert_info(seen_q_op_info) + ) _map_smooth_quant_info_to_idx(module) for _, child in module.named_children(): @@ -515,9 +515,9 @@ def _reset_post_node_input_infos(node): if post_node.qconfig is not None: for idx, tensor_info in enumerate(post_node.input_tensor_infos): if tensor_info in node.output_tensor_infos: - post_node.input_tensor_force_inf_dtype[ - idx - ] = tensor_info.orig_dtype + post_node.input_tensor_force_inf_dtype[idx] = ( + tensor_info.orig_dtype + ) elif post_node.type == str(nn.Identity): _reset_post_node_input_infos(post_node) @@ -569,9 +569,9 @@ def _reset_post_node_input_infos(node): in [torch.qint8, torch.quint8] and not post_node_are_quantized ): - node.output_tensor_infos[ - 0 - ].inf_dtype = node.input_tensor_force_inf_dtype[0] + node.output_tensor_infos[0].inf_dtype = ( + node.input_tensor_force_inf_dtype[0] + ) node.insert_fake_quant_after_outputs[0] = True _reset_post_node_input_infos(node) @@ -699,9 +699,9 @@ def save_quant_state(quant_state_map, configure_file): cur_tensor_infos["scale"] = v.tensor_id_to_scale_zp[ tensor_info.id ][0].tolist() - cur_tensor_infos[ - "zero_point" - ] = v.tensor_id_to_scale_zp[tensor_info.id][1].tolist() + cur_tensor_infos["zero_point"] = ( + v.tensor_id_to_scale_zp[tensor_info.id][1].tolist() + ) else: scales_dict = v.tensor_id_to_scale_zp[tensor_info.id][0] zp_dict = v.tensor_id_to_scale_zp[tensor_info.id][1] @@ -733,9 +733,9 @@ def save_quant_state(quant_state_map, configure_file): scaling_factors_to_save = {} for key, val in scaling_factor_dict.items(): scaling_factors_to_save.update({key: val.tolist()}) - cur_tensor_infos[ - "smooth_quant_scaling_factor" - ] = scaling_factors_to_save + cur_tensor_infos["smooth_quant_scaling_factor"] = ( + scaling_factors_to_save + ) smooth_quant_enabled = True input_tensor_infos.append(cur_tensor_infos) info["input_tensor_infos"] = input_tensor_infos @@ -751,9 +751,9 @@ def save_quant_state(quant_state_map, configure_file): cur_tensor_infos["scale"] = v.weight_tensor_id_to_scale_zp[ weight_idx ][0].tolist() - cur_tensor_infos[ - "zero_point" - ] = v.weight_tensor_id_to_scale_zp[weight_idx][1].tolist() + cur_tensor_infos["zero_point"] = ( + v.weight_tensor_id_to_scale_zp[weight_idx][1].tolist() + ) if ( weight_idx in v.weight_tensor_id_to_smooth_quant_scaling_factor @@ -764,11 +764,11 @@ def save_quant_state(quant_state_map, configure_file): ] is not None ): - cur_tensor_infos[ - "smooth_quant_scaling_factor" - ] = v.weight_tensor_id_to_smooth_quant_scaling_factor[ - weight_idx - ].tolist() + cur_tensor_infos["smooth_quant_scaling_factor"] = ( + v.weight_tensor_id_to_smooth_quant_scaling_factor[ + weight_idx + ].tolist() + ) weight_tensor_infos.append(cur_tensor_infos) info["weight_tensor_infos"] = weight_tensor_infos # output infos @@ -786,9 +786,9 @@ def save_quant_state(quant_state_map, configure_file): cur_tensor_infos["scale"] = v.tensor_id_to_scale_zp[ tensor_info.id ][0].tolist() - cur_tensor_infos[ - "zero_point" - ] = v.tensor_id_to_scale_zp[tensor_info.id][1].tolist() + cur_tensor_infos["zero_point"] = ( + v.tensor_id_to_scale_zp[tensor_info.id][1].tolist() + ) else: scales_dict = v.tensor_id_to_scale_zp[tensor_info.id][0] zp_dict = v.tensor_id_to_scale_zp[tensor_info.id][1] @@ -820,9 +820,9 @@ def save_quant_state(quant_state_map, configure_file): scaling_factors_to_save = {} for key, val in scaling_factors.items(): scaling_factors_to_save.update({key: val.tolist()}) - cur_tensor_infos[ - "smooth_quant_scaling_factor" - ] = scaling_factors_to_save + cur_tensor_infos["smooth_quant_scaling_factor"] = ( + scaling_factors_to_save + ) output_tensor_infos.append(cur_tensor_infos) info["output_tensor_infos"] = output_tensor_infos # qconfig @@ -838,9 +838,9 @@ def save_quant_state(quant_state_map, configure_file): info["activation_observer"]["act_observer"] = _get_observer_setting( op_info.qconfig.activation().act_obs ) - info["activation_observer"][ - "act_ic_observer" - ] = _get_observer_setting(op_info.qconfig.activation().ic_obs) + info["activation_observer"]["act_ic_observer"] = ( + _get_observer_setting(op_info.qconfig.activation().ic_obs) + ) info["share_weight_observers"] = getattr( op_info.qconfig, "share_weight_observers", True ) @@ -1064,13 +1064,13 @@ def load_qconf_summary_to_model(model, qconf_summary): ) # overide the cur model's info v.idx_to_seen_q_op_infos[int(i)].input_tensor_infos = input_tensor_infos - v.idx_to_seen_q_op_infos[ - int(i) - ].input_tensor_force_inf_dtype = input_force_dtype_infos + v.idx_to_seen_q_op_infos[int(i)].input_tensor_force_inf_dtype = ( + input_force_dtype_infos + ) v.idx_to_seen_q_op_infos[int(i)].output_tensor_infos = output_tensor_infos - v.idx_to_seen_q_op_infos[ - int(i) - ].insert_fake_quant_after_outputs = insert_fake_quant_after_outputs + v.idx_to_seen_q_op_infos[int(i)].insert_fake_quant_after_outputs = ( + insert_fake_quant_after_outputs + ) v.idx_to_seen_q_op_infos[int(i)].weight_tensor_infos = weight_tensor_infos v.idx_to_seen_q_op_infos[int(i)].qconfig = qconfig @@ -1377,9 +1377,9 @@ def _map_smooth_quant_info_to_idx(module): tensor_id = str(input_arg.id) if tensor_id in qstate.tensor_id_to_smooth_quant_scaling_factor: key = str(seen_q_op_info.idx) - qstate.idx_to_smooth_quant_scaling_factor[ - key - ] = qstate.tensor_id_to_smooth_quant_scaling_factor[tensor_id] + qstate.idx_to_smooth_quant_scaling_factor[key] = ( + qstate.tensor_id_to_smooth_quant_scaling_factor[tensor_id] + ) # Linear has only one weight. Key is not changed. for weight_arg in seen_q_op_info.weight_tensor_infos: if weight_arg is None: @@ -1387,6 +1387,6 @@ def _map_smooth_quant_info_to_idx(module): tensor_id = str(seen_q_op_info.idx) + "_" + str(weight_arg.id) if tensor_id in qstate.weight_tensor_id_to_smooth_quant_scaling_factor: key = str(seen_q_op_info.idx) + "_" + str(weight_arg.id) - qstate.idx_to_smooth_quant_scaling_factor[ - key - ] = qstate.weight_tensor_id_to_smooth_quant_scaling_factor[tensor_id] + qstate.idx_to_smooth_quant_scaling_factor[key] = ( + qstate.weight_tensor_id_to_smooth_quant_scaling_factor[tensor_id] + ) diff --git a/intel_extension_for_pytorch/quantization/fp8/recipe.py b/intel_extension_for_pytorch/quantization/fp8/recipe.py index 29b8e997a..3ba3eb434 100644 --- a/intel_extension_for_pytorch/quantization/fp8/recipe.py +++ b/intel_extension_for_pytorch/quantization/fp8/recipe.py @@ -1,4 +1,5 @@ """This module provides predefined FP8 recipes.""" + from __future__ import annotations from enum import Enum from typing import Literal, NamedTuple diff --git a/intel_extension_for_pytorch/quantization/fp8/util.py b/intel_extension_for_pytorch/quantization/fp8/util.py index 527d91d0a..a27922f00 100644 --- a/intel_extension_for_pytorch/quantization/fp8/util.py +++ b/intel_extension_for_pytorch/quantization/fp8/util.py @@ -1,4 +1,5 @@ """Utility functions for IPEX FP8 modules""" + import torch from intel_extension_for_pytorch.frontend import _copy_model_and_optimizer diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py index 07278b50b..f7fe68b10 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py @@ -34,9 +34,11 @@ def forward(self, x): return torch.ops.torch_ipex.tpp_linear_silu( x, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), self.linear.out_features, ) else: # fallback path @@ -54,9 +56,11 @@ def forward(self, x): return torch.ops.torch_ipex.tpp_linear_relu( x, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), self.linear.out_features, ) else: # fallback path @@ -76,9 +80,11 @@ def forward(self, x, y): x, y, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), self.linear.out_features, ) else: # fallback path @@ -98,9 +104,11 @@ def forward(self, x, y): x, y, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), 1.0, self.linear.out_features, ) @@ -133,9 +141,11 @@ def forward(self, x, y, z): y, z, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), 1.0, self.linear.out_features, ) @@ -164,9 +174,11 @@ def forward(self, x): return torch.ops.torch_ipex.tpp_linear_gelu( x, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), self.linear.out_features, ) elif ( @@ -204,9 +216,11 @@ def forward(self, x): return torch.ops.torch_ipex.tpp_linear_gelu( x, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), self.linear.out_features, ) if ( @@ -386,13 +400,17 @@ def forward(self, x): return torch.ops.torch_ipex.tpp_fused_gate_up_proj( x, self.linear_s.weight.detach(), - self.linear_s.bias.detach() - if self.linear_s.bias is not None - else x.new_empty(0), + ( + self.linear_s.bias.detach() + if self.linear_s.bias is not None + else x.new_empty(0) + ), self.linear_m.weight.detach(), - self.linear_m.bias.detach() - if self.linear_m.bias is not None - else x.new_empty(0), + ( + self.linear_m.bias.detach() + if self.linear_m.bias is not None + else x.new_empty(0) + ), ) else: # fallback path return nn.functional.silu(self.linear_s(x)) * self.linear_m(x) @@ -412,9 +430,11 @@ def forward(self, x, y): x1 = torch.ops.torch_ipex.tpp_linear_silu( x, self.linear.weight.detach(), - self.linear.bias.detach() - if self.linear.bias is not None - else x.new_empty(0), + ( + self.linear.bias.detach() + if self.linear.bias is not None + else x.new_empty(0) + ), self.linear.out_features, ) return x1 * y diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index e3b96d813..1ea329f13 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -457,10 +457,12 @@ def _FalconAttention_forward( math.sqrt(self.head_dim), layer_past, head_mask, - attention_mask_float - + alibi.view(batch_size, self.num_heads, 1, -1) * self.inv_norm_factor - if alibi is not None - else attention_mask_float, + ( + attention_mask_float + + alibi.view(batch_size, self.num_heads, 1, -1) * self.inv_norm_factor + if alibi is not None + else attention_mask_float + ), alibi, ) diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index 7247d3498..1b04ccb97 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -813,9 +813,11 @@ def MixtralDecoderLayer_forward( expert_layer.w1.weight, expert_layer.w3.weight, expert_layer.w2.weight, - expert_layer.w1.tpp_fallback - if hasattr(expert_layer.w1, "tpp_fallback") - else True, + ( + expert_layer.w1.tpp_fallback + if hasattr(expert_layer.w1, "tpp_fallback") + else True + ), routing_weights, final_hidden_states, ) diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 7297f2ae1..bc40c22c5 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -1185,9 +1185,11 @@ def optimize( model.eval(), dtype=dtype, inplace=inplace, - auto_kernel_selection=True - if ipex.get_fp32_math_mode() == ipex.FP32MathMode.BF32 - else False, + auto_kernel_selection=( + True + if ipex.get_fp32_math_mode() == ipex.FP32MathMode.BF32 + else False + ), ) elif dtype is torch.bfloat16: _model = ipex.optimize(model.eval(), dtype=dtype, inplace=inplace) @@ -1272,9 +1274,9 @@ def optimize( else sample_inputs ) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True - if dtype in [torch.bfloat16, torch.half] - else False, + enabled=( + True if dtype in [torch.bfloat16, torch.half] else False + ), dtype=dtype, ): trace_model = torch.jit.trace( diff --git a/intel_extension_for_pytorch/xpu/__init__.py b/intel_extension_for_pytorch/xpu/__init__.py index 1289eed5c..0d1b068e4 100644 --- a/intel_extension_for_pytorch/xpu/__init__.py +++ b/intel_extension_for_pytorch/xpu/__init__.py @@ -235,6 +235,7 @@ class StreamContext(object): ``None``. .. note:: Streams are per-device. """ + cur_stream: Optional["Stream"] def __init__(self, stream: Optional["Stream"]): diff --git a/scripts/tools/setup/requirements-flake8.txt b/scripts/tools/setup/requirements-flake8.txt index 453c3d9a3..5689e9542 100644 --- a/scripts/tools/setup/requirements-flake8.txt +++ b/scripts/tools/setup/requirements-flake8.txt @@ -7,4 +7,4 @@ flake8-pyi==20.5.0 mccabe pycodestyle==2.6.0 pyflakes==2.2.0 -black==23.3.0 +black==24.3.0 diff --git a/setup.py b/setup.py index bd052cc5a..22eefef8d 100644 --- a/setup.py +++ b/setup.py @@ -149,9 +149,7 @@ def get_build_type(): return ( "RelWithDebInfo" if _check_env_flag("REL_WITH_DEB_INFO") - else "Debug" - if _check_env_flag("DEBUG") - else "Release" + else "Debug" if _check_env_flag("DEBUG") else "Release" ) diff --git a/tests/cpu/common_nn.py b/tests/cpu/common_nn.py index e810c8655..ec4ee589c 100644 --- a/tests/cpu/common_nn.py +++ b/tests/cpu/common_nn.py @@ -4496,9 +4496,7 @@ def padding3d_circular(input, pad): / ( i.numel() if get_reduction(m) == "mean" - else i.size(1) - if get_reduction(m) == "sum" - else 1 + else i.size(1) if get_reduction(m) == "sum" else 1 ), desc="weights", check_sum_reduction=True, diff --git a/tests/cpu/hf_configs/qwen/modeling_qwen.py b/tests/cpu/hf_configs/qwen/modeling_qwen.py index 103d907e0..e05797fd3 100644 --- a/tests/cpu/hf_configs/qwen/modeling_qwen.py +++ b/tests/cpu/hf_configs/qwen/modeling_qwen.py @@ -372,17 +372,23 @@ def _attn( shape = query.shape[:-1] + (qk.shape[-2],) attn_weights = torch.zeros(shape, dtype=torch.float16, device=device) self.cache_kernels.vecquant8matmul_batched_faster_old( - query.contiguous() - if query.dtype == torch.float16 - else query.to(torch.float16).contiguous(), + ( + query.contiguous() + if query.dtype == torch.float16 + else query.to(torch.float16).contiguous() + ), qk.transpose(-1, -2).contiguous(), attn_weights, - qk_scale.contiguous() - if qk_scale.dtype == torch.float16 - else qk_scale.to(torch.float16).contiguous(), - qk_zero.contiguous() - if qk_zero.dtype == torch.float16 - else qk_zero.to(torch.float16).contiguous(), + ( + qk_scale.contiguous() + if qk_scale.dtype == torch.float16 + else qk_scale.to(torch.float16).contiguous() + ), + ( + qk_zero.contiguous() + if qk_zero.dtype == torch.float16 + else qk_zero.to(torch.float16).contiguous() + ), ) # attn_weights = attn_weights.to(query.dtype).contiguous() else: @@ -424,17 +430,23 @@ def _attn( shape = attn_weights.shape[:-1] + (query.shape[-1],) attn_output = torch.zeros(shape, dtype=torch.float16, device=device) self.cache_kernels.vecquant8matmul_batched_column_compression_faster_old( - attn_weights.contiguous() - if attn_weights.dtype == torch.float16 - else attn_weights.to(torch.float16).contiguous(), + ( + attn_weights.contiguous() + if attn_weights.dtype == torch.float16 + else attn_weights.to(torch.float16).contiguous() + ), qv.contiguous(), # dtype: int32 attn_output, - qv_scale.contiguous() - if qv_scale.dtype == torch.float16 - else qv_scale.to(torch.float16).contiguous(), - qv_zero.contiguous() - if qv_zero.dtype == torch.float16 - else qv_zero.to(torch.float16).contiguous(), + ( + qv_scale.contiguous() + if qv_scale.dtype == torch.float16 + else qv_scale.to(torch.float16).contiguous() + ), + ( + qv_zero.contiguous() + if qv_zero.dtype == torch.float16 + else qv_zero.to(torch.float16).contiguous() + ), ) if attn_output.dtype != query.dtype: attn_output = attn_output.to(query.dtype) diff --git a/tests/cpu/test_inductor.py b/tests/cpu/test_inductor.py index 53fcf2a2b..833ed0ab6 100644 --- a/tests/cpu/test_inductor.py +++ b/tests/cpu/test_inductor.py @@ -119,9 +119,11 @@ def run(*ex, **kwargs): actual_flat, _ = tree_flatten(actual) if reference_in_float: correct_flat = tuple( - y.to(x.dtype) - if isinstance(y, torch.Tensor) and y.dtype.is_floating_point - else y + ( + y.to(x.dtype) + if isinstance(y, torch.Tensor) and y.dtype.is_floating_point + else y + ) for x, y in zip(actual_flat, correct_flat) ) correct = tree_unflatten(correct_flat, correct_spec) diff --git a/tests/cpu/test_weight_prepack.py b/tests/cpu/test_weight_prepack.py index 02c6624c3..91e69d885 100644 --- a/tests/cpu/test_weight_prepack.py +++ b/tests/cpu/test_weight_prepack.py @@ -1427,9 +1427,11 @@ def forward(self, x): self.assertTrue( module_found( ipex_model, - torch.nn.ConvTranspose2d - if dims == 2 - else torch.nn.ConvTranspose3d, + ( + torch.nn.ConvTranspose2d + if dims == 2 + else torch.nn.ConvTranspose3d + ), ) ) continue @@ -1437,9 +1439,11 @@ def forward(self, x): self.assertFalse( module_found( ipex_model, - torch.nn.ConvTranspose2d - if dims == 2 - else torch.nn.ConvTranspose3d, + ( + torch.nn.ConvTranspose2d + if dims == 2 + else torch.nn.ConvTranspose3d + ), ) ) @@ -1477,9 +1481,11 @@ def forward(self, x): self.assertTrue( module_found( ipex_model, - torch.nn.ConvTranspose2d - if dims == 2 - else torch.nn.ConvTranspose3d, + ( + torch.nn.ConvTranspose2d + if dims == 2 + else torch.nn.ConvTranspose3d + ), ) ) continue @@ -1487,9 +1493,11 @@ def forward(self, x): self.assertFalse( module_found( ipex_model, - torch.nn.ConvTranspose2d - if dims == 2 - else torch.nn.ConvTranspose3d, + ( + torch.nn.ConvTranspose2d + if dims == 2 + else torch.nn.ConvTranspose3d + ), ) ) From 4e5114d3cb93ca6e3a8e394145d53599e920574a Mon Sep 17 00:00:00 2001 From: ZhaoqiongZ <106125927+ZhaoqiongZ@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:45:30 +0800 Subject: [PATCH 013/199] Cherrypick PR llm script/zzq (#2724) to master (#2745) * Update llm script/zzq (#2724) * add transformers_stream_generator package for Qwen * add tiktoken * add intermediate step for compile script, compare torch version and decide continue or quit * fix interactive step log * add variable in {} * adjust warning format * update log * rename compare function * compare torch version with patch version and platform suffix * change version to main --- .../inference/python/llm/tools/env_setup.sh | 2 +- scripts/compile_bundle.sh | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/examples/cpu/inference/python/llm/tools/env_setup.sh b/examples/cpu/inference/python/llm/tools/env_setup.sh index 1546e4cbd..314adaefe 100644 --- a/examples/cpu/inference/python/llm/tools/env_setup.sh +++ b/examples/cpu/inference/python/llm/tools/env_setup.sh @@ -158,7 +158,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then rm -rf compile_bundle.sh llvm-project llvm-release torch-ccl fi - echo "python -m pip install cpuid accelerate datasets sentencepiece protobuf==${VER_PROTOBUF} transformers==${VER_TRANSFORMERS} neural-compressor==${VER_INC}" >> ${AUX_INSTALL_SCRIPT} + echo "python -m pip install cpuid accelerate datasets sentencepiece protobuf==${VER_PROTOBUF} transformers==${VER_TRANSFORMERS} neural-compressor==${VER_INC} transformers_stream_generator tiktoken" >> ${AUX_INSTALL_SCRIPT} # Used for accuracy test only if [ -d lm-evaluation-harness ]; then diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh index 05debf5ac..fb3594a67 100644 --- a/scripts/compile_bundle.sh +++ b/scripts/compile_bundle.sh @@ -134,6 +134,52 @@ fi # Install dependencies python -m pip install cmake + +# Compare the torch torchvision and torchaudio version +function ver_compare_eq() { + RET=0 + if [[ "$1" == "$2" ]]; then + RET=1 + fi + echo ${RET} +} +# Check if PyTorch is installed, if installed, compare the current version with compiling version +if python -c "import torch; print(torch.__version__)" &> /dev/null; then + torch_version=$(python -c "import torch; print(torch.__version__)") + VER_COMP_TORCH=$(ver_compare_eq ${torch_version} ${VER_TORCH}) + if python -c "import torchvision; print(torchvision.__version__)" &> /dev/null; then + torchvision_version=$(python -c "import torchvision; print(torchvision.__version__)") + VER_COMP_VISION=$(ver_compare_eq ${torchvision_version} ${VER_TORCHVISION}) + fi + if python -c "import torchaudio; print(torchaudio.__version__)" &> /dev/null; then + torchaudio_version=$(python -c "import torchaudio; print(torchaudio.__version__)") + VER_COMP_AUDIO=$(ver_compare_eq ${torchaudio_version} ${VER_TORCHAUDIO}) + fi + if [ ${VER_COMP_TORCH} -ne 1 ] || [ ${VER_COMP_VISION} -ne 1 ] || [ ${VER_COMP_AUDIO} -ne 1 ]; then + if [ ${VER_COMP_TORCH} -ne 1 ]; then + printf "WARNING: Found installed torch version ${torch_version}, the required version for compiling is ${VER_TORCH}\\n" + fi + if [ ${VER_COMP_VISION} -ne 1 ]; then + printf " Found installed torchvision version ${torchvision_version}, the required version for compiling is ${VER_TORCHVISION}\\n" + fi + if [ ${VER_COMP_AUDIO} -ne 1 ]; then + printf " Found installed torchaudio version ${torchaudio_version}, the required version for compiling is ${VER_COMP_AUDIO}\\n" + fi + printf "Continue to run the compile script will replace the current torch/torchvision/torchaudio package\\n" + printf "Are sure you want to continue the compilation? yes for continue, no for quit. [yes|no]\\n" + printf "[yes] >>> " + read -r ans + ans=$(echo "${ans}" | tr '[:lower:]' '[:upper:]') + if [ "${ans}" != "YES" ] && [ "${ans}" != "Y" ] + then + printf "Aborting compilation\\n" + exit 2 + fi + fi +fi + + + python -m pip uninstall -y torch torchvision torchaudio intel-extension-for-pytorch oneccl_bind_pt set +e echo ${VER_TORCH} | grep "dev" > /dev/null From 197ef108d98ff660276ad5c7978b6886474ecc85 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 11 Apr 2024 16:39:53 +0800 Subject: [PATCH 014/199] update oneDNN to 1a5163791c on main (#2754) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index d83040111..8eb97f88f 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit d83040111fe211ac9496a6125f31ff3c66a9c194 +Subproject commit 8eb97f88f6cd4290e93dbead94fb22c68b9d1d5a From 451a79cd857d45c96f6d24ec4c58b827e45bc906 Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Fri, 12 Apr 2024 19:36:49 +0800 Subject: [PATCH 015/199] make fused adam/addagrad more close with non-fused kernel (#2741) (#2758) * make fused adam/addagrad more close with non-fused kernel * add more comment --- .../optimizer/AdagradFusedStepKrnl.cpp | 4 +- .../kernels/optimizer/AdamFusedStepKrnl.cpp | 295 ++++++++++++------ 2 files changed, 200 insertions(+), 99 deletions(-) diff --git a/csrc/cpu/aten/kernels/optimizer/AdagradFusedStepKrnl.cpp b/csrc/cpu/aten/kernels/optimizer/AdagradFusedStepKrnl.cpp index d9666b84d..4d104f2e6 100644 --- a/csrc/cpu/aten/kernels/optimizer/AdagradFusedStepKrnl.cpp +++ b/csrc/cpu/aten/kernels/optimizer/AdagradFusedStepKrnl.cpp @@ -54,7 +54,7 @@ void adagrad_fused_step_kernel( sum_vec.store(state_sum_ptr + d); Vec std_vec = sum_vec.sqrt() + Vec(scalar_t(eps)); - param_vec = param_vec - grad_vec / std_vec * Vec(scalar_t(clr)); + param_vec = param_vec + Vec(scalar_t(-clr)) * grad_vec / std_vec; param_vec.store(param_ptr + d); } for (; d < size; d++) { @@ -64,7 +64,7 @@ void adagrad_fused_step_kernel( state_sum_ptr[d] += grad_val * grad_val; scalar_t std_val = std::sqrt(state_sum_ptr[d]) + eps; - param_ptr[d] -= grad_val / std_val * clr; + param_ptr[d] -= clr * grad_val / std_val; } }); } diff --git a/csrc/cpu/aten/kernels/optimizer/AdamFusedStepKrnl.cpp b/csrc/cpu/aten/kernels/optimizer/AdamFusedStepKrnl.cpp index e11fb36ca..e98a39820 100644 --- a/csrc/cpu/aten/kernels/optimizer/AdamFusedStepKrnl.cpp +++ b/csrc/cpu/aten/kernels/optimizer/AdamFusedStepKrnl.cpp @@ -19,30 +19,30 @@ void adam_fused_step_kernel( const at::Tensor& grad, const at::Tensor& param2, bool amsgrad, - double step, - double beta1_double, double beta2_double, double learning_rate_double, double weight_decay_double, - double eps_double) { + double eps_double, + double step_size_double, + double bias_correction2_sqrt_double, + double exp_avg_grad_coefficient_double, + double exp_avg_sq_grad_coefficient_double) { scalar_t* param_data = param.data_ptr(); scalar_t* exp_avg_data = exp_avg.data_ptr(); scalar_t* exp_avg_sq_data = exp_avg_sq.data_ptr(); scalar_t* max_exp_avg_sq_data = max_exp_avg_sq.data_ptr(); scalar_t* grad_data = grad.data_ptr(); - scalar_t bias_correction1 = 1 - std::pow(beta1_double, step); - scalar_t step_size = learning_rate_double / bias_correction1; - scalar_t bias_correction2 = 1 - std::pow(beta2_double, step); - // cast all scalar value to the same dtype with parameters - scalar_t beta1 = scalar_t(beta1_double); scalar_t beta2 = scalar_t(beta2_double); - scalar_t exp_avg_grad_coefficient = scalar_t(1 - beta1_double); - scalar_t exp_avg_sq_grad_coefficient = scalar_t(1 - beta2_double); scalar_t learning_rate = scalar_t(learning_rate_double); scalar_t weight_decay = scalar_t(weight_decay_double); scalar_t eps = scalar_t(eps_double); + scalar_t step_size = scalar_t(step_size_double); + scalar_t bias_correction2_sqrt = scalar_t(bias_correction2_sqrt_double); + scalar_t exp_avg_grad_coefficient = scalar_t(exp_avg_grad_coefficient_double); + scalar_t exp_avg_sq_grad_coefficient = + scalar_t(exp_avg_sq_grad_coefficient_double); using Vec = at::vec::Vectorized; int64_t grain_size = 512; @@ -63,12 +63,25 @@ void adam_fused_step_kernel( int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec param_vec = Vec::loadu(param_ptr + d); - Vec grad_vec = - Vec::loadu(grad_ptr + d) + param_vec * Vec(weight_decay); - Vec exp_avg_vec = Vec::loadu(exp_avg_ptr + d) * Vec(beta1) + - grad_vec * Vec(exp_avg_grad_coefficient); + Vec grad_vec = Vec::loadu(grad_ptr + d); + if (weight_decay != 0.f) { + // only accumulate weight decay when weight_decay != 0 to avoid NaN + // propagation from param to grad + grad_vec += param_vec * Vec(weight_decay); + } + + Vec exp_avg_vec = Vec::loadu(exp_avg_ptr + d); + // exp_avg.lerp_(grad, 1 - beta1) + // exactly match + // https://github.com/pytorch/pytorch/blob/d04957c0c682d766987cad07dce20986ca4a5b78/aten/src/ATen/native/cpu/LerpKernel.cpp#L99-L110 + const Vec lerp_weight = Vec(exp_avg_grad_coefficient); + auto mask = lerp_weight.abs() < Vec(0.5); + auto coeff = Vec::blendv(lerp_weight - Vec(1), lerp_weight, mask); + auto base = Vec::blendv(grad_vec, exp_avg_vec, mask); + exp_avg_vec = fmadd(coeff, grad_vec - exp_avg_vec, base); + Vec exp_avg_sq_vec = Vec::loadu(exp_avg_sq_ptr + d) * Vec(beta2) + - grad_vec * grad_vec * Vec(exp_avg_sq_grad_coefficient); + Vec(exp_avg_sq_grad_coefficient) * grad_vec * grad_vec; exp_avg_vec.store(exp_avg_ptr + d); exp_avg_sq_vec.store(exp_avg_sq_ptr + d); @@ -77,30 +90,44 @@ void adam_fused_step_kernel( Vec max_exp_avg_sq_vec = maximum(Vec::loadu(max_exp_avg_sq_ptr + d), exp_avg_sq_vec); max_exp_avg_sq_vec.store(max_exp_avg_sq_ptr + d); - denom_vec = - (max_exp_avg_sq_vec / Vec(bias_correction2)).sqrt() + Vec(eps); + denom_vec = max_exp_avg_sq_vec.sqrt() / Vec(bias_correction2_sqrt) + + Vec(eps); } else { denom_vec = - (exp_avg_sq_vec / Vec(bias_correction2)).sqrt() + Vec(eps); + exp_avg_sq_vec.sqrt() / Vec(bias_correction2_sqrt) + Vec(eps); } - param_vec = param_vec - Vec(step_size) * exp_avg_vec / denom_vec; param_vec.store(param_ptr + d); } for (; d < size; d++) { - scalar_t grad_val = grad_ptr[d] + param_ptr[d] * weight_decay; - exp_avg_ptr[d] = - exp_avg_ptr[d] * beta1 + grad_val * exp_avg_grad_coefficient; + scalar_t grad_val = grad_ptr[d]; + if (weight_decay != 0.f) { + // only accumulate weight decay when weight_decay != 0 to avoid NaN + // propagation from param to grad + grad_val += param_ptr[d] * weight_decay; + } + // exp_avg.lerp_(grad, 1 - beta1) + // exactly match + // https://github.com/pytorch/pytorch/blob/d04957c0c682d766987cad07dce20986ca4a5b78/aten/src/ATen/native/cpu/LerpKernel.cpp#L99-L110 + auto is_lerp_weight_small = std::abs(exp_avg_grad_coefficient) < 0.5; + if (is_lerp_weight_small) { + exp_avg_ptr[d] = exp_avg_ptr[d] + + exp_avg_grad_coefficient * (grad_val - exp_avg_ptr[d]); + } else { + exp_avg_ptr[d] = grad_val - + (grad_val - exp_avg_ptr[d]) * (1 - exp_avg_grad_coefficient); + } exp_avg_sq_ptr[d] = exp_avg_sq_ptr[d] * beta2 + - grad_val * grad_val * (exp_avg_sq_grad_coefficient); + exp_avg_sq_grad_coefficient * grad_val * grad_val; scalar_t demon_val; if (amsgrad) { max_exp_avg_sq_ptr[d] = std::max(max_exp_avg_sq_ptr[d], exp_avg_sq_ptr[d]); demon_val = - std::sqrt(max_exp_avg_sq_ptr[d] / bias_correction2) + eps; + std::sqrt(max_exp_avg_sq_ptr[d]) / bias_correction2_sqrt + eps; } else { - demon_val = std::sqrt(exp_avg_sq_ptr[d] / bias_correction2) + eps; + demon_val = + std::sqrt(exp_avg_sq_ptr[d]) / bias_correction2_sqrt + eps; } param_ptr[d] = param_ptr[d] - step_size * exp_avg_ptr[d] / demon_val; } @@ -116,12 +143,14 @@ void adam_fused_step_kernel( const at::Tensor& grad, const at::Tensor& param2, bool amsgrad, - double step, - double beta1_double, double beta2_double, double learning_rate_double, double weight_decay_double, - double eps_double) { + double eps_double, + double step_size_double, + double bias_correction2_sqrt_double, + double exp_avg_grad_coefficient_double, + double exp_avg_sq_grad_coefficient_double) { TORCH_CHECK( param.scalar_type() == at::kBFloat16, "adam_fused_step_kernel: expect param to be at::BFloat16"); @@ -148,18 +177,15 @@ void adam_fused_step_kernel( at::BFloat16* grad_data = grad.data_ptr(); at::BFloat16* param2_data = param2.data_ptr(); - float bias_correction1 = 1 - std::pow(beta1_double, step); - float step_size = learning_rate_double / bias_correction1; - float bias_correction2 = 1 - std::pow(beta2_double, step); - // cast all scalar value to float for computation - float beta1 = float(beta1_double); float beta2 = float(beta2_double); - float exp_avg_grad_coefficient = float(1 - beta1_double); - float exp_avg_sq_grad_coefficient = float(1 - beta2_double); float learning_rate = float(learning_rate_double); float weight_decay = float(weight_decay_double); float eps = float(eps_double); + float step_size = float(step_size_double); + float bias_correction2_sqrt = float(bias_correction2_sqrt_double); + float exp_avg_grad_coefficient = float(exp_avg_grad_coefficient_double); + float exp_avg_sq_grad_coefficient = float(exp_avg_sq_grad_coefficient_double); using bVec = at::vec::Vectorized; using fVec = at::vec::Vectorized; @@ -191,21 +217,34 @@ void adam_fused_step_kernel( std::tie(param_fvec, param_fvec2) = at::vec::pack_bfloat16_float(param_bvec, param2_bvec); // weight decay - grad_fvec = grad_fvec + param_fvec * fVec(weight_decay); - grad_fvec2 = grad_fvec2 + param_fvec2 * fVec(weight_decay); + if (weight_decay != 0.f) { + // only accumulate weight decay when weight_decay != 0 to avoid NaN + // propagation from param to grad + grad_fvec = grad_fvec + param_fvec * fVec(weight_decay); + grad_fvec2 = grad_fvec2 + param_fvec2 * fVec(weight_decay); + } + // update exp_avg, exp_avg_sq - fVec exp_avg_fvec = fVec::loadu(exp_avg_ptr + d) * fVec(beta1) + - grad_fvec * fVec(exp_avg_grad_coefficient); - fVec exp_avg_fvec2 = - fVec::loadu(exp_avg_ptr + d + fVec::size()) * fVec(beta1) + - grad_fvec2 * fVec(exp_avg_grad_coefficient); + // exp_avg.lerp_(grad, 1 - beta1) + // exactly match + // https://github.com/pytorch/pytorch/blob/d04957c0c682d766987cad07dce20986ca4a5b78/aten/src/ATen/native/cpu/LerpKernel.cpp#L99-L110 + fVec exp_avg_fvec = fVec::loadu(exp_avg_ptr + d); + fVec exp_avg_fvec2 = fVec::loadu(exp_avg_ptr + d + fVec::size()); + fVec lerp_weight = fVec(exp_avg_grad_coefficient); + auto mask = lerp_weight.abs() < fVec(0.5); + auto coeff = fVec::blendv(lerp_weight - fVec(1), lerp_weight, mask); + auto base = fVec::blendv(grad_fvec, exp_avg_fvec, mask); + exp_avg_fvec = fmadd(coeff, grad_fvec - exp_avg_fvec, base); + auto base2 = fVec::blendv(grad_fvec2, exp_avg_fvec2, mask); + exp_avg_fvec2 = fmadd(coeff, grad_fvec2 - exp_avg_fvec2, base2); exp_avg_fvec.store(exp_avg_ptr + d); exp_avg_fvec2.store(exp_avg_ptr + d + fVec::size()); + fVec exp_avg_sq_fvec = fVec::loadu(exp_avg_sq_ptr + d) * fVec(beta2) + - grad_fvec * grad_fvec * fVec(exp_avg_sq_grad_coefficient); + fVec(exp_avg_sq_grad_coefficient) * grad_fvec * grad_fvec; fVec exp_avg_sq_fvec2 = fVec::loadu(exp_avg_sq_ptr + d + fVec::size()) * fVec(beta2) + - grad_fvec2 * grad_fvec2 * fVec(exp_avg_sq_grad_coefficient); + fVec(exp_avg_sq_grad_coefficient) * grad_fvec2 * grad_fvec2; exp_avg_sq_fvec.store(exp_avg_sq_ptr + d); exp_avg_sq_fvec2.store(exp_avg_sq_ptr + d + fVec::size()); // amsgrad @@ -218,16 +257,18 @@ void adam_fused_step_kernel( exp_avg_sq_fvec2); max_exp_avg_sq_fvec.store(max_exp_avg_sq_ptr + d); max_exp_avg_sq_fvec2.store(max_exp_avg_sq_ptr + d + fVec::size()); - denom_fvec = (max_exp_avg_sq_fvec / fVec(bias_correction2)).sqrt() + + denom_fvec = + max_exp_avg_sq_fvec.sqrt() / fVec(bias_correction2_sqrt) + fVec(eps); denom_fvec2 = - (max_exp_avg_sq_fvec2 / fVec(bias_correction2)).sqrt() + + max_exp_avg_sq_fvec2.sqrt() / fVec(bias_correction2_sqrt) + fVec(eps); } else { - denom_fvec = - (exp_avg_sq_fvec / fVec(bias_correction2)).sqrt() + fVec(eps); + denom_fvec = exp_avg_sq_fvec.sqrt() / fVec(bias_correction2_sqrt) + + fVec(eps); denom_fvec2 = - (exp_avg_sq_fvec2 / fVec(bias_correction2)).sqrt() + fVec(eps); + exp_avg_sq_fvec2.sqrt() / fVec(bias_correction2_sqrt) + + fVec(eps); } // update param param_fvec = param_fvec - fVec(step_size) * exp_avg_fvec / denom_fvec; @@ -241,19 +282,34 @@ void adam_fused_step_kernel( for (; d < size; d++) { float param_val = at::vec::pack_bfloat16_float(param_ptr[d], param2_ptr[d]); - float grad_val = float(grad_ptr[d]) + param_val * weight_decay; - exp_avg_ptr[d] = - exp_avg_ptr[d] * beta1 + grad_val * exp_avg_grad_coefficient; + float grad_val = grad_ptr[d]; + if (weight_decay != 0.f) { + // only accumulate weight decay when weight_decay != 0 to avoid NaN + // propagation from param to grad + grad_val = grad_val + param_val * weight_decay; + } + // exp_avg.lerp_(grad, 1 - beta1) + // exactly match + // https://github.com/pytorch/pytorch/blob/d04957c0c682d766987cad07dce20986ca4a5b78/aten/src/ATen/native/cpu/LerpKernel.cpp#L99-L110 + auto is_lerp_weight_small = std::abs(exp_avg_grad_coefficient) < 0.5; + if (is_lerp_weight_small) { + exp_avg_ptr[d] = exp_avg_ptr[d] + + exp_avg_grad_coefficient * (grad_val - exp_avg_ptr[d]); + } else { + exp_avg_ptr[d] = grad_val - + (grad_val - exp_avg_ptr[d]) * (1 - exp_avg_grad_coefficient); + } exp_avg_sq_ptr[d] = exp_avg_sq_ptr[d] * beta2 + - grad_val * grad_val * exp_avg_sq_grad_coefficient; + exp_avg_sq_grad_coefficient * grad_val * grad_val; float demon_val; if (amsgrad) { max_exp_avg_sq_ptr[d] = std::max(max_exp_avg_sq_ptr[d], exp_avg_sq_ptr[d]); demon_val = - std::sqrt(max_exp_avg_sq_ptr[d] / bias_correction2) + eps; + std::sqrt(max_exp_avg_sq_ptr[d]) / bias_correction2_sqrt + eps; } else { - demon_val = std::sqrt(exp_avg_sq_ptr[d] / bias_correction2) + eps; + demon_val = + std::sqrt(exp_avg_sq_ptr[d]) / bias_correction2_sqrt + eps; } param_val = param_val - step_size * exp_avg_ptr[d] / demon_val; std::tie(param_ptr[d], param2_ptr[d]) = @@ -271,12 +327,14 @@ void adam_fused_step_kernel( const at::Tensor& grad, const at::Tensor& param2, bool amsgrad, - double step, - double beta1_double, double beta2_double, double learning_rate_double, double weight_decay_double, - double eps_double) { + double eps_double, + double step_size_double, + double bias_correction2_sqrt_double, + double exp_avg_grad_coefficient_double, + double exp_avg_sq_grad_coefficient_double) { TORCH_CHECK( param.scalar_type() == at::kFloat, "adam_fused_step_kernel: expect param to be at::Float"); @@ -303,18 +361,15 @@ void adam_fused_step_kernel( at::BFloat16* grad_data = grad.data_ptr(); at::BFloat16* param2_data = param2.data_ptr(); - float bias_correction1 = 1 - std::pow(beta1_double, step); - float step_size = learning_rate_double / bias_correction1; - float bias_correction2 = 1 - std::pow(beta2_double, step); - // cast all scalar value to float for computation - float beta1 = float(beta1_double); float beta2 = float(beta2_double); - float exp_avg_grad_coefficient = float(1 - beta1_double); - float exp_avg_sq_grad_coefficient = float(1 - beta2_double); float learning_rate = float(learning_rate_double); float weight_decay = float(weight_decay_double); float eps = float(eps_double); + float step_size = float(step_size_double); + float bias_correction2_sqrt = float(bias_correction2_sqrt_double); + float exp_avg_grad_coefficient = float(exp_avg_grad_coefficient_double); + float exp_avg_sq_grad_coefficient = float(exp_avg_sq_grad_coefficient_double); using bVec = at::vec::Vectorized; using fVec = at::vec::Vectorized; @@ -343,21 +398,33 @@ void adam_fused_step_kernel( fVec param_fvec = fVec::loadu(param_ptr + d); fVec param_fvec2 = fVec::loadu(param_ptr + d + fVec::size()); // weight decay - grad_fvec = grad_fvec + param_fvec * fVec(weight_decay); - grad_fvec2 = grad_fvec2 + param_fvec2 * fVec(weight_decay); + if (weight_decay != 0.f) { + // only accumulate weight decay when weight_decay != 0 to avoid NaN + // propagation from param to grad + grad_fvec = grad_fvec + param_fvec * fVec(weight_decay); + grad_fvec2 = grad_fvec2 + param_fvec2 * fVec(weight_decay); + } // update exp_avg, exp_avg_sq - fVec exp_avg_fvec = fVec::loadu(exp_avg_ptr + d) * fVec(beta1) + - grad_fvec * fVec(exp_avg_grad_coefficient); - fVec exp_avg_fvec2 = - fVec::loadu(exp_avg_ptr + d + fVec::size()) * fVec(beta1) + - grad_fvec2 * fVec(exp_avg_grad_coefficient); + // exp_avg.lerp_(grad, 1 - beta1) + // exactly match + // https://github.com/pytorch/pytorch/blob/d04957c0c682d766987cad07dce20986ca4a5b78/aten/src/ATen/native/cpu/LerpKernel.cpp#L99-L110 + fVec exp_avg_fvec = fVec::loadu(exp_avg_ptr + d); + fVec exp_avg_fvec2 = fVec::loadu(exp_avg_ptr + d + fVec::size()); + fVec lerp_weight = fVec(exp_avg_grad_coefficient); + auto mask = lerp_weight.abs() < fVec(0.5); + auto coeff = fVec::blendv(lerp_weight - fVec(1), lerp_weight, mask); + auto base = fVec::blendv(grad_fvec, exp_avg_fvec, mask); + exp_avg_fvec = fmadd(coeff, grad_fvec - exp_avg_fvec, base); + auto base2 = fVec::blendv(grad_fvec2, exp_avg_fvec2, mask); + exp_avg_fvec2 = fmadd(coeff, grad_fvec2 - exp_avg_fvec2, base2); exp_avg_fvec.store(exp_avg_ptr + d); exp_avg_fvec2.store(exp_avg_ptr + d + fVec::size()); + fVec exp_avg_sq_fvec = fVec::loadu(exp_avg_sq_ptr + d) * fVec(beta2) + - grad_fvec * grad_fvec * fVec(exp_avg_sq_grad_coefficient); + fVec(exp_avg_sq_grad_coefficient) * grad_fvec * grad_fvec; fVec exp_avg_sq_fvec2 = fVec::loadu(exp_avg_sq_ptr + d + fVec::size()) * fVec(beta2) + - grad_fvec2 * grad_fvec2 * fVec(exp_avg_sq_grad_coefficient); + fVec(exp_avg_sq_grad_coefficient) * grad_fvec2 * grad_fvec2; exp_avg_sq_fvec.store(exp_avg_sq_ptr + d); exp_avg_sq_fvec2.store(exp_avg_sq_ptr + d + fVec::size()); // amsgrad @@ -370,16 +437,18 @@ void adam_fused_step_kernel( exp_avg_sq_fvec2); max_exp_avg_sq_fvec.store(max_exp_avg_sq_ptr + d); max_exp_avg_sq_fvec2.store(max_exp_avg_sq_ptr + d + fVec::size()); - denom_fvec = (max_exp_avg_sq_fvec / fVec(bias_correction2)).sqrt() + + denom_fvec = + max_exp_avg_sq_fvec.sqrt() / fVec(bias_correction2_sqrt) + fVec(eps); denom_fvec2 = - (max_exp_avg_sq_fvec2 / fVec(bias_correction2)).sqrt() + + max_exp_avg_sq_fvec2.sqrt() / fVec(bias_correction2_sqrt) + fVec(eps); } else { - denom_fvec = - (exp_avg_sq_fvec / fVec(bias_correction2)).sqrt() + fVec(eps); + denom_fvec = exp_avg_sq_fvec.sqrt() / fVec(bias_correction2_sqrt) + + fVec(eps); denom_fvec2 = - (exp_avg_sq_fvec2 / fVec(bias_correction2)).sqrt() + fVec(eps); + exp_avg_sq_fvec2.sqrt() / fVec(bias_correction2_sqrt) + + fVec(eps); } // update param param_fvec = param_fvec - fVec(step_size) * exp_avg_fvec / denom_fvec; @@ -392,19 +461,34 @@ void adam_fused_step_kernel( param2_bvec.store(param2_ptr + d); } for (; d < size; d++) { - float grad_val = float(grad_ptr[d]) + param_ptr[d] * weight_decay; - exp_avg_ptr[d] = - exp_avg_ptr[d] * beta1 + grad_val * exp_avg_grad_coefficient; + float grad_val = grad_ptr[d]; + if (weight_decay != 0.f) { + // only accumulate weight decay when weight_decay != 0 to avoid NaN + // propagation from param to grad + grad_val = grad_val + param_ptr[d] * weight_decay; + } + // exp_avg.lerp_(grad, 1 - beta1) + // exactly match + // https://github.com/pytorch/pytorch/blob/d04957c0c682d766987cad07dce20986ca4a5b78/aten/src/ATen/native/cpu/LerpKernel.cpp#L99-L110 + auto is_lerp_weight_small = std::abs(exp_avg_grad_coefficient) < 0.5; + if (is_lerp_weight_small) { + exp_avg_ptr[d] = exp_avg_ptr[d] + + exp_avg_grad_coefficient * (grad_val - exp_avg_ptr[d]); + } else { + exp_avg_ptr[d] = grad_val - + (grad_val - exp_avg_ptr[d]) * (1 - exp_avg_grad_coefficient); + } exp_avg_sq_ptr[d] = exp_avg_sq_ptr[d] * beta2 + - grad_val * grad_val * exp_avg_sq_grad_coefficient; + exp_avg_sq_grad_coefficient * grad_val * grad_val; float demon_val; if (amsgrad) { max_exp_avg_sq_ptr[d] = std::max(max_exp_avg_sq_ptr[d], exp_avg_sq_ptr[d]); demon_val = - std::sqrt(max_exp_avg_sq_ptr[d] / bias_correction2) + eps; + std::sqrt(max_exp_avg_sq_ptr[d]) / bias_correction2_sqrt + eps; } else { - demon_val = std::sqrt(exp_avg_sq_ptr[d] / bias_correction2) + eps; + demon_val = + std::sqrt(exp_avg_sq_ptr[d]) / bias_correction2_sqrt + eps; } param_ptr[d] = param_ptr[d] - step_size * exp_avg_ptr[d] / demon_val; param2_ptr[d] = at::BFloat16(param_ptr[d]); @@ -435,6 +519,15 @@ void adam_fused_step_kernel_impl( auto grad_dtype = grad_.scalar_type(); auto param_dtype = param_.scalar_type(); + + // make sure all scalar args are computationed with double precision + double bias_correction1 = 1 - std::pow(beta1, step); + double step_size = learning_rate / bias_correction1; + double bias_correction2 = 1 - std::pow(beta2, step); + double bias_correction2_sqrt = std::sqrt(bias_correction2); + double exp_avg_grad_coefficient = 1 - beta1; + double exp_avg_sq_grad_coefficient = 1 - beta2; + if (at::ScalarType::Float == grad_dtype) { adam_fused_step_kernel( param, @@ -444,12 +537,14 @@ void adam_fused_step_kernel_impl( grad, param2, amsgrad, - step, - beta1, beta2, learning_rate, weight_decay, - eps); + eps, + step_size, + bias_correction2_sqrt, + exp_avg_grad_coefficient, + exp_avg_sq_grad_coefficient); } else if (at::ScalarType::Double == grad_dtype) { adam_fused_step_kernel( param, @@ -459,12 +554,14 @@ void adam_fused_step_kernel_impl( grad, param2, amsgrad, - step, - beta1, beta2, learning_rate, weight_decay, - eps); + eps, + step_size, + bias_correction2_sqrt, + exp_avg_grad_coefficient, + exp_avg_sq_grad_coefficient); } else if ( at::ScalarType::BFloat16 == grad_dtype && at::ScalarType::BFloat16 == param_dtype) { @@ -476,12 +573,14 @@ void adam_fused_step_kernel_impl( grad, param2, amsgrad, - step, - beta1, beta2, learning_rate, weight_decay, - eps); + eps, + step_size, + bias_correction2_sqrt, + exp_avg_grad_coefficient, + exp_avg_sq_grad_coefficient); } else if ( at::ScalarType::BFloat16 == grad_dtype && at::ScalarType::Float == param_dtype) { @@ -493,12 +592,14 @@ void adam_fused_step_kernel_impl( grad, param2, amsgrad, - step, - beta1, beta2, learning_rate, weight_decay, - eps); + eps, + step_size, + bias_correction2_sqrt, + exp_avg_grad_coefficient, + exp_avg_sq_grad_coefficient); } else { TORCH_CHECK(false, "expect bfloat16 or float or double param"); } From e2d4be319aed60a09a2e405f54a03ee02499d9a6 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Sat, 13 Apr 2024 05:50:56 -0700 Subject: [PATCH 016/199] INT4 GPT-J MLPerf example: Fix example input issue; fix logger issue; update KMP settings (#2742) Co-authored-by: Chunyuan WU --- .../run_int4_gpt-j_on_cnndailymail.py | 54 +++++++++---------- .../run_int4_gpt-j_on_cnndailymail.sh | 3 +- 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py b/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py index 47e53aa8d..df3a5ae52 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py +++ b/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py @@ -54,10 +54,8 @@ assert not (args.fp32 and args.bf16), "--fp32 and --bf16 cannot be used at the same time" random.seed(9973) -logging.basicConfig( - format='%(asctime)s %(levelname)-8s %(message)s', - level=logging.INFO, - datefmt='%Y-%m-%d %H:%M:%S') +logger = logging.getLogger("INT4 GPT-J") +logger.setLevel(logging.INFO) PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " @@ -161,13 +159,13 @@ def collate_batch(self, batch): Path(args.output_dir).mkdir(parents=True, exist_ok=True) def load_original_model(args): - logging.info("Loading model {}...".format(args.model)) + logger.info("Loading model {}...".format(args.model)) config = AutoConfig.from_pretrained(args.model, torchscript=True) user_model = AutoModelForCausalLM.from_pretrained( args.model, torch_dtype=torch.float, config=config, low_cpu_mem_usage=True ) tokenizer = AutoTokenizer.from_pretrained(args.model) - logging.info("model loaded.") + logger.info("model loaded.") return user_model, tokenizer dataset_id = 'cnn_dailymail' @@ -175,7 +173,7 @@ def load_original_model(args): dataset_split = "validation" if args.dataset_path == "": instruction_template = "Summarize the following news article:" - logging.info("Loading {} split of {} dataset...".format(dataset_split, args.model)) + logger.info("Loading {} split of {} dataset...".format(dataset_split, args.model)) dataset = load_dataset(dataset_id, name=dataset_version, split=dataset_split) train = dict((x['id'], x) for x in dataset) inputs = [] @@ -191,16 +189,16 @@ def load_original_model(args): with open(val_data_path, 'w') as write_f: json.dump(inputs, write_f, indent=4, ensure_ascii=False) - logging.info("{} data saved at {}".format(dataset_split, val_data_path)) + logger.info("{} data saved at {}".format(dataset_split, val_data_path)) else: - logging.info("Use the given dataset {}".format(args.dataset_path)) + logger.info("Use the given dataset {}".format(args.dataset_path)) val_data_path = args.dataset_path num_beams = 4 batch_size = 1 if args.fp32 or args.bf16: user_model, tokenizer = load_original_model(args) - logging.info("Optimize model by ipex.llm.optimize") + logger.info("Optimize model by ipex.llm.optimize") user_model = user_model.eval() user_model = user_model.to(memory_format=torch.channels_last) inf_dtype = torch.float if args.fp32 else torch.bfloat16 @@ -212,8 +210,8 @@ def load_original_model(args): ) elif args.int4_model == "": if args.low_precision_checkpoint == "": - logging.info("Do calibration with GPTQ to generate lowp-precision checkpoint.") - logging.info("Calibration with GPTQ will take an hour or so. Please wait.") + logger.info("Do calibration with GPTQ to generate lowp-precision checkpoint.") + logger.info("Calibration with GPTQ will take an hour or so. Please wait.") user_model, tokenizer = load_original_model(args) calib_iters = 128 calib_dataset = CNNDAILYMAIL(args.model, val_data_path, is_calib=True, num_samples=calib_iters) @@ -234,15 +232,15 @@ def load_original_model(args): scale_dtype=torch.float16, save_dir=args.output_dir) - logging.info("Calibration finished. Low-precision checkpoint generated as {}.".format(args.output_dir)) + logger.info("Calibration finished. Low-precision checkpoint generated as {}.".format(args.output_dir)) # Quit here because we want to use different environment variables to run GPTQ and benchmark. # So, run this script twice and specify the GPTQ checkpoint file for the second run. quit() else: - logging.info("low_precision_checkpoint is given. Calibration skipped.") + logger.info("low_precision_checkpoint is given. Calibration skipped.") low_precision_checkpoint_file_path = args.low_precision_checkpoint - logging.info("Loading low_precision_checkpoint...") + logger.info("Loading low_precision_checkpoint...") low_precision_checkpoint = torch.load(low_precision_checkpoint_file_path) config_dict = { "weight_key": "qweight", @@ -252,11 +250,11 @@ def load_original_model(args): "g_idx_key": "g_idx" } state_dict_and_config = (low_precision_checkpoint, config_dict) - logging.info("low_precision_checkpoint loaded.") + logger.info("low_precision_checkpoint loaded.") user_model, tokenizer = load_original_model(args) - logging.info("Quantize model to INT4.") + logger.info("Quantize model to INT4.") beam_idx_tmp = torch.zeros( (2048, int(batch_size * num_beams)), dtype=torch.long ).contiguous() @@ -294,7 +292,7 @@ def load_original_model(args): qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( weight_dtype=weight_dtype, lowp_mode=lowp_mode ) - logging.info("Start quantizing model to INT4 by ipex.llm.optimize.") + logger.info("Start quantizing model to INT4 by ipex.llm.optimize.") user_model = ipex.llm.optimize( user_model.eval(), dtype=torch.bfloat16, @@ -310,25 +308,25 @@ def load_original_model(args): example_inputs = ( input_ids.unsqueeze(0), attention_mask.unsqueeze(0), - position_ids.unsqueeze(0), tuple(global_past_key_value), + position_ids.unsqueeze(0), ) with torch.no_grad(), torch.cpu.amp.autocast(enabled=True): self_jit = torch.jit.trace(user_model.eval(), example_inputs, strict=False) self_jit = torch.jit.freeze(self_jit.eval()) Path(args.output_dir).mkdir(parents=True, exist_ok=True) self_jit.save(args.output_dir + "/int4_model.pt") - logging.info("Quantization finished. INT4 model saved to {}.".format(args.output_dir + "/int4_model.pt")) + logger.info("Quantization finished. INT4 model saved to {}.".format(args.output_dir + "/int4_model.pt")) else: user_model, tokenizer = load_original_model(args) - logging.info("INT4 model is given. Quantization skipped.") - logging.info("Loading INT4 model...") + logger.info("INT4 model is given. Quantization skipped.") + logger.info("Loading INT4 model...") self_jit = torch.jit.load(args.int4_model) self_jit = torch.jit.freeze(self_jit.eval()) ipex._set_optimized_model_for_generation(user_model, optimized_model=self_jit) - logging.info("INT4 model loaded.") + logger.info("INT4 model loaded.") -logging.info("Ready to run accuracy task.") +logger.info("Ready to run accuracy task.") generate_kwargs = { "early_stopping": True, "max_new_tokens": 128, @@ -354,8 +352,8 @@ def postprocess_text(preds, targets): val_dataset = CNNDAILYMAIL(args.model, val_data_path, is_calib=False, max_len=max_len, num_samples=iters) sources = val_dataset.sources targets = val_dataset.targets -logging.info("Start running accuracy task...") -logging.info("Number of samples to run = {}".format(iters)) +logger.info("Start running accuracy task...") +logger.info("Number of samples to run = {}".format(iters)) with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( enabled=(False if args.fp32 else True), dtype=(None if args.fp32 else torch.bfloat16) @@ -383,5 +381,5 @@ def postprocess_text(preds, targets): result = metric.compute(predictions=predictions, references=ground_truths, use_stemmer=True, use_aggregator=False) result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()} -logging.info("Accuracy test results:") -logging.info(result) +logger.info("Accuracy test results:") +logger.info(result) diff --git a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh b/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh index a27e1f21c..495ecdde6 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh +++ b/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.sh @@ -32,9 +32,8 @@ if [ $retVal -ne 0 ]; then fi # Set a few env variables to get best performance -export KMP_BLOCKTIME=INF +export KMP_BLOCKTIME=1 export KMP_TPAUSE=0 -export KMP_SETTINGS=1 export KMP_FORKJOIN_BARRIER_PATTERN=dist,dist export KMP_PLAIN_BARRIER_PATTERN=dist,dist export KMP_REDUCTION_BARRIER_PATTERN=dist,dist From 2f3b008682fdb5a47c22739586de628e4b0e4205 Mon Sep 17 00:00:00 2001 From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com> Date: Mon, 15 Apr 2024 10:51:19 +0800 Subject: [PATCH 017/199] update transformers ver. to pass security check (#2762) --- examples/cpu/serving/triton/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cpu/serving/triton/requirements.txt b/examples/cpu/serving/triton/requirements.txt index ffcd3ef85..3dcefb4b2 100644 --- a/examples/cpu/serving/triton/requirements.txt +++ b/examples/cpu/serving/triton/requirements.txt @@ -2,7 +2,7 @@ torch==2.2.0 --index-url https://download.pytorch.org/whl/cpu torchvision==0.17.0 --index-url https://download.pytorch.org/whl/cpu torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cpu intel_extension_for_pytorch==2.2.0 -transformers==4.36.0 +transformers==4.38.1 tritonclient[all]==2.41.1 intel-openmp==2024.0.2 configargparse==1.7 From 1d104877203a0737545185b690ceadaa80330342 Mon Sep 17 00:00:00 2001 From: jiayisunx Date: Mon, 15 Apr 2024 16:01:08 +0800 Subject: [PATCH 018/199] backport several fixes from release branch to cpu-device (#2761) * fix concat_bn_relu (#2756) * fix concat_bn_relu * fix clangformat issue * fix replace_dropout_with_identity (#2757) * fix numpy (#2759) --- csrc/cpu/aten/kernels/ConcatBnReluKrnl.cpp | 3 ++- .../nn/functional/_tensor_method.py | 6 +++--- .../nn/utils/_model_convert.py | 1 + tests/cpu/test_dropout.py | 1 + tests/cpu/test_jit.py | 1 + tests/cpu/test_tensor_method.py | 21 ++++++++++--------- 6 files changed, 19 insertions(+), 14 deletions(-) diff --git a/csrc/cpu/aten/kernels/ConcatBnReluKrnl.cpp b/csrc/cpu/aten/kernels/ConcatBnReluKrnl.cpp index 51a038380..f8017cf09 100644 --- a/csrc/cpu/aten/kernels/ConcatBnReluKrnl.cpp +++ b/csrc/cpu/aten/kernels/ConcatBnReluKrnl.cpp @@ -92,7 +92,8 @@ at::Tensor concat_bn_relu_kernel_impl( } #if defined(CPU_CAPABILITY_AVX512) if (tensor_check) { - at::Tensor output = at::empty(output_dim, a[0].options()); + at::Tensor output = at::empty( + output_dim, a[0].options().memory_format(a[0].suggest_memory_format())); if (a[0].scalar_type() == at::kBFloat16) { ConcatBnReluKernelImpl_ChannelsLast( a, bn_scale, bn_beta, output); diff --git a/intel_extension_for_pytorch/nn/functional/_tensor_method.py b/intel_extension_for_pytorch/nn/functional/_tensor_method.py index 3f5a8cccd..d34c795a1 100644 --- a/intel_extension_for_pytorch/nn/functional/_tensor_method.py +++ b/intel_extension_for_pytorch/nn/functional/_tensor_method.py @@ -3,14 +3,14 @@ from ...utils._logger import logger -def _numpy(x): +def _numpy(x, force=False): if x.dtype == torch.bfloat16: logger.warning( "calling in ipex numpy which is not share memory with torch tensor for bfloat16 input." ) - return torch._C._TensorBase.numpy(x.float()) + return torch._C._TensorBase.numpy(x.float(), force=force) else: - return torch._C._TensorBase.numpy(x) + return torch._C._TensorBase.numpy(x, force=force) # Fix https://github.com/pytorch/pytorch/issues/82764 diff --git a/intel_extension_for_pytorch/nn/utils/_model_convert.py b/intel_extension_for_pytorch/nn/utils/_model_convert.py index 80a042a35..a53b2c6d5 100644 --- a/intel_extension_for_pytorch/nn/utils/_model_convert.py +++ b/intel_extension_for_pytorch/nn/utils/_model_convert.py @@ -33,6 +33,7 @@ def replace_dropout_with_identity(model): for child_name, child in model.named_children(): if isinstance(child, torch.nn.Dropout): setattr(model, child_name, torch.nn.Identity()) + getattr(model, child_name).p = child.p else: replace_dropout_with_identity(child) diff --git a/tests/cpu/test_dropout.py b/tests/cpu/test_dropout.py index e4805d345..e71640e4f 100644 --- a/tests/cpu/test_dropout.py +++ b/tests/cpu/test_dropout.py @@ -35,6 +35,7 @@ def test_replace_dropout_with_identity(self): x = torch.randn(2, 3) named_children = dict(optimized_model.named_children()) self.assertTrue(isinstance(named_children["dropout"], torch.nn.Identity)) + self.assertEqual(optimized_model.dropout.p, model.dropout.p) optimized_model = ipex.optimize(model, replace_dropout_with_identity=False) named_children = dict(optimized_model.named_children()) diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py index cb9469eb2..bfab04bf5 100644 --- a/tests/cpu/test_jit.py +++ b/tests/cpu/test_jit.py @@ -2164,6 +2164,7 @@ def test_concat_bn_relu(self): result = model(a[0], a[1], a[2]) trace_model = torch.jit.trace(model, (a[0], a[1], a[2])).eval() trace_model = torch.jit.freeze(trace_model) + trace_model(a[0], a[1], a[2]) tresult = trace_model(a[0], a[1], a[2]) trace_graph = trace_model.graph_for(a[0], a[1], a[2]) diff --git a/tests/cpu/test_tensor_method.py b/tests/cpu/test_tensor_method.py index ca73262b7..6486535fb 100644 --- a/tests/cpu/test_tensor_method.py +++ b/tests/cpu/test_tensor_method.py @@ -5,16 +5,17 @@ class TestTesorMethod(TestCase): def test_numpy(self): - # float tensor, numpy array will share memory with torch tensor. - x = torch.randn(2, 3) - y = torch.from_numpy(x.numpy()) - self.assertEqual(x, y) - self.assertEqual(x.data_ptr(), y.data_ptr()) - # bfloat16 tensor, numpy array will not share memory with torch tensor. - x = torch.randn(2, 3).bfloat16() - y = torch.from_numpy(x.numpy()) - self.assertEqual(x, y.bfloat16()) - self.assertNotEqual(x.data_ptr(), y.data_ptr()) + for force in [True, False]: + # float tensor, numpy array will share memory with torch tensor. + x = torch.randn(2, 3) + y = torch.from_numpy(x.numpy(force=force)) + self.assertEqual(x, y) + self.assertEqual(x.data_ptr(), y.data_ptr()) + # bfloat16 tensor, numpy array will not share memory with torch tensor. + x = torch.randn(2, 3).bfloat16() + y = torch.from_numpy(x.numpy(force=force)) + self.assertEqual(x, y.bfloat16()) + self.assertNotEqual(x.data_ptr(), y.data_ptr()) if __name__ == "__main__": From 3fb2454a1ce5270bf8b10d13116e3bb662c3d0a2 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Mon, 15 Apr 2024 19:48:57 +0800 Subject: [PATCH 019/199] rename iakv to iakv_attn (#2753) * rename iakv to iakv_attn * tests * format --------- Co-authored-by: jianan-gu --- examples/cpu/inference/python/llm-modeling/README.md | 2 +- .../cpu/inference/python/llm-modeling/modeling_gptj.py | 6 +++--- .../inference/python/llm-modeling/modeling_llama.py | 8 ++++---- .../cpu/inference/python/llm-modeling/modeling_opt.py | 6 +++--- intel_extension_for_pytorch/llm/functional/fusions.py | 6 +++--- intel_extension_for_pytorch/llm/modules/__init__.py | 2 +- intel_extension_for_pytorch/llm/modules/mha_fusion.py | 10 +++++----- intel_extension_for_pytorch/llm/modules/utils.py | 4 ++-- tests/cpu/test_ipex_llm_module.py | 6 ++++-- 9 files changed, 26 insertions(+), 24 deletions(-) diff --git a/examples/cpu/inference/python/llm-modeling/README.md b/examples/cpu/inference/python/llm-modeling/README.md index 857d85940..15d3e2762 100644 --- a/examples/cpu/inference/python/llm-modeling/README.md +++ b/examples/cpu/inference/python/llm-modeling/README.md @@ -29,7 +29,7 @@ ipex.llm.modules.RMSNorm ipex.llm.modules.FastLayerNorm ipex.llm.modules.VarlenAttention ipex.llm.modules.PagedAttention -ipex.llm.modules.IndirectAccessKVCache +ipex.llm.modules.IndirectAccessKVCacheAttention #using as functions ipex.llm.functional.rotary_embedding diff --git a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py index bf8def64c..a9c3dd034 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py @@ -37,7 +37,7 @@ def __init__(self, config): pos_embd_dim, backbone=config.architectures[0], ) - self._IPEXIndirectAccessKVCache = ipex.llm.modules.IndirectAccessKVCache( + self._IPEXIndirectAccessKVCacheAttention = ipex.llm.modules.IndirectAccessKVCacheAttention( max_positions ) # ========================================================================== @@ -111,7 +111,7 @@ def forward( attn_output, attn_weights, present, - ) = self._IPEXIndirectAccessKVCache( + ) = self._IPEXIndirectAccessKVCacheAttention( query, key, value, @@ -530,7 +530,7 @@ def forward( attentions=transformer_outputs.attentions, ) - # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCache ==================== + # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention ==================== def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: diff --git a/examples/cpu/inference/python/llm-modeling/modeling_llama.py b/examples/cpu/inference/python/llm-modeling/modeling_llama.py index 27c9439a9..3558e07cc 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_llama.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_llama.py @@ -71,7 +71,7 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) # ==================== Changes to apply ipex.llm layers ==================== - self._IPEXIndirectAccessKVCache = ipex.llm.modules.IndirectAccessKVCache( + self._IPEXIndirectAccessKVCacheAttention = ipex.llm.modules.IndirectAccessKVCacheAttention( self.max_position_embeddings ) self.ipex_rotary_emb = ipex.llm.modules.RotaryEmbedding( @@ -128,7 +128,7 @@ def forward( kv_seq_len, ) - (attn_output, attn_weights, past_key_value) = self._IPEXIndirectAccessKVCache( + (attn_output, attn_weights, past_key_value) = self._IPEXIndirectAccessKVCacheAttention( query_states, key_states, value_states, @@ -478,7 +478,7 @@ def forward( attentions=outputs.attentions, ) - # ======== rewrite to prepare_inputs_for_generation to work with ipex.llm.modules.IndirectAccessKVCache ========= + # ======== rewrite to prepare_inputs_for_generation to work with ipex.llm.modules.IndirectAccessKVCacheAttention ========= def prepare_inputs_for_generation( self, input_ids, @@ -519,7 +519,7 @@ def prepare_inputs_for_generation( ) return model_inputs - # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCache ==================== + # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention ==================== def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: diff --git a/examples/cpu/inference/python/llm-modeling/modeling_opt.py b/examples/cpu/inference/python/llm-modeling/modeling_opt.py index 7b3247a22..3847df848 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_opt.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_opt.py @@ -56,7 +56,7 @@ def _handle_deprecated_argument(config_arg_name, config, fn_arg_name, kwargs): self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) # ==================== Changes to apply ipex.llm layers ==================== - self._IPEXIndirectAccessKVCache = ipex.llm.modules.IndirectAccessKVCache( + self._IPEXIndirectAccessKVCacheAttention = ipex.llm.modules.IndirectAccessKVCacheAttention( config.max_position_embeddings ) # ========================================================================== @@ -124,7 +124,7 @@ def forward( attn_output, attn_weights, past_key_value_decoder, - ) = self._IPEXIndirectAccessKVCache( + ) = self._IPEXIndirectAccessKVCacheAttention( query, key, value, @@ -722,7 +722,7 @@ def prepare_inputs_for_generation( ) return model_inputs - # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCache ==================== + # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention ==================== def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py index 4109cfc08..59ecf6b7c 100644 --- a/intel_extension_for_pytorch/llm/functional/fusions.py +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -4,7 +4,7 @@ RotaryEmbedding, RMSNorm, FastLayerNorm, - IndirectAccessKVCache, + IndirectAccessKVCacheAttention, VarlenAttention, ) @@ -128,7 +128,7 @@ def indirect_access_kv_cache( - new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). Notes: - - How to reorder KV cache when using the format of IndirectAccessKVCache (e.g., on llama model + - How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor @@ -141,7 +141,7 @@ def _reorder_cache( return past_key_values """ - return IndirectAccessKVCache.apply_function( + return IndirectAccessKVCacheAttention.apply_function( query, key, value, diff --git a/intel_extension_for_pytorch/llm/modules/__init__.py b/intel_extension_for_pytorch/llm/modules/__init__.py index 0d02f0706..d701ce281 100644 --- a/intel_extension_for_pytorch/llm/modules/__init__.py +++ b/intel_extension_for_pytorch/llm/modules/__init__.py @@ -13,7 +13,7 @@ RotaryEmbedding, RMSNorm, FastLayerNorm, - IndirectAccessKVCache, + IndirectAccessKVCacheAttention, PagedAttention, VarlenAttention, ) diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py index fa15fc66d..fc9d3e62b 100644 --- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py @@ -501,7 +501,7 @@ def single_query_cached_kv_attention( ) -class IndirectAccessKVCache(nn.Module): +class IndirectAccessKVCacheAttention(nn.Module): r""" kv_cache is used to reduce computation for **Decoder** layer but it also brings memory overheads, for example, when using beam search, the kv_cache should be reordered according to the latest beam @@ -540,7 +540,7 @@ class IndirectAccessKVCache(nn.Module): - new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). Notes: - - How to reorder KV cache when using the format of IndirectAccessKVCache (e.g., on llama model + - How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor @@ -552,7 +552,7 @@ def _reorder_cache( layer_past[3][layer_past[0].size(-2) - 1] = beam_idx return past_key_values - [Direct function call] This module also provides a `.apply_function` function call to apply IndirectAccessKVCache + [Direct function call] This module also provides a `.apply_function` function call to apply IndirectAccessKVCacheAttention without initializing the module. Args: - The parameters are the same as the forward call. @@ -581,7 +581,7 @@ def apply_function( text_max_length: Optional[int] = 0, ): return cls.runtime_ops.get_module_from_device( - query.device.type, IPEXCustomOpType.INDIRECTACCESS_KVCACHE, False + query.device.type, IPEXCustomOpType.INDIRECTACCESS_KVCACHE_ATTENTION, False ).apply_function( query, key, @@ -627,7 +627,7 @@ def forward( runtime_module = self.runtime_ops.get_module_from_device( query.device.type, - IPEXCustomOpType.INDIRECTACCESS_KVCACHE, + IPEXCustomOpType.INDIRECTACCESS_KVCACHE_ATTENTION, True, self.text_max_length, ) diff --git a/intel_extension_for_pytorch/llm/modules/utils.py b/intel_extension_for_pytorch/llm/modules/utils.py index 60d6fa8d7..1312e8c34 100644 --- a/intel_extension_for_pytorch/llm/modules/utils.py +++ b/intel_extension_for_pytorch/llm/modules/utils.py @@ -36,7 +36,7 @@ class IPEXCustomOpType(Enum): PAGED_ATTENTION: int = 11 FAST_LAYERNORM: int = 12 VARLEN_ATTENTION: int = 13 - INDIRECTACCESS_KVCACHE: int = 14 + INDIRECTACCESS_KVCACHE_ATTENTION: int = 14 CPU_fusion_modules = { @@ -45,7 +45,7 @@ class IPEXCustomOpType(Enum): IPEXCustomOpType.PAGED_ATTENTION: _IPEXPagedAttentionCPU, IPEXCustomOpType.FAST_LAYERNORM: _IPEXFastLayerNormCPU, IPEXCustomOpType.VARLEN_ATTENTION: _IPEXVarlenScaledDotProductCPU, - IPEXCustomOpType.INDIRECTACCESS_KVCACHE: _IPEXScaleDotProductCPU, + IPEXCustomOpType.INDIRECTACCESS_KVCACHE_ATTENTION: _IPEXScaleDotProductCPU, IPEXCustomOpType.LINEAR_SILU: _IPEXlinearSiluCPU, IPEXCustomOpType.LINEAR_SILU_MUL: _IPEXlinearSiluAndMulCPU, IPEXCustomOpType.LINEAR2_SILU_MUL: _IPEXlinearSiluMulCPU, diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py index 96fe6f1fc..3ad9cee44 100644 --- a/tests/cpu/test_ipex_llm_module.py +++ b/tests/cpu/test_ipex_llm_module.py @@ -274,8 +274,10 @@ def test_modules_naming(self): assert ipex.llm.modules.RotaryEmbedding is not None assert ipex.llm.modules.RotaryEmbedding.apply_function is not None assert ipex.llm.modules.PagedAttention is not None - assert ipex.llm.modules.IndirectAccessKVCache is not None - assert ipex.llm.modules.IndirectAccessKVCache.apply_function is not None + assert ipex.llm.modules.IndirectAccessKVCacheAttention is not None + assert ( + ipex.llm.modules.IndirectAccessKVCacheAttention.apply_function is not None + ) assert ipex.llm.modules.VarlenAttention is not None assert ipex.llm.modules.VarlenAttention.apply_function is not None assert ipex.llm.modules.FastLayerNorm is not None From 9ebb83916495af6ab0cfd7b17114b03b3db07827 Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 16 Apr 2024 12:12:23 +0800 Subject: [PATCH 020/199] Enable optimized Yuan2 (#2760) --- .../run_accuracy_with_deepspeed.py | 1 + .../run_generation_with_deepspeed.py | 15 +- examples/cpu/inference/python/llm/run.py | 1 + .../llm/single_instance/run_accuracy.py | 1 + .../llm/single_instance/run_generation.py | 4 +- .../python/llm/utils/create_shard_model.py | 1 + .../transformers/generation/beam_sample.py | 3 + .../transformers/generation/beam_search.py | 11 +- .../transformers/generation/greedy_search.py | 8 +- .../transformers/generation/sample.py | 8 +- .../models/cpu/modules/decoder.py | 1 + .../models/reference/fusions/mha_fusion.py | 3 + .../transformers/models/reference/models.py | 239 +++ .../models/reference/modules/attentions.py | 157 ++ .../models/reference/modules/decoder.py | 69 + .../transformers/optimize.py | 175 ++- .../transformers/tensor_parallel.py | 287 +++- tests/cpu/hf_configs/yuan/config.json | 39 + .../cpu/hf_configs/yuan/configuration_yuan.py | 42 + tests/cpu/hf_configs/yuan/yuan_hf_model.py | 1389 +++++++++++++++++ ...test_ipex_optimize_transformers_nightly.py | 8 + 21 files changed, 2413 insertions(+), 49 deletions(-) create mode 100644 tests/cpu/hf_configs/yuan/config.json create mode 100644 tests/cpu/hf_configs/yuan/configuration_yuan.py create mode 100644 tests/cpu/hf_configs/yuan/yuan_hf_model.py diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 5c4f38b78..e2eaebf50 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -68,6 +68,7 @@ def decorator(func): "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), + "yuan": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 9b52f93a3..1acec3412 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -54,6 +54,7 @@ "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), + "yuan": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -146,7 +147,7 @@ ) parser.add_argument( "--weight-dtype", - choices=["INT8", "INT4"], + choices=["INT8", "INT4", "NF4"], default="INT8", type=str, help="weight data type for weight only quantization. Unrelated to activation data type or lowp-mode.", @@ -343,7 +344,7 @@ def get_checkpoint_files(model_name_or_path): if model_type in ["llava"]: tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) model.config = config -elif world_size == 1 or model_type in ["falcon", "baichuan", "baichuan2", "gptbigcode", "git", "qwen"]: +elif world_size == 1 or model_type in ["falcon", "baichuan", "baichuan2", "gptbigcode", "git", "qwen", "yuan"]: model = model_class[0].from_pretrained( model_name, config=config, @@ -428,9 +429,13 @@ def write_checkpoints_json(): ipex_woq_enabled = args.ipex_weight_only_quantization if ipex_woq_enabled: from intel_extension_for_pytorch.quantization import WoqWeightDtype - weight_dtype = ( - WoqWeightDtype.INT4 if args.weight_dtype == "INT4" else WoqWeightDtype.INT8 - ) + if args.weight_dtype == "INT8": + weight_dtype = WoqWeightDtype.INT8 + elif args.weight_dtype == "INT4": + weight_dtype = WoqWeightDtype.INT4 + else: + assert args.weight_dtype == "NF4" + weight_dtype = WoqWeightDtype.NF4 if args.lowp_mode == "INT8": lowp_mode = ipex.quantization.WoqLowpMode.INT8 elif args.lowp_mode == "FP32": diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 238a8c170..325182335 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -495,6 +495,7 @@ def main(args_in: Optional[List[str]] = None) -> None: "dolly": ("/dolly_local_shard"), "qwen": ("/qwen_local_shard"), "git": ("/git_local_shard"), + "yuan": ("/yuan_local_shard"), "llava": ("/llava_local_shard"), } model_type = next( diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 21b85c10c..490866cb4 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -34,6 +34,7 @@ "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), + "yuan": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index ec422f39a..626c65e7b 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -44,6 +44,7 @@ "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), + "yuan": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -204,7 +205,8 @@ def load_image(image_file): roles = ('user', 'assistant') else: roles = conv.roles - +if re.search("yuan", model.config.architectures[0], re.IGNORECASE): + model.config.batch_size = int(args.batch_size) * num_beams def trace_handler(prof): print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) # to ipex diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/inference/python/llm/utils/create_shard_model.py index 7b35c027b..e3f042cea 100644 --- a/examples/cpu/inference/python/llm/utils/create_shard_model.py +++ b/examples/cpu/inference/python/llm/utils/create_shard_model.py @@ -24,6 +24,7 @@ "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), + "yuan": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/intel_extension_for_pytorch/transformers/generation/beam_sample.py b/intel_extension_for_pytorch/transformers/generation/beam_sample.py index 0754bd273..544438d71 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_sample.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_sample.py @@ -188,6 +188,7 @@ def _beam_sample( "QWenLMHeadModel", "GitForCausalLM", "LlavaLlamaForCausalLM", + "YuanForCausalLM", ]: first_token = False if model_inputs["past_key_values"] is None: @@ -309,6 +310,8 @@ def _beam_sample( self, "prepare_inputs_labels_for_multimodal" ): model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) + if first_token and self.model_backbone == "YuanForCausalLM": + model_inputs.pop("past_key_values", None) if hasattr(self, "trace_graph"): if first_token and hasattr(self, "trace_graph_first"): outputs = self.trace_graph_first(**model_inputs) diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py index 43d549004..ada0245dc 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_search.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py @@ -190,6 +190,7 @@ def _beam_search( "QWenLMHeadModel", "GitForCausalLM", "LlavaLlamaForCausalLM", + "YuanForCausalLM", ]: first_token = False has_position_id = model_inputs.get("position_ids", None) is not None @@ -271,7 +272,7 @@ def _beam_search( for i in range(self.config.num_hidden_layers) ] ) - if first_token: + if first_token and self.model_backbone != "YuanForCausalLM": if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer elif hasattr(self.config, "num_hidden_layers"): @@ -329,6 +330,8 @@ def _beam_search( self, "prepare_inputs_labels_for_multimodal" ): model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) + if first_token and self.model_backbone == "YuanForCausalLM": + model_inputs.pop("past_key_values", None) if hasattr(self, "trace_graph"): if first_token and hasattr(self, "trace_graph_first"): outputs = self.trace_graph_first(**model_inputs) @@ -341,7 +344,11 @@ def _beam_search( output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) - if first_token and len(model_inputs["past_key_values"][0]) == 4: + if ( + first_token + and self.model_backbone != "YuanForCausalLM" + and len(model_inputs["past_key_values"][0]) == 4 + ): if isinstance(outputs, dict): outputs.logits = outputs.logits.repeat_interleave(num_beams, dim=0) else: diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py index 7f160272c..ba568839e 100644 --- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py +++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py @@ -171,6 +171,7 @@ def _greedy_search( "QWenLMHeadModel", "GitForCausalLM", "LlavaLlamaForCausalLM", + "YuanForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] @@ -276,6 +277,8 @@ def _greedy_search( self, "prepare_inputs_labels_for_multimodal" ): model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) + if first_token and self.model_backbone == "YuanForCausalLM": + model_inputs.pop("past_key_values", None) if hasattr(self, "trace_graph"): model_inputs.pop("use_cache", None) model_inputs.pop("token_type_ids", None) @@ -291,7 +294,10 @@ def _greedy_search( model_inputs["encoder_outputs"] = ( model_inputs["encoder_outputs"]["last_hidden_state"], ) - outputs = self.trace_graph(**model_inputs) + if first_token and hasattr(self, "trace_graph_first"): + outputs = self.trace_graph_first(**model_inputs) + else: + outputs = self.trace_graph(**model_inputs) else: outputs = self( **model_inputs, diff --git a/intel_extension_for_pytorch/transformers/generation/sample.py b/intel_extension_for_pytorch/transformers/generation/sample.py index 74ec3c51e..e52dfd1e8 100644 --- a/intel_extension_for_pytorch/transformers/generation/sample.py +++ b/intel_extension_for_pytorch/transformers/generation/sample.py @@ -177,6 +177,7 @@ def _sample( "QWenLMHeadModel", "GitForCausalLM", "LlavaLlamaForCausalLM", + "YuanForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] @@ -282,6 +283,8 @@ def _sample( self, "prepare_inputs_labels_for_multimodal" ): model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) + if first_token and self.model_backbone == "YuanForCausalLM": + model_inputs.pop("past_key_values", None) if hasattr(self, "trace_graph"): model_inputs.pop("use_cache", None) model_inputs.pop("token_type_ids", None) @@ -297,7 +300,10 @@ def _sample( model_inputs["encoder_outputs"] = ( model_inputs["encoder_outputs"]["last_hidden_state"], ) - outputs = self.trace_graph(**model_inputs) + if first_token and hasattr(self, "trace_graph_first"): + outputs = self.trace_graph_first(**model_inputs) + else: + outputs = self.trace_graph(**model_inputs) else: outputs = self( **model_inputs, diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py index 15f2bdcfb..bb5bc0df0 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py @@ -32,6 +32,7 @@ def __init__(self, module, config, tpp=False, woq=False): "BaichuanForCausalLM", "MistralForCausalLM", "QWenLMHeadModel", + "YuanForCausalLM", ]: if not self.distributed: self.mha_linear_add = _IPEXlinearAddCPU( diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py index 0eb476177..409af7a14 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py @@ -154,6 +154,7 @@ def apply_ref_rope( "MistralForCausalLM", "MixtralForCausalLM", "LlavaLlamaForCausalLM", + "YuanForCausalLM", ]: x = x.transpose(1, 2) x = self.apply_rotary_pos_emb_llama(x, _cos, _sin, position_ids) @@ -294,6 +295,7 @@ def __init__(self, module, config): "MixtralForCausalLM", "StableLmForCausalLM", "LlavaLlamaForCausalLM", + "YuanForCausalLM", ]: self.num_key_value_groups = ( module.num_key_value_groups @@ -508,6 +510,7 @@ def forward( "MistralForCausalLM", "MixtralForCausalLM", "StableLmForCausalLM", + "YuanForCausalLM", ]: # repeat k/v heads if n_kv_heads < n_heads key = self._repeat_kv(key, self.num_key_value_groups) diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 679bce1e9..5b200bd7d 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -2739,6 +2739,245 @@ def LlavaLlamaForCausalLM_forward( return (loss,) + output if loss is not None else output +def YuanForCausalLM_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = True, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=False, + ) + + hidden_states = outputs[0] + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and hidden_states.size(1) != 1 + ): + hidden_states = hidden_states[:, -1:, :] + logits = self.lm_head(hidden_states) + loss = None + if labels is not None: + if self.use_loss_mask: + loss_mask = self.get_loss_mask( + input_ids, labels, self.eod_token, self.sep_token + ) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + if self.use_loss_mask: + loss_fct = CrossEntropyLoss(reduction="none") + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + loss = torch.sum(loss * loss_mask) / loss_mask.sum() + else: + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + +def YuanModel_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + input_ids1 = input_ids.clone() + reset_mask_flag = False + if past_key_values: + input_ids = input_ids[:, -1:] + if use_cache: + reset_mask_flag = True + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + if self.training or self.reset_position_ids: + attention_mask, _ = self._prepare_decoder_attention_mask_training( + input_ids1, + inputs_embeds, + self.eod_token, + reset_mask_flag, + self.reset_attention_mask, + self.reset_position_ids, + ) + + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), + dtype=torch.bool, + device=inputs_embeds.device, + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): if module.config.use_return_dict or ( "return_dict" in kwargs and kwargs["return_dict"] diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 1ea329f13..9f42a488c 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -1746,6 +1746,138 @@ def _CLIPAttention_forward( return attn_output, attn_weights +def _YuanAttention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + before_hidden_states = None + if past_key_value is None: + inference_hidden_states_memory = torch.zeros( + bsz, 2, hidden_states.shape[2], dtype=hidden_states.dtype + ) + target = hidden_states[:, q_len - 2 :, :] + inference_hidden_states_memory[:, -target.shape[1] :, :] = target + else: + before_hidden_states = past_key_value[-1][0] + hidden_states_tmp = before_hidden_states[:, -1:, :] + inference_hidden_states_memory = torch.cat( + (hidden_states_tmp, hidden_states), dim=1 + ) + + value_states = self.v_proj(hidden_states).view( + bsz, q_len, self.num_heads, self.head_dim + ) + if self.use_shareqk: + qk_states = self.qk_proj(hidden_states).view( + bsz, q_len, self.num_heads * self.head_dim + ) + query_key = qk_states.unsqueeze(2) * self.qk_weight + self.qk_bias + query_states, key_states = torch.unbind(query_key, dim=2) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim) + else: + hidden_states = self.lf_gate(hidden_states, before_hidden_states) + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + if self.distributed: + import torch.distributed as dist + + world_size = dist.get_world_size() + if world_size > 1: + query_gather_list = [ + torch.zeros_like(query_states) for _ in range(dist.get_world_size()) + ] + key_gather_list = [ + torch.zeros_like(key_states) for _ in range(dist.get_world_size()) + ] + dist.all_gather(query_gather_list, query_states) + dist.all_gather(key_gather_list, key_states) + query_states = torch.cat(query_gather_list, -1) + key_states = torch.cat(key_gather_list, -1) + qk_states = torch.cat([query_states, key_states], dim=-1) + qk_states = qk_states.view( + bsz, + q_len, + self.num_heads * world_size, + int(qk_states.shape[-1] // (self.num_heads * world_size)), + ) + qk_chunk = torch.chunk(qk_states, 2, dim=-1) + rank = dist.get_rank() + stride = 64 // world_size + start = rank * stride + end = (rank + 1) * stride + query_states = qk_chunk[0][:, :, start:end, :].transpose(1, 2) + key_states = qk_chunk[1][:, :, start:end, :].transpose(1, 2) + else: + qk_states = torch.cat([query_states, key_states], dim=-1) + qk_states = qk_states.view( + bsz, + q_len, + self.num_heads, + int(qk_states.shape[-1] // self.num_heads), + ) + (query_states, key_states) = torch.chunk(qk_states, 2, dim=-1) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + else: + qk_states = torch.cat([query_states, key_states], dim=-1) + qk_states = qk_states.view( + bsz, q_len, self.num_heads, int(qk_states.shape[-1] // self.num_heads) + ) + (query_states, key_states) = torch.chunk(qk_states, 2, dim=-1) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + kv_seq_len = ( + q_len + past_key_value[0].size(-2) if past_key_value is not None else q_len + ) + + key_states = self._IPEXROPE( + key_states, + position_ids, + self.num_key_value_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + query_states = self._IPEXROPE( + query_states, + position_ids, + self.num_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + (attn_output, attn_weights, present) = self._IPEXScaleDotProduct( + query_states, + key_states, + value_states, + math.sqrt(self.head_dim), + past_key_value, + None, + attention_mask, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + # attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + past_key_value = present + (inference_hidden_states_memory.unsqueeze(0),) + return attn_output, attn_weights, past_key_value + + def _create_attention_mask_for_git( self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None ): @@ -1816,6 +1948,7 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): setattr(self.__class__, k, getattr(module.__class__, k)) self.model_backbone = config.architectures[0] + self.distributed = distributed # common known as hidden_size if hasattr(module, "hidden_size"): @@ -2313,6 +2446,16 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "YuanForCausalLM": + return _YuanAttention_forward( + self, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) else: AssertionError(False, "Do not support the optimization of your model yet") @@ -2331,6 +2474,20 @@ def _reorder_cache( layer_past[3][layer_past[0].size(-2) - 1] = beam_idx layer_past[7][layer_past[0].size(-2) - 1] = beam_idx return past_key_values + elif len(past_key_values[0]) == 5: + for layer_past in past_key_values: + layer_past[3][layer_past[0].size(-2) - 1] = beam_idx + layer_past[-1][0] = layer_past[-1][0].index_select(0, beam_idx) + return past_key_values + elif len(past_key_values[0]) == 3: + return tuple( + ( + layer_past[0].index_select(0, beam_idx), + layer_past[1].index_select(0, beam_idx), + layer_past[2][0].index_select(0, beam_idx).unsqueeze(0), + ) + for layer_past in past_key_values + ) else: return tuple( tuple( diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index 1b04ccb97..d9ffccad8 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -1135,6 +1135,54 @@ def CLIPEncoderLayer_forward( return outputs +def YuanDecoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, +) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.o_proj(hidden_states) + hidden_states = residual + hidden_states + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + # hidden_states = self.mlp(hidden_states) + # hidden_states = residual + hidden_states + mlp_gate = self.linear_silu_mul(hidden_states) + if not self.distributed: + hidden_states = self.mlp_linear_add(mlp_gate, residual) + else: + hidden_states = self.mlp.down_proj(mlp_gate) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + outputs += (present_key_value,) + + return outputs + + class _IPEXDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): super().__init__() @@ -1309,6 +1357,17 @@ def __init__(self, module, config, distributed=False): ) del self.__dict__["_modules"]["mlp"].gate_proj del self.__dict__["_modules"]["mlp"].up_proj + elif self.model_backbone == "YuanForCausalLM": + if not self.distributed: + self.mha_linear_add = _IPEXlinearAddRef(module.self_attn.o_proj) + self.mlp_linear_add = _IPEXlinearAddRef(module.mlp.down_proj) + del self.__dict__["_modules"]["self_attn"].o_proj + del self.__dict__["_modules"]["mlp"].down_proj + self.linear_silu_mul = _IPEXlinearSiluMulRef( + module.mlp.up_proj, module.mlp.gate_proj + ) + del self.__dict__["_modules"]["mlp"].gate_proj + del self.__dict__["_modules"]["mlp"].up_proj else: AssertionError(False, "Do not support the optimization of your model yet") @@ -1519,5 +1578,15 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "YuanForCausalLM": + return YuanDecoderLayer_forward( + self, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index bc40c22c5..293b1a0fd 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -182,6 +182,8 @@ def model_convert_reference(_model): GitModel_forward, CLIPEncoder_forward, LlavaLlamaForCausalLM_forward, + YuanForCausalLM_forward, + YuanModel_forward, prepare_inputs_for_generation, prepare_inputs_for_generation_gptbigcode, prepare_inputs_for_generation_llama, @@ -350,9 +352,7 @@ def model_convert_reference(_model): else: need_ipex_tp = True distributed = True - - # model-wise optimizations - MHA module - for supported_mha_class in [ + supported_mha_classes = [ transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention, transformers.models.llama.modeling_llama.LlamaAttention, transformers.models.gptj.modeling_gptj.GPTJAttention, @@ -361,17 +361,37 @@ def model_convert_reference(_model): transformers.models.codegen.modeling_codegen.CodeGenAttention, transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeAttention, transformers.models.t5.modeling_t5.T5Attention, - ]: - if need_ipex_tp and supported_mha_class in [ - transformers.models.llama.modeling_llama.LlamaAttention, - transformers.models.gptj.modeling_gptj.GPTJAttention, - ]: + ] + ipex_tp_supported_mha_classes = [ + transformers.models.llama.modeling_llama.LlamaAttention, + transformers.models.gptj.modeling_gptj.GPTJAttention, + ] + ipex_tp_supported_mlp_classes = [ + transformers.models.llama.modeling_llama.LlamaMLP, + transformers.models.gptj.modeling_gptj.GPTJMLP, + ] + ipex_tp_supported_model_classes = [ + transformers.models.llama.modeling_llama.LlamaForCausalLM, + transformers.models.gptj.modeling_gptj.GPTJForCausalLM, + ] + yuan_attention = None + if _model.config.architectures[0] == "YuanForCausalLM": + yuan_attention = type(_model.model.layers[0].self_attn) + supported_mha_classes.append(yuan_attention) + ipex_tp_supported_mha_classes.append(yuan_attention) + ipex_tp_supported_mlp_classes.append(type(_model.model.layers[0].mlp)) + ipex_tp_supported_model_classes.append(type(_model)) + # model-wise optimizations - MHA module + for supported_mha_class in supported_mha_classes: + if need_ipex_tp and supported_mha_class in ipex_tp_supported_mha_classes: num_heads = _model.config.num_attention_heads num_kv_heads = num_heads for name in ["num_key_value_heads"]: if hasattr(_model.config, name): num_kv_heads = getattr(_model.config, name) head_dim = _model.config.hidden_size // num_heads + value_with_share_qk = supported_mha_class == yuan_attention + shard_local_filtering = supported_mha_class == yuan_attention shard_mha_weights( _model, supported_mha_class, @@ -380,8 +400,9 @@ def model_convert_reference(_model): head_dim, rank, world_size, + value_with_share_qk, + shard_local_filtering, ) - convert_class( _model, supported_mha_class, @@ -390,10 +411,7 @@ def model_convert_reference(_model): distributed=distributed, ) if need_ipex_tp: - for supported_mlp_class in [ - transformers.models.llama.modeling_llama.LlamaMLP, - transformers.models.gptj.modeling_gptj.GPTJMLP, - ]: + for supported_mlp_class in ipex_tp_supported_mlp_classes: shard_mlp_weights( _model, supported_mlp_class, @@ -403,10 +421,7 @@ def model_convert_reference(_model): rank, world_size, ) - for supported_model_class in [ - transformers.models.llama.modeling_llama.LlamaForCausalLM, - transformers.models.gptj.modeling_gptj.GPTJForCausalLM, - ]: + for supported_model_class in ipex_tp_supported_model_classes: if isinstance(_model, supported_model_class): shard_lm_head_weights( _model, @@ -436,7 +451,6 @@ def model_convert_reference(_model): _model.config, distributed=distributed, ) - # special list that has not official transformers design if _model.config.architectures[0] == "BloomForCausalLM": convert_function( @@ -714,6 +728,23 @@ def model_convert_reference(_model): _model.config, distributed=distributed, ) + elif _model.config.architectures[0] == "YuanForCausalLM": + convert_function(_model, "forward", YuanForCausalLM_forward) + convert_function(_model.model, "forward", YuanModel_forward) + convert_class( + _model, + type(_model.model.layers[0].self_attn), + _IPEXAttentionRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + type(_model.model.layers[0]), + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) return _model @@ -874,7 +905,35 @@ def get_dummy_input(_model, return_dict=False): sample_inputs = ( torch.zeros(batch_size, 1, 4096).to(_model.dtype), ) + sample_inputs[1:] + if _model.config.architectures[0] == "YuanForCausalLM": + hidden_size = _model.config.hidden_size + if _model.device.type == "cpu": + from ..cpu import comm as ipex_comm + world_size = ipex_comm.get_world_size() + hidden_size = hidden_size * world_size + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 1, 2, hidden_size), + ) + for i in range(model_num_layers) + ] + ) + sample_inputs = ( + { + "input_ids": input_ids[:, -1:], + "attention_mask": attention_mask[:, -1:], + "past_key_values": past_key_values, + "position_ids": position_ids[:, -1:], + } + if return_dict + else (input_ids, attention_mask, position_ids, past_key_values) + ) if "return_last_logit" in model_inputs: if return_dict: sample_inputs["return_last_logit"] = torch.tensor(True) @@ -964,7 +1023,10 @@ def model_convert_lowering( supported_classes = [ transformers.models.llama.modeling_llama.LlamaRMSNorm, ] - if _model.config.architectures[0] == "BaichuanForCausalLM": + if _model.config.architectures[0] in [ + "BaichuanForCausalLM", + "YuanForCausalLM", + ]: supported_classes.append(type(_model.model.layers[0].input_layernorm)) if ( _model.config.architectures[0] == "ChatGLMModel" @@ -1036,9 +1098,38 @@ def model_convert_lowering( check_trace=False, ) trace_model = torch.jit.freeze(trace_model) - _model = _set_optimized_model_for_generation( - _model, optimized_model=trace_model - ) + if _model.config.architectures[0] == "YuanForCausalLM": + sample_inputs.pop("past_key_values", None) + batch_size = ( + _model.config.batch_size + if hasattr(_model.config, "batch_size") + else 1 + ) + sample_inputs["input_ids"] = sample_inputs["input_ids"].repeat( + batch_size, 1 + ) + sample_inputs["attention_mask"] = sample_inputs[ + "attention_mask" + ].repeat(batch_size, 1) + sample_inputs["position_ids"] = sample_inputs[ + "position_ids" + ].repeat(batch_size, 1) + trace_model_first = torch.jit.trace( + _model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + trace_model_first = torch.jit.freeze(trace_model_first) + _model = _set_optimized_model_for_generation( + _model, + optimized_model=trace_model, + first_token_optimized_model=trace_model_first, + ) + else: + _model = _set_optimized_model_for_generation( + _model, optimized_model=trace_model + ) return _model @@ -1080,7 +1171,7 @@ def optimize( Well supported model family with full functionalities: Llama, GPT-J, GPT-Neox, OPT, Falcon, Bloom, CodeGen, Baichuan, ChatGLM, GPTBigCode, - T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava. + T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan. For the model that is not in the scope of supported model family above, will try to apply default ipex.optimize transparently to get benifits (not include quantizations, @@ -1167,6 +1258,7 @@ def optimize( "QWenLMHeadModel", "GitForCausalLM", "LlavaLlamaForCausalLM", + "YuanForCausalLM", ] if well_supported_model: @@ -1175,7 +1267,7 @@ def optimize( if quantization_config is not None: logger.warning( "ipex.llm.optimize supports quantizations on Llama, GPT-J, GPT-Neox, Falcon, OPT, Bloom, CodeGen," - + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, and Llava," + + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, Llava, and Yuan" + "fallback to origin model" ) return model @@ -1286,9 +1378,38 @@ def optimize( check_trace=False, ) trace_model = torch.jit.freeze(trace_model) - _model = _set_optimized_model_for_generation( - _model, optimized_model=trace_model - ) + if _model.config.architectures[0] == "YuanForCausalLM": + sample_inputs.pop("past_key_values", None) + batch_size = ( + _model.config.batch_size + if hasattr(_model.config, "batch_size") + else 1 + ) + sample_inputs["input_ids"] = sample_inputs[ + "input_ids" + ].repeat(batch_size, 1) + sample_inputs["attention_mask"] = sample_inputs[ + "attention_mask" + ].repeat(batch_size, 1) + sample_inputs["position_ids"] = sample_inputs[ + "position_ids" + ].repeat(batch_size, 1) + trace_model_first = torch.jit.trace( + _model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + trace_model_first = torch.jit.freeze(trace_model_first) + _model = _set_optimized_model_for_generation( + _model, + optimized_model=trace_model, + first_token_optimized_model=trace_model_first, + ) + else: + _model = _set_optimized_model_for_generation( + _model, optimized_model=trace_model + ) return _model else: print( diff --git a/intel_extension_for_pytorch/transformers/tensor_parallel.py b/intel_extension_for_pytorch/transformers/tensor_parallel.py index a38181521..f55ecabc8 100644 --- a/intel_extension_for_pytorch/transformers/tensor_parallel.py +++ b/intel_extension_for_pytorch/transformers/tensor_parallel.py @@ -4,6 +4,101 @@ import os +class TensorParallelConv2d(nn.Module): + def __init__(self, conv, rank, world_size, shard_by_oc): + super().__init__() + self.rank = rank + self.world_size = world_size + self.shard_by_oc = shard_by_oc + self.shard_weights(conv) + + def shard_weights(self, conv): + if self.world_size == 1: + return + if self.shard_by_oc: + total_size = conv.weight.shape[0] + else: + total_size = conv.weight.shape[1] + bias_data = None + cols_per_rank = [0] + for i in range(self.world_size - 1, -1, -1): + cols = total_size // self.world_size + if i < total_size % self.world_size: + cols += 1 + cols_per_rank.append(cols_per_rank[-1] + cols) + weight_data = conv.weight.data + if self.shard_by_oc: + weight_data = weight_data[ + cols_per_rank[self.rank] : cols_per_rank[self.rank + 1] + ] + if conv.bias is not None: + bias_data = conv.bias.data[ + cols_per_rank[self.rank] : cols_per_rank[self.rank + 1] + ] + else: + weight_data = weight_data[ + :, cols_per_rank[self.rank] : cols_per_rank[self.rank + 1] + ] + if conv.bias is not None: + bias_data = conv.bias.data / float(self.world_size) + self.conv = nn.Conv2d( + weight_data.shape[1], + weight_data.shape[0], + conv.kernel_size, + conv.stride, + conv.padding, + conv.dilation, + conv.groups, + conv.bias is not None, + conv.padding_mode, + ) + self.conv.weight = torch.nn.Parameter(weight_data) + if conv.bias is not None: + self.conv.bias = torch.nn.Parameter(bias_data) + del conv + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return self.conv(input) + + +class TensorParallelOcShardConv2d(TensorParallelConv2d): + def __init__(self, conv, rank, world_size): + super().__init__(conv, rank, world_size, True) + + +class TensorParallelIcShardConv2d(TensorParallelConv2d): + def __init__(self, conv, rank, world_size): + super().__init__(conv, rank, world_size, False) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + out = self.conv(input) + if self.world_size > 1: + ipex_comm.allreduce_add(out) + return out + + +def shard_local_filtering_Conv2d_weights(model, target_m, rank, world_size): + if world_size == 1: + return + for name, sub_m in model.named_children(): + for l_name, l_sub_m in sub_m.named_children(): + if l_name in ["conv1"]: + TPConv2d = TensorParallelOcShardConv2d( + l_sub_m, + rank, + world_size, + ) + setattr(sub_m, l_name, TPConv2d) + if l_name in ["conv2"]: + TPConv2d = TensorParallelIcShardConv2d( + l_sub_m, + rank, + world_size, + ) + setattr(sub_m, l_name, TPConv2d) + shard_local_filtering_Conv2d_weights(sub_m, target_m, rank, world_size) + + class TensorParallellLinear(nn.Module): def __init__( self, @@ -15,6 +110,7 @@ def __init__( world_size, shard_by_head, shard_by_col, + value_with_share_qk=False, ): super().__init__() self.num_kv_heads = num_kv_heads @@ -25,7 +121,7 @@ def __init__( self.shard_by_head = shard_by_head self.shard_by_col = shard_by_col self.cols_per_rank = None - self.shard_weights(linear) + self.shard_weights(linear, value_with_share_qk) def shard_weights_by_head( self, @@ -60,6 +156,15 @@ def shard_weights_by_head( if i < num_kv_heads % world_size: kv_head_this_rank += 1 kv_head_range.append(kv_head_range[-1] + kv_head_this_rank) + cols_per_rank = [0] + for i in range(world_size): + q_head_start = kv_head_range[i] * kv_group_size + q_head_end = ( + q_head_start + (kv_head_range[i + 1] - kv_head_range[i]) * kv_group_size + ) + cols_per_rank.append( + cols_per_rank[-1] + (q_head_end - q_head_start) * head_dim + ) weight_data = linear.weight.data q_head_start = kv_head_range[rank] * kv_group_size q_head_end = ( @@ -75,7 +180,7 @@ def shard_weights_by_head( else: q = weight_data[:, q_head_start * head_dim : q_head_end * head_dim] if not concat_qkv: - return torch.nn.Parameter(q), torch.nn.Parameter(q_bias) + return torch.nn.Parameter(q), torch.nn.Parameter(q_bias), cols_per_rank k_head_start = num_heads + kv_head_range[rank] k_head_end = k_head_start + (kv_head_range[rank + 1] - kv_head_range[rank]) @@ -98,7 +203,11 @@ def shard_weights_by_head( if linear.bias is not None: bias_data = linear.bias.data weight_data = torch.cat([q, k, v], dim=0) - return torch.nn.Parameter(weight_data), torch.nn.Parameter(bias_data) + return ( + torch.nn.Parameter(weight_data), + torch.nn.Parameter(bias_data), + None, + ) def shard_weights_by_block( self, linear, rank, world_size, shard_by_col=True, block_size=64 @@ -138,11 +247,118 @@ def shard_weights_by_block( cols_per_rank, ) - def shard_weights(self, linear): + def shard_value_with_share_qk( + self, + linear, + num_heads, + head_dim, + rank, + world_size, + # shard_by_col=True, + ): + + total_size = linear.weight.shape[0] + if world_size == 1: + return + assert num_heads % world_size == 0 + if world_size > num_heads // 2: + RuntimeError( + f"world_size {world_size} is larger than half of num_heads {num_heads}" + ) + head_per_rank = num_heads // world_size + q_head_start = rank * head_per_rank + # mapping q_head to v_head + v_head_ids = [] + i = 0 + # mapping neighbor q_head to v_head + while i < head_per_rank: + v_head_ids.append(q_head_start // 2) + q_head_start += 2 + i = i + 2 + + # mapping neighbor k_head to v_head + v_head_ids.extend([i + num_heads // 2 for i in v_head_ids]) + weight_data = linear.weight.data + sharded_weight = [] + sharded_bias = [] + for head_id in v_head_ids: + sharded_weight.append( + weight_data[head_id * head_dim : (head_id + 1) * head_dim] + ) + if linear.bias is not None: + sharded_bias.append( + linear.bias.data[head_id * head_dim : (head_id + 1) * head_dim] + ) + sharded_weight = torch.cat(sharded_weight, dim=0) + if linear.bias is not None: + sharded_bias = torch.cat(sharded_bias, dim=0) + else: + sharded_bias = None + return torch.nn.Parameter(sharded_weight), torch.nn.Parameter(sharded_bias) + + def shard_oproj_with_share_qk( + self, + linear, + num_heads, + head_dim, + rank, + world_size, + ): + + total_size = linear.weight.shape[1] + if world_size == 1: + return + assert num_heads % world_size == 0 + if world_size > num_heads // 2: + RuntimeError( + f"world_size {world_size} is larger than half of num_heads {num_heads}" + ) + head_per_rank = num_heads // world_size + q_head_start = rank * head_per_rank + # mapping q_head to v_head + v_head_ids = [] + i = 0 + # mapping neighbor q_head to v_head + while i < head_per_rank: + v_head_ids.append(q_head_start // 2) + q_head_start += 2 + i = i + 2 + + # mapping neighbor k_head to v_head + v_head_ids.extend([i + num_heads // 2 for i in v_head_ids]) + weight_data = linear.weight.data + sharded_weight = [] + for head_id in v_head_ids: + sharded_weight.append( + weight_data[:, head_id * head_dim : (head_id + 1) * head_dim] + ) + sharded_weight = torch.cat(sharded_weight, dim=1) + if linear.bias is not None: + linear.bias = linear.bias / float(world_size) + return torch.nn.Parameter(sharded_weight), torch.nn.Parameter(linear.bias) + + def shard_weights(self, linear, value_with_share_qk=False): if self.world_size == 1: return - if self.shard_by_head: - weight, bias = self.shard_weights_by_head( + if self.shard_by_head and value_with_share_qk: + if self.shard_by_col: + weight, bias = self.shard_value_with_share_qk( + linear, + self.num_heads, + self.head_dim, + self.rank, + self.world_size, + ) + else: + weight, bias = self.shard_oproj_with_share_qk( + linear, + self.num_heads, + self.head_dim, + self.rank, + self.world_size, + ) + elif self.shard_by_head: + weight, bias, self.cols_per_rank = self.shard_weights_by_head( linear, self.num_kv_heads, self.num_heads, @@ -181,6 +397,7 @@ def __init__( rank, world_size, shard_by_head=True, + value_with_share_qk=False, ): super().__init__( linear, @@ -191,6 +408,7 @@ def __init__( world_size, shard_by_head, shard_by_col=True, + value_with_share_qk=value_with_share_qk, ) @@ -204,6 +422,7 @@ def __init__( rank, world_size, shard_by_head=True, + value_with_share_qk=False, ): super().__init__( linear, @@ -214,6 +433,7 @@ def __init__( world_size, shard_by_head, shard_by_col=False, + value_with_share_qk=value_with_share_qk, ) def forward(self, input: torch.Tensor) -> torch.Tensor: @@ -264,8 +484,18 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: def shard_mha_weights( - model, target_m, num_heads, num_kv_heads, head_dim, rank, world_size + model, + target_m, + num_heads, + num_kv_heads, + head_dim, + rank, + world_size, + value_with_share_qk=False, + shard_local_filtering=False, ): + if shard_local_filtering: + shard_local_filtering_Conv2d_weights(model, target_m, rank, world_size) if world_size == 1: return for name, sub_m in model.named_children(): @@ -282,8 +512,8 @@ def shard_mha_weights( shard_by_head=True, ) # del sub_m.__dict__["_modules"][l_name] - setattr(sub_m, l_name, TPLinear.linear) - if l_name in ["k_proj", "v_proj"]: + setattr(sub_m, l_name, TPLinear) + if l_name in ["k_proj"]: TPLinear = TensorParallelColumnLinear( l_sub_m, num_kv_heads, @@ -294,8 +524,33 @@ def shard_mha_weights( shard_by_head=True, ) # del sub_m.__dict__["_modules"][l_name] - setattr(sub_m, l_name, TPLinear.linear) - if l_name in ["out_proj", "o_proj"]: + setattr(sub_m, l_name, TPLinear) + if l_name in ["v_proj"]: + TPLinear = TensorParallelColumnLinear( + l_sub_m, + num_kv_heads, + num_kv_heads, + head_dim, + rank, + world_size, + True, + value_with_share_qk, + ) + # del sub_m.__dict__["_modules"][l_name] + setattr(sub_m, l_name, TPLinear) + if l_name in ["out_proj"]: + TPLinear = TensorParallelRowLinear( + l_sub_m, + num_kv_heads, + num_heads, + head_dim, + rank, + world_size, + shard_by_head=True, + ) + # del sub_m.__dict__["_modules"][l_name] + setattr(sub_m, l_name, TPLinear) + if l_name in ["o_proj"]: TPLinear = TensorParallelRowLinear( l_sub_m, num_kv_heads, @@ -304,12 +559,20 @@ def shard_mha_weights( rank, world_size, shard_by_head=True, + value_with_share_qk=True, ) # del sub_m.__dict__["_modules"][l_name] setattr(sub_m, l_name, TPLinear) shard_mha_weights( - sub_m, target_m, num_heads, num_kv_heads, head_dim, rank, world_size + sub_m, + target_m, + num_heads, + num_kv_heads, + head_dim, + rank, + world_size, + value_with_share_qk, ) diff --git a/tests/cpu/hf_configs/yuan/config.json b/tests/cpu/hf_configs/yuan/config.json new file mode 100644 index 000000000..4c1ab754d --- /dev/null +++ b/tests/cpu/hf_configs/yuan/config.json @@ -0,0 +1,39 @@ +{ + "_from_model_config": true, + "architectures": [ + "YuanForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_yuan.YuanConfig", + "AutoModelForCausalLM": "yuan_hf_model.YuanForCausalLM" + }, + "tokenizer_class": "YuanTokenizer", + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 8192, + "model_type": "yuan", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "rms_norm_eps": 1e-06, + "dropout": 0.1, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.30.0.dev0", + "use_cache": true, + "causal_mask": true, + "use_flash_attention": false, + "reset_attention_mask": true, + "reset_position_ids": true, + "use_loss_mask": false, + "eod_token": 77185, + "sep_token": 77187, + "eod_token_id": 77185, + "sep_token_id": 77185, + "pad_token_id": 77185, + "bos_token_id": 77185, + "eos_token_id": 77185, + "mask_token_id": 77185, + "vocab_size": 135040 +} \ No newline at end of file diff --git a/tests/cpu/hf_configs/yuan/configuration_yuan.py b/tests/cpu/hf_configs/yuan/configuration_yuan.py new file mode 100644 index 000000000..fea079faa --- /dev/null +++ b/tests/cpu/hf_configs/yuan/configuration_yuan.py @@ -0,0 +1,42 @@ +from transformers.configuration_utils import PretrainedConfig + + +class YuanConfig(PretrainedConfig): + model_type = "yuan" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=135040, + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=24, + num_attention_heads=32, + hidden_act="silu", + model_max_length=8192, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=77185, + bos_token_id=77185, + eos_token_id=77185, + tie_word_embeddings=True, + **kwargs, + ): + self.vocab_size = vocab_size + self.model_max_length = model_max_length + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/tests/cpu/hf_configs/yuan/yuan_hf_model.py b/tests/cpu/hf_configs/yuan/yuan_hf_model.py new file mode 100644 index 000000000..9533b3cae --- /dev/null +++ b/tests/cpu/hf_configs/yuan/yuan_hf_model.py @@ -0,0 +1,1389 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Yuan model.""" +import math +from typing import List, Optional, Tuple, Union +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_yuan import YuanConfig +from einops import rearrange + +# from flash_attn import flash_attn_varlen_func as flash_attn_unpadded_func +# from flash_attn import flash_attn_func + +import copy + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "YuanConfig" + + +class LocalizedFiltering(torch.nn.Module): + """ + Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of + variable names and moving away from the stateful representation of incremental decoding state. See + "https://arxiv.org/abs/2209.10655" for more details. + """ + + def __init__(self, hidden_size): + super().__init__() + + self.embed_dim = hidden_size + self.lf_conv2d_group = 1 + self.lf_conv2d_num_pad = 1 + + self.conv1 = torch.nn.Conv2d( + self.embed_dim, + self.embed_dim // 2, + (2, 1), + stride=(1, 1), + padding=(self.lf_conv2d_num_pad, 0), + groups=self.lf_conv2d_group, + ) + self.conv2 = torch.nn.Conv2d( + self.embed_dim // 2, + self.embed_dim, + (2, 1), + stride=(1, 1), + padding=(self.lf_conv2d_num_pad, 0), + groups=self.lf_conv2d_group, + ) + self.output_layernorm = YuanRMSNorm(self.embed_dim) + + def _train_forward(self, inputs): + inputs = inputs.transpose(0, 1) + seq_len, bsz, embed_dim = inputs.size() + if embed_dim != self.embed_dim: + raise ValueError( + f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}" + ) + residual = inputs + + inputs = inputs.view(seq_len, 1, bsz, embed_dim).permute(2, 3, 0, 1) + output1 = self.conv1(inputs) + output1 = output1[:, :, :seq_len, :] + + output2 = self.conv2(output1) + output2 = output2[:, :, :seq_len, :].permute(2, 3, 0, 1).contiguous() + output2 = output2.view(seq_len, bsz, embed_dim) + assert output2.shape == residual.shape + + lf_output = self.output_layernorm(output2 + residual) + lf_output = lf_output.transpose(0, 1) + return lf_output + + def _inference_forward(self, inputs, before_hidden_states): + if before_hidden_states is None: + inputs = inputs.transpose(0, 1) + seq_len, bsz, embed_dim = inputs.size() + if embed_dim != self.embed_dim: + raise ValueError( + f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}" + ) + residual = inputs + + inputs = inputs.view(seq_len, 1, bsz, embed_dim).permute(2, 3, 0, 1) + output1 = self.conv1(inputs) + output1 = output1[:, :, :seq_len, :] + + output2 = self.conv2(output1) + output2 = output2[:, :, :seq_len, :].permute(2, 3, 0, 1).contiguous() + output2 = output2.view(seq_len, bsz, embed_dim) + assert output2.shape == residual.shape + + lf_output = self.output_layernorm(output2 + residual) + lf_output = lf_output.transpose(0, 1) + return lf_output + else: + inputs = inputs.transpose(0, 1) + before_hidden_states = before_hidden_states.transpose(0, 1) + residual = inputs + + seq_len, bsz, embed_dim = inputs.size() + seq_len_before, _, _ = before_hidden_states.size() + + assert seq_len == 1 and seq_len_before == 2 + + inputs = torch.cat((before_hidden_states, inputs), dim=0) + inputs = inputs.view(3, 1, bsz, embed_dim).permute(2, 3, 0, 1) + + output1 = self.conv1(inputs) + output2 = self.conv2(output1[:, :, 1:-1, :]) + output2 = output2[:, :, 1:-1, :] + output2 = output2.view(1, bsz, embed_dim) + assert output2.shape == residual.shape + + lf_output = self.output_layernorm(output2 + residual) + lf_output = lf_output.transpose(0, 1) + + return lf_output + + def forward(self, inputs, before_hidden_states) -> torch.Tensor: + assert self.lf_conv2d_num_pad == 1 + if self.training: + lf_output = self._train_forward(inputs) + else: + lf_output = self._inference_forward(inputs, before_hidden_states) + + return lf_output + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full( + (tgt_len, tgt_len), + torch.tensor(torch.finfo(dtype).min, device=device), + device=device, + ) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat( + [ + torch.zeros( + tgt_len, past_key_values_length, dtype=dtype, device=device + ), + mask, + ], + dim=-1, + ) + return mask[None, None, :, :].expand( + bsz, 1, tgt_len, tgt_len + past_key_values_length + ) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(dtype).min + ) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class YuanRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + YuanRMSNorm is equivalent to LlamaRMSNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class YuanRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + """ + YuanRotaryEmbedding is equivalent to LlamaRotaryEmbedding in transformers v4.36 + """ + + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, + device=self.inv_freq.device, + dtype=torch.get_default_dtype(), + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer( + "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False + ) + self.register_buffer( + "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False + ) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), + ) + + +class YuanMLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + ): + super().__init__() + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.gate_proj(x) * self.act_fn(self.up_proj(x))) + + +class YuanAttention(nn.Module): + """Localized Filtering-based Attention 'YUAN 2.0: A Large Language Model with Localized Filtering-based Attention' paper""" + + def __init__(self, config: YuanConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + self.causal_mask = config.causal_mask + self.softmax_scale = 1.0 / math.sqrt(self.head_dim) + self.use_flash_attention = config.use_flash_attention + try: + self.use_shareqk = config.use_shareqk + except Exception as e: + self.use_shareqk = False + self.dropout = 0.0 + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + self.v_proj = nn.Linear( + self.hidden_size, self.num_heads * self.head_dim, bias=False + ) + self.o_proj = nn.Linear( + self.num_heads * self.head_dim, self.hidden_size, bias=False + ) + self.rotary_emb = YuanRotaryEmbedding( + self.head_dim, max_position_embeddings=self.max_position_embeddings + ) + if self.use_shareqk: + self.qk_proj = nn.Linear( + self.hidden_size, self.num_heads * self.head_dim, bias=False + ) + self.qk_weight = nn.Parameter(torch.Tensor(2, self.hidden_size)) + self.qk_bias = nn.Parameter(torch.Tensor(2, self.hidden_size)) + else: + self.lf_gate = LocalizedFiltering(self.hidden_size) + self.q_proj = nn.Linear( + self.hidden_size, self.num_heads * self.head_dim, bias=False + ) + self.k_proj = nn.Linear( + self.hidden_size, self.num_heads * self.head_dim, bias=False + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + before_hidden_states = None + is_first_step = False + if use_cache: + if past_key_value is None: + inference_hidden_states_memory = torch.empty( + bsz, 2, hidden_states.shape[2], dtype=hidden_states.dtype + ) + is_first_step = True + else: + before_hidden_states = past_key_value[2] + + if use_cache: + if is_first_step: + if q_len >= 2: + inference_hidden_states_memory = hidden_states[:, -2:, :] + else: + inference_hidden_states_memory[:, :, :] = 0 + inference_hidden_states_memory[:, -1:, :] = hidden_states[:, -1:, :] + else: + hidden_states_tmp = before_hidden_states[:, -1:, :] + inference_hidden_states_memory = copy.deepcopy( + torch.cat((hidden_states_tmp, hidden_states), dim=1) + ) + + value_states = ( + self.v_proj(hidden_states) + .view(bsz, q_len, self.num_heads, self.head_dim) + .transpose(1, 2) + ) + if self.use_shareqk: + qk_states = self.qk_proj(hidden_states).view( + bsz, q_len, self.num_heads * self.head_dim + ) + query_key = qk_states.unsqueeze(2) * self.qk_weight + self.qk_bias + query_states, key_states = torch.unbind(query_key, dim=2) + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + else: + hidden_states = self.lf_gate(hidden_states, before_hidden_states) + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + qk_states = torch.cat([query_states, key_states], dim=-1) + qk_states = qk_states.view( + bsz, q_len, self.num_heads, int(qk_states.shape[-1] // self.num_heads) + ) + (query_states, key_states) = torch.chunk(qk_states, 2, dim=-1) + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = ( + (key_states, value_states, inference_hidden_states_memory) + if use_cache + else None + ) + + if self.use_flash_attention: + attn_weights = None + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + batch_size, seqlen_q = query_states.shape[0], query_states.shape[1] + seqlen_k = key_states.shape[1] + + q, k, v = [ + rearrange(x, "b s ... -> (b s) ...") + for x in [query_states, key_states, value_states] + ] + + cu_seqlens_q = torch.arange( + 0, + (batch_size + 1) * seqlen_q, + step=seqlen_q, + dtype=torch.int, + device=q.device, + ) + + if self.training: + assert seqlen_k == seqlen_q + cu_seqlens_k = cu_seqlens_q + is_causal = self.causal_mask + else: + is_causal = seqlen_q == seqlen_k + cu_seqlens_k = torch.arange( + 0, + (batch_size + 1) * seqlen_k, + step=seqlen_k, + dtype=torch.int, + device=q.device, + ) + self.dropout = 0 + + output = flash_attn_unpadded_func( + q, + k, + v, + cu_seqlens_q, + cu_seqlens_k, + seqlen_q, + seqlen_k, + self.dropout, + causal=is_causal, + ) + + attn_output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + else: + attn_weights = torch.matmul( + query_states, key_states.transpose(2, 3) + ) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + attn_weights = torch.max( + attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min) + ) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + return attn_output, attn_weights, past_key_value + + +class YuanDecoderLayer(nn.Module): + def __init__(self, config: YuanConfig): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = YuanAttention(config=config) + self.mlp = YuanMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + ) + self.input_layernorm = YuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = YuanRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +YUAN_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`YuanConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Yuan Model outputting raw hidden-states without any specific head on top.", + YUAN_START_DOCSTRING, +) +class YuanPreTrainedModel(PreTrainedModel): + config_class = YuanConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["YuanDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, YuanModel): + module.gradient_checkpointing = value + + +YUAN_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed + or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Yuan Model outputting raw hidden-states without any specific head on top.", + YUAN_START_DOCSTRING, +) +class YuanModel(YuanPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`YuanDecoderLayer`] + + Args: + config: YuanConfig + """ + + def __init__(self, config: YuanConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + # TODO: control it by config + self.eod_token = config.eod_token + self.reset_attention_mask = config.reset_attention_mask + self.reset_position_ids = config.reset_position_ids + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [YuanDecoderLayer(config) for _ in range(config.num_hidden_layers)] + ) + self.norm = YuanRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask( + self, attention_mask, input_shape, inputs_embeds, past_key_values_length + ): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask( + attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ).to(inputs_embeds.device) + combined_attention_mask = ( + expanded_attn_mask + if combined_attention_mask is None + else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def _prepare_decoder_attention_mask_training( + self, + input_id, + inputs_embeds, + eod_token, + reset_mask_flag, + reset_attention_mask=True, + reset_position_ids=True, + ): + micro_batch_size, seq_length = input_id.size() + + attention_mask = torch.tril( + torch.ones( + (micro_batch_size, seq_length, seq_length), device=inputs_embeds.device + ) + ).view(micro_batch_size, 1, seq_length, seq_length) + + position_ids = torch.arange( + seq_length, dtype=torch.long, device=inputs_embeds.device + ) + position_ids = position_ids.unsqueeze(0).expand_as(input_id) + + if reset_position_ids: + position_ids = position_ids.clone() + + if reset_position_ids or reset_attention_mask: + # Loop through the batches: + for b in range(micro_batch_size): + # Find indecies where EOD token is. + eod_index = position_ids[b, input_id[b] == eod_token] + + # Detach indecies from positions if going to modify positions. + if reset_position_ids: + eod_index = eod_index.clone() + # Loop through EOD indecies: + prev_index = 0 + for j in range(eod_index.size()[0]): + i = eod_index[j] + # Mask attention loss. + if reset_attention_mask: + attention_mask[b, 0, (i + 1) :, : (i + 1)] = 0 + # Reset positions. + if reset_position_ids: + position_ids[b, (i + 1) :] -= i + 1 - prev_index + prev_index = i + 1 + + inverted_mask = 1 - attention_mask + output_attn_mask = inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min + ) + if reset_mask_flag: + output_attn_mask = output_attn_mask[:, :, -1:, :] + return output_attn_mask, position_ids + + @add_start_docstrings_to_model_forward(YUAN_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + input_ids1 = copy.deepcopy(input_ids) + reset_mask_flag = False + if past_key_values: + input_ids = input_ids[:, -1:] + if use_cache: + reset_mask_flag = True + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError( + "You have to specify either decoder_input_ids or decoder_inputs_embeds" + ) + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + if self.training or self.reset_position_ids: + attention_mask, _ = self._prepare_decoder_attention_mask_training( + input_ids1, + inputs_embeds, + self.eod_token, + reset_mask_flag, + self.reset_attention_mask, + self.reset_position_ids, + ) + + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), + dtype=torch.bool, + device=inputs_embeds.device, + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = ( + past_key_values[idx] if past_key_values is not None else None + ) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class YuanForCausalLM(YuanPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.eod_token = config.eod_token + self.sep_token = config.sep_token + self.use_loss_mask = config.use_loss_mask + self.model = YuanModel(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def get_loss_mask(self, input_ids, labels, eod_token, sep_token): + micro_batch_size, seq_length = input_ids.size() + loss_mask = torch.ones( + input_ids.size(), dtype=torch.float, device=input_ids.device + ) + + position_ids = torch.arange( + seq_length, dtype=torch.long, device=input_ids.device + ) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + + """modify loss_mask to only calculate the loss of the answer (separated with [SEP])""" + + for b in range(micro_batch_size): + eod_indexs = position_ids[b, input_ids[b] == eod_token] + sep_indexs = position_ids[b, input_ids[b] == sep_token] + + if len(eod_indexs) == 0 or len(sep_indexs) == 0: + loss_mask[b] = 1.0 + else: + if eod_indexs[0] > sep_indexs[0]: + loss_mask[b, 0 : sep_indexs[0]] = 0 + + if len(eod_indexs) == len(sep_indexs): + for ii, eod_index in enumerate(eod_indexs): + start_index = eod_index + if ii == (len(sep_indexs) - 1): + stop_index = seq_length + else: + stop_index = sep_indexs[ii + 1] + loss_mask[b, start_index:stop_index] = 0.0 + else: + if len(eod_indexs) > len(sep_indexs): + loss_mask[b, :] = 1.0 + else: + for ii, eod_index in enumerate(eod_indexs): + start_index = eod_index + stop_index = sep_indexs[ii + 1] + + loss_mask[b, start_index:stop_index] = 0.0 + + elif eod_indexs[0] < sep_indexs[0]: + if len(eod_indexs) == len(sep_indexs): + for ii, eod_index in enumerate(eod_indexs): + start_index = eod_index + stop_index = sep_indexs[ii] + loss_mask[b, start_index:stop_index] = 0.0 + + else: + if len(eod_indexs) < len(sep_indexs): + loss_mask[b, :] = 1.0 + else: + for ii, eod_index in enumerate(eod_indexs): + start_index = eod_index + if ii >= len(sep_indexs): + stop_index = seq_length + else: + stop_index = sep_indexs[ii] + loss_mask[b, start_index:stop_index] = 0.0 + + loss_mask[input_ids == eod_token] = 1.0 + return loss_mask + + @add_start_docstrings_to_model_forward(YUAN_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, YuanForCausalLM + + >>> model = YuanForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you consciours? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + ```""" + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + loss = None + if labels is not None: + if self.use_loss_mask: + loss_mask = self.get_loss_mask( + input_ids, labels, self.eod_token, self.sep_token + ) + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + if self.use_loss_mask: + loss_fct = CrossEntropyLoss(reduction="none") + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + loss = torch.sum(loss * loss_mask) / loss_mask.sum() + else: + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx) for past_state in layer_past + ), + ) + return reordered_past + + +@add_start_docstrings( + """ + The Yuan Model transformer with a sequence classification head on top (linear layer). + + [`YuanForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + YUAN_START_DOCSTRING, +) +class YuanForSequenceClassification(YuanPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = YuanModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(YUAN_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = ( + torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1 + ).to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[ + torch.arange(batch_size, device=logits.device), sequence_lengths + ] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and ( + labels.dtype == torch.long or labels.dtype == torch.int + ): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct( + pooled_logits.view(-1, self.num_labels), labels.view(-1) + ) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 2b0ff17d7..7118e6326 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -13,6 +13,7 @@ from hf_configs.chatglm.modeling_chatglm import ChatGLMForConditionalGeneration from hf_configs.qwen.modeling_qwen import QWenLMHeadModel from hf_configs.llava.modeling_llavallama import LlavaLlamaForCausalLM +from hf_configs.yuan.yuan_hf_model import YuanForCausalLM from intel_extension_for_pytorch.cpu._auto_kernel_selection import _disable_tpp try: @@ -148,6 +149,13 @@ lambda m: m.model.layers[0].self_attn.__class__, lambda m: m.model.layers[0].__class__, ), + model_info( + "yuan", + YuanForCausalLM, + False, + lambda m: m.model.layers[0].self_attn.__class__, + lambda m: m.model.layers[0].__class__, + ), ] From b54af7773b5ccc0540fe25637fe2ad26d3371977 Mon Sep 17 00:00:00 2001 From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com> Date: Wed, 17 Apr 2024 13:11:52 +0800 Subject: [PATCH 021/199] rename indirect_access_kv_cache to indirect_access_kv_cache_attention (#2767) --- examples/cpu/inference/python/llm-modeling/README.md | 2 +- intel_extension_for_pytorch/llm/functional/__init__.py | 2 +- intel_extension_for_pytorch/llm/functional/fusions.py | 2 +- tests/cpu/test_ipex_llm_module.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/cpu/inference/python/llm-modeling/README.md b/examples/cpu/inference/python/llm-modeling/README.md index 15d3e2762..2587a188c 100644 --- a/examples/cpu/inference/python/llm-modeling/README.md +++ b/examples/cpu/inference/python/llm-modeling/README.md @@ -35,7 +35,7 @@ ipex.llm.modules.IndirectAccessKVCacheAttention ipex.llm.functional.rotary_embedding ipex.llm.functional.rms_norm ipex.llm.functional.fast_layer_norm -ipex.llm.functional.indirect_access_kv_cache +ipex.llm.functional.indirect_access_kv_cache_attention ipex.llm.functional.varlen_attention ``` diff --git a/intel_extension_for_pytorch/llm/functional/__init__.py b/intel_extension_for_pytorch/llm/functional/__init__.py index 429679f49..82f0b5e45 100644 --- a/intel_extension_for_pytorch/llm/functional/__init__.py +++ b/intel_extension_for_pytorch/llm/functional/__init__.py @@ -2,6 +2,6 @@ rotary_embedding, rms_norm, fast_layer_norm, - indirect_access_kv_cache, + indirect_access_kv_cache_attention, varlen_attention, ) diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py index 59ecf6b7c..7251bc525 100644 --- a/intel_extension_for_pytorch/llm/functional/fusions.py +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -81,7 +81,7 @@ def fast_layer_norm( ) -def indirect_access_kv_cache( +def indirect_access_kv_cache_attention( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py index 3ad9cee44..f3ee00552 100644 --- a/tests/cpu/test_ipex_llm_module.py +++ b/tests/cpu/test_ipex_llm_module.py @@ -288,7 +288,7 @@ def test_modules_naming(self): assert ipex.llm.functional.rotary_embedding is not None assert ipex.llm.functional.rms_norm is not None assert ipex.llm.functional.fast_layer_norm is not None - assert ipex.llm.functional.indirect_access_kv_cache is not None + assert ipex.llm.functional.indirect_access_kv_cache_attention is not None assert ipex.llm.functional.varlen_attention is not None def test_rotary_embedding_tgi(self): From c1dc7ae7ec1ead2c9c53f068b0ac81ce91bb1f51 Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Thu, 18 Apr 2024 11:55:19 +0800 Subject: [PATCH 022/199] update deepspeed to the latest version (#2768) * update deepspeed to the latest version 1. Update Deepspeed version to fix compatibility issue with latest torch. 2. update torch related packages to the latest as well. * Update dependency_version.yml * Update dependency_version.yml --- dependency_version.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 66616e1b9..a6995187d 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -8,9 +8,9 @@ # branch: Branch name of the Github repository. For human understanding only. deepspeed: - commit: v0.14.0 + commit: v0.14.1 repo: https://github.com/microsoft/DeepSpeed.git - version: 0.14.0 + version: 0.14.1 gcc: max-version: null min-version: 12.3.0 @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240328+cpu + version: 2.4.0.dev20240401+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240328+cpu + version: 2.2.0.dev20240401+cpu torchvision: - version: 0.19.0.dev20240328+cpu + version: 0.19.0.dev20240401+cpu transformers: version: 4.38.1 From 6d71c53e0195af39a16b60cf430b41a163173cb1 Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Thu, 18 Apr 2024 16:56:53 +0800 Subject: [PATCH 023/199] Fix sgd (#2777) * Only update momentum buffers for SGD if momentum is enabled * update stock PT version --- dependency_version.yml | 6 +++--- intel_extension_for_pytorch/optim/_functional.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index a6995187d..1cadbfa26 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240401+cpu + version: 2.4.0.dev20240417+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240401+cpu + version: 2.2.0.dev20240417+cpu torchvision: - version: 0.19.0.dev20240401+cpu + version: 0.19.0.dev20240417+cpu transformers: version: 4.38.1 diff --git a/intel_extension_for_pytorch/optim/_functional.py b/intel_extension_for_pytorch/optim/_functional.py index 9655da958..a68f3a660 100644 --- a/intel_extension_for_pytorch/optim/_functional.py +++ b/intel_extension_for_pytorch/optim/_functional.py @@ -526,10 +526,11 @@ def sgd_step(self, closure=None): fused=self.fused, ) - # update momentum_buffers in state - for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list): - state = self.state[p] - state["momentum_buffer"] = momentum_buffer + if group["momentum"] != 0: + # update momentum_buffers in state + for p, momentum_buffer in zip(params_with_grad, momentum_buffer_list): + state = self.state[p] + state["momentum_buffer"] = momentum_buffer return loss From 3f394a3d4f5d6bb21a3086cb48bd60c351bd814b Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Thu, 18 Apr 2024 21:00:33 +0900 Subject: [PATCH 024/199] fine tune env setup scripts (#2776) --- examples/cpu/inference/python/llm/tools/env_setup.sh | 5 ++--- scripts/compile_bundle.sh | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/cpu/inference/python/llm/tools/env_setup.sh b/examples/cpu/inference/python/llm/tools/env_setup.sh index 314adaefe..aa08caebb 100644 --- a/examples/cpu/inference/python/llm/tools/env_setup.sh +++ b/examples/cpu/inference/python/llm/tools/env_setup.sh @@ -81,7 +81,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then fi # Install deps - conda install -y cmake ninja unzip + python -m pip install cmake==3.28.4 ninja unzip echo "#!/bin/bash" > ${AUX_INSTALL_SCRIPT} if [ $((${MODE} & 0x04)) -ne 0 ]; then @@ -158,7 +158,7 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then rm -rf compile_bundle.sh llvm-project llvm-release torch-ccl fi - echo "python -m pip install cpuid accelerate datasets sentencepiece protobuf==${VER_PROTOBUF} transformers==${VER_TRANSFORMERS} neural-compressor==${VER_INC} transformers_stream_generator tiktoken" >> ${AUX_INSTALL_SCRIPT} + echo "python -m pip install cpuid accelerate datasets sentencepiece mkl protobuf==${VER_PROTOBUF} transformers==${VER_TRANSFORMERS} neural-compressor==${VER_INC} transformers_stream_generator tiktoken" >> ${AUX_INSTALL_SCRIPT} # Used for accuracy test only if [ -d lm-evaluation-harness ]; then @@ -206,7 +206,6 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then cd intel-extension-for-pytorch/examples/cpu/inference/python/llm fi if [ $((${MODE} & 0x01)) -ne 0 ]; then - conda install -y mkl conda install -y gperftools -c conda-forge bash ${AUX_INSTALL_SCRIPT} python -m pip install ${WHEELFOLDER}/*.whl diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh index fb3594a67..b507ad683 100644 --- a/scripts/compile_bundle.sh +++ b/scripts/compile_bundle.sh @@ -133,7 +133,7 @@ if [ ! -z "${MAX_JOBS}" ]; then fi # Install dependencies -python -m pip install cmake +python -m pip install cmake==3.28.4 # Compare the torch torchvision and torchaudio version function ver_compare_eq() { From ac70766919ef81667ed7955f127e505bdab2b228 Mon Sep 17 00:00:00 2001 From: blzheng Date: Fri, 19 Apr 2024 09:59:15 +0800 Subject: [PATCH 025/199] Fix accuracy issue of Mixtral with deepspeed (#2774) --- csrc/cpu/CMakeLists.txt | 7 ++ csrc/cpu/aten/MoE.cpp | 24 ++-- csrc/cpu/aten/MoE.h | 18 ++- csrc/cpu/aten/kernels/MoEKrnl.cpp | 105 +++++++++++------- .../models/reference/modules/decoder.py | 3 + tests/cpu/cpp/CMakeLists.txt | 8 ++ 6 files changed, 107 insertions(+), 58 deletions(-) diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt index 5f60d7cf4..d14ff3243 100644 --- a/csrc/cpu/CMakeLists.txt +++ b/csrc/cpu/CMakeLists.txt @@ -246,6 +246,13 @@ if(BUILD_STRIPPED_BIN) set_target_properties(${PLUGIN_NAME_CPU} PROPERTIES LINK_FLAGS_RELEASE -s) endif() +find_package(PythonLibs) +if(${PYTHONLIBS_FOUND}) + target_link_libraries(${PLUGIN_NAME_CPU} PUBLIC ${PYTHON_LIBRARIES}) +endif() + +find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") +target_link_libraries(${PLUGIN_NAME_CPU} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) install(TARGETS ${PLUGIN_NAME_CPU} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/csrc/cpu/aten/MoE.cpp b/csrc/cpu/aten/MoE.cpp index c25d6acec..02ba9bc68 100644 --- a/csrc/cpu/aten/MoE.cpp +++ b/csrc/cpu/aten/MoE.cpp @@ -18,7 +18,8 @@ at::Tensor mixtral_moe_tpp( const at::Tensor& down_wei, bool tpp_fallback, const at::Tensor& routing_weights, - at::Tensor& output) { + at::Tensor& output, + bool is_distributed) { RECORD_FUNCTION("ipex::mixtral_moe_tpp", c10::ArrayRef({})); if (top_x.sizes()[0] == 0) @@ -33,7 +34,8 @@ at::Tensor mixtral_moe_tpp( down_wei, tpp_fallback, routing_weights, - output); + output, + is_distributed); } at::Tensor mixtral_moe( @@ -48,7 +50,8 @@ at::Tensor mixtral_moe( const at::Tensor& down_op_ctx, bool use_dnnl, const at::Tensor& routing_weights, - at::Tensor& output) { + at::Tensor& output, + bool is_distributed) { RECORD_FUNCTION("ipex::mixtral_moe", c10::ArrayRef({})); if (top_x.sizes()[0] == 0) @@ -66,7 +69,8 @@ at::Tensor mixtral_moe( down_op_ctx, use_dnnl, routing_weights, - output); + output, + is_distributed); } at::Tensor mixtral_moe_woq( const at::Tensor& hidden_states, @@ -76,7 +80,8 @@ at::Tensor mixtral_moe_woq( const at::Tensor& up_wei, const at::Tensor& down_wei, const at::Tensor& routing_weights, - at::Tensor& output) { + at::Tensor& output, + bool is_distributed) { RECORD_FUNCTION("ipex::mixtral_moe_woq", c10::ArrayRef({})); if (top_x.sizes()[0] == 0) @@ -90,7 +95,8 @@ at::Tensor mixtral_moe_woq( up_wei, down_wei, routing_weights, - output); + output, + is_distributed); } } // namespace cpu } // namespace torch_ipex @@ -101,7 +107,7 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { m.def( "mixtral_moe_tpp(Tensor hidden_states, Tensor top_x, Tensor idx, Tensor gate_wei, \ Tensor up_wei, Tensor down_wei, bool tpp_fallback, Tensor routing_weights, \ - Tensor output) -> Tensor"); + Tensor output, bool is_distributed) -> Tensor"); m.impl( "mixtral_moe_tpp", c10::DispatchKey::CPU, @@ -109,11 +115,11 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { m.def( "mixtral_moe(Tensor hidden_states, Tensor top_x, Tensor idx, Tensor gate_wei, \ Tensor gate_op_ctx, Tensor up_wei, Tensor up_op_ctx, Tensor down_wei, \ - Tensor down_op_ctx, bool use_dnnl, Tensor routing_weights, Tensor output) -> Tensor"); + Tensor down_op_ctx, bool use_dnnl, Tensor routing_weights, Tensor output, bool is_distributed) -> Tensor"); m.impl("mixtral_moe", c10::DispatchKey::CPU, torch_ipex::cpu::mixtral_moe); m.def( "mixtral_moe_woq(Tensor hidden_states, Tensor top_x, Tensor idx, Tensor gate_wei, \ - Tensor up_wei, Tensor down_wei, Tensor routing_weights, Tensor output) -> Tensor"); + Tensor up_wei, Tensor down_wei, Tensor routing_weights, Tensor output, bool is_distributed) -> Tensor"); m.impl( "mixtral_moe_woq", c10::DispatchKey::CPU, diff --git a/csrc/cpu/aten/MoE.h b/csrc/cpu/aten/MoE.h index 81b8a20a8..291eacb12 100644 --- a/csrc/cpu/aten/MoE.h +++ b/csrc/cpu/aten/MoE.h @@ -14,7 +14,8 @@ at::Tensor mixtral_moe_tpp( const at::Tensor&, bool, const at::Tensor&, - at::Tensor&); + at::Tensor&, + bool); at::Tensor mixtral_moe_woq( const at::Tensor&, const at::Tensor&, @@ -23,7 +24,8 @@ at::Tensor mixtral_moe_woq( const at::Tensor&, const at::Tensor&, const at::Tensor&, - at::Tensor&); + at::Tensor&, + bool); at::Tensor mixtral_moe( const at::Tensor&, const at::Tensor&, @@ -36,7 +38,8 @@ at::Tensor mixtral_moe( const at::Tensor&, bool, const at::Tensor&, - at::Tensor&); + at::Tensor&, + bool); using mixtral_moe_tpp_kernel_fn = at::Tensor (*)( const at::Tensor& hidden_states, const at::Tensor& top_x, @@ -46,7 +49,8 @@ using mixtral_moe_tpp_kernel_fn = at::Tensor (*)( const at::Tensor& down_wei, bool tpp_fallback, const at::Tensor& routing_weights, - at::Tensor& output); + at::Tensor& output, + bool is_distributed); using mixtral_moe_woq_kernel_fn = at::Tensor (*)( const at::Tensor& hidden_states, const at::Tensor& top_x, @@ -55,7 +59,8 @@ using mixtral_moe_woq_kernel_fn = at::Tensor (*)( const at::Tensor& up_wei, const at::Tensor& down_wei, const at::Tensor& routing_weights, - at::Tensor& output); + at::Tensor& output, + bool is_distributed); using mixtral_moe_kernel_fn = at::Tensor (*)( const at::Tensor& hidden_states, const at::Tensor& top_x, @@ -68,7 +73,8 @@ using mixtral_moe_kernel_fn = at::Tensor (*)( const at::Tensor& down_op_ctx, bool use_dnnl, const at::Tensor& routing_weights, - at::Tensor& output); + at::Tensor& output, + bool is_distributed); IPEX_DECLARE_DISPATCH(mixtral_moe_tpp_kernel_fn, mixtral_moe_tpp_kernel_stub); IPEX_DECLARE_DISPATCH(mixtral_moe_woq_kernel_fn, mixtral_moe_woq_kernel_stub); IPEX_DECLARE_DISPATCH(mixtral_moe_kernel_fn, mixtral_moe_kernel_stub); diff --git a/csrc/cpu/aten/kernels/MoEKrnl.cpp b/csrc/cpu/aten/kernels/MoEKrnl.cpp index 50ebbb938..ea982318e 100644 --- a/csrc/cpu/aten/kernels/MoEKrnl.cpp +++ b/csrc/cpu/aten/kernels/MoEKrnl.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "tpp/kernels/TPPGEMMKrnl.h" @@ -24,15 +25,15 @@ at::Tensor mixtral_moe_tpp_kernl_impl( const at::Tensor& down_wei, bool tpp_fallback, const at::Tensor& routing_weights, - at::Tensor& output) { + at::Tensor& output, + bool is_distributed) { auto curr_state = hidden_states.index({top_x}).unsqueeze(0); auto routing_w = routing_weights.index({top_x, idx}).unsqueeze(-1); if (tpp_fallback) { curr_state = at::linear( - at::silu(at::linear(curr_state, gate_wei)) * - at::linear(curr_state, up_wei), - down_wei) * - routing_w; + at::silu(at::linear(curr_state, gate_wei)) * + at::linear(curr_state, up_wei), + down_wei); } else { curr_state = tpp_fused_gate_up_proj_forward_cpu( curr_state, @@ -42,9 +43,18 @@ at::Tensor mixtral_moe_tpp_kernl_impl( at::empty(0, curr_state.options()), c10::nullopt); curr_state = - tpp_linear_nobias_forward_cpu(curr_state, down_wei, c10::nullopt) * - routing_w; + tpp_linear_nobias_forward_cpu(curr_state, down_wei, c10::nullopt); } + if (is_distributed) { + py::gil_scoped_acquire acquire; + py::function allreduce = py::module_::import("torch") + .attr("ops") + .attr("deepspeed_comm") + .attr("all_reduce"); + allreduce(curr_state); + py::gil_scoped_release release; + } + curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); return output; @@ -62,42 +72,41 @@ at::Tensor mixtral_moe_kernl_impl( const at::Tensor& down_op_ctx, bool use_dnnl, const at::Tensor& routing_weights, - at::Tensor& output) { + at::Tensor& output, + bool is_distributed) { auto curr_state = hidden_states.index({top_x}).unsqueeze(0); auto routing_w = routing_weights.index({top_x, idx}).unsqueeze(-1); if (use_dnnl) { - curr_state = - ipex_linear( - at::silu(ipex_linear( - curr_state, - gate_wei, - c10::nullopt, - gate_op_ctx, - c10::nullopt)) * - ipex_linear( - curr_state, up_wei, c10::nullopt, up_op_ctx, c10::nullopt), - down_wei, - c10::nullopt, - down_op_ctx, - c10::nullopt) * - routing_w; + curr_state = ipex_linear( + at::silu(ipex_linear( + curr_state, gate_wei, c10::nullopt, gate_op_ctx, c10::nullopt)) * + ipex_linear( + curr_state, up_wei, c10::nullopt, up_op_ctx, c10::nullopt), + down_wei, + c10::nullopt, + down_op_ctx, + c10::nullopt); } else { - curr_state = - mkl_sgemm_forward( - at::silu(mkl_sgemm_forward( - curr_state, - gate_wei, - c10::nullopt, - gate_op_ctx, - c10::nullopt)) * - mkl_sgemm_forward( - curr_state, up_wei, c10::nullopt, up_op_ctx, c10::nullopt), - down_wei, - c10::nullopt, - down_op_ctx, - c10::nullopt) * - routing_w; + curr_state = mkl_sgemm_forward( + at::silu(mkl_sgemm_forward( + curr_state, gate_wei, c10::nullopt, gate_op_ctx, c10::nullopt)) * + mkl_sgemm_forward( + curr_state, up_wei, c10::nullopt, up_op_ctx, c10::nullopt), + down_wei, + c10::nullopt, + down_op_ctx, + c10::nullopt); + } + if (is_distributed) { + py::gil_scoped_acquire acquire; + py::function allreduce = py::module_::import("torch") + .attr("ops") + .attr("deepspeed_comm") + .attr("all_reduce"); + allreduce(curr_state); + py::gil_scoped_release release; } + curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); return output; @@ -111,15 +120,25 @@ at::Tensor mixtral_moe_woq_kernl_impl( const at::Tensor& up_wei, const at::Tensor& down_wei, const at::Tensor& routing_weights, - at::Tensor& output) { + at::Tensor& output, + bool is_distributed) { auto curr_state = hidden_states.index({top_x}).unsqueeze(0); auto routing_w = routing_weights.index({top_x, idx}).unsqueeze(-1); curr_state = woq_linear_forward( - at::silu(woq_linear_forward(curr_state, gate_wei)) * - woq_linear_forward(curr_state, up_wei), - down_wei) * - routing_w; + at::silu(woq_linear_forward(curr_state, gate_wei)) * + woq_linear_forward(curr_state, up_wei), + down_wei); + if (is_distributed) { + py::gil_scoped_acquire acquire; + py::function allreduce = py::module_::import("torch") + .attr("ops") + .attr("deepspeed_comm") + .attr("all_reduce"); + allreduce(curr_state); + py::gil_scoped_release release; + } + curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); return output; diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index d9ffccad8..ac39b277d 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -789,6 +789,7 @@ def MixtralDecoderLayer_forward( expert_layer.w2._op_context.get_data_handle(), routing_weights, final_hidden_states, + self.distributed, ) elif hasattr(expert_layer.w1, "use_dnnl") and expert_layer.w1.use_dnnl: final_hidden_states = torch.ops.torch_ipex.mixtral_moe( @@ -804,6 +805,7 @@ def MixtralDecoderLayer_forward( hasattr(expert_layer.w1, "use_dnnl") and expert_layer.w1.use_dnnl, routing_weights, final_hidden_states, + self.distributed, ) else: final_hidden_states = torch.ops.torch_ipex.mixtral_moe_tpp( @@ -820,6 +822,7 @@ def MixtralDecoderLayer_forward( ), routing_weights, final_hidden_states, + self.distributed, ) final_hidden_states = final_hidden_states.reshape( batch_size, sequence_length, hidden_dim diff --git a/tests/cpu/cpp/CMakeLists.txt b/tests/cpu/cpp/CMakeLists.txt index cc299d0a5..fc5dff343 100644 --- a/tests/cpu/cpp/CMakeLists.txt +++ b/tests/cpu/cpp/CMakeLists.txt @@ -69,5 +69,13 @@ target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC c10) # Link IPEX target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC intel-ext-pt-cpu) +find_package(PythonLibs) +if(${PYTHONLIBS_FOUND}) + target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC ${PYTHON_LIBRARIES}) +endif() + +find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") +target_link_libraries(${CPU_CPP_TEST_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) + install(TARGETS ${CPU_CPP_TEST_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) From 4a8a6fbed611cf46321b5157840d58b4bb1460f4 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Fri, 19 Apr 2024 13:07:37 +0800 Subject: [PATCH 026/199] update oneDNN to b9feccb98d on main (#2784) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 8eb97f88f..26bb35808 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 8eb97f88f6cd4290e93dbead94fb22c68b9d1d5a +Subproject commit 26bb358089b2dc9fcfe8aaf1181ee9775c100a46 From c8f1243cb26cba193df90928e415fe4cd64d1ff6 Mon Sep 17 00:00:00 2001 From: Xuan Liao Date: Fri, 19 Apr 2024 18:15:05 +0800 Subject: [PATCH 027/199] fallback ipex sdpa when stride=0 (#2790) * fallback ipex sdpa when stride=0 * add sdpa stride0 ut --- csrc/cpu/aten/FlashAttention.cpp | 20 ++++++++++++++++++-- tests/cpu/test_cpu_ops.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/csrc/cpu/aten/FlashAttention.cpp b/csrc/cpu/aten/FlashAttention.cpp index e9a60327a..78a0b2c10 100644 --- a/csrc/cpu/aten/FlashAttention.cpp +++ b/csrc/cpu/aten/FlashAttention.cpp @@ -7,6 +7,18 @@ namespace cpu { IPEX_DEFINE_DISPATCH(flash_attention_kernel_stub); +// When stride=0, MKL gemm causes error. +// Fallback to flash attention in PT. +bool use_ipex_flash_attention( + const at::Tensor& query, + const at::Tensor& key, + const at::Tensor& value) { + int64_t qStrideM = query.stride(2); + int64_t kStrideN = key.stride(2); + int64_t vStrideN = value.stride(2); + return ((qStrideM >= 1) && (kStrideN >= 1) && (vStrideN >= 1)); +} + /* *Caculate the flash attention SDPA with attention mask. */ @@ -18,8 +30,12 @@ std::tuple flash_attention_forward_cpu( bool is_causal, const c10::optional& attention_mask, c10::optional scale) { - return flash_attention_kernel_stub( - kCPU, query, key, value, dropout_p, is_causal, attention_mask, scale); + if (use_ipex_flash_attention(query, key, value)) { + return flash_attention_kernel_stub( + kCPU, query, key, value, dropout_p, is_causal, attention_mask, scale); + } + return at::native::_scaled_dot_product_flash_attention_cpu( + query, key, value, dropout_p, is_causal, attention_mask, scale); } /* diff --git a/tests/cpu/test_cpu_ops.py b/tests/cpu/test_cpu_ops.py index 26b7eeca8..ea0c8d8a2 100644 --- a/tests/cpu/test_cpu_ops.py +++ b/tests/cpu/test_cpu_ops.py @@ -1483,6 +1483,37 @@ def test_flash_attention(self): math_ref = math_ref.to(dtype) torch.testing.assert_close(actual, math_ref, atol=atol, rtol=rtol) + def test_flash_attention_stride0(self): + input_shape = ( + 1, + 16, + 1, + 48, + ) + input_stride = ( + 0, + 48, + 0, + 1, + ) + q = torch.randn( + input_shape, device="cpu", dtype=torch.float32, requires_grad=False + ).as_strided(input_shape, input_stride) + k = torch.randn( + input_shape, device="cpu", dtype=torch.float32, requires_grad=False + ).as_strided(input_shape, input_stride) + v = torch.randn( + input_shape, device="cpu", dtype=torch.float32, requires_grad=False + ).as_strided(input_shape, input_stride) + atol = 1e-5 + rtol = 5e-6 + q2 = q.clone() + k2 = k.clone() + v2 = v.clone() + actual = torch.ops.torch_ipex.flash_attention(q, k, v)[0] + math_ref = torch._scaled_dot_product_attention_math(q2, k2, v2)[0] + torch.testing.assert_close(actual, math_ref, atol=1e-5, rtol=5e-6) + if __name__ == "__main__": test = unittest.main() From d60d6342030e9e85073e1eb731318bb8827d0a26 Mon Sep 17 00:00:00 2001 From: Xu Han Date: Fri, 19 Apr 2024 20:37:21 +0800 Subject: [PATCH 028/199] copy openmp if build by icx. (#2735) * copy openmp if build by icx. * Install icx runtime libs. --- cmake/Modules/FindIcxCpuRt.cmake | 40 ++++++++++++++++++++++++++++++++ csrc/cpu/CMakeLists.txt | 14 +++++++++++ 2 files changed, 54 insertions(+) create mode 100644 cmake/Modules/FindIcxCpuRt.cmake diff --git a/cmake/Modules/FindIcxCpuRt.cmake b/cmake/Modules/FindIcxCpuRt.cmake new file mode 100644 index 000000000..30028b5f4 --- /dev/null +++ b/cmake/Modules/FindIcxCpuRt.cmake @@ -0,0 +1,40 @@ +if(ICX_CPU_RT_FOUND) + return() +endif() + +set(ICX_CPU_RT_FOUND OFF) +set(INTEL_ICX_RT_LIBS "") + +function(get_intel_compiler_rt_list libpath_list) + if(MSVC) + message( FATAL_ERROR "Not support Windows now." ) + else() + set(intel_rt_list "libiomp5.so" "libintlc.so" "libintlc.so.5" "libimf.so" "libsvml.so" "libirng.so") + set(libimf_name "libimf.so") + endif() + + set(intel_rt_path_list "") + execute_process( + COMMAND bash "-c" "${CMAKE_CXX_COMPILER} --print-file-name=${libimf_name}" + OUTPUT_VARIABLE intel_imf_path + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + get_filename_component(intel_compiler_rt_install_dir "${intel_imf_path}" DIRECTORY) + foreach(lib ${intel_rt_list}) + list(APPEND intel_rt_path_list ${intel_compiler_rt_install_dir}/${lib}) + endforeach() + set(${libpath_list} "${intel_rt_path_list}" PARENT_SCOPE) +endfunction() + +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM") + get_intel_compiler_rt_list(INTEL_ICX_RT_LIBS) + foreach (intel_lib_item ${INTEL_ICX_RT_LIBS}) + message("Found Intel icx runtime lib: ${intel_lib_item}") + endforeach() + + string(COMPARE EQUAL "${INTEL_ICX_RT_LIBS}" "" result_empty) + if(NOT result_empty) + set(ICX_CPU_RT_FOUND ON) + message("Intel icx cpu runtime found.") + endif() +endif() \ No newline at end of file diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt index d14ff3243..a09380761 100644 --- a/csrc/cpu/CMakeLists.txt +++ b/csrc/cpu/CMakeLists.txt @@ -258,3 +258,17 @@ install(TARGETS ${PLUGIN_NAME_CPU} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + +if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM") + if(${OpenMP_FOUND}) + install(FILES ${OpenMP_CXX_LIBRARIES} + DESTINATION ${CMAKE_INSTALL_LIBDIR}) + endif() + + find_package(IcxCpuRt) + if(${ICX_CPU_RT_FOUND}) + install(FILES ${INTEL_ICX_RT_LIBS} + DESTINATION ${CMAKE_INSTALL_LIBDIR}) + endif() + +endif() \ No newline at end of file From b3857a399954c21a9e201b7ed01fa7a608f03df5 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Sun, 21 Apr 2024 19:59:51 -0700 Subject: [PATCH 029/199] [release/2.3] Fix concat linear for deepspeed and other improvements (#2792) (#2793) * Fix concat linear for deepspeed; Improve run_accuracy_with_deepspeed.py; Remove unnecessary RECORD_FUNCTION from woq linear ops * Fix typo --- csrc/cpu/aten/Linear.cpp | 9 ------ .../run_accuracy_with_deepspeed.py | 14 +++++++--- .../models/reference/modules/attentions.py | 28 +++++++++++-------- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp index c29f4de53..3958369a3 100644 --- a/csrc/cpu/aten/Linear.cpp +++ b/csrc/cpu/aten/Linear.cpp @@ -448,8 +448,6 @@ at::Tensor woq_linear_kernel( at::Tensor woq_linear_forward( const at::Tensor& input, const at::Tensor& op_context) { - RECORD_FUNCTION( - "torch_ipex::ipex_woq_linear", c10::ArrayRef({})); return reinterpret_cast( op_context.data_ptr()[0]) ->run(input); @@ -496,8 +494,6 @@ at::Tensor woq_linear_eltwise_kernel( at::Tensor woq_linear_gelu_forward( const at::Tensor& input, const at::Tensor& op_context) { - RECORD_FUNCTION( - "torch_ipex::woq_linear_gelu", c10::ArrayRef({})); return reinterpret_cast( op_context.data_ptr()[0]) ->run_eltwise( @@ -507,8 +503,6 @@ at::Tensor woq_linear_gelu_forward( at::Tensor woq_linear_new_gelu_forward( const at::Tensor& input, const at::Tensor& op_context) { - RECORD_FUNCTION( - "torch_ipex::woq_linear_new_gelu", c10::ArrayRef({})); return reinterpret_cast( op_context.data_ptr()[0]) ->run_eltwise( @@ -575,7 +569,6 @@ at::Tensor woq_linear_add_forward( const at::Tensor& input, const at::Tensor& op_context, const std::vector& others) { - RECORD_FUNCTION("torch_ipex::woq_linear_add", c10::ArrayRef({})); return reinterpret_cast( op_context.data_ptr()[0]) ->run_add(input, others); @@ -585,8 +578,6 @@ at::Tensor woq_linear_add_add_forward( const at::Tensor& input, const at::Tensor& op_context, const std::vector& others) { - RECORD_FUNCTION( - "torch_ipex::woq_linear_add_add", c10::ArrayRef({})); return reinterpret_cast( op_context.data_ptr()[0]) ->run_add_add(input, others); diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index e2eaebf50..05e8c6c30 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -130,7 +130,7 @@ def decorator(func): ) parser.add_argument( "--weight-dtype", - choices=["INT8", "INT4"], + choices=["INT8", "INT4", "NF4"], default="INT8", type=str, help="weight data type for weight only quantization. Unrelated to activation data type or lowp-mode.", @@ -291,6 +291,8 @@ def print_rank0(*msg): def get_repo_root(model_name_or_path): if os.path.exists(model_name_or_path): # local path + # use absolute path here to avoid path error in deepspeed + model_name_or_path = os.path.abspath(model_name_or_path) return model_name_or_path # checks if online or not if is_offline_mode(): @@ -356,9 +358,13 @@ def write_checkpoints_json(): ipex_woq_enabled = args.ipex_weight_only_quantization if ipex_woq_enabled: from intel_extension_for_pytorch.quantization import WoqWeightDtype - weight_dtype = ( - WoqWeightDtype.INT4 if args.weight_dtype == "INT4" else WoqWeightDtype.INT8 - ) + if args.weight_dtype == "INT8": + weight_dtype = WoqWeightDtype.INT8 + elif args.weight_dtype == "INT4": + weight_dtype = WoqWeightDtype.INT4 + else: + assert args.weight_dtype == "NF4" + weight_dtype = WoqWeightDtype.NF4 if args.lowp_mode == "INT8": lowp_mode = ipex.quantization.WoqLowpMode.INT8 diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 9f42a488c..afce80710 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -2052,22 +2052,26 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): "StableLmForCausalLM", "LlavaLlamaForCausalLM", ]: + supported_linear_types = [ + torch.nn.Linear, + WeightOnlyQuantizedLinear, + ] + try: + import deepspeed + + supported_linear_types.append( + deepspeed.module_inject.layers.LinearLayer + ) + except ImportError: + pass + supported_linear_types = tuple(supported_linear_types) if ( hasattr(module, "q_proj") and hasattr(module, "k_proj") and hasattr(module, "v_proj") - and ( - isinstance(module.q_proj, torch.nn.Linear) - or isinstance(module.q_proj, WeightOnlyQuantizedLinear) - ) - and ( - isinstance(module.k_proj, torch.nn.Linear) - or isinstance(module.k_proj, WeightOnlyQuantizedLinear) - ) - and ( - isinstance(module.v_proj, torch.nn.Linear) - or isinstance(module.v_proj, WeightOnlyQuantizedLinear) - ) + and (isinstance(module.q_proj, supported_linear_types)) + and (isinstance(module.k_proj, supported_linear_types)) + and (isinstance(module.v_proj, supported_linear_types)) ) and not (hasattr(self, "use_qk_layernorm") and self.use_qk_layernorm): # we support MHA, GQA, MQA for concat linear self.concat_qkv = _IPEXConcatLinearRef( From dbe3235097610860b8556f930b9ab7166b0ebeb0 Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Mon, 22 Apr 2024 15:17:13 +0800 Subject: [PATCH 030/199] LLM example: add finetune and change tokenizer (#2794) --- .../run_accuracy_with_deepspeed.py | 3 +- .../run_generation_with_deepspeed.py | 5 +- .../llm/single_instance/run_accuracy.py | 3 +- .../llm/single_instance/run_generation.py | 3 +- .../python/llm/utils/create_shard_model.py | 4 +- .../python/llm/utils/model_class/llama.py | 4 +- .../inference/python/llm/utils/run_gptq.py | 4 +- examples/cpu/training/llm/README.md | 47 +++ examples/cpu/training/llm/finetune.py | 278 ++++++++++++++++++ examples/cpu/training/llm/requirements.txt | 11 + .../cpu/training/llm/run_lora_finetune_ddp.sh | 126 ++++++++ .../cpu/training/llm/templates/alpaca.json | 6 + examples/cpu/training/llm/utils/README.md | 7 + examples/cpu/training/llm/utils/__init__.py | 0 examples/cpu/training/llm/utils/prompter.py | 69 +++++ 15 files changed, 555 insertions(+), 15 deletions(-) create mode 100644 examples/cpu/training/llm/README.md create mode 100644 examples/cpu/training/llm/finetune.py create mode 100644 examples/cpu/training/llm/requirements.txt create mode 100644 examples/cpu/training/llm/run_lora_finetune_ddp.sh create mode 100644 examples/cpu/training/llm/templates/alpaca.json create mode 100644 examples/cpu/training/llm/utils/README.md create mode 100644 examples/cpu/training/llm/utils/__init__.py create mode 100644 examples/cpu/training/llm/utils/prompter.py diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 05e8c6c30..e3b9ad3ab 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -18,7 +18,6 @@ AutoConfig, AutoModelForCausalLM, AutoTokenizer, - LlamaTokenizer, T5ForConditionalGeneration, AutoProcessor, ) @@ -54,7 +53,7 @@ def decorator(func): "gpt-j": (AutoModelForCausalLM, AutoTokenizer), "gpt-neox": (AutoModelForCausalLM, AutoTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), - "llama": (AutoModelForCausalLM, LlamaTokenizer), + "llama": (AutoModelForCausalLM, AutoTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), "bloom": (AutoModelForCausalLM, AutoTokenizer), "codegen": (AutoModelForCausalLM, AutoTokenizer), diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 1acec3412..ea01ce922 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -17,7 +17,6 @@ AutoConfig, AutoModelForCausalLM, AutoTokenizer, - LlamaTokenizer, T5ForConditionalGeneration, AutoProcessor, TextStreamer @@ -38,7 +37,7 @@ "gptj": (AutoModelForCausalLM, AutoTokenizer), "gpt-neox": (AutoModelForCausalLM, AutoTokenizer), "gptneox": (AutoModelForCausalLM, AutoTokenizer), - "llama": (AutoModelForCausalLM, LlamaTokenizer), + "llama": (AutoModelForCausalLM, AutoTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), "chatglm": (AutoModelForCausalLM, AutoTokenizer), @@ -583,7 +582,7 @@ def generate(): image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(infer_dtype) for img in image] input_tokens = {"input_ids": input_ids, "images": image_tensor} else: - input_tokens = tokenizer.batch_encode_plus(inputs, return_tensors="pt") + input_tokens = tokenizer.batch_encode_plus(inputs, return_token_type_ids=False, return_tensors="pt") input_ids = input_tokens.input_ids for t in input_tokens: if torch.is_tensor(input_tokens[t]): diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 490866cb4..bfca6db71 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -11,7 +11,6 @@ AutoConfig, AutoModelForCausalLM, AutoTokenizer, - LlamaTokenizer, T5ForConditionalGeneration, AutoProcessor, ) @@ -20,7 +19,7 @@ "gpt-j": (AutoModelForCausalLM, AutoTokenizer), "gpt-neox": (AutoModelForCausalLM, AutoTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), - "llama": (AutoModelForCausalLM, LlamaTokenizer), + "llama": (AutoModelForCausalLM, AutoTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), "bloom": (AutoModelForCausalLM, AutoTokenizer), "codegen": (AutoModelForCausalLM, AutoTokenizer), diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 626c65e7b..6b5a60504 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -9,7 +9,6 @@ AutoConfig, AutoModelForCausalLM, AutoTokenizer, - LlamaTokenizer, T5ForConditionalGeneration, AutoProcessor, ) @@ -28,7 +27,7 @@ MODEL_CLASSES = { "gpt-j": (AutoModelForCausalLM, AutoTokenizer), "gpt-neox": (AutoModelForCausalLM, AutoTokenizer), - "llama": (AutoModelForCausalLM, LlamaTokenizer), + "llama": (AutoModelForCausalLM, AutoTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), "bloom": (AutoModelForCausalLM, AutoTokenizer), diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/inference/python/llm/utils/create_shard_model.py index e3f042cea..7926c3950 100644 --- a/examples/cpu/inference/python/llm/utils/create_shard_model.py +++ b/examples/cpu/inference/python/llm/utils/create_shard_model.py @@ -1,7 +1,7 @@ import torch import argparse -from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer, T5ForConditionalGeneration, AutoProcessor +from transformers import AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, AutoProcessor # Here import ipex for Baichuan loading compatibility, for other models we can ignore this import import intel_extension_for_pytorch @@ -9,7 +9,7 @@ MODEL_CLASSES = { "gpt-j": (AutoModelForCausalLM, AutoTokenizer), "gpt-neox": (AutoModelForCausalLM, AutoTokenizer), - "llama": (AutoModelForCausalLM, LlamaTokenizer), + "llama": (AutoModelForCausalLM, AutoTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), "falcon": (AutoModelForCausalLM, AutoTokenizer), "bloom": (AutoModelForCausalLM, AutoTokenizer), diff --git a/examples/cpu/inference/python/llm/utils/model_class/llama.py b/examples/cpu/inference/python/llm/utils/model_class/llama.py index 3cc68ea3d..31e75fb83 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/llama.py +++ b/examples/cpu/inference/python/llm/utils/model_class/llama.py @@ -1,7 +1,7 @@ import torch from .llm import LLMConfig, EXAMPLE_INPUTS_MODE -from transformers import LlamaForCausalLM, LlamaTokenizer +from transformers import LlamaForCausalLM, AutoTokenizer import intel_extension_for_pytorch as ipex @@ -33,4 +33,4 @@ def get_user_model(self, config, benchmark): return self.model def get_tokenizer(self): - return LlamaTokenizer.from_pretrained(self.model_id) + return AutoTokenizer.from_pretrained(self.model_id) diff --git a/examples/cpu/inference/python/llm/utils/run_gptq.py b/examples/cpu/inference/python/llm/utils/run_gptq.py index 99cb4d60e..68826b8e5 100644 --- a/examples/cpu/inference/python/llm/utils/run_gptq.py +++ b/examples/cpu/inference/python/llm/utils/run_gptq.py @@ -103,12 +103,12 @@ def get_user_model(): from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer torchscript = False if re.search("llama", args.model.lower()): - from transformers import LlamaForCausalLM, LlamaTokenizer + from transformers import LlamaForCausalLM, AutoTokenizer user_model = LlamaForCausalLM.from_pretrained( args.model, torchscript=torchscript, # torchscript will force `return_dict=False` to avoid jit errors ) - tokenizer = LlamaTokenizer.from_pretrained(args.model) + tokenizer = AutoTokenizer.from_pretrained(args.model) elif re.search("mpt-7b-chat", args.model.lower()): from mpt_7b.modeling_mpt import MPTForCausalLM user_model = MPTForCausalLM.from_pretrained( diff --git a/examples/cpu/training/llm/README.md b/examples/cpu/training/llm/README.md new file mode 100644 index 000000000..50dad1f90 --- /dev/null +++ b/examples/cpu/training/llm/README.md @@ -0,0 +1,47 @@ +# IPEX LLAMA2 7B lora apalca finetuning training on CPUs (distributed) + +## Description + +This document has instructions for running [LLaMA2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) lora apalca finetuning using Intel-optimized PyTorch (enable the recipes from [apalca-lora](https://github.com/tloen/alpaca-lora/tree/main) on CPUs ). + +## Bare Metal +### General setup + +Follow [link](https://github.com/intel/intel-extension-for-pytorch/tree/release/2.3/examples/cpu/inference/python/llm#3-environment-setup) to setup PyTorch/IPEX and some other dependency. + +### Prepare dependency +``` + pip install -r requirements.txt + ``` +### Specific Setup + +* Set ENV to use multi-nodes distributed training (no need for single-node multi-sockets) + +In this case, we use data-parallel distributed training and every rank will hold same model replica. The NNODES is the number of ip in the HOSTFILE. To use multi-nodes distributed training you should firstly setup the passwordless login (you can refer to [link](https://linuxize.com/post/how-to-setup-passwordless-ssh-login/)) between these nodes. +``` +export NNODES=#your_node_number (default using 1 node) +# create your_ip_list_file, one ip per line, like (or self edit): +scontrol show hostname > ./hostfile + +export HOSTFILE=hostfile + +``` +# Quick Start Scripts +## Run the model +``` +# Get the dataset here https://github.com/tloen/alpaca-lora/blob/main/alpaca_data.json +export DATASET="./alpaca_data.json" +# Env vars +export LOCAL_BATCH_SIZE=32 #32 is default one, you can choose per need +export MODEL_NAME_OR_PATH="YOUR LOCAL PATH or MODEL_ID (HF)" +export MAXSTEP=-1 #default is -1, which means running all steps + +#[optional] you may need to get access to llama2 weights from HF +Apply the access in this page [LLaMA2 7B](https://huggingface.co/meta-llama/Llama-2-7b-hf) with your huggingface account +huggingface-cli login +{your huggingface token} +``` +## Launch command +| DataType | Throughput | +| ----------- | ----------- | +| BF16 | bash run_lora_finetune_ddp.sh bf16 | diff --git a/examples/cpu/training/llm/finetune.py b/examples/cpu/training/llm/finetune.py new file mode 100644 index 000000000..230f941af --- /dev/null +++ b/examples/cpu/training/llm/finetune.py @@ -0,0 +1,278 @@ +""" +This script is adapted from the following official alpaca-loca fine-tuning code with minimal code changes: +https://github.com/tloen/alpaca-lora/blob/main/finetune.py +""" + +import os +from typing import List + +import fire +import torch +import transformers +from datasets import load_dataset +from datasets.utils.logging import disable_progress_bar +from transformers.utils import logging as hf_logging + + +from peft import ( + LoraConfig, + get_peft_model, + get_peft_model_state_dict, + prepare_model_for_kbit_training, + set_peft_model_state_dict, +) +from transformers import AutoModelForCausalLM, AutoTokenizer + +from utils.prompter import Prompter + + +def train( + # model/data params + base_model: str = "", # the only required argument + data_path: str = "yahma/alpaca-cleaned", + output_dir: str = "./lora-alpaca", + bf16: bool = False, + attn_implementation: str = "", + # training hyperparams + max_steps: int = -1, + micro_batch_size: int = 16, + num_epochs: int = 3, + learning_rate: float = 3e-4, + cutoff_len: int = 256, + val_set_size: int = 2000, + # lora hyperparams + lora_r: int = 8, + lora_alpha: int = 16, + lora_dropout: float = 0.05, + lora_target_modules: List[str] = [ + "q_proj", + "v_proj", + ], + # llm hyperparams + train_on_inputs: bool = True, # if False, masks out inputs in loss + group_by_length: bool = False, # faster, but produces an odd training loss curve + # wandb params + wandb_project: str = "", + wandb_run_name: str = "", + wandb_watch: str = "", # options: false | gradients | all + wandb_log_model: str = "", # options: false | true + resume_from_checkpoint: str = None, # either training checkpoint or final adapter + prompt_template_name: str = "alpaca", # The prompt template to use, will default to alpaca. + disable_tqdm: bool = False, # disable tqdm if needed to avoid split log failure when ddp training outputs multiple ranks. +): + if int(os.environ.get("LOCAL_RANK", 0)) == 0: + print( + f"Training Alpaca-LoRA model with params:\n" + f"base_model: {base_model}\n" + f"data_path: {data_path}\n" + f"output_dir: {output_dir}\n" + f"bf16: {bf16}\n" + f"attn_implementation: {attn_implementation}\n" + f"max_steps: {max_steps}\n" + f"micro_batch_size: {micro_batch_size}\n" + f"num_epochs: {num_epochs}\n" + f"learning_rate: {learning_rate}\n" + f"cutoff_len: {cutoff_len}\n" + f"val_set_size: {val_set_size}\n" + f"lora_r: {lora_r}\n" + f"lora_alpha: {lora_alpha}\n" + f"lora_dropout: {lora_dropout}\n" + f"lora_target_modules: {lora_target_modules}\n" + f"train_on_inputs: {train_on_inputs}\n" + f"group_by_length: {group_by_length}\n" + f"wandb_project: {wandb_project}\n" + f"wandb_run_name: {wandb_run_name}\n" + f"wandb_watch: {wandb_watch}\n" + f"wandb_log_model: {wandb_log_model}\n" + f"resume_from_checkpoint: {resume_from_checkpoint or False}\n" + f"prompt template: {prompt_template_name}\n" + f"disable tqdm: {disable_tqdm}\n" + ) + assert ( + base_model + ), "Please specify a --base_model, e.g. --base_model='decapoda-research/llama-7b-hf'" + gradient_accumulation_steps = 8 + + if disable_tqdm: + disable_progress_bar() + hf_logging.disable_progress_bar() + + prompter = Prompter(prompt_template_name) + + world_size = int(os.environ.get("WORLD_SIZE", 1)) + + ddp = world_size != 1 + + if ddp: + gradient_accumulation_steps = gradient_accumulation_steps // world_size + + # Check if parameter passed or if set within environ + use_wandb = len(wandb_project) > 0 or ( + "WANDB_PROJECT" in os.environ and len(os.environ["WANDB_PROJECT"]) > 0 + ) + # Only overwrite environ if wandb param passed + if len(wandb_project) > 0: + os.environ["WANDB_PROJECT"] = wandb_project + if len(wandb_watch) > 0: + os.environ["WANDB_WATCH"] = wandb_watch + if len(wandb_log_model) > 0: + os.environ["WANDB_LOG_MODEL"] = wandb_log_model + + if attn_implementation == "eager": + model = AutoModelForCausalLM.from_pretrained( + base_model, attn_implementation="eager" + ) + else: + model = AutoModelForCausalLM.from_pretrained( + base_model + ) + + tokenizer = AutoTokenizer.from_pretrained(base_model) + + tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token + tokenizer.padding_side = "left" # Allow batched inference + + def tokenize(prompt, add_eos_token=True): + # there's probably a way to do this with the tokenizer settings + # but again, gotta move fast + result = tokenizer( + prompt, + truncation=True, + max_length=cutoff_len, + padding=False, + return_tensors=None, + ) + if ( + result["input_ids"][-1] != tokenizer.eos_token_id + and len(result["input_ids"]) < cutoff_len + and add_eos_token + ): + result["input_ids"].append(tokenizer.eos_token_id) + result["attention_mask"].append(1) + + result["labels"] = result["input_ids"].copy() + + return result + + def generate_and_tokenize_prompt(data_point): + full_prompt = prompter.generate_prompt( + data_point["instruction"], + data_point["input"], + data_point["output"], + ) + tokenized_full_prompt = tokenize(full_prompt) + if not train_on_inputs: + user_prompt = prompter.generate_prompt( + data_point["instruction"], data_point["input"] + ) + tokenized_user_prompt = tokenize(user_prompt, add_eos_token=False) + user_prompt_len = len(tokenized_user_prompt["input_ids"]) + + tokenized_full_prompt["labels"] = [ + -100 + ] * user_prompt_len + tokenized_full_prompt["labels"][ + user_prompt_len: + ] # could be sped up, probably + return tokenized_full_prompt + + model = prepare_model_for_kbit_training(model) + config = LoraConfig( + r=lora_r, + lora_alpha=lora_alpha, + target_modules=lora_target_modules, + lora_dropout=lora_dropout, + bias="none", + task_type="CAUSAL_LM", + ) + model = get_peft_model(model, config) + + if data_path.endswith(".json") or data_path.endswith(".jsonl"): + data = load_dataset("json", data_files=data_path) + else: + data = load_dataset(data_path) + + if resume_from_checkpoint: + # Check the available weights and load them + checkpoint_name = os.path.join( + resume_from_checkpoint, "pytorch_model.bin" + ) # Full checkpoint + if not os.path.exists(checkpoint_name): + checkpoint_name = os.path.join( + resume_from_checkpoint, "adapter_model.bin" + ) # only LoRA model - LoRA config above has to fit + resume_from_checkpoint = False # So the trainer won't try loading its state + # The two files above have a different name depending on how they were saved, but are actually the same. + if os.path.exists(checkpoint_name): + print(f"Restarting from {checkpoint_name}") + adapters_weights = torch.load(checkpoint_name) + model = set_peft_model_state_dict(model, adapters_weights) + else: + print(f"Checkpoint {checkpoint_name} not found") + + model.print_trainable_parameters() # Be more transparent about the % of trainable params. + + if val_set_size > 0: + train_val = data["train"].train_test_split( + test_size=val_set_size, shuffle=True, seed=42 + ) + train_data = train_val["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = train_val["test"].shuffle().map(generate_and_tokenize_prompt) + else: + train_data = data["train"].shuffle().map(generate_and_tokenize_prompt) + val_data = None + + if not ddp and torch.cuda.device_count() > 1: + # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available + model.is_parallelizable = True + model.model_parallel = True + + trainer = transformers.Trainer( + model=model, + train_dataset=train_data, + eval_dataset=val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=micro_batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + warmup_steps=10, + num_train_epochs=num_epochs, + learning_rate=learning_rate, + bf16=bf16, + logging_steps=10, + optim="adamw_torch", + evaluation_strategy="steps" if val_set_size > 0 else "no", + save_strategy="steps", + eval_steps=200 if val_set_size > 0 else None, + save_steps=200, + output_dir=output_dir, + save_total_limit=3, + load_best_model_at_end=True if val_set_size > 0 else False, + ddp_find_unused_parameters=False if ddp else None, + group_by_length=group_by_length, + report_to="wandb" if use_wandb else None, + run_name=wandb_run_name if use_wandb else None, + use_cpu=True, + use_ipex=True, + max_steps=max_steps, + ddp_backend="ccl", + disable_tqdm=disable_tqdm, + ), + data_collator=transformers.DataCollatorForSeq2Seq( + tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True + ), + ) + model.config.use_cache = False + + old_state_dict = model.state_dict + model.state_dict = ( + lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict()) + ).__get__(model, type(model)) + print("Start Training") + trainer.train(resume_from_checkpoint=resume_from_checkpoint) + print("Finish Training") + model.save_pretrained(output_dir) + + print("\n If there's a warning about missing keys above, please disregard :)") + + +if __name__ == "__main__": + fire.Fire(train) diff --git a/examples/cpu/training/llm/requirements.txt b/examples/cpu/training/llm/requirements.txt new file mode 100644 index 000000000..d4d4149cb --- /dev/null +++ b/examples/cpu/training/llm/requirements.txt @@ -0,0 +1,11 @@ +accelerate==0.28.0 +appdirs +bitsandbytes +black +black[jupyter] +datasets +fire +peft +transformers==4.38.1 +gradio +sentencepiece diff --git a/examples/cpu/training/llm/run_lora_finetune_ddp.sh b/examples/cpu/training/llm/run_lora_finetune_ddp.sh new file mode 100644 index 000000000..8ac366c33 --- /dev/null +++ b/examples/cpu/training/llm/run_lora_finetune_ddp.sh @@ -0,0 +1,126 @@ + +#!/bin/bash + +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + + + +ARGS="" + +MAXSTEP=${MAXSTEP:--1} + +if [ -z "${OUTPUT_DIR}" ]; then + echo "The required environment variable OUTPUT_DIR has not been set, please create the output path and set it to OUTPUT_DIR" + exit 1 +fi + +if [[ "$1" == "bf16" ]] +then + precision="bf16" + ARGS="$ARGS --bf16 " + echo "### running bf16 mode" +elif [[ "$1" == "fp32" ]] +then + echo "### running fp32 mode" +else + echo "The specified precision '$1' is unsupported." + echo "Supported precisions are: fp32, bf16" + exit 1 +fi + + +MODEL_NAME_OR_PATH=${MODEL_NAME_OR_PATH:-"meta-llama/Llama-2-7b-hf"} +LOCAL_BATCH_SIZE=${LOCAL_BATCH_SIZE:-32} +ATTN_IMPLEMENTATION=${ATTN_IMPLEMENTATION:-""} + +#DDP settings +export TORCH_CPP_LOG_LEVEL=INFO +export TORCH_DISTRIBUTED_DEBUG=INFO +export MASTER_ADDR=`head -1 hostfile` +CORES=`lscpu | grep Core | awk '{print $4}'` +SOCKETS=`lscpu | grep Socket | awk '{print $2}'` +TOTAL_CORES=`expr $CORES \* $SOCKETS` +NNODES=${NNODES:-1} +HOSTFILE=${HOSTFILE:-./hostfile} +DATASET=${DATASET:-"./alpaca_data.json"} +export DNNL_PRIMITIVE_CACHE_CAPACITY=1024 +export KMP_BLOCKTIME=1 +export KMP_AFFINITY=granularity=fine,compact,1,0 + +<< EOF +# specific oneCCL settings depending on any CPU cluster +export CCL_WORKER_COUNT=8 +export CCL_LOG_LEVEL=info +export CCL_BF16=avx512bf +export CCL_ATL_TRANSPORT=ofi +export CCL_MNIC_COUNT=2 +export CCL_MNIC=local +export CCL_MNIC_NAME=irdma1,irdma5 +export CCL_ALLREDUCE=ring +export CCL_WORKER_COUNT=8 + +for (( i = $SOCKETS; i < 2*$SOCKETS; i++ )); do # pin CCL workers to HT + START_CORE=$(( i * CORES )) + for (( j = 0; j < $CCL_WORKER_COUNT; j++)); do + CCL_WORKER_AFFINITY="${CCL_WORKER_AFFINITY} $((START_CORE + j))" + done +done + +export CCL_WORKER_AFFINITY=`echo ${CCL_WORKER_AFFINITY} | tr " " ","` +EOF + +# specific Fabric settings depending on your network hardware status +# export FI_PROVIDER=psm3 +# export PSM3_IDENTIFY=1 +# export PSM3_ALLOW_ROUTERS=1 +# export PSM3_RDMA=1 +# export PSM3_PRINT_STATS=0 +# export PSM3_RV_MR_CACHE_SIZE=8192 +# export PSM3_KASSIST_MODE=none +# export FI_PSM3_CONN_TIMEOUT=100 +# export PSM3_HAL=sockets + + +oneccl_bindings_for_pytorch_path=$(python -c "import torch; import oneccl_bindings_for_pytorch; import os; print(os.path.abspath(os.path.dirname(oneccl_bindings_for_pytorch.__file__)))") +source $oneccl_bindings_for_pytorch_path/env/setvars.sh + +python -m intel_extension_for_pytorch.cpu.launch \ + --memory-allocator tcmalloc \ + --distributed \ + --nnodes ${NNODES} \ + --hostfile ${HOSTFILE} \ + --logical-cores-for-ccl --ccl_worker_count 2 \ + ./finetune.py $ARGS \ + --base_model ${MODEL_NAME_OR_PATH} \ + --attn_implementation ${ATTN_IMPLEMENTATION} \ + --data_path ${DATASET} \ + --output_dir ${OUTPUT_DIR} \ + --micro_batch_size ${LOCAL_BATCH_SIZE} \ + --num_epochs 3 \ + --learning_rate 1e-4 \ + --cutoff_len 512 \ + --val_set_size 2000 \ + --lora_r 8 \ + --lora_alpha 16 \ + --lora_dropout 0.05 \ + --lora_target_modules '[q_proj,v_proj]' \ + --train_on_inputs \ + --group_by_length \ + --max_steps ${MAXSTEP} + + diff --git a/examples/cpu/training/llm/templates/alpaca.json b/examples/cpu/training/llm/templates/alpaca.json new file mode 100644 index 000000000..e486439c4 --- /dev/null +++ b/examples/cpu/training/llm/templates/alpaca.json @@ -0,0 +1,6 @@ +{ + "description": "Template used by Alpaca-LoRA.", + "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", + "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", + "response_split": "### Response:" +} diff --git a/examples/cpu/training/llm/utils/README.md b/examples/cpu/training/llm/utils/README.md new file mode 100644 index 000000000..ee32d9871 --- /dev/null +++ b/examples/cpu/training/llm/utils/README.md @@ -0,0 +1,7 @@ +# Directory for helpers modules + +## prompter.py + +Prompter class, a template manager. + +`from utils.prompter import Prompter` diff --git a/examples/cpu/training/llm/utils/__init__.py b/examples/cpu/training/llm/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cpu/training/llm/utils/prompter.py b/examples/cpu/training/llm/utils/prompter.py new file mode 100644 index 000000000..0915f2aa9 --- /dev/null +++ b/examples/cpu/training/llm/utils/prompter.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +A dedicated helper to manage templates and prompt building. +""" + +import json +import os.path as osp +import os +from typing import Union + + +class Prompter(object): + __slots__ = ("template", "_verbose") + + def __init__(self, template_name: str = "", verbose: bool = False): + self._verbose = verbose + if not template_name: + # Enforce the default here, so the constructor can be called with '' and will not break. + template_name = "alpaca" + curpath = os.path.abspath(os.path.dirname(__file__)) + file_name = osp.join(curpath + "/../templates", f"{template_name}.json") + if not osp.exists(file_name): + raise ValueError(f"Can't read {file_name}") + with open(file_name) as fp: + self.template = json.load(fp) + if self._verbose: + print( + f"Using prompt template {template_name}: {self.template['description']}" + ) + + def generate_prompt( + self, + instruction: str, + input: Union[None, str] = None, + label: Union[None, str] = None, + ) -> str: + # returns the full prompt from instruction and optional input + # if a label (=response, =output) is provided, it's also appended. + if input: + res = self.template["prompt_input"].format( + instruction=instruction, input=input + ) + else: + res = self.template["prompt_no_input"].format( + instruction=instruction + ) + if label: + res = f"{res}{label}" + if self._verbose: + print(res) + return res + + def get_response(self, output: str) -> str: + return output.split(self.template["response_split"])[1].strip() From b291a8150a5bb77c327767b568af3ccff97e8748 Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 23 Apr 2024 09:07:08 +0800 Subject: [PATCH 031/199] Enable optimized Phi (#2785) --- .../run_accuracy_with_deepspeed.py | 1 + .../llm/distributed/run_generation_tp.py | 467 +++++ .../run_generation_with_deepspeed.py | 1 + examples/cpu/inference/python/llm/run.py | 5 +- .../llm/single_instance/run_accuracy.py | 1 + .../llm/single_instance/run_generation.py | 1 + .../llm/single_instance/run_quantization.py | 3 + .../python/llm/utils/create_shard_model.py | 1 + .../python/llm/utils/model_class/phi.py | 13 + .../transformers/__init__.py | 1 + .../transformers/generation/beam_sample.py | 1 + .../transformers/generation/beam_search.py | 1 + .../transformers/generation/greedy_search.py | 1 + .../transformers/generation/sample.py | 1 + .../models/cpu/modules/attentions.py | 1 + .../models/cpu/modules/decoder.py | 2 +- .../models/reference/fusions/mha_fusion.py | 4 +- .../transformers/models/reference/models.py | 213 +++ .../models/reference/modules/attentions.py | 101 +- .../models/reference/modules/decoder.py | 54 + .../transformers/optimize.py | 32 +- .../transformers/tensor_parallel.py | 8 +- tests/cpu/hf_configs/phi/config.json | 34 + tests/cpu/hf_configs/phi/configuration_phi.py | 199 +++ tests/cpu/hf_configs/phi/modeling_phi.py | 1564 +++++++++++++++++ tests/cpu/hf_configs/yuan/yuan_hf_model.py | 10 +- ...test_ipex_optimize_transformers_nightly.py | 8 + tests/cpu/test_ipex_tensor_parallel.py | 88 +- 28 files changed, 2787 insertions(+), 29 deletions(-) create mode 100644 examples/cpu/inference/python/llm/distributed/run_generation_tp.py create mode 100644 examples/cpu/inference/python/llm/utils/model_class/phi.py create mode 100644 tests/cpu/hf_configs/phi/config.json create mode 100644 tests/cpu/hf_configs/phi/configuration_phi.py create mode 100644 tests/cpu/hf_configs/phi/modeling_phi.py diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index e3b9ad3ab..04f27c1a1 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -68,6 +68,7 @@ def decorator(func): "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py new file mode 100644 index 000000000..fceafd391 --- /dev/null +++ b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py @@ -0,0 +1,467 @@ +import torch +import time +import json +import pathlib +import argparse +import re + +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + LlamaTokenizer, + T5ForConditionalGeneration, + AutoProcessor, +) + +from transformers import TextStreamer + +import sys + +sys.path.append(sys.path[0] + '/../../') + +import logging + +logger = logging.getLogger(__name__) + +# supported models +MODEL_CLASSES = { + "gpt-j": (AutoModelForCausalLM, AutoTokenizer), + "gpt-neox": (AutoModelForCausalLM, AutoTokenizer), + "llama": (AutoModelForCausalLM, LlamaTokenizer), + "opt": (AutoModelForCausalLM, AutoTokenizer), + "falcon": (AutoModelForCausalLM, AutoTokenizer), + "bloom": (AutoModelForCausalLM, AutoTokenizer), + "codegen": (AutoModelForCausalLM, AutoTokenizer), + "baichuan2": (AutoModelForCausalLM, AutoTokenizer), + "baichuan": (AutoModelForCausalLM, AutoTokenizer), + "chatglm": (AutoModelForCausalLM, AutoTokenizer), + "gptbigcode": (AutoModelForCausalLM, AutoTokenizer), + "t5": (T5ForConditionalGeneration, AutoTokenizer), + "mistral": (AutoModelForCausalLM, AutoTokenizer), + "mixtral": (AutoModelForCausalLM, AutoTokenizer), + "mpt": (AutoModelForCausalLM, AutoTokenizer), + "stablelm": (AutoModelForCausalLM, AutoTokenizer), + "qwen": (AutoModelForCausalLM, AutoTokenizer), + "git": (AutoModelForCausalLM, AutoProcessor), + "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi": (AutoModelForCausalLM, AutoTokenizer), + "auto": (AutoModelForCausalLM, AutoTokenizer), +} + +try: + from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM + from llava.model.builder import load_pretrained_model + from llava.conversation import conv_templates + from llava.mm_utils import get_model_name_from_path, tokenizer_image_token + from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + MODEL_CLASSES["llava"] = (LlavaLlamaForCausalLM, AutoTokenizer) +except ImportError: + pass + +# args +parser = argparse.ArgumentParser("Generation script (fp32/bf16 path)", add_help=False) +parser.add_argument( + "-m", + "--model-id", + type=str, + default="EleutherAI/gpt-j-6B", + help="the huggingface mdoel id", +) +parser.add_argument( + "--dtype", + type=str, + choices=["float32", "bfloat16"], + default="bfloat16", + help="bfloat16, float32", +) +parser.add_argument( + "--input-tokens", + default="32", + type=str, + help="input tokens length if needed from prompt.json", +) +parser.add_argument( + "--max-new-tokens", default=32, type=int, help="output max new tokens" +) +parser.add_argument( + "--prompt", default=None, type=str, help="input prompt for self-defined if needed" +) +parser.add_argument( + "--streaming", action="store_true", help="enable streaming mode for generation output (greedy search only)" +) +parser.add_argument( + "--image-url", default="http://images.cocodataset.org/val2017/000000039769.jpg", type=str, help="image url for image-to-text task" +) +parser.add_argument( + "--config-file", default=None, type=str, help="specific configuration file" +) +parser.add_argument("--greedy", action="store_true") +parser.add_argument("--ipex", action="store_true") +parser.add_argument( + "--ipex-weight-only-quantization", + action="store_true", + help="use ipex weight-only quantization", +) +parser.add_argument( + "--lowp-mode", + choices=["AUTO", "BF16", "FP32", "INT8", "FP16"], + default="AUTO", + type=str, + help="low precision mode for weight only quantization. " + "It indicates data type for computation for speedup at the cost " + "of accuracy. Unrelated to activation or weight data type." + "It is not supported yet to use lowp_mode=INT8 for INT8 weight, " + "falling back to lowp_mode=BF16 implicitly in this case." + "If set to AUTO, lowp_mode is determined by weight data type: " + "lowp_mode=BF16 is used for INT8 weight " + "and lowp_mode=INT8 used for INT4 weight", +) +parser.add_argument( + "--group-size", + default=-1, + type=int, + help="For weight-only quantization only. Specifies the group size along" + " input channel for block-wise quantization of weight. It must be a" + " positive power of 2 or -1. If it is -1, weight is quantized per" + " output channel. Otherwise, weight is quantized per block with block size" + " = [1, group_size]. If `--low-precision-checkpoint` is given, group" + " size is determined automatically and this argument has no effect.", +) +parser.add_argument( + "--quant-with-amp", + action="store_true", + help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) +parser.add_argument( + "--weight-dtype", + choices=["INT8", "INT4", "NF4"], + default="INT8", + type=str, + help="weight data type for weight only quantization. Unrelated to activation" + " data type or lowp-mode. If `--low-precision-checkpoint` is given, weight" + " data type is always INT4 and this argument is not needed.", +) +parser.add_argument("--deployment-mode", action="store_true") +parser.add_argument("--torch-compile", action="store_true") +parser.add_argument("--backend", default="ipex", type=str, help="backend of torch.compile") +parser.add_argument("--profile", action="store_true") +parser.add_argument("--benchmark", action="store_true") +parser.add_argument("--num-iter", default=100, type=int, help="num iter") +parser.add_argument("--num-warmup", default=10, type=int, help="num warmup") +parser.add_argument("--batch-size", default=1, type=int, help="batch size") +parser.add_argument( + "--token-latency", action="store_true", help="get token latency breakdown" +) +parser.add_argument( + "--low-precision-checkpoint", + default="", + type=str, + help="Low precision checkpoint file generated by calibration, such as GPTQ. It contains" + " modified weights, scales, zero points, etc. For better accuracy of weight only" + " quantization with INT4 weight.", +) +parser.add_argument( + "--act-quant-mode", + choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"], + default="PER_IC_BLOCK", + type=str, + help="Quantization mode for activation with different granularity. " + "For lowp-mode=INT8 only. For other cases, it has no effect. " + "Assume the activation tensor has shape batch_size x input_channel. " + "PER_TENSOR(0): quantize per tensor; " + "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; " + "PER_BATCH(2): quantize per batch; " + "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " + "IC_BLOCK is determined by IC automatically.", +) +parser.add_argument( + "--gptq-legacy-format", + action="store_true", + help="Indicate that the low-precision checkpoint is in the legacy format rather than the" + " HuggingFace Optimum format for backward compatibility. It must be used with" + " --low-precision-checkpoint. Otherwise, it has no effect." +) +args = parser.parse_args() +print(args) + +# import ipex +if args.ipex or args.ipex_weight_only_quantization: + import intel_extension_for_pytorch as ipex + + torch._C._jit_set_texpr_fuser_enabled(False) + try: + ipex._C.disable_jit_linear_repack() + except Exception: + pass + +# dtype +amp_enabled = False if args.dtype == "float32" or not args.quant_with_amp else True +amp_dtype = getattr(torch, args.dtype) + +# load model +model_type = next( + (x for x in MODEL_CLASSES.keys() if x in args.model_id.lower()), "auto" +) +model_class = MODEL_CLASSES[model_type] +if args.config_file is None: + config = AutoConfig.from_pretrained( + args.model_id, torchscript=args.deployment_mode, trust_remote_code=True + ) +else: + config = AutoConfig.from_pretrained( + args.config_file, torchscript=args.deployment_mode, trust_remote_code=True + ) +if not hasattr(config, "text_max_length") and args.prompt is None: + config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) +if model_type == "mpt" and args.prompt is None: + config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) +if model_type == "llava": + config.use_cache=True + +if not hasattr(config, "lm_head_generation"): + config.lm_head_generation = True + +if model_type != "llava": + model = model_class[0].from_pretrained( + args.model_id, + torch_dtype=amp_dtype, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True + ) + tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) +else: + tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) +model = model.eval() +model = model.to(memory_format=torch.channels_last) + +num_beams = 1 if args.greedy else 4 +# generate args +if args.streaming: + streamer = TextStreamer(tokenizer) +else: + streamer = None +generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=num_beams, max_new_tokens=args.max_new_tokens, min_new_tokens=args.max_new_tokens, streamer=streamer) + +if re.search("gptbigcode", model.config.architectures[0], re.IGNORECASE): + model_type = "gptbigcode" +if re.search("gptneox", model.config.architectures[0], re.IGNORECASE): + model_type = "gpt-neox" +elif re.search("t5", model.config.architectures[0], re.IGNORECASE): + generate_kwargs["max_length"] = generate_kwargs["max_new_tokens"] + generate_kwargs.pop("max_new_tokens") +elif re.search("git", model.config.architectures[0], re.IGNORECASE) or re.search("llava", model.config.architectures[0], re.IGNORECASE): + from PIL import Image + import requests + from io import BytesIO + model.config.batch_size = int(args.batch_size) * num_beams + + def load_image(image_file): + if image_file.startswith('http://') or image_file.startswith('https://'): + response = requests.get(image_file) + image = Image.open(BytesIO(response.content)).convert('RGB') + else: + image = Image.open(image_file).convert('RGB') + return image +if re.search("llava", model.config.architectures[0], re.IGNORECASE): + model_name = get_model_name_from_path(args.model_id) + if 'llama-2' in model_name.lower(): + conv_mode = "llava_llama_2" + elif "v1" in model_name.lower(): + conv_mode = "llava_v1" + elif "mpt" in model_name.lower(): + conv_mode = "mpt" + else: + conv_mode = "llava_v0" + conv = conv_templates[conv_mode].copy() + if "mpt" in model_name.lower(): + roles = ('user', 'assistant') + else: + roles = conv.roles +if re.search("yuan", model.config.architectures[0], re.IGNORECASE): + model.config.batch_size = int(args.batch_size) * num_beams +def trace_handler(prof): + print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) +# to ipex +if args.ipex: + model = ipex.llm.optimize( + model.eval(), + dtype=amp_dtype, + inplace=True, + deployment_mode=args.deployment_mode, + ) +elif args.ipex_weight_only_quantization: + from intel_extension_for_pytorch.quantization import WoqWeightDtype + if args.weight_dtype == "INT8": + weight_dtype = WoqWeightDtype.INT8 + elif args.weight_dtype == "INT4": + weight_dtype = WoqWeightDtype.INT4 + else: + assert args.weight_dtype == "NF4" + weight_dtype = WoqWeightDtype.NF4 + + if args.lowp_mode == "INT8": + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + elif args.lowp_mode == "FP32": + lowp_mode = ipex.quantization.WoqLowpMode.NONE + elif args.lowp_mode == "FP16": + lowp_mode = ipex.quantization.WoqLowpMode.FP16 + elif args.lowp_mode == "BF16": + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + else: # AUTO + if args.low_precision_checkpoint != "" or weight_dtype == WoqWeightDtype.INT4: + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + else: + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + + act_quant_mode_dict = { + "PER_TENSOR": ipex.quantization.WoqActQuantMode.PER_TENSOR, + "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, + "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + } + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode_dict[args.act_quant_mode], + group_size=args.group_size, + ) + if args.low_precision_checkpoint != "": + low_precision_checkpoint = torch.load(args.low_precision_checkpoint) + if args.gptq_legacy_format: + config_dict = ( + ipex.utils.weight_only_quantization._legacy_lowp_checkpoint_config() + ) + low_precision_checkpoint = (low_precision_checkpoint, config_dict) + else: + low_precision_checkpoint = None + + model = ipex.llm.optimize( + model.eval(), + dtype=amp_dtype, + quantization_config=qconfig, + inplace=True, + ) + +if args.torch_compile: + if args.deployment_mode: + raise SystemExit("[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode to False if want to use torch.compile.") + model.forward = torch.compile(model.forward, dynamic=True, backend=args.backend) + + +if args.benchmark: + if args.token_latency: + if not hasattr(model.config, "token_latency"): + model.config.token_latency = True + if model_type == "git": + prompt = Image.open(requests.get(args.image_url, stream=True).raw) + generate_kwargs.pop("min_new_tokens", None) + elif model_type == "llava": + if args.prompt is not None: + prompt = args.prompt + image = load_image(args.image_url) + image = [image] * args.batch_size + if model.config.mm_use_im_start_end: + prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + else: + prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt + conv.append_message(conv.roles[0], prompt) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + else: + # input prompt + current_path = pathlib.Path(__file__).parent.resolve() + with open(str(current_path) + "/prompt.json") as f: + prompt_pool = json.load(f) + if args.prompt is not None: + prompt = args.prompt + elif model_type == "auto": + raise SystemExit( + "[ERROR] model prompt is not supported, please use --prompt for this model: " + + args.model_id + ) + elif int(args.input_tokens) > 8192: + prompt = prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) + elif args.input_tokens in prompt_pool[model_type]: + prompt = prompt_pool[model_type][args.input_tokens] + else: + raise SystemExit("[ERROR] Plese use --prompt if want to use custom input.") + + input_size = tokenizer(prompt, return_tensors="pt").input_ids.size(dim=1) + print("---- Prompt size:", input_size) + + # start + total_time = 0.0 + num_iter = args.num_iter + num_warmup = args.num_warmup + prompt = [prompt] * args.batch_size + total_list = [] + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=amp_enabled + ): + if args.profile: + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU], + schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), + on_trace_ready=trace_handler, + ) as prof: + for i in range(5): + if model_type == "llava": + input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) + image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] + output = model.generate(input_ids, images=image_tensor, **generate_kwargs) + elif model_type == "git": + input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values + output = model.generate(pixel_values=input_ids, **generate_kwargs) + else: + input_ids = tokenizer(prompt, return_tensors="pt").input_ids + output = model.generate(input_ids, **generate_kwargs) + prof.step() + for i in range(num_iter): + tic = time.time() + if model_type == "llava": + input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) + image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] + output = model.generate(input_ids, images=image_tensor, **generate_kwargs) + elif model_type == "git": + input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values + output = model.generate(pixel_values=input_ids, **generate_kwargs) + else: + input_ids = tokenizer(prompt, return_tensors="pt").input_ids + output = model.generate(input_ids, **generate_kwargs) + gen_ids = output[0] if args.token_latency else output + gen_text = tokenizer.batch_decode(gen_ids[:, input_ids.shape[1]:] if model_type=="llava" else gen_ids, skip_special_tokens=True) + + toc = time.time() + input_tokens_lengths = [x.shape[0] for x in input_ids] + output_tokens_lengths = [x.shape[0] for x in gen_ids] + total_new_tokens = [ + o - i if model.config.model_type != "t5" else o + for i, o in zip(input_tokens_lengths, output_tokens_lengths) + ] + print(gen_text, total_new_tokens, flush=True) + print("Iteration: %d, Time: %.6f sec" % (i, toc - tic), flush=True) + if i >= num_warmup: + total_time += toc - tic + if args.token_latency: + total_list.append(output[1]) + + print("\n", "-" * 10, "Summary:", "-" * 10) + latency = total_time / (num_iter - num_warmup) + print("Inference latency: %.3f sec." % latency) + + if args.token_latency: + import numpy as np + from itertools import chain + + first_latency = np.mean([x[0] for x in total_list]) + average_2n = list(chain(*[x[1:] for x in total_list])) + average_2n.sort() + average_2n_latency = np.mean(average_2n) + p90_latency = average_2n[int(len(average_2n) * 0.9)] + p99_latency = average_2n[int(len(average_2n) * 0.99)] + print("First token average latency: %.3f sec." % first_latency) + print("Average 2... latency: %.3f sec." % average_2n_latency) + print("P90 2... latency: %.3f sec." % p90_latency) + print("P99 2... latency: %.3f sec." % p99_latency) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index ea01ce922..47a61032d 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -54,6 +54,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 325182335..8fc1eaff0 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -479,6 +479,7 @@ def main(args_in: Optional[List[str]] = None) -> None: MODEL_CLASSES = { "gpt-j": ("/gptj_local_shard"), "gpt-neox": ("/gptneox_local_shard"), + "llava": ("/llava_local_shard"), "llama": ("/llama_local_shard"), "opt": ("/opt_local_shard"), "falcon": ("/falcon_local_shard"), @@ -488,15 +489,15 @@ def main(args_in: Optional[List[str]] = None) -> None: "chatglm": ("/chatglm_local_shard"), "starcoder": ("/starcoder_local_shard"), "t5": ("/t5_local_shard"), - "mistral": ("/mistral_local_shard"), "mixtral": ("/mixtral_local_shard"), + "mistral": ("/mistral_local_shard"), "mpt": ("/mpt_local_shard"), "stablelm": ("/stablelm_local_shard"), "dolly": ("/dolly_local_shard"), "qwen": ("/qwen_local_shard"), "git": ("/git_local_shard"), "yuan": ("/yuan_local_shard"), - "llava": ("/llava_local_shard"), + "phi": ("/phi_local_shard"), } model_type = next( (x for x in MODEL_CLASSES.keys() if x in args.model_name_or_path.lower()), "auto" diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index bfca6db71..079d76c82 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -34,6 +34,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 6b5a60504..22bb5a431 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -44,6 +44,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 583ae0d74..485052c4b 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -36,6 +36,7 @@ from llm.utils.model_class.qwen import QwenConfig from llm.utils.model_class.git import GitConfig from llm.utils.model_class.llava import LlavaConfig +from llm.utils.model_class.phi import PhiConfig parser = argparse.ArgumentParser("LLM generation script (int8 path)", add_help=False) parser.add_argument( @@ -278,6 +279,8 @@ def load_image(image_file): roles = ('user', 'assistant') else: roles = conv.roles +elif re.search("phi", config.architectures[0], re.IGNORECASE): + model = PhiConfig(args.model_id) else: raise AssertionError("Not support %s." % (args.model_id)) diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/inference/python/llm/utils/create_shard_model.py index 7926c3950..9ed7bab52 100644 --- a/examples/cpu/inference/python/llm/utils/create_shard_model.py +++ b/examples/cpu/inference/python/llm/utils/create_shard_model.py @@ -25,6 +25,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/utils/model_class/phi.py b/examples/cpu/inference/python/llm/utils/model_class/phi.py new file mode 100644 index 000000000..1bee2e476 --- /dev/null +++ b/examples/cpu/inference/python/llm/utils/model_class/phi.py @@ -0,0 +1,13 @@ +from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + +class PhiConfig(LLMConfig): + def __init__(self, model_id): + self.name = "phi" + self.model_id = model_id + self.to_channels_last = True + self.example_inputs_mode = EXAMPLE_INPUTS_MODE.MASK_KV_POS + + # for smooth quant + self.default_dataset = "NeelNanda/pile-10k" + self.use_global_past_key_value = True + self.use_ipex_autotune = True diff --git a/intel_extension_for_pytorch/transformers/__init__.py b/intel_extension_for_pytorch/transformers/__init__.py index 1e17b4597..0638f0df6 100644 --- a/intel_extension_for_pytorch/transformers/__init__.py +++ b/intel_extension_for_pytorch/transformers/__init__.py @@ -10,4 +10,5 @@ TensorParallelColumnLinear, TensorParallelRowLinear, TensorParallelLMhead, + TensorParallelConv2d, ) diff --git a/intel_extension_for_pytorch/transformers/generation/beam_sample.py b/intel_extension_for_pytorch/transformers/generation/beam_sample.py index 544438d71..b468941ea 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_sample.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_sample.py @@ -189,6 +189,7 @@ def _beam_sample( "GitForCausalLM", "LlavaLlamaForCausalLM", "YuanForCausalLM", + "PhiForCausalLM", ]: first_token = False if model_inputs["past_key_values"] is None: diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py index ada0245dc..6c096fa02 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_search.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py @@ -191,6 +191,7 @@ def _beam_search( "GitForCausalLM", "LlavaLlamaForCausalLM", "YuanForCausalLM", + "PhiForCausalLM", ]: first_token = False has_position_id = model_inputs.get("position_ids", None) is not None diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py index ba568839e..c205df94d 100644 --- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py +++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py @@ -172,6 +172,7 @@ def _greedy_search( "GitForCausalLM", "LlavaLlamaForCausalLM", "YuanForCausalLM", + "PhiForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] diff --git a/intel_extension_for_pytorch/transformers/generation/sample.py b/intel_extension_for_pytorch/transformers/generation/sample.py index e52dfd1e8..c8df63620 100644 --- a/intel_extension_for_pytorch/transformers/generation/sample.py +++ b/intel_extension_for_pytorch/transformers/generation/sample.py @@ -178,6 +178,7 @@ def _sample( "GitForCausalLM", "LlavaLlamaForCausalLM", "YuanForCausalLM", + "PhiForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py index 3fbc8eaca..1d9d1610f 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py @@ -41,6 +41,7 @@ def __init__(self, module, config, tpp=False, woq=False): "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM", + "PhiForCausalLM", ]: if hasattr(module, "concat_qkv"): self.concat_qkv = _IPEXConcatLinearCPU( diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py index bb5bc0df0..1560bfbf5 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py @@ -19,7 +19,7 @@ def __init__(self, module, config, tpp=False, woq=False): if k.startswith("__"): continue setattr(self.__class__, k, getattr(module.__class__, k)) - if self.model_backbone == "GPTJForCausalLM": + if self.model_backbone in ["GPTJForCausalLM", "PhiForCausalLM"]: if not self.distributed: self.linear_add_add = _IPEXlinearAddAddCPU( module.linear_add_add.linear, tpp=tpp, woq=woq diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py index 409af7a14..c48677b81 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py @@ -205,7 +205,7 @@ def apply_ref_rope( .transpose(1, 2) ) x = torch.cat([x_rot, x_pass], dim=-1) - elif self.model_backbone == "StableLmForCausalLM": + elif self.model_backbone in ["StableLmForCausalLM", "PhiForCausalLM"]: x = x.transpose(1, 2) x_rot = x[..., :rotary_ndims] x_pass = x[..., rotary_ndims:] @@ -296,6 +296,7 @@ def __init__(self, module, config): "StableLmForCausalLM", "LlavaLlamaForCausalLM", "YuanForCausalLM", + "PhiForCausalLM", ]: self.num_key_value_groups = ( module.num_key_value_groups @@ -511,6 +512,7 @@ def forward( "MixtralForCausalLM", "StableLmForCausalLM", "YuanForCausalLM", + "PhiForCausalLM", ]: # repeat k/v heads if n_kv_heads < n_heads key = self._repeat_kv(key, self.num_key_value_groups) diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 5b200bd7d..700dc4e3f 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -2978,6 +2978,219 @@ def custom_forward(*inputs): ) +def PhiForCausalLM_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and hidden_states.size(1) != 1 + ): + hidden_states = hidden_states[:, -1:, :] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + +def PhiModel_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + inputs_embeds = self.embed_dropout(inputs_embeds) + + # Attention mask. + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = ( + attention_mask + if (attention_mask is not None and 0 in attention_mask) + else None + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): if module.config.use_return_dict or ( "return_dict" in kwargs and kwargs["return_dict"] diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index afce80710..8cd1d28c2 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -1878,6 +1878,94 @@ def _YuanAttention_forward( return attn_output, attn_weights, past_key_value +def _PhiAttention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + concat_qkv = None + if hasattr(self, "concat_qkv"): + concat_qkv = self.concat_qkv(hidden_states) + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + + kv_seq_len = ( + q_len + past_key_value[0].size(-2) if past_key_value is not None else q_len + ) + if concat_qkv is not None and type(concat_qkv) is not tuple: + query_states, key_states, value_states = self._IPEXROPE( + concat_qkv, + position_ids, + self.num_heads, + self.head_dim, + self.pos_embd_dim // 2, + self.pos_embd_dim, + kv_seq_len, + self.concat_qkv.num_concat, + ) + else: + if concat_qkv is not None: + query_states, key_states, value_states = concat_qkv + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + key_states = self._IPEXROPE( + key_states, + position_ids, + self.num_key_value_heads, + self.head_dim, + self.pos_embd_dim // 2, + self.pos_embd_dim, + kv_seq_len, + ) + query_states = self._IPEXROPE( + query_states, + position_ids, + self.num_attention_heads, + self.head_dim, + self.pos_embd_dim // 2, + self.pos_embd_dim, + kv_seq_len, + ) + + key_states = _repeat_kv(key_states, self.num_key_value_groups) + value_states = _repeat_kv(value_states, self.num_key_value_groups) + + (attn_output, attn_weights, past_key_value) = self._IPEXScaleDotProduct( + query_states, + key_states, + value_states, + math.sqrt(self.head_dim), + past_key_value, + None, + attention_mask, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _create_attention_mask_for_git( self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None ): @@ -2028,7 +2116,7 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): else config.kv_channels ) self.pos_embd_dim = rotary_dim // 2 - elif self.model_backbone == "StableLmForCausalLM": + elif self.model_backbone in ["StableLmForCausalLM", "PhiForCausalLM"]: self.pos_embd_dim = self.rotary_emb.dim else: self.pos_embd_dim = self.head_dim @@ -2051,6 +2139,7 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): "MixtralForCausalLM", "StableLmForCausalLM", "LlavaLlamaForCausalLM", + "PhiForCausalLM", ]: supported_linear_types = [ torch.nn.Linear, @@ -2460,6 +2549,16 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "PhiForCausalLM": + return _PhiAttention_forward( + self, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index ac39b277d..454457a9e 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -1186,6 +1186,44 @@ def YuanDecoderLayer_forward( return outputs +def PhiDecoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + past_key_value: Optional[Tuple[torch.Tensor]] = None, +) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + # feed_forward_hidden_states = self.mlp(hidden_states) + feed_forward_hidden_states = self.linear_gelu(hidden_states) + if not self.distributed: + hidden_states = self.linear_add_add( + feed_forward_hidden_states, attn_outputs, residual + ) + else: + feed_forward_hidden_states = self.mlp.fc2(feed_forward_hidden_states) + hidden_states = attn_outputs + feed_forward_hidden_states + residual + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + return outputs + + class _IPEXDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): super().__init__() @@ -1371,6 +1409,12 @@ def __init__(self, module, config, distributed=False): ) del self.__dict__["_modules"]["mlp"].gate_proj del self.__dict__["_modules"]["mlp"].up_proj + elif self.model_backbone == "PhiForCausalLM": + if not self.distributed: + self.linear_add_add = _IPEXlinearAddAddRef(module.mlp.fc2) + del self.__dict__["_modules"]["mlp"].fc2 + self.linear_gelu = _IPEXlinearNewGeluRef(module.mlp.fc1) + del self.__dict__["_modules"]["mlp"].fc1 else: AssertionError(False, "Do not support the optimization of your model yet") @@ -1591,5 +1635,15 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "PhiForCausalLM": + return PhiDecoderLayer_forward( + self, + hidden_states, + attention_mask, + position_ids, + output_attentions, + use_cache, + past_key_value, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 293b1a0fd..4f87d7f30 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -184,6 +184,8 @@ def model_convert_reference(_model): LlavaLlamaForCausalLM_forward, YuanForCausalLM_forward, YuanModel_forward, + PhiForCausalLM_forward, + PhiModel_forward, prepare_inputs_for_generation, prepare_inputs_for_generation_gptbigcode, prepare_inputs_for_generation_llama, @@ -377,8 +379,9 @@ def model_convert_reference(_model): yuan_attention = None if _model.config.architectures[0] == "YuanForCausalLM": yuan_attention = type(_model.model.layers[0].self_attn) - supported_mha_classes.append(yuan_attention) - ipex_tp_supported_mha_classes.append(yuan_attention) + if _model.config.architectures[0] in ["YuanForCausalLM", "PhiForCausalLM"]: + supported_mha_classes.append(type(_model.model.layers[0].self_attn)) + ipex_tp_supported_mha_classes.append(type(_model.model.layers[0].self_attn)) ipex_tp_supported_mlp_classes.append(type(_model.model.layers[0].mlp)) ipex_tp_supported_model_classes.append(type(_model)) # model-wise optimizations - MHA module @@ -745,7 +748,23 @@ def model_convert_reference(_model): _model.config, distributed=distributed, ) - + elif _model.config.architectures[0] == "PhiForCausalLM": + convert_function(_model, "forward", PhiForCausalLM_forward) + convert_function(_model.model, "forward", PhiModel_forward) + convert_class( + _model, + type(_model.model.layers[0].self_attn), + _IPEXAttentionRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + type(_model.model.layers[0]), + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) return _model @@ -1171,7 +1190,7 @@ def optimize( Well supported model family with full functionalities: Llama, GPT-J, GPT-Neox, OPT, Falcon, Bloom, CodeGen, Baichuan, ChatGLM, GPTBigCode, - T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan. + T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan, Phi. For the model that is not in the scope of supported model family above, will try to apply default ipex.optimize transparently to get benifits (not include quantizations, @@ -1259,6 +1278,7 @@ def optimize( "GitForCausalLM", "LlavaLlamaForCausalLM", "YuanForCausalLM", + "PhiForCausalLM", ] if well_supported_model: @@ -1267,8 +1287,8 @@ def optimize( if quantization_config is not None: logger.warning( "ipex.llm.optimize supports quantizations on Llama, GPT-J, GPT-Neox, Falcon, OPT, Bloom, CodeGen," - + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, Llava, and Yuan" - + "fallback to origin model" + + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, Llava, Yuan, " + + "and Phi, fallback to origin model" ) return model diff --git a/intel_extension_for_pytorch/transformers/tensor_parallel.py b/intel_extension_for_pytorch/transformers/tensor_parallel.py index f55ecabc8..080bda40a 100644 --- a/intel_extension_for_pytorch/transformers/tensor_parallel.py +++ b/intel_extension_for_pytorch/transformers/tensor_parallel.py @@ -179,6 +179,8 @@ def shard_weights_by_head( ] else: q = weight_data[:, q_head_start * head_dim : q_head_end * head_dim] + if linear.bias is not None: + q_bias = linear.bias.data / float(world_size) if not concat_qkv: return torch.nn.Parameter(q), torch.nn.Parameter(q_bias), cols_per_rank @@ -538,7 +540,7 @@ def shard_mha_weights( ) # del sub_m.__dict__["_modules"][l_name] setattr(sub_m, l_name, TPLinear) - if l_name in ["out_proj"]: + if l_name in ["out_proj", "dense"]: TPLinear = TensorParallelRowLinear( l_sub_m, num_kv_heads, @@ -584,7 +586,7 @@ def shard_mlp_weights( for _, sub_m in model.named_children(): if isinstance(sub_m, target_m): for l_name, l_sub_m in sub_m.named_children(): - if l_name in ["gate_proj", "up_proj", "fc_in"]: + if l_name in ["gate_proj", "up_proj", "fc_in", "fc1"]: TPLinear = TensorParallelColumnLinear( l_sub_m, num_kv_heads, @@ -595,7 +597,7 @@ def shard_mlp_weights( shard_by_head=False, ) setattr(sub_m, l_name, TPLinear.linear) - if l_name in ["down_proj", "fc_out"]: + if l_name in ["down_proj", "fc_out", "fc2"]: TPLinear = TensorParallelRowLinear( l_sub_m, num_kv_heads, diff --git a/tests/cpu/hf_configs/phi/config.json b/tests/cpu/hf_configs/phi/config.json new file mode 100644 index 000000000..3c0bdeb55 --- /dev/null +++ b/tests/cpu/hf_configs/phi/config.json @@ -0,0 +1,34 @@ +{ + "_name_or_path": "microsoft/phi-2", + "architectures": [ + "PhiForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_phi.PhiConfig", + "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM" + }, + "attention_dropout": 0.0, + "bos_token_id": 50256, + "embd_pdrop": 0.0, + "eos_token_id": 50256, + "hidden_act": "gelu_new", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 10240, + "layer_norm_eps": 1e-05, + "max_position_embeddings": 2048, + "model_type": "phi", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads": 32, + "partial_rotary_factor": 0.4, + "qk_layernorm": false, + "resid_pdrop": 0.1, + "rope_scaling": null, + "rope_theta": 10000.0, + "tie_word_embeddings": false, + "torch_dtype": "float16", + "transformers_version": "4.37.0", + "use_cache": true, + "vocab_size": 51200 +} \ No newline at end of file diff --git a/tests/cpu/hf_configs/phi/configuration_phi.py b/tests/cpu/hf_configs/phi/configuration_phi.py new file mode 100644 index 000000000..672bb18d3 --- /dev/null +++ b/tests/cpu/hf_configs/phi/configuration_phi.py @@ -0,0 +1,199 @@ +# coding=utf-8 +# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Phi model configuration""" + + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +PHI_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "microsoft/phi-2": "https://huggingface.co/microsoft/phi-2/resolve/main/config.json", +} + + +class PhiConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`PhiModel`]. It is used to instantiate an Phi + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Phi + [microsoft/phi-1](https://huggingface.co/microsoft/phi-1). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 51200): + Vocabulary size of the Phi model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`PhiModel`]. + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + resid_pdrop (`float`, *optional*, defaults to 0.0): + Dropout probability for mlp outputs. + embd_pdrop (`int`, *optional*, defaults to 0.0): + The dropout ratio for the embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after computing the attention scores. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Phi-1 and Phi-1.5 supports up to 2048 + tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format + is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This + is an experimental feature, subject to breaking API changes in future versions. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): + Percentage of the query and keys which will have rotary embedding. + qk_layernorm (`bool`, *optional*, defaults to `False`): + Whether or not to normalize the Queries and Keys after projecting the hidden states. + bos_token_id (`int`, *optional*, defaults to 1): + Denotes beginning of sequences token id. + eos_token_id (`int`, *optional*, defaults to 2): + Denotes end of sequences token id. + + Example: + + ```python + >>> from transformers import PhiModel, PhiConfig + + >>> # Initializing a Phi-1 style configuration + >>> configuration = PhiConfig.from_pretrained("microsoft/phi-1") + + >>> # Initializing a model from the configuration + >>> model = PhiModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "phi" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=51200, + hidden_size=2048, + intermediate_size=8192, + num_hidden_layers=24, + num_attention_heads=32, + num_key_value_heads=None, + resid_pdrop=0.0, + embd_pdrop=0.0, + attention_dropout=0.0, + hidden_act="gelu_new", + max_position_embeddings=2048, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + partial_rotary_factor=0.5, + qk_layernorm=False, + bos_token_id=1, + eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.partial_rotary_factor = partial_rotary_factor + self.qk_layernorm = qk_layernorm + self._rope_scaling_validation() + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if ( + rope_scaling_factor is None + or not isinstance(rope_scaling_factor, float) + or rope_scaling_factor <= 1.0 + ): + raise ValueError( + f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}" + ) diff --git a/tests/cpu/hf_configs/phi/modeling_phi.py b/tests/cpu/hf_configs/phi/modeling_phi.py new file mode 100644 index 000000000..d9a75c846 --- /dev/null +++ b/tests/cpu/hf_configs/phi/modeling_phi.py @@ -0,0 +1,1564 @@ +# coding=utf-8 +# Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" PyTorch Phi model.""" + + +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_phi import PhiConfig + + +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa +except ImportError: + pass + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "microsoft/phi-2" +_CONFIG_FOR_DOC = "PhiConfig" + +PHI_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/phi-2", + # See all Phi models at https://huggingface.co/models?filter=phi +] + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad( + torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0) + ) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Phi +class PhiRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, + device=self.inv_freq.device, + dtype=torch.get_default_dtype(), + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Phi +class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding): + """PhiRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Phi +class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding): + """PhiRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Phi +class PhiMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand( + batch, num_key_value_heads, n_rep, slen, head_dim + ) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class PhiAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.partial_rotary_factor = config.partial_rotary_factor + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear( + self.hidden_size, self.num_heads * self.head_dim, bias=True + ) + self.k_proj = nn.Linear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True + ) + self.v_proj = nn.Linear( + self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True + ) + self.dense = nn.Linear( + self.num_heads * self.head_dim, self.hidden_size, bias=True + ) + + self.qk_layernorm = config.qk_layernorm + if self.qk_layernorm: + self.q_layernorm = nn.LayerNorm( + config.hidden_size // self.num_heads, + eps=config.layer_norm_eps, + elementwise_affine=True, + ) + self.k_layernorm = nn.LayerNorm( + config.hidden_size // self.num_heads, + eps=config.layer_norm_eps, + elementwise_affine=True, + ) + + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = PhiRotaryEmbedding( + int(self.partial_rotary_factor * self.head_dim), + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = PhiLinearScalingRotaryEmbedding( + int(self.partial_rotary_factor * self.head_dim), + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = PhiDynamicNTKScalingRotaryEmbedding( + int(self.partial_rotary_factor * self.head_dim), + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + # Phi-2 has an attention overflow issue (with FP16) and requires autocast to be disabled + @torch.autocast("cpu", enabled=False) + @torch.autocast("cuda", enabled=False) + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + # Partial rotary embedding + query_rot, query_pass = ( + query_states[..., : self.rotary_emb.dim], + query_states[..., self.rotary_emb.dim :], + ) + key_rot, key_pass = ( + key_states[..., : self.rotary_emb.dim], + key_states[..., self.rotary_emb.dim :], + ) + # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] + query_rot, key_rot = apply_rotary_pos_emb( + query_rot, key_rot, cos, sin, position_ids + ) + + # [batch_size, seq_length, num_heads, head_dim] + query_states = torch.cat((query_rot, query_pass), dim=-1) + key_states = torch.cat((key_rot, key_pass), dim=-1) + + if past_key_value is not None: + cache_kwargs = { + "sin": sin, + "cos": cos, + "partial_rotation_size": self.rotary_emb.dim, + } + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow + attn_weights = torch.matmul( + query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3) + ) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(value_states.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.attention_dropout, training=self.training + ) + + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class PhiFlashAttention2(PhiAttention): + """ + Phi flash attention module. This module inherits from `PhiAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, + # that was made default for flash_attn>=2.1. This attribute is used to handle this difference. + # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) + # produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # PhiFlashAttention2 attention does not support output_attentions + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + # Partial rotary embedding + query_rot, query_pass = ( + query_states[..., : self.rotary_emb.dim], + query_states[..., self.rotary_emb.dim :], + ) + key_rot, key_pass = ( + key_states[..., : self.rotary_emb.dim], + key_states[..., self.rotary_emb.dim :], + ) + # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] + query_rot, key_rot = apply_rotary_pos_emb( + query_rot, key_rot, cos, sin, position_ids + ) + + # [batch_size, seq_length, num_heads, head_dim] + query_states = torch.cat((query_rot, query_pass), dim=-1) + key_states = torch.cat((key_rot, key_pass), dim=-1) + + if past_key_value is not None: + cache_kwargs = { + "sin": sin, + "cos": cos, + "partial_rotation_size": self.rotary_emb.dim, + } + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout + # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_dropout = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. + + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=attn_dropout, + softmax_scale=None, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.dense(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. + # For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + ( + query_states, + key_states, + value_states, + indices_q, + cu_seq_lens, + max_seq_lens, + ) = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input( + attn_output_unpad, indices_q, batch_size, query_length + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + return attn_output + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input + def _upad_input( + self, query_layer, key_layer, value_layer, attention_mask, query_length + ): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), + indices_k, + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), + indices_k, + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), + indices_k, + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( + query_layer, attention_mask + ) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +PHI_ATTENTION_CLASSES = { + "eager": PhiAttention, + "flash_attention_2": PhiFlashAttention2, +} + + +class PhiDecoderLayer(nn.Module): + def __init__(self, config: PhiConfig, layer_idx: int): + super().__init__() + self.self_attn = PHI_ATTENTION_CLASSES[config._attn_implementation]( + config, layer_idx=layer_idx + ) + self.mlp = PhiMLP(config) + self.input_layernorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps + ) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + attn_outputs = self.resid_dropout(attn_outputs) + + feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states)) + hidden_states = attn_outputs + feed_forward_hidden_states + residual + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +PHI_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`PhiConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Phi Model outputting raw hidden-states without any specific head on top.", + PHI_START_DOCSTRING, +) +class PhiPreTrainedModel(PreTrainedModel): + config_class = PhiConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["PhiDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +PHI_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Phi Model outputting raw hidden-states without any specific head on top.", + PHI_START_DOCSTRING, +) +class PhiModel(PhiPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiDecoderLayer`] + + Args: + config: PhiConfig + """ + + def __init__(self, config: PhiConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.embed_dropout = nn.Dropout(config.embd_pdrop) + self.layers = nn.ModuleList( + [ + PhiDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.final_layernorm = nn.LayerNorm( + config.hidden_size, eps=config.layer_norm_eps + ) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + inputs_embeds = self.embed_dropout(inputs_embeds) + + # Attention mask. + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = ( + attention_mask + if (attention_mask is not None and 0 in attention_mask) + else None + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = ( + next_decoder_cache.to_legacy_cache() + if use_legacy_cache + else next_decoder_cache + ) + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class PhiForCausalLM(PhiPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi,bias=False->bias=True + def __init__(self, config): + super().__init__(config) + self.model = PhiModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings + def get_input_embeddings(self): + return self.model.embed_tokens + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings + def get_output_embeddings(self): + return self.lm_head + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder + def set_decoder(self, decoder): + self.model = decoder + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, PhiForCausalLM + + >>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1") + + >>> prompt = "This is an example script ." + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'This is an example script .\n\n\n\nfrom typing import List\n\ndef find_most_common_letter(words: List[str' + ```""" + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if ( + attention_mask is not None + and attention_mask.shape[1] > input_ids.shape[1] + ): + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ), + ) + return reordered_past + + +@add_start_docstrings( + """ + The PhiModel with a sequence classification head on top (linear layer). + + [`PhiForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + PHI_START_DOCSTRING, +) +# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->PHI, +# Llama->Phi with self.transformer->self.model, transformer_outputs->model_outputs +class PhiForSequenceClassification(PhiPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = PhiModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + model_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = model_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = ( + torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + ) + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[ + torch.arange(batch_size, device=logits.device), sequence_lengths + ] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and ( + labels.dtype == torch.long or labels.dtype == torch.int + ): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct( + pooled_logits.view(-1, self.num_labels), labels.view(-1) + ) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + model_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=model_outputs.past_key_values, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) + + +@add_start_docstrings( + """ + PhiModel with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + PHI_START_DOCSTRING, +) +# Copied from transformers.models.mpt.modeling_mpt.MptForTokenClassification with MPT->PHI,Mpt->Phi, +# self.transformer->self.model,transformer_outputs->model_outputs +class PhiForTokenClassification(PhiPreTrainedModel): + def __init__(self, config: PhiConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.model = PhiModel(config) + if ( + hasattr(config, "classifier_dropout") + and config.classifier_dropout is not None + ): + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + model_outputs = self.model( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = model_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), + labels.view(batch_size * seq_length), + ) + + if not return_dict: + output = (logits,) + model_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) diff --git a/tests/cpu/hf_configs/yuan/yuan_hf_model.py b/tests/cpu/hf_configs/yuan/yuan_hf_model.py index 9533b3cae..1c444b515 100644 --- a/tests/cpu/hf_configs/yuan/yuan_hf_model.py +++ b/tests/cpu/hf_configs/yuan/yuan_hf_model.py @@ -38,7 +38,8 @@ replace_return_docstrings, ) from .configuration_yuan import YuanConfig -from einops import rearrange + +# from einops import rearrange # from flash_attn import flash_attn_varlen_func as flash_attn_unpadded_func # from flash_attn import flash_attn_func @@ -461,7 +462,8 @@ def forward( seqlen_k = key_states.shape[1] q, k, v = [ - rearrange(x, "b s ... -> (b s) ...") + # rearrange(x, "b s ... -> (b s) ...") + x.view(-1, *x.shape[2:]) for x in [query_states, key_states, value_states] ] @@ -500,7 +502,9 @@ def forward( causal=is_causal, ) - attn_output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + # attn_output = rearrange(output, "(b s) ... -> b s ...", b=batch_size) + shapes = (batch_size, -1, *output.shape[1:]) + attn_output = output.view(shapes) else: attn_weights = torch.matmul( query_states, key_states.transpose(2, 3) diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 7118e6326..95fbeb780 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -14,6 +14,7 @@ from hf_configs.qwen.modeling_qwen import QWenLMHeadModel from hf_configs.llava.modeling_llavallama import LlavaLlamaForCausalLM from hf_configs.yuan.yuan_hf_model import YuanForCausalLM +from hf_configs.phi.modeling_phi import PhiForCausalLM from intel_extension_for_pytorch.cpu._auto_kernel_selection import _disable_tpp try: @@ -156,6 +157,13 @@ lambda m: m.model.layers[0].self_attn.__class__, lambda m: m.model.layers[0].__class__, ), + model_info( + "phi", + PhiForCausalLM, + True, + lambda m: m.model.layers[0].self_attn.__class__, + lambda m: m.model.layers[0].__class__, + ), ] diff --git a/tests/cpu/test_ipex_tensor_parallel.py b/tests/cpu/test_ipex_tensor_parallel.py index 5be9d19f1..cae69fcc0 100644 --- a/tests/cpu/test_ipex_tensor_parallel.py +++ b/tests/cpu/test_ipex_tensor_parallel.py @@ -12,6 +12,7 @@ update_heads_info, TensorParallelRowLinear, TensorParallelLMhead, + TensorParallelConv2d, ) from intel_extension_for_pytorch.cpu import comm as ipex_comm @@ -31,21 +32,42 @@ curpath = os.path.abspath(os.path.dirname(__file__)) +from hf_configs.yuan.yuan_hf_model import YuanForCausalLM +from hf_configs.phi.modeling_phi import PhiForCausalLM + class TensorParallelTester(TestCase): def _shard_model(self, model): rank = ipex_comm.get_rank() world_size = ipex_comm.get_world_size() - for supported_mha_class in [ + supported_mha_classes = [ transformers.models.llama.modeling_llama.LlamaAttention, transformers.models.gptj.modeling_gptj.GPTJAttention, - ]: + ] + supported_mlp_classes = [ + transformers.models.llama.modeling_llama.LlamaMLP, + transformers.models.gptj.modeling_gptj.GPTJMLP, + ] + supported_model_classes = [ + transformers.models.llama.modeling_llama.LlamaForCausalLM, + transformers.models.gptj.modeling_gptj.GPTJForCausalLM, + ] + yuan_attention = None + if isinstance(model, YuanForCausalLM): + yuan_attention = type(model.model.layers[0].self_attn) + if isinstance(model, YuanForCausalLM) or isinstance(model, PhiForCausalLM): + supported_mha_classes.append(type(model.model.layers[0].self_attn)) + supported_mlp_classes.append(type(model.model.layers[0].mlp)) + supported_model_classes.append(type(model)) + for supported_mha_class in supported_mha_classes: num_heads = model.config.num_attention_heads num_kv_heads = num_heads for name in ["num_key_value_heads"]: if hasattr(model.config, name): num_kv_heads = getattr(model.config, name) head_dim = model.config.hidden_size // num_heads + value_with_share_qk = supported_mha_class == yuan_attention + shard_local_filtering = supported_mha_class == yuan_attention shard_mha_weights( model, supported_mha_class, @@ -54,11 +76,10 @@ def _shard_model(self, model): head_dim, rank, world_size, + value_with_share_qk, + shard_local_filtering, ) - for supported_mlp_class in [ - transformers.models.llama.modeling_llama.LlamaMLP, - transformers.models.gptj.modeling_gptj.GPTJMLP, - ]: + for supported_mlp_class in supported_mlp_classes: shard_mlp_weights( model, supported_mlp_class, @@ -68,14 +89,11 @@ def _shard_model(self, model): rank, world_size, ) - for supported_model_calss in [ - transformers.models.llama.modeling_llama.LlamaForCausalLM, - transformers.models.gptj.modeling_gptj.GPTJForCausalLM, - ]: - if isinstance(model, supported_model_calss): + for supported_model_class in supported_model_classes: + if isinstance(model, supported_model_class): shard_lm_head_weights( model, - supported_model_calss, + supported_model_class, num_heads, num_kv_heads, head_dim, @@ -139,6 +157,52 @@ def test_tensor_parallel_replace_check_llama(self): self.assertTrue(tp_model.lm_head, TensorParallelLMhead) self.tensor_parallel_with_optimize_transformers(model) + def test_tensor_parallel_replace_check_yuan(self): + config = AutoConfig.from_pretrained( + f"{curpath}/hf_configs/yuan", return_dict=False, trust_remote_code=True + ) + model = YuanForCausalLM(config).eval() + tp_model = self._shard_model(copy.deepcopy(model)) + self.assertTrue( + isinstance( + tp_model.model.layers[0].self_attn.o_proj, TensorParallelRowLinear + ) + ) + self.assertTrue( + isinstance(tp_model.model.layers[0].mlp.down_proj, TensorParallelRowLinear) + ) + self.assertTrue( + isinstance( + tp_model.model.layers[0].self_attn.lf_gate.conv1, TensorParallelConv2d + ) + ) + self.assertTrue( + isinstance( + tp_model.model.layers[0].self_attn.lf_gate.conv2, TensorParallelConv2d + ) + ) + self.assertTrue(isinstance(tp_model.lm_head, TensorParallelLMhead)) + self.assertTrue(tp_model.lm_head, TensorParallelLMhead) + self.tensor_parallel_with_optimize_transformers(model) + + def test_tensor_parallel_replace_check_phi(self): + config = AutoConfig.from_pretrained( + f"{curpath}/hf_configs/phi", return_dict=False, trust_remote_code=True + ) + model = PhiForCausalLM(config).eval() + tp_model = self._shard_model(copy.deepcopy(model)) + self.assertTrue( + isinstance( + tp_model.model.layers[0].self_attn.dense, TensorParallelRowLinear + ) + ) + self.assertTrue( + isinstance(tp_model.model.layers[0].mlp.fc2, TensorParallelRowLinear) + ) + self.assertTrue(isinstance(tp_model.lm_head, TensorParallelLMhead)) + self.assertTrue(tp_model.lm_head, TensorParallelLMhead) + self.tensor_parallel_with_optimize_transformers(model) + if __name__ == "__main__": test = unittest.main() From ebac4700532872f923eaf6d26a2ee363361f1f95 Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 23 Apr 2024 13:05:55 +0800 Subject: [PATCH 032/199] Fix dummy input for llava (#2799) --- .../inference/python/llm/single_instance/run_generation.py | 2 -- .../python/llm/single_instance/run_quantization.py | 2 +- .../transformers/models/reference/models.py | 2 +- intel_extension_for_pytorch/transformers/optimize.py | 6 +++++- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 22bb5a431..5387738bb 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -142,8 +142,6 @@ config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) if model_type == "mpt" and args.prompt is None: config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) -if model_type == "llava": - config.use_cache=True if not hasattr(config, "lm_head_generation"): config.lm_head_generation = True diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 485052c4b..7c6a2e3ad 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -432,7 +432,7 @@ def get_example_inputs(model): input_embeds = torch.zeros(batch_size, 1, 4096).to(amp_dtype) example_inputs = ( input_embeds, - attention_mask.unsqueeze(0).repeat(batch_size,1), + torch.ones((batch_size, 1), dtype=torch.long), tuple(past_key_value), ) else: diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 700dc4e3f..79a89ba34 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -2685,7 +2685,7 @@ def LlavaLlamaForCausalLM_forward( images: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, input_ids: torch.LongTensor = None, - use_cache: Optional[bool] = None, + use_cache: Optional[bool] = True, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 4f87d7f30..9e44b28ef 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -917,13 +917,17 @@ def get_dummy_input(_model, return_dict=False): ) if return_dict: sample_inputs.pop("input_ids", None) + sample_inputs["attention_mask"] = torch.ones( + (batch_size, 1), dtype=torch.long + ) sample_inputs["inputs_embeds"] = torch.zeros(batch_size, 1, 4096).to( _model.dtype ) else: sample_inputs = ( torch.zeros(batch_size, 1, 4096).to(_model.dtype), - ) + sample_inputs[1:] + torch.ones((batch_size, 1), dtype=torch.long), + ) + sample_inputs[2:] if _model.config.architectures[0] == "YuanForCausalLM": hidden_size = _model.config.hidden_size if _model.device.type == "cpu": From 291e7dfb664610161ddb485f704e3864ba7fae68 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Tue, 23 Apr 2024 22:04:53 -0700 Subject: [PATCH 033/199] Set parameters to get correct max length for MHA for quantizaiton (#2804) --- examples/cpu/inference/python/llm/run.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 8fc1eaff0..a7ad2fa38 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -349,6 +349,8 @@ def main(args_in: Optional[List[str]] = None) -> None: quant_cmd = ["python", qpath] quant_cmd.extend(["-m", str(args.model_name_or_path)]) quant_cmd.extend(["--output-dir", str(args.output_dir)]) + quant_cmd.extend(["--input-tokens", str(args.input_tokens)]) + quant_cmd.extend(["--max-new-tokens", str(args.max_new_tokens)]) if args.config_file is not None: quant_cmd.extend(["--config-file", str(args.config_file)]) if args.quant_with_amp: From 6fce07e852f3bedae2cd64e0092b330d540740b1 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 25 Apr 2024 18:41:39 +0800 Subject: [PATCH 034/199] update oneDNN to 5476baec67 on main (#2811) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 26bb35808..240d32a66 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 26bb358089b2dc9fcfe8aaf1181ee9775c100a46 +Subproject commit 240d32a66a0ce709017708483a93ef64c7a70161 From b1bde77672c1cd159e92a0caeb4b7dedea3cdfa7 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Thu, 25 Apr 2024 05:22:23 -0700 Subject: [PATCH 035/199] Move profiling code after benchmark to improve UX (#2812) --- .../run_generation_with_deepspeed.py | 19 +++++----- .../llm/single_instance/run_generation.py | 37 ++++++++++--------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 47a61032d..55efdc706 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -634,15 +634,6 @@ def trace_handler(prof): cycles = args.num_iter warmup = args.num_warmup total_list = [] - if args.profile: - with torch.profiler.profile( - activities=[torch.profiler.ProfilerActivity.CPU], - schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), - on_trace_ready=trace_handler, - ) as prof: - for i in range(5): - gen_ids, outputs = generate() - prof.step() # latency for i in range(cycles): t0 = time.time() @@ -656,6 +647,16 @@ def trace_handler(prof): if args.token_latency: total_list.append(outputs[1]) + if args.profile: + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU], + schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), + on_trace_ready=trace_handler, + ) as prof: + for i in range(5): + gen_ids, outputs = generate() + prof.step() + latency = total_time / (cycles - warmup) print_rank0("\n", "-" * 10, "Summary:", "-" * 10) print_rank0("Inference latency: %.3f sec." % latency) diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 5387738bb..a023367f9 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -275,24 +275,6 @@ def trace_handler(prof): with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( enabled=amp_enabled ): - if args.profile: - with torch.profiler.profile( - activities=[torch.profiler.ProfilerActivity.CPU], - schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), - on_trace_ready=trace_handler, - ) as prof: - for i in range(5): - if model_type == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] - output = model.generate(input_ids, images=image_tensor, **generate_kwargs) - elif model_type == "git": - input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values - output = model.generate(pixel_values=input_ids, **generate_kwargs) - else: - input_ids = tokenizer(prompt, return_tensors="pt").input_ids - output = model.generate(input_ids, **generate_kwargs) - prof.step() for i in range(num_iter): tic = time.time() if model_type == "llava": @@ -322,6 +304,25 @@ def trace_handler(prof): if args.token_latency: total_list.append(output[1]) + if args.profile: + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU], + schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), + on_trace_ready=trace_handler, + ) as prof: + for i in range(5): + if model_type == "llava": + input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) + image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] + output = model.generate(input_ids, images=image_tensor, **generate_kwargs) + elif model_type == "git": + input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values + output = model.generate(pixel_values=input_ids, **generate_kwargs) + else: + input_ids = tokenizer(prompt, return_tensors="pt").input_ids + output = model.generate(input_ids, **generate_kwargs) + prof.step() + print("\n", "-" * 10, "Summary:", "-" * 10) latency = total_time / (num_iter - num_warmup) print("Inference latency: %.3f sec." % latency) From 389408e9464b0edc85964571e29fd2079de5bb5a Mon Sep 17 00:00:00 2001 From: blzheng Date: Fri, 26 Apr 2024 09:39:12 +0800 Subject: [PATCH 036/199] Enable optimized Phi3 (#2813) --- .../run_accuracy_with_deepspeed.py | 1 + .../llm/distributed/run_generation_tp.py | 1 + .../run_generation_with_deepspeed.py | 1 + .../llm/single_instance/run_accuracy.py | 1 + .../llm/single_instance/run_generation.py | 3 +- .../llm/single_instance/run_quantization.py | 63 +- .../python/llm/utils/create_shard_model.py | 1 + .../python/llm/utils/model_class/phi.py | 12 + .../transformers/generation/beam_sample.py | 1 + .../transformers/generation/beam_search.py | 1 + .../transformers/generation/greedy_search.py | 1 + .../transformers/generation/sample.py | 1 + .../models/cpu/fusions/mha_fusion.py | 3 +- .../models/cpu/modules/attentions.py | 33 +- .../models/cpu/modules/decoder.py | 8 + .../models/reference/fusions/mha_fusion.py | 67 +- .../transformers/models/reference/models.py | 145 ++ .../models/reference/modules/attentions.py | 90 +- .../models/reference/modules/decoder.py | 65 + .../transformers/optimize.py | 26 +- tests/cpu/hf_configs/phi3/config.json | 35 + .../cpu/hf_configs/phi3/configuration_phi3.py | 223 ++ tests/cpu/hf_configs/phi3/modeling_phi3.py | 1819 +++++++++++++++++ ...test_ipex_optimize_transformers_nightly.py | 8 + 24 files changed, 2577 insertions(+), 32 deletions(-) create mode 100644 tests/cpu/hf_configs/phi3/config.json create mode 100644 tests/cpu/hf_configs/phi3/configuration_phi3.py create mode 100644 tests/cpu/hf_configs/phi3/modeling_phi3.py diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 04f27c1a1..9e5402e85 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -68,6 +68,7 @@ def decorator(func): "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py index fceafd391..4bc7dce44 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py @@ -45,6 +45,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 55efdc706..4e4c226ca 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -54,6 +54,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 079d76c82..e80f368b0 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -34,6 +34,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index a023367f9..a631ffde0 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -37,13 +37,14 @@ "chatglm": (AutoModelForCausalLM, AutoTokenizer), "gptbigcode": (AutoModelForCausalLM, AutoTokenizer), "t5": (T5ForConditionalGeneration, AutoTokenizer), - "mistral": (AutoModelForCausalLM, AutoTokenizer), "mixtral": (AutoModelForCausalLM, AutoTokenizer), + "mistral": (AutoModelForCausalLM, AutoTokenizer), "mpt": (AutoModelForCausalLM, AutoTokenizer), "stablelm": (AutoModelForCausalLM, AutoTokenizer), "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 7c6a2e3ad..1b602a9c3 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -37,6 +37,8 @@ from llm.utils.model_class.git import GitConfig from llm.utils.model_class.llava import LlavaConfig from llm.utils.model_class.phi import PhiConfig +from llm.utils.model_class.phi import Phi3Config +from llm.utils.model_class.yuan import YuanConfig parser = argparse.ArgumentParser("LLM generation script (int8 path)", add_help=False) parser.add_argument( @@ -279,8 +281,12 @@ def load_image(image_file): roles = ('user', 'assistant') else: roles = conv.roles +elif re.search("phi3", config.architectures[0], re.IGNORECASE): + model = Phi3Config(args.model_id) elif re.search("phi", config.architectures[0], re.IGNORECASE): model = PhiConfig(args.model_id) +elif re.search("yuan", config.architectures[0], re.IGNORECASE): + model = YuanConfig(args.model_id) else: raise AssertionError("Not support %s." % (args.model_id)) @@ -344,6 +350,19 @@ def _get_target_nums(names): ) for i in range(n_layers) ] +if model.name == "yuan": + global_past_key_value = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 1, 2, hidden_size), + ) + for i in range(n_layers) + ] + ) def get_example_inputs(model): if model.use_global_past_key_value: @@ -353,12 +372,20 @@ def get_example_inputs(model): attention_mask = torch.ones(len(input_ids)) if model.example_inputs_mode == EXAMPLE_INPUTS_MODE.MASK_POS_KV: position_ids = torch.arange(len(input_ids)) - example_inputs = ( - input_ids.unsqueeze(0), - attention_mask.unsqueeze(0), - position_ids.unsqueeze(0), - tuple(global_past_key_value), - ) + if model.name == "yuan": + example_inputs = ( + input_ids.unsqueeze(0)[:, -1:], + attention_mask.unsqueeze(0)[:, -1:], + position_ids.unsqueeze(0)[:, -1:], + tuple(global_past_key_value), + ) + else: + example_inputs = ( + input_ids.unsqueeze(0), + attention_mask.unsqueeze(0), + position_ids.unsqueeze(0), + tuple(global_past_key_value), + ) elif model.example_inputs_mode == EXAMPLE_INPUTS_MODE.MASK_KV_POS: position_ids = torch.arange(len(input_ids)) example_inputs = ( @@ -688,6 +715,14 @@ def calib_func(prepared_model): pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True) self_jit.save(args.output_dir + "/" + args.quant_model_name) quant_model = self_jit + if model.name == "yuan": + input_bs = int(args.batch_size * num_beams) + example_inputs = (example_inputs[0].repeat(input_bs, 1), example_inputs[1].repeat(input_bs, 1), example_inputs[2].repeat(input_bs, 1)) + self_jit_first = torch.jit.trace( + convert_model.eval(), example_inputs, strict=False, check_trace=False + ) + self_jit_first = torch.jit.freeze(self_jit_first.eval()) + self_jit_first.save(args.output_dir + "/" + args.quant_model_name + "2") elif args.ipex_weight_only_quantization: from intel_extension_for_pytorch.quantization import WoqWeightDtype @@ -753,6 +788,14 @@ def calib_func(prepared_model): pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True) self_jit.save(args.output_dir + "/" + args.quant_model_name) quant_model = self_jit + if model.name == "yuan": + input_bs = int(args.batch_size * num_beams) + example_inputs = (example_inputs[0].repeat(input_bs, 1), example_inputs[1].repeat(input_bs, 1), example_inputs[2].repeat(input_bs, 1)) + self_jit_first = torch.jit.trace( + user_model.eval(), example_inputs, strict=False, check_trace=False + ) + self_jit_first = torch.jit.freeze(self_jit_first.eval()) + self_jit_first.save(args.output_dir + "/" + args.quant_model_name + "2") if args.benchmark: @@ -770,10 +813,16 @@ def calib_func(prepared_model): try: self_jit = torch.jit.load(args.quantized_model_path) self_jit = torch.jit.freeze(self_jit.eval()) + if model.name == "yuan": + self_jit_first = torch.jit.load(args.quantized_model_path + "2") + self_jit_first = torch.jit.freeze(self_jit_first.eval()) except Exception as e: print("warning: loading failed.", e) self_jit = quant_model - ipex._set_optimized_model_for_generation(user_model, optimized_model=self_jit) + if model.name == "yuan": + ipex._set_optimized_model_for_generation(user_model, optimized_model=self_jit, first_token_optimized_model=self_jit_first) + else: + ipex._set_optimized_model_for_generation(user_model, optimized_model=self_jit) if model.name == "git": prompt = Image.open(requests.get(args.image_url, stream=True).raw) diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/inference/python/llm/utils/create_shard_model.py index 9ed7bab52..9daac65a2 100644 --- a/examples/cpu/inference/python/llm/utils/create_shard_model.py +++ b/examples/cpu/inference/python/llm/utils/create_shard_model.py @@ -25,6 +25,7 @@ "qwen": (AutoModelForCausalLM, AutoTokenizer), "git": (AutoModelForCausalLM, AutoProcessor), "yuan": (AutoModelForCausalLM, AutoTokenizer), + "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/utils/model_class/phi.py b/examples/cpu/inference/python/llm/utils/model_class/phi.py index 1bee2e476..bab8494c3 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/phi.py +++ b/examples/cpu/inference/python/llm/utils/model_class/phi.py @@ -11,3 +11,15 @@ def __init__(self, model_id): self.default_dataset = "NeelNanda/pile-10k" self.use_global_past_key_value = True self.use_ipex_autotune = True + +class Phi3Config(LLMConfig): + def __init__(self, model_id): + self.name = "phi-3" + self.model_id = model_id + self.to_channels_last = True + self.example_inputs_mode = EXAMPLE_INPUTS_MODE.MASK_KV_POS + + # for smooth quant + self.default_dataset = "NeelNanda/pile-10k" + self.use_global_past_key_value = True + self.use_ipex_autotune = True \ No newline at end of file diff --git a/intel_extension_for_pytorch/transformers/generation/beam_sample.py b/intel_extension_for_pytorch/transformers/generation/beam_sample.py index b468941ea..a97fe98fc 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_sample.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_sample.py @@ -190,6 +190,7 @@ def _beam_sample( "LlavaLlamaForCausalLM", "YuanForCausalLM", "PhiForCausalLM", + "Phi3ForCausalLM", ]: first_token = False if model_inputs["past_key_values"] is None: diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py index 6c096fa02..9ce5b276e 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_search.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py @@ -192,6 +192,7 @@ def _beam_search( "LlavaLlamaForCausalLM", "YuanForCausalLM", "PhiForCausalLM", + "Phi3ForCausalLM", ]: first_token = False has_position_id = model_inputs.get("position_ids", None) is not None diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py index c205df94d..85d8f2ab6 100644 --- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py +++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py @@ -173,6 +173,7 @@ def _greedy_search( "LlavaLlamaForCausalLM", "YuanForCausalLM", "PhiForCausalLM", + "Phi3ForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] diff --git a/intel_extension_for_pytorch/transformers/generation/sample.py b/intel_extension_for_pytorch/transformers/generation/sample.py index c8df63620..f446b412c 100644 --- a/intel_extension_for_pytorch/transformers/generation/sample.py +++ b/intel_extension_for_pytorch/transformers/generation/sample.py @@ -179,6 +179,7 @@ def _sample( "LlavaLlamaForCausalLM", "YuanForCausalLM", "PhiForCausalLM", + "Phi3ForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index ede6ba186..629b32cc5 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -11,10 +11,11 @@ def __init__( pos_embd_dim, base=10000, backbone=None, + kwargs=None, ): super().__init__() self.embed_positions = RotaryEmbedding( - max_position_embeddings, pos_embd_dim, backbone, base + max_position_embeddings, pos_embd_dim, backbone, base, kwargs ) def forward( diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py index 1d9d1610f..7d893b648 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py @@ -30,12 +30,33 @@ def __init__(self, module, config, tpp=False, woq=False): or self.model_backbone == "BaichuanForCausalLM" and hasattr(module, "rotary_emb") ): - self._IPEXROPE = _IPEXRopeCPU( - self.max_position_embeddings, - self.pos_embd_dim, - self.rope_base, - self.model_backbone, - ) + if self.model_backbone in ["Phi3ForCausalLM"]: + extra_inputs = {} + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + if "short_factor" in config.rope_scaling: + extra_inputs["short_factor"] = config.rope_scaling[ + "short_factor" + ] + if "long_factor" in config.rope_scaling: + extra_inputs["long_factor"] = config.rope_scaling["long_factor"] + if hasattr(config, "original_max_position_embeddings"): + extra_inputs["original_max_position_embeddings"] = ( + config.original_max_position_embeddings + ) + self._IPEXROPE = _IPEXRopeCPU( + self.max_position_embeddings, + self.pos_embd_dim, + self.rope_base, + self.model_backbone, + extra_inputs, + ) + else: + self._IPEXROPE = _IPEXRopeCPU( + self.max_position_embeddings, + self.pos_embd_dim, + self.rope_base, + self.model_backbone, + ) if self.model_backbone in [ "GPTJForCausalLM", "LlamaForCausalLM", diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py index 1560bfbf5..7d578cffb 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py @@ -206,5 +206,13 @@ def __init__(self, module, config, tpp=False, woq=False): tpp=tpp, woq=woq, ) + elif self.model_backbone == "Phi3ForCausalLM": + if not self.distributed: + self.mlp_linear_add = _IPEXlinearAddCPU( + module.mlp_linear_add.linear, tpp=tpp, woq=woq + ) + self.mha_linear_add = _IPEXlinearAddCPU( + module.mha_linear_add.linear, tpp=tpp, woq=woq + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py index c48677b81..d3d077894 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py @@ -6,9 +6,34 @@ class RotaryEmbedding(torch.nn.Module): - def __init__(self, max_position_embeddings, dim, backbone, base=10000): + def __init__(self, max_position_embeddings, dim, backbone, base=10000, kwargs=None): super().__init__() - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) + self.scaling_factor = 1.0 + if kwargs is not None and "short_factor" in kwargs: + self.short_factor = kwargs["short_factor"] + ext_factors = torch.tensor(self.short_factor, dtype=torch.float32) + inv_freq = 1.0 / ( + ext_factors * base ** (torch.arange(0, dim, 2).float() / dim) + ) + else: + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) + if kwargs is not None and "long_factor" in kwargs: + self.long_factor = kwargs["long_factor"] + new_ext_factors = torch.tensor(self.long_factor, dtype=torch.float32) + new_inv_freq = 1.0 / ( + new_ext_factors * base ** (torch.arange(0, dim, 2).float() / dim) + ) + self.new_inv_freq = new_inv_freq + if kwargs is not None and "original_max_position_embeddings" in kwargs: + self.original_max_position_embeddings = kwargs[ + "original_max_position_embeddings" + ] + scale = max_position_embeddings / self.original_max_position_embeddings + if scale > 1.0: + self.scaling_factor = math.sqrt( + 1 + + math.log(scale) / math.log(self.original_max_position_embeddings) + ) self.register_buffer("inv_freq", inv_freq, persistent=False) self.max_seq_len_cached = max_position_embeddings t = torch.arange( @@ -28,20 +53,30 @@ def __init__(self, max_position_embeddings, dim, backbone, base=10000): self.cos_cached = self.emb.cos()[None, :, :] self.sin_cached = self.emb.sin()[None, :, :] else: - self.sin_cos = torch.cat((torch.sin(freqs), torch.cos(freqs)), dim=1) + self.sin_cos = ( + torch.cat((torch.sin(freqs), torch.cos(freqs)), dim=1) + * self.scaling_factor + ) self.emb = torch.cat((freqs, freqs), dim=-1) self.register_buffer( - "cos_cached", self.emb.cos()[None, None, :, :], persistent=False + "cos_cached", + self.emb.cos()[None, None, :, :] * self.scaling_factor, + persistent=False, ) self.register_buffer( - "sin_cached", self.emb.sin()[None, None, :, :], persistent=False + "sin_cached", + self.emb.sin()[None, None, :, :] * self.scaling_factor, + persistent=False, ) def forward(self, seq_len=None): if seq_len is not None and seq_len > self.max_seq_len_cached: self.max_seq_len_cached = seq_len t = torch.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype) - freqs = torch.einsum("i,j->ij", t, self.inv_freq) + if hasattr(self, "long_factor"): + freqs = torch.einsum("i,j->ij", t, self.new_inv_freq) + else: + freqs = torch.einsum("i,j->ij", t, self.inv_freq) if ( self.model_backbone == "FalconForCausalLM" or self.model_backbone == "RWForCausalLM" @@ -53,10 +88,13 @@ def forward(self, seq_len=None): self.cos_cached = self.emb.cos()[None, :, :] self.sin_cached = self.emb.sin()[None, :, :] else: - self.sin_cos = torch.cat((torch.sin(freqs), torch.cos(freqs)), dim=1) + self.sin_cos = ( + torch.cat((torch.sin(freqs), torch.cos(freqs)), dim=1) + * self.scaling_factor + ) self.emb = torch.cat((freqs, freqs), dim=-1) - self.cos_cached = self.emb.cos()[None, None, :, :] - self.sin_cached = self.emb.sin()[None, None, :, :] + self.cos_cached = self.emb.cos()[None, None, :, :] * self.scaling_factor + self.sin_cached = self.emb.sin()[None, None, :, :] * self.scaling_factor self.cos_cached[:, :, :seq_len, ...] self.sin_cached[:, :, :seq_len, ...] return self.sin_cos, self.sin_cached, self.cos_cached @@ -69,11 +107,12 @@ def __init__( pos_embd_dim, base=10000, backbone=None, + kwargs=None, ): super().__init__() self.model_backbone = backbone self.embed_positions = RotaryEmbedding( - max_position_embeddings, pos_embd_dim, backbone, base + max_position_embeddings, pos_embd_dim, backbone, base, kwargs ) def rotate_every_two(self, x: torch.Tensor) -> torch.Tensor: @@ -216,6 +255,12 @@ def apply_ref_rope( ), dim=-1, ) + elif self.model_backbone == "Phi3ForCausalLM": + x = x.view(x.shape[0], -1, num_head, head_dim) + x = x.transpose(1, 2) + cos = _cos[..., seq_len - x.shape[2] : seq_len, :] + sin = _sin[..., seq_len - x.shape[2] : seq_len, :] + x = (x * cos) + (self.rotate_half(x) * sin) elif self.model_backbone == "QWenLMHeadModel": x = x.view(x.size(0), x.size(1), num_head, head_dim) b, sq, np, hn = x.size(0), x.size(1), x.size(2), x.size(3) @@ -297,6 +342,7 @@ def __init__(self, module, config): "LlavaLlamaForCausalLM", "YuanForCausalLM", "PhiForCausalLM", + "Phi3ForCausalLM", ]: self.num_key_value_groups = ( module.num_key_value_groups @@ -513,6 +559,7 @@ def forward( "StableLmForCausalLM", "YuanForCausalLM", "PhiForCausalLM", + "Phi3ForCausalLM", ]: # repeat k/v heads if n_kv_heads < n_heads key = self._repeat_kv(key, self.num_key_value_groups) diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 79a89ba34..a9dd50b55 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -3191,6 +3191,151 @@ def PhiModel_forward( ) +def Phi3Model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = ( + attention_mask + if (attention_mask is not None and 0 in attention_mask) + else None + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): if module.config.use_return_dict or ( "return_dict" in kwargs and kwargs["return_dict"] diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 8cd1d28c2..84472c564 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -1966,6 +1966,53 @@ def _PhiAttention_forward( return attn_output, attn_weights, past_key_value +def _Phi3Attention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + qkv = self.qkv_proj(hidden_states) + kv_seq_len = ( + q_len + past_key_value[0].size(-2) if past_key_value is not None else q_len + ) + query_states, key_states, value_states = self._IPEXROPE( + qkv, + position_ids, + self.num_heads, + self.head_dim, + self.pos_embd_dim // 2, + self.pos_embd_dim, + kv_seq_len, + 3, + ) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ) + key_states = _repeat_kv(key_states, self.num_key_value_groups) + value_states = _repeat_kv(value_states, self.num_key_value_groups) + (attn_output, attn_weights, past_key_value) = self._IPEXScaleDotProduct( + query_states, + key_states, + value_states, + math.sqrt(self.head_dim), + past_key_value, + None, + attention_mask, + ) + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + # attn_output = self.o_proj(attn_output) + if not output_attentions: + attn_weights = None + return attn_output, attn_weights, past_key_value + + def _create_attention_mask_for_git( self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None ): @@ -2125,12 +2172,33 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): self.rope_base = config.rotary_emb_base elif hasattr(config, "rope_theta"): self.rope_base = config.rope_theta - self._IPEXROPE = _IPEXRopeRef( - self.max_position_embeddings, - self.pos_embd_dim, - self.rope_base, - self.model_backbone, - ) + if self.model_backbone in ["Phi3ForCausalLM"]: + extra_inputs = {} + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + if "short_factor" in config.rope_scaling: + extra_inputs["short_factor"] = config.rope_scaling[ + "short_factor" + ] + if "long_factor" in config.rope_scaling: + extra_inputs["long_factor"] = config.rope_scaling["long_factor"] + if hasattr(config, "original_max_position_embeddings"): + extra_inputs["original_max_position_embeddings"] = ( + config.original_max_position_embeddings + ) + self._IPEXROPE = _IPEXRopeRef( + self.max_position_embeddings, + self.pos_embd_dim, + self.rope_base, + self.model_backbone, + extra_inputs, + ) + else: + self._IPEXROPE = _IPEXRopeRef( + self.max_position_embeddings, + self.pos_embd_dim, + self.rope_base, + self.model_backbone, + ) if self.model_backbone in [ "GPTJForCausalLM", @@ -2559,6 +2627,16 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "Phi3ForCausalLM": + return _Phi3Attention_forward( + self, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index 454457a9e..9e3a29c6e 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -1224,6 +1224,55 @@ def PhiDecoderLayer_forward( return outputs +def Phi3DecoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + past_key_value: Optional[Tuple[torch.Tensor]] = None, +) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(attn_outputs, residual) + else: + attn_outputs = self.self_attn.o_proj(attn_outputs) + hidden_states = residual + attn_outputs + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + # hidden_states = self.mlp(hidden_states) + up_states = self.mlp.gate_up_proj(hidden_states) + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.mlp.activation_fn(gate) + if not self.distributed: + hidden_states = self.mlp_linear_add(up_states, residual) + else: + hidden_states = self.mlp.down_proj(up_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + class _IPEXDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): super().__init__() @@ -1415,6 +1464,12 @@ def __init__(self, module, config, distributed=False): del self.__dict__["_modules"]["mlp"].fc2 self.linear_gelu = _IPEXlinearNewGeluRef(module.mlp.fc1) del self.__dict__["_modules"]["mlp"].fc1 + elif self.model_backbone == "Phi3ForCausalLM": + if not self.distributed: + self.mlp_linear_add = _IPEXlinearAddRef(module.mlp.down_proj) + del self.__dict__["_modules"]["mlp"].down_proj + self.mha_linear_add = _IPEXlinearAddRef(module.self_attn.o_proj) + del self.__dict__["_modules"]["self_attn"].o_proj else: AssertionError(False, "Do not support the optimization of your model yet") @@ -1645,5 +1700,15 @@ def forward( use_cache, past_key_value, ) + elif self.model_backbone == "Phi3ForCausalLM": + return Phi3DecoderLayer_forward( + self, + hidden_states, + attention_mask, + position_ids, + output_attentions, + use_cache, + past_key_value, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 9e44b28ef..661daae5a 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -186,6 +186,7 @@ def model_convert_reference(_model): YuanModel_forward, PhiForCausalLM_forward, PhiModel_forward, + Phi3Model_forward, prepare_inputs_for_generation, prepare_inputs_for_generation_gptbigcode, prepare_inputs_for_generation_llama, @@ -379,7 +380,10 @@ def model_convert_reference(_model): yuan_attention = None if _model.config.architectures[0] == "YuanForCausalLM": yuan_attention = type(_model.model.layers[0].self_attn) - if _model.config.architectures[0] in ["YuanForCausalLM", "PhiForCausalLM"]: + if _model.config.architectures[0] in [ + "YuanForCausalLM", + "PhiForCausalLM", + ]: supported_mha_classes.append(type(_model.model.layers[0].self_attn)) ipex_tp_supported_mha_classes.append(type(_model.model.layers[0].self_attn)) ipex_tp_supported_mlp_classes.append(type(_model.model.layers[0].mlp)) @@ -765,6 +769,24 @@ def model_convert_reference(_model): _model.config, distributed=distributed, ) + elif _model.config.architectures[0] == "Phi3ForCausalLM": + convert_function(_model, "forward", PhiForCausalLM_forward) + convert_function(_model.model, "forward", Phi3Model_forward) + convert_class( + _model, + type(_model.model.layers[0].self_attn), + _IPEXAttentionRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + type(_model.model.layers[0]), + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) + return _model @@ -1049,6 +1071,7 @@ def model_convert_lowering( if _model.config.architectures[0] in [ "BaichuanForCausalLM", "YuanForCausalLM", + "Phi3ForCausalLM", ]: supported_classes.append(type(_model.model.layers[0].input_layernorm)) if ( @@ -1283,6 +1306,7 @@ def optimize( "LlavaLlamaForCausalLM", "YuanForCausalLM", "PhiForCausalLM", + "Phi3ForCausalLM", ] if well_supported_model: diff --git a/tests/cpu/hf_configs/phi3/config.json b/tests/cpu/hf_configs/phi3/config.json new file mode 100644 index 000000000..e9f5ac404 --- /dev/null +++ b/tests/cpu/hf_configs/phi3/config.json @@ -0,0 +1,35 @@ +{ + "_name_or_path": "Phi-3-mini-4k-instruct", + "architectures": [ + "Phi3ForCausalLM" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "bos_token_id": 1, + "embd_pdrop": 0.0, + "eos_token_id": 32000, + "hidden_act": "silu", + "hidden_size": 3072, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 4096, + "model_type": "phi3", + "num_attention_heads": 32, + "num_hidden_layers": 1, + "num_key_value_heads": 32, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000.0, + "sliding_window": 2048, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.39.3", + "use_cache": true, + "vocab_size": 32064 +} \ No newline at end of file diff --git a/tests/cpu/hf_configs/phi3/configuration_phi3.py b/tests/cpu/hf_configs/phi3/configuration_phi3.py new file mode 100644 index 000000000..1755e6129 --- /dev/null +++ b/tests/cpu/hf_configs/phi3/configuration_phi3.py @@ -0,0 +1,223 @@ +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Phi-3 model configuration""" + + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +PHI3_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "microsoft/Phi-3-mini-4k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/resolve/main/config.json", + "microsoft/Phi-3-mini-128k-instruct": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/resolve/main/config.json", +} + + +class Phi3Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the + [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 32064): + Vocabulary size of the Phi-3 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Phi3Model`]. + hidden_size (`int`, *optional*, defaults to 3072): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 8192): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + resid_pdrop (`float`, *optional*, defaults to 0.0): + Dropout probability for mlp outputs. + embd_pdrop (`int`, *optional*, defaults to 0.0): + The dropout ratio for the embeddings. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio after computing the attention scores. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + original_max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model was trained with. This is used to determine the size of the + original RoPE embeddings when using long scaling. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon value used for the RMSNorm. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`dict`, *optional*): + The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must + contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and + the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size + divided by the number of attention heads divided by 2. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 32000): + The id of the "end-of-sequence" token. + pad_token_id (`int`, *optional*, defaults to 32000): + The id of the padding token. + sliding_window (`int`, *optional*): + Sliding window attention window size. If `None`, no sliding window is applied. + + Example: + + ```python + >>> from transformers import Phi3Model, Phi3Config + + >>> # Initializing a Phi-3 style configuration + >>> configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + + >>> # Initializing a model from the configuration + >>> model = Phi3Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "phi3" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32064, + hidden_size=3072, + intermediate_size=8192, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + resid_pdrop=0.0, + embd_pdrop=0.0, + attention_dropout=0.0, + hidden_act="silu", + max_position_embeddings=4096, + original_max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + bos_token_id=1, + eos_token_id=32000, + pad_token_id=32000, + sliding_window=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attention_dropout = attention_dropout + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.original_max_position_embeddings = original_max_position_embeddings + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.sliding_window = sliding_window + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3: + raise ValueError( + "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) + rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}" + ) + if not ( + isinstance(rope_scaling_short_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) + ): + raise ValueError( + f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}" + ) + if ( + not len(rope_scaling_short_factor) + == self.hidden_size // self.num_attention_heads // 2 + ): + raise ValueError( + "`rope_scaling`'s short_factor field must have length" + + f" {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}" + ) + if not ( + isinstance(rope_scaling_long_factor, list) + and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor) + ): + raise ValueError( + f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}" + ) + if ( + not len(rope_scaling_long_factor) + == self.hidden_size // self.num_attention_heads // 2 + ): + raise ValueError( + "`rope_scaling`'s long_factor field must have length" + + f" {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}" + ) diff --git a/tests/cpu/hf_configs/phi3/modeling_phi3.py b/tests/cpu/hf_configs/phi3/modeling_phi3.py new file mode 100644 index 000000000..078639f21 --- /dev/null +++ b/tests/cpu/hf_configs/phi3/modeling_phi3.py @@ -0,0 +1,1819 @@ +# coding=utf-8 +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" PyTorch Phi-3 model.""" + +import inspect +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from .configuration_phi3 import Phi3Config + + +logger = logging.get_logger(__name__) + +# Transformers scans dependencies in the modeling file, causing issues on conditional loading. +# The regex only ignores try/catch blocks, but not if statements +# if is_flash_attn_2_available(): +_flash_supports_window_size = False +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list( + inspect.signature(flash_attn_func).parameters + ) +except ImportError as error: + logger.warning( + f"`flash-attention` package not found, consider installing for better performance: {error}." + ) + if not _flash_supports_window_size: + logger.warning( + "Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`." + ) + +_CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct" +_CONFIG_FOR_DOC = "Phi3Config" + +PHI3_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "microsoft/Phi-3-mini-4k-instruct", + "microsoft/Phi-3-mini-128k-instruct", + # See all Phi-3 models at https://huggingface.co/models?filter=Phi-3 +] + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Phi3 +class Phi3RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Phi3RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3 +class Phi3RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + self.register_buffer("inv_freq", None, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.inv_freq is None: + self.inv_freq = 1.0 / ( + self.base + ** ( + torch.arange( + 0, self.dim, 2, dtype=torch.int64, device=x.device + ).float() + / self.dim + ) + ) + inv_freq_expanded = ( + self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + ) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = ( + device_type + if isinstance(device_type, str) and device_type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding): + def __init__(self, dim, config, device=None): + super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) + + self.short_factor = config.rope_scaling["short_factor"] + self.long_factor = config.rope_scaling["long_factor"] + self.original_max_position_embeddings = config.original_max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + seq_len = torch.max(position_ids) + 1 + if seq_len > self.original_max_position_embeddings: + ext_factors = torch.tensor( + self.long_factor, dtype=torch.float32, device=x.device + ) + else: + ext_factors = torch.tensor( + self.short_factor, dtype=torch.float32, device=x.device + ) + + inv_freq_shape = ( + torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() + / self.dim + ) + self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) + + inv_freq_expanded = ( + self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + ) + position_ids_expanded = position_ids[:, None, :].float() + + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = ( + device_type + if isinstance(device_type, str) and device_type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = math.sqrt( + 1 + + math.log(scale) / math.log(self.original_max_position_embeddings) + ) + + cos = emb.cos() * scaling_factor + sin = emb.sin() * scaling_factor + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding): + def __init__(self, dim, config, device=None): + super().__init__(dim, config.max_position_embeddings, config.rope_theta, device) + + self.short_factor = config.rope_scaling["short_factor"] + self.long_factor = config.rope_scaling["long_factor"] + self.original_max_position_embeddings = config.original_max_position_embeddings + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + seq_len = torch.max(position_ids) + 1 + if seq_len > self.original_max_position_embeddings: + ext_factors = torch.tensor( + self.long_factor, dtype=torch.float32, device=x.device + ) + else: + ext_factors = torch.tensor( + self.short_factor, dtype=torch.float32, device=x.device + ) + + inv_freq_shape = ( + torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() + / self.dim + ) + self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape) + + inv_freq_expanded = ( + self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + ) + position_ids_expanded = position_ids[:, None, :].float() + + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = ( + device_type + if isinstance(device_type, str) and device_type != "mps" + else "cpu" + ) + with torch.autocast(device_type=device_type, enabled=False): + freqs = ( + inv_freq_expanded.float() @ position_ids_expanded.float() + ).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + + scale = self.max_position_embeddings / self.original_max_position_embeddings + if scale <= 1.0: + scaling_factor = 1.0 + else: + scaling_factor = 0.1 * math.log(scale) + 1.0 + + cos = emb.cos() * scaling_factor + sin = emb.sin() * scaling_factor + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class Phi3MLP(nn.Module): + def __init__(self, config): + super().__init__() + + self.config = config + self.gate_up_proj = nn.Linear( + config.hidden_size, 2 * config.intermediate_size, bias=False + ) + self.down_proj = nn.Linear( + config.intermediate_size, config.hidden_size, bias=False + ) + + self.activation_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + up_states = self.gate_up_proj(hidden_states) + + gate, up_states = up_states.chunk(2, dim=-1) + up_states = up_states * self.activation_fn(gate) + + return self.down_proj(up_states) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand( + batch, num_key_value_heads, n_rep, slen, head_dim + ) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class Phi3Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Phi3Config, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.original_max_position_embeddings = config.original_max_position_embeddings + self.rope_theta = config.rope_theta + self.rope_scaling = config.rope_scaling + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + op_size = self.num_heads * self.head_dim + 2 * ( + self.num_key_value_heads * self.head_dim + ) + self.o_proj = nn.Linear( + self.num_heads * self.head_dim, self.hidden_size, bias=False + ) + self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False) + self._init_rope() + + def _init_rope(self): + if self.rope_scaling is None: + self.rotary_emb = Phi3RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + if scaling_type == "su": + self.rotary_emb = Phi3SuScaledRotaryEmbedding( + self.head_dim, self.config + ) + elif scaling_type == "yarn": + self.rotary_emb = Phi3YarnScaledRotaryEmbedding( + self.head_dim, self.config + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + logger.warning_once( + "You are not running the flash-attention implementation, expect numerical differences." + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[ + ..., query_pos : query_pos + self.num_key_value_heads * self.head_dim + ] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul( + query_states, key_states.transpose(2, 3) + ) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(value_states.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.attention_dropout, training=self.training + ) + + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class Phi3FlashAttention2(Phi3Attention): + """ + Phi-3 flash attention module. This module inherits from `Phi3Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, + # that was made default for flash_attn>=2.1. This attribute is used to handle this difference. + # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) + # produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # Phi3FlashAttention2 attention does not support output_attentions + + if not _flash_supports_window_size: + logger.warning_once( + "The current flash attention version does not support sliding window attention." + + " Please use `attn_implementation='eager'` or upgrade flash-attn library." + ) + raise ValueError( + "The current flash attention version does not support sliding window attention." + ) + + output_attentions = False + + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[ + ..., query_pos : query_pos + self.num_key_value_heads * self.head_dim + ] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # Because the input can be padded, the absolute sequence length depends on the max position id. + rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1 + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len) + + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + use_sliding_windows = ( + _flash_supports_window_size + and getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + ) + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0 + if ( + getattr(self.config, "sliding_window", None) is not None + and kv_seq_len > self.config.sliding_window + and cache_has_contents + ): + slicing_tokens = 1 - self.config.sliding_window + + past_key = past_key_value[self.layer_idx][0] + past_value = past_key_value[self.layer_idx][1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got" + f" {past_key.shape}" + ) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat( + [attention_mask, torch.ones_like(attention_mask[:, -1:])], + dim=-1, + ) + + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_dropout = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. + + if query_states.dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.qkv_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=attn_dropout, + use_sliding_windows=use_sliding_windows, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. + # For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + ( + query_states, + key_states, + value_states, + indices_q, + cu_seq_lens, + max_seq_lens, + ) = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=( + self.config.sliding_window, + self.config.sliding_window, + ), + ) + + attn_output = pad_input( + attn_output_unpad, indices_q, batch_size, query_length + ) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=( + self.config.sliding_window, + self.config.sliding_window, + ), + ) + + return attn_output + + # Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2._upad_input + def _upad_input( + self, query_layer, key_layer, value_layer, attention_mask, query_length + ): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), + indices_k, + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( + query_layer, attention_mask + ) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# copied from transformers.models.llama.modeling_llama.LlamaSdpaAttention with Llama->Phi3 +# TODO @Arthur no longer copied from LLama after static cache +class Phi3SdpaAttention(Phi3Attention): + """ + Phi3 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `Phi3Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from Phi3Attention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + qkv = self.qkv_proj(hidden_states) + query_pos = self.num_heads * self.head_dim + query_states = qkv[..., :query_pos] + key_states = qkv[ + ..., query_pos : query_pos + self.num_key_value_heads * self.head_dim + ] + value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :] + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +PHI3_ATTENTION_CLASSES = { + "eager": Phi3Attention, + "flash_attention_2": Phi3FlashAttention2, + "sdpa": Phi3SdpaAttention, +} + + +class Phi3DecoderLayer(nn.Module): + def __init__(self, config: Phi3Config, layer_idx: int): + super().__init__() + + self.config = config + self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation]( + config, layer_idx=layer_idx + ) + + self.mlp = Phi3MLP(config) + self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.resid_attn_dropout = nn.Dropout(config.resid_pdrop) + self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop) + self.post_attention_layernorm = Phi3RMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + """ + Args: + hidden_states (`torch.FloatTensor`): + input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range + `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + attn_outputs, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = residual + self.resid_attn_dropout(attn_outputs) + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + self.resid_mlp_dropout(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +PHI3_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Phi3Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", + PHI3_START_DOCSTRING, +) +class Phi3PreTrainedModel(PreTrainedModel): + config_class = Phi3Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Phi3DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = False + _supports_cache_class = True + + _version = "0.0.5" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +PHI3_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Phi-3 model outputting raw hidden-states without any specific head on top.", + PHI3_START_DOCSTRING, +) +class Phi3Model(Phi3PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Phi3DecoderLayer`] + + Args: + config: Phi3Config + """ + + def __init__(self, config: Phi3Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.embed_dropout = nn.Dropout(config.embd_pdrop) + self.layers = nn.ModuleList( + [ + Phi3DecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self._attn_implementation = config._attn_implementation + self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if ( + attention_mask is not None + and self._attn_implementation == "flash_attention_2" + and use_cache + ): + is_padding_right = attention_mask[:, -1].sum().item() != batch_size + if is_padding_right: + raise ValueError( + "You are attempting to perform batched generation with padding_side='right'" + " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to " + " call `tokenizer.padding_side = 'left'` before tokenizing the input. " + ) + + if self._attn_implementation == "flash_attention_2": + # 2d mask is passed through the layers + attention_mask = ( + attention_mask + if (attention_mask is not None and 0 in attention_mask) + else None + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + sliding_window=self.config.sliding_window, + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = ( + next_decoder_cache.to_legacy_cache() + if use_legacy_cache + else next_decoder_cache + ) + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class Phi3ForCausalLM(Phi3PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.__init__ with Llama->Phi3 + def __init__(self, config): + super().__init__(config) + self.model = Phi3Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings + def get_input_embeddings(self): + return self.model.embed_tokens + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings + def get_output_embeddings(self): + return self.lm_head + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder + def set_decoder(self, decoder): + self.model = decoder + + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder + def get_decoder(self): + return self.model + + # Ignore copy + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, Phi3ForCausalLM + + >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct") + >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct") + + >>> prompt = "This is an example script ." + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'This is an example script . + Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum' + ```""" + + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + # Copied from transformers.models.persimmon.modeling_persimmon.PersimmonForCausalLM.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if ( + attention_mask is not None + and attention_mask.shape[1] > input_ids.shape[1] + ): + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ), + ) + return reordered_past + + +@add_start_docstrings( + """ + The [`Phi3Model`] with a sequence classification head on top (linear layer). + + [`Phi3ForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + PHI3_START_DOCSTRING, +) +class Phi3ForSequenceClassification(Phi3PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = Phi3Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + model_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = model_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility + sequence_lengths = ( + torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + ) + sequence_lengths = sequence_lengths % input_ids.shape[-1] + sequence_lengths = sequence_lengths.to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[ + torch.arange(batch_size, device=logits.device), sequence_lengths + ] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and ( + labels.dtype == torch.long or labels.dtype == torch.int + ): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct( + pooled_logits.view(-1, self.num_labels), labels.view(-1) + ) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + model_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=model_outputs.past_key_values, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) + + +@add_start_docstrings( + """ + [`Phi3Model`] with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + PHI3_START_DOCSTRING, +) +class Phi3ForTokenClassification(Phi3PreTrainedModel): + def __init__(self, config: Phi3Config): + super().__init__(config) + self.num_labels = config.num_labels + + self.model = Phi3Model(config) + if ( + hasattr(config, "classifier_dropout") + and config.classifier_dropout is not None + ): + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + model_outputs = self.model( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = model_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), + labels.view(batch_size * seq_length), + ) + + if not return_dict: + output = (logits,) + model_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=model_outputs.hidden_states, + attentions=model_outputs.attentions, + ) diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 95fbeb780..0f697d4be 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -15,6 +15,7 @@ from hf_configs.llava.modeling_llavallama import LlavaLlamaForCausalLM from hf_configs.yuan.yuan_hf_model import YuanForCausalLM from hf_configs.phi.modeling_phi import PhiForCausalLM +from hf_configs.phi3.modeling_phi3 import Phi3ForCausalLM from intel_extension_for_pytorch.cpu._auto_kernel_selection import _disable_tpp try: @@ -164,6 +165,13 @@ lambda m: m.model.layers[0].self_attn.__class__, lambda m: m.model.layers[0].__class__, ), + model_info( + "phi3", + Phi3ForCausalLM, + True, + lambda m: m.model.layers[0].self_attn.__class__, + lambda m: m.model.layers[0].__class__, + ), ] From a199ddf036962e53fbdb9b1fd0e610419f344138 Mon Sep 17 00:00:00 2001 From: blzheng Date: Fri, 26 Apr 2024 13:46:32 +0800 Subject: [PATCH 037/199] add yuan.py (#2815) --- .../inference/python/llm/utils/model_class/yuan.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 examples/cpu/inference/python/llm/utils/model_class/yuan.py diff --git a/examples/cpu/inference/python/llm/utils/model_class/yuan.py b/examples/cpu/inference/python/llm/utils/model_class/yuan.py new file mode 100644 index 000000000..54ddf85ab --- /dev/null +++ b/examples/cpu/inference/python/llm/utils/model_class/yuan.py @@ -0,0 +1,13 @@ +from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + +class YuanConfig(LLMConfig): + def __init__(self, model_id): + self.name = "yuan" + self.model_id = model_id + self.to_channels_last = True + self.example_inputs_mode = EXAMPLE_INPUTS_MODE.MASK_POS_KV + + # for smooth quant + self.default_dataset = "NeelNanda/pile-10k" + self.use_global_past_key_value = True + self.use_ipex_autotune = True From d32b5003d97bb18d5dfe1385bb36d4ce00c652e2 Mon Sep 17 00:00:00 2001 From: blzheng Date: Fri, 26 Apr 2024 18:09:38 +0800 Subject: [PATCH 038/199] support accuracy test for yuan (#2820) --- .../run_accuracy_with_deepspeed.py | 65 ++++++++++++++++++- .../llm/single_instance/run_accuracy.py | 63 ++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 9e5402e85..046fa6cb8 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -252,7 +252,7 @@ def __init__( # For now, Falcon, baichuan and gptbigcode have accuracy issue with from_config with deepspeed meta device load. # TODO: we will change the scope once deepspeed providing the support - if world_size == 1 or model_type in ["falcon", "baichuan", "gptbigcode", "qwen"]: + if world_size == 1 or model_type in ["falcon", "baichuan", "gptbigcode", "qwen", "yuan"]: self.model = model_class[0].from_pretrained( model_id, config=self.config, @@ -578,6 +578,8 @@ def max_length(self): @property def max_gen_toks(self): + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + return 1024 return 256 @property @@ -604,6 +606,67 @@ def _model_generate(self, context, max_length, eos_token_id): generation_kwargs["pad_token_id"] = eos_token_id return self.model.generate(context, **generation_kwargs) + def greedy_until(self, requests): + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + re_ord = utils.Reorderer(requests, _collate) + + warn_stop_seq = False + for context, request_args in tqdm(re_ord.get_reordered()): + until = request_args["until"] + if isinstance(until, str): + until = [until] + + if until: + try: + (primary_until,) = self.tok_encode(until[0]) + except ValueError: + if not warn_stop_seq: + print( + "Warning: a primary stop sequence is multi-token! Will default to EOS token for this tokenizer. Consider using `hf-causal-experimental` for multi-token stop sequence support for the time being." + ) + warn_stop_seq = True + primary_until = self.eot_token_id + else: + primary_until = None + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + context = "่ฏฆ็ป†ๅˆ†ๆžๅนถๆฑ‚่งฃไปฅไธ‹ๆ•ฐๅญฆ้—ฎ้ข˜ใ€‚\n" + context.replace("้—ฎ้ข˜: ", "").replace("\n้€ๆญฅ่งฃ็ญ”:", "") + context_enc = torch.tensor( + [self.tok_encode(context)[self.max_gen_toks - self.max_length :]] + ).to(self.device) + + max_gen_tokens = min( + self.max_gen_toks, request_args.get("max_length", self.max_gen_toks) + ) + cont = self._model_generate( + context_enc, context_enc.shape[1] + max_gen_tokens, primary_until + ) + + s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :]) + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + s = s.replace("\n", "").split("")[0] + + for term in until: + s = s.split(term)[0] + + # partial caching + self.cache_hook.add_partial("greedy_until", (context, until), s) + + res.append(s) + + return re_ord.get_original(res) + class HuggingFaceSeq2SeqModel(HuggingFaceModel): """Seq2Seq language modeling. You can find a set of supported models in the following documentation: diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index e80f368b0..d1d7c8855 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -368,6 +368,8 @@ def max_length(self): @property def max_gen_toks(self): + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + return 1024 return 256 @property @@ -394,6 +396,67 @@ def _model_generate(self, context, max_length, eos_token_id): generation_kwargs["pad_token_id"] = eos_token_id return self.model.generate(context, **generation_kwargs) + def greedy_until(self, requests): + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + re_ord = utils.Reorderer(requests, _collate) + + warn_stop_seq = False + for context, request_args in tqdm(re_ord.get_reordered()): + until = request_args["until"] + if isinstance(until, str): + until = [until] + + if until: + try: + (primary_until,) = self.tok_encode(until[0]) + except ValueError: + if not warn_stop_seq: + print( + "Warning: a primary stop sequence is multi-token! Will default to EOS token for this tokenizer. Consider using `hf-causal-experimental` for multi-token stop sequence support for the time being." + ) + warn_stop_seq = True + primary_until = self.eot_token_id + else: + primary_until = None + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + context = "่ฏฆ็ป†ๅˆ†ๆžๅนถๆฑ‚่งฃไปฅไธ‹ๆ•ฐๅญฆ้—ฎ้ข˜ใ€‚\n" + context.replace("้—ฎ้ข˜: ", "").replace("\n้€ๆญฅ่งฃ็ญ”:", "") + context_enc = torch.tensor( + [self.tok_encode(context)[self.max_gen_toks - self.max_length :]] + ).to(self.device) + + max_gen_tokens = min( + self.max_gen_toks, request_args.get("max_length", self.max_gen_toks) + ) + cont = self._model_generate( + context_enc, context_enc.shape[1] + max_gen_tokens, primary_until + ) + + s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :]) + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + s = s.replace("\n", "").split("")[0] + + for term in until: + s = s.split(term)[0] + + # partial caching + self.cache_hook.add_partial("greedy_until", (context, until), s) + + res.append(s) + + return re_ord.get_original(res) + class HuggingFaceSeq2SeqModel(HuggingFaceModel): """Seq2Seq language modeling. You can find a set of supported models in the following documentation: From 7265163f1d0e2b14059e61b415fef95ebdaeb67b Mon Sep 17 00:00:00 2001 From: sanchitintel Date: Sun, 28 Apr 2024 01:51:06 +0000 Subject: [PATCH 039/199] Map standalone oneDNN Graph Select op (#2818) * Map standalone LLGA Select op oneDNN Graph added support for a standalone Select op on April 10. * Remove mayRemoveSelectOp pass * Update dependency_version.yml --- .../cpu/jit/codegen/onednn/prepare_binary.cpp | 38 ------------------- dependency_version.yml | 6 +-- tests/cpu/test_jit_llga_fuser.py | 5 ++- 3 files changed, 6 insertions(+), 43 deletions(-) diff --git a/csrc/cpu/jit/codegen/onednn/prepare_binary.cpp b/csrc/cpu/jit/codegen/onednn/prepare_binary.cpp index bab011004..cf1e65bcf 100644 --- a/csrc/cpu/jit/codegen/onednn/prepare_binary.cpp +++ b/csrc/cpu/jit/codegen/onednn/prepare_binary.cpp @@ -223,43 +223,6 @@ static void replaceWithSelectOp(Block* block) { } } -void removeSelectOpNode(Node* node) { - WithInsertPoint guard(node); - auto g = node->owningGraph(); - auto dtype = node->output()->type()->cast()->scalarType().value(); - // The sequence of ops in the graph is like this - - // if_tensor = aten::as_tensor(%if_value, %, %) - // if_tensor = aten::unsqueeze(%if_tensor, %57) - // llga::Select(mask, if_tensor, then_tensor) - auto as_tensor_node = node->input(1)->node()->input(0)->node(); - auto expand_as_output = - g->insert(aten::expand_as, {node->input(0), node->input(2)}); - expand_as_output->setType(node->input(2)->type()); - auto masked_fill_output = g->insert( - aten::masked_fill, - {node->input(2), expand_as_output, as_tensor_node->input(0)}); - masked_fill_output->setType(node->input(2)->type()); - node->output()->replaceAllUsesWith(masked_fill_output); -} - -static void mayRemoveLLGASelect(Block* block) { - for (auto nodeIterator = block->nodes().begin(); - nodeIterator != block->nodes().end(); - ++nodeIterator) { - Node* node = *nodeIterator; - for (auto blockIterator = node->blocks().begin(); - blockIterator != node->blocks().end(); - ++blockIterator) { - Block* body_block = *blockIterator; - mayRemoveLLGASelect(body_block); - } - if (node->kind().toQualString() == std::string("llga::Select")) { - removeSelectOpNode(node); - nodeIterator.destroyCurrent(); - } - } -} - static void EliminateIdentityMulAddDiv(Block* block) { for (auto node : block->nodes()) { for (auto sub : node->blocks()) { @@ -292,7 +255,6 @@ void PrepareBinaryForLLGA(const std::shared_ptr& graph) { void RevertPrepareBinaryForLLGA(const std::shared_ptr& graph) { ConvertTensorToScalar(graph->block()); mayRevertDtypeAttributeInsertion(graph->block()); - mayRemoveLLGASelect(graph->block()); EliminateDeadCode(graph); } diff --git a/dependency_version.yml b/dependency_version.yml index 1cadbfa26..8827e6615 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240417+cpu + version: 2.4.0.dev20240424+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240417+cpu + version: 2.2.0.dev20240424+cpu torchvision: - version: 0.19.0.dev20240417+cpu + version: 0.19.0.dev20240424+cpu transformers: version: 4.38.1 diff --git a/tests/cpu/test_jit_llga_fuser.py b/tests/cpu/test_jit_llga_fuser.py index 00ff9eac6..2025345db 100644 --- a/tests/cpu/test_jit_llga_fuser.py +++ b/tests/cpu/test_jit_llga_fuser.py @@ -834,8 +834,9 @@ def forward(self, x): graph, _ = self.checkTrace(m, [x]) self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + # oneDNN Graph DNNL backend now supports standalone Select op @llga_fp32_bf16_test_env - def test_do_not_map_select(self): + def test_map_select(self): class M(nn.Module): def __init__( self, @@ -851,7 +852,7 @@ def forward(self, x, y): x = torch.randn(3, 32, 32, 32) y = torch.randn(3, 32, 32, 1).to(torch.bool) graph, _ = self.checkTrace(m, [x, y]) - self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) @llga_fp32_bf16_test_env def test_avg_pool2d_add(self): From 27c13c40d1f154f26cf413c89fab6155cd1db6e3 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Sun, 28 Apr 2024 15:07:53 +0800 Subject: [PATCH 040/199] Update dependency_version.yml 20240428 (#2826) --- dependency_version.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 8827e6615..cf2f96d9c 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -8,9 +8,9 @@ # branch: Branch name of the Github repository. For human understanding only. deepspeed: - commit: v0.14.1 + commit: v0.14.2 repo: https://github.com/microsoft/DeepSpeed.git - version: 0.14.1 + version: 0.14.2 gcc: max-version: null min-version: 12.3.0 @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240424+cpu + version: 2.4.0.dev20240427+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240424+cpu + version: 2.2.0.dev20240427+cpu torchvision: - version: 0.19.0.dev20240424+cpu + version: 0.19.0.dev20240427+cpu transformers: version: 4.38.1 From 9342b35e98e332636b71f13a4b61890edb4a1147 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Sun, 28 Apr 2024 18:49:17 +0800 Subject: [PATCH 041/199] Improve UX of run_generation_with_deepspeed.py (#2824) --- .../llm/distributed/run_generation_with_deepspeed.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 4e4c226ca..b2a9f2f98 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -482,9 +482,9 @@ def write_checkpoints_json(): generate_kwargs = dict(do_sample=False, num_beams=num_beams, max_new_tokens=args.max_new_tokens, min_new_tokens=args.max_new_tokens, streamer=streamer) -if args.token_latency and not args.ipex: +if args.token_latency and not use_ipex: args.token_latency = False - logger.warning("--token-latency requires --ipex. Disabling --token-latency.") + logger.warning("--token-latency requires using ipex (--ipex or --ipex-weight-only-quantization). Disabling --token-latency.") if args.token_latency: if not hasattr(model.config, "token_latency"): model.config.token_latency = True @@ -649,6 +649,8 @@ def trace_handler(prof): total_list.append(outputs[1]) if args.profile: + # Wait for all ranks to finish before move on + deepspeed.comm.barrier() with torch.profiler.profile( activities=[torch.profiler.ProfilerActivity.CPU], schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), @@ -657,6 +659,8 @@ def trace_handler(prof): for i in range(5): gen_ids, outputs = generate() prof.step() + # Wait for all ranks to finish before move on + deepspeed.comm.barrier() latency = total_time / (cycles - warmup) print_rank0("\n", "-" * 10, "Summary:", "-" * 10) From 8b37f88a1ad001ee0d6db509134ff31ba1c54bfc Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Sun, 28 Apr 2024 20:46:02 +0800 Subject: [PATCH 042/199] Add get_weight_only_quant_qconfig_mapping to doc (#2823) Co-authored-by: WeizhuoZhang-intel --- docs/tutorials/api_doc.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorials/api_doc.rst b/docs/tutorials/api_doc.rst index 8dac4c1f1..7080252eb 100644 --- a/docs/tutorials/api_doc.rst +++ b/docs/tutorials/api_doc.rst @@ -37,6 +37,7 @@ Quantization .. automodule:: intel_extension_for_pytorch.quantization .. autofunction:: get_smooth_quant_qconfig_mapping +.. autofunction:: get_weight_only_quant_qconfig_mapping .. autofunction:: prepare .. autofunction:: convert From 4e6d4c0dcab57988d74ce94b435b8b73a72fe028 Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Mon, 29 Apr 2024 10:51:26 +0800 Subject: [PATCH 043/199] Refine batch scaling with kv block. (#2821) * Reduce write cache polution to improve IAKV performance * Refine batch scaling for large batch bs=64/128 * Fix clang-format --- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 286 ++++++++++-------- 1 file changed, 154 insertions(+), 132 deletions(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index ac0923cc4..f479c27f7 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -555,6 +555,15 @@ scale_dot_product_for_indirect_access_kv_cache( auto attn_w_ptr = attn_weights.data_ptr(); long new_beam_idx[beam_batch][offset + query.size(1) + 1]; auto b_ptr = beam_idx.data_ptr(); + + auto thread_numbers = omp_get_max_threads(); + auto max_parallel_parts = thread_numbers * 4; + + auto kv_block_size = bs * head_num >= max_parallel_parts + ? seq_len + : std::max(seq_len / max_parallel_parts, 1L); + kv_block_size = std::min(kv_block_size, 32L); + auto kv_block_count = (seq_len + kv_block_size - 1) / kv_block_size; if (offset > 0) { // according to the last decoded token to get the target beam for the past // token @@ -570,76 +579,81 @@ scale_dot_product_for_indirect_access_kv_cache( RECORD_FUNCTION( "ipex::iakv_sdp::matmul(query, key)", c10::ArrayRef({})); #pragma omp parallel for collapse(3) - for (auto ti = 0; ti < seq_len; ti++) { + for (auto block_id = 0; block_id < kv_block_count; block_id++) { for (auto bi = 0; bi < bs; bi++) { for (auto hi = 0; hi < head_num; hi++) { - for (auto query_ti = 0; query_ti < cur_len; query_ti++) { - auto kv_hi = hi / group_size; // maping the query head to key/value - // head to support MGA/MQA - auto q_ptr_start = q_ptr + - (bi * cur_len + query_ti) * head_num * head_size + - hi * head_size; - auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; - auto attn_w_pos = - attn_w_ptr + attn_w_stride + query_ti * seq_len + ti; - attn_w_pos[0] = 0.0f; - auto kc_token_start = ti * kc_token_stride; - auto kc_t_beam_start = kc_token_start; - if (ti > query_ti + offset) { // only caculate the innerproduct for - // the past token and current token - attn_w_pos[0] = -10000.0f; - } else if (ti == query_ti + offset) { // caculate the innerproduct - // for the current token and - // store the key - if (cur_len > 1) { // this may occur for processing the promt - auto beam_size = beam_batch / bs; - // need to store key accross beam - kc_t_beam_start = - kc_t_beam_start + bi * beam_size * kv_head * head_size; - } else { - kc_t_beam_start = kc_t_beam_start + bi * kv_head * head_size; - } - auto kc_head_start = - k_cache_ptr + kc_t_beam_start + kv_hi * head_size; - auto k_ptr_start = k_ptr + - (bi * cur_len + ti - offset) * kv_head * head_size + - kv_hi * head_size; - reduce_head( - q_ptr_start, - k_ptr_start, - attn_w_pos, - head_size, - true, - kc_head_start); - } else { // caculate the innerproduct for the past token - if (ti >= offset) { - auto k_ptr_start = k_ptr + - (bi * cur_len + ti - offset) * kv_head * head_size + - kv_hi * head_size; - reduce_head( - q_ptr_start, - k_ptr_start, - attn_w_pos, - head_size, - false, - nullptr); - } else { - kc_t_beam_start = kc_t_beam_start + - new_beam_idx[bi][ti] * kv_head * head_size; - if (cur_len > 1) { + auto k_start = block_id * kv_block_size; + auto block_size = std::min(kv_block_size, seq_len - k_start); + for (auto ti = k_start; ti < k_start + block_size; ti++) { + for (auto query_ti = 0; query_ti < cur_len; query_ti++) { + auto kv_hi = hi / group_size; // maping the query head to + // key/value head to support MGA/MQA + auto q_ptr_start = q_ptr + + (bi * cur_len + query_ti) * head_num * head_size + + hi * head_size; + auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; + auto attn_w_pos = + attn_w_ptr + attn_w_stride + query_ti * seq_len + ti; + attn_w_pos[0] = 0.0f; + auto kc_token_start = ti * kc_token_stride; + auto kc_t_beam_start = kc_token_start; + if (ti > + query_ti + offset) { // only caculate the innerproduct for + // the past token and current token + attn_w_pos[0] = -10000.0f; + } else if (ti == query_ti + offset) { // caculate the innerproduct + // for the current token and + // store the key + if (cur_len > 1) { // this may occur for processing the promt auto beam_size = beam_batch / bs; + // need to store key accross beam kc_t_beam_start = kc_t_beam_start + bi * beam_size * kv_head * head_size; + } else { + kc_t_beam_start = kc_t_beam_start + bi * kv_head * head_size; } auto kc_head_start = k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + auto k_ptr_start = k_ptr + + (bi * cur_len + ti - offset) * kv_head * head_size + + kv_hi * head_size; reduce_head( q_ptr_start, - kc_head_start, + k_ptr_start, attn_w_pos, head_size, - false, - nullptr); + true, + kc_head_start); + } else { // caculate the innerproduct for the past token + if (ti >= offset) { + auto k_ptr_start = k_ptr + + (bi * cur_len + ti - offset) * kv_head * head_size + + kv_hi * head_size; + reduce_head( + q_ptr_start, + k_ptr_start, + attn_w_pos, + head_size, + false, + nullptr); + } else { + kc_t_beam_start = kc_t_beam_start + + new_beam_idx[bi][ti] * kv_head * head_size; + if (cur_len > 1) { + auto beam_size = beam_batch / bs; + kc_t_beam_start = + kc_t_beam_start + bi * beam_size * kv_head * head_size; + } + auto kc_head_start = + k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + reduce_head( + q_ptr_start, + kc_head_start, + attn_w_pos, + head_size, + false, + nullptr); + } } } } @@ -705,7 +719,6 @@ scale_dot_product_for_indirect_access_kv_cache( } } } - auto thread_numbers = omp_get_max_threads(); auto private_attn_outs = at::empty({thread_numbers, bs, head_num, cur_len, head_size}, at::kFloat); auto private_attn_out_flag = @@ -719,87 +732,93 @@ scale_dot_product_for_indirect_access_kv_cache( "ipex::iakv_sdp::matmul(attn_w, value)", c10::ArrayRef({})); #pragma omp parallel for collapse(3) - for (auto vi = 0; vi < seq_len; vi++) { + for (auto block_id = 0; block_id < kv_block_count; block_id++) { for (auto bi = 0; bi < bs; bi++) { for (auto hi = 0; hi < head_num; hi++) { - auto thread_id = omp_get_thread_num(); - for (auto query_ti = 0; query_ti < cur_len; query_ti++) { - auto kv_hi = hi / group_size; // maping the query head to key/value - // head to support MGA/MQA - auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; - auto attn_w_query_start = - attn_w_ptr + attn_w_stride + query_ti * seq_len; - // calculate weighted value and store the result to attn_outs[bs, - // head_num, cur_len, head_size] - auto attn_out_head_stride = thread_id * attn_outs_stride_priv + - (bi * head_num + hi) * cur_len * head_size; - auto attn_out_start = private_attn_out_ptr + attn_out_head_stride + - query_ti * head_size; + auto thread_id = 0; + if (kv_block_size < seq_len) + thread_id = omp_get_thread_num(); + auto v_start = block_id * kv_block_size; + auto block_size = std::min(kv_block_size, seq_len - v_start); + for (auto vi = v_start; vi < v_start + block_size; vi++) { + for (auto query_ti = 0; query_ti < cur_len; query_ti++) { + auto kv_hi = hi / group_size; // maping the query head to + // key/value head to support MGA/MQA + auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; + auto attn_w_query_start = + attn_w_ptr + attn_w_stride + query_ti * seq_len; + // calculate weighted value and store the result to attn_outs[bs, + // head_num, cur_len, head_size] + auto attn_out_head_stride = thread_id * attn_outs_stride_priv + + (bi * head_num + hi) * cur_len * head_size; + auto attn_out_start = private_attn_out_ptr + + attn_out_head_stride + query_ti * head_size; - auto vc_token_start = vi * kc_token_stride; - if (vi == query_ti + offset) { // caculate the attention values - // for the current token - auto vc_t_beam_start = vc_token_start; - if (cur_len > 1) { // this may occur for processing the promt - auto beam_size = beam_batch / bs; - // removed the redundant computation, need to store key accross - // beam - vc_t_beam_start = - vc_t_beam_start + bi * beam_size * kv_head * head_size; - } else { - vc_t_beam_start = vc_t_beam_start + bi * kv_head * head_size; - } - auto v_cache_head_start = - v_cache_ptr + vc_t_beam_start + kv_hi * head_size; - auto v_ptr_start = v_ptr + - (bi * cur_len + vi - offset) * kv_head * head_size + - kv_hi * head_size; - mul_attenion_weights_and_value_of_head( - attn_w_query_start[vi], - v_ptr_start, - attn_out_start, - head_size, - true, - v_cache_head_start, - flag_access[thread_id][bi][hi]); - } else if (vi < query_ti + offset) { // caculate attention - // values for the past - // token - if (vi >= offset) { - auto v_ptr_start = v_ptr + - (bi * cur_len + vi - offset) * kv_head * head_size + - kv_hi * head_size; - mul_attenion_weights_and_value_of_head( - attn_w_query_start[vi], - v_ptr_start, - attn_out_start, - head_size, - false, - nullptr, - flag_access[thread_id][bi][hi]); - } else { - auto vc_t_beam_start = - vc_token_start + new_beam_idx[bi][vi] * kv_head * head_size; - if (cur_len > 1) { + auto vc_token_start = vi * kc_token_stride; + if (vi == query_ti + offset) { // caculate the attention values + // for the current token + auto vc_t_beam_start = vc_token_start; + if (cur_len > 1) { // this may occur for processing the promt auto beam_size = beam_batch / bs; + // removed the redundant computation, need to store key + // accross beam vc_t_beam_start = vc_t_beam_start + bi * beam_size * kv_head * head_size; + } else { + vc_t_beam_start = vc_t_beam_start + bi * kv_head * head_size; } auto v_cache_head_start = v_cache_ptr + vc_t_beam_start + kv_hi * head_size; + auto v_ptr_start = v_ptr + + (bi * cur_len + vi - offset) * kv_head * head_size + + kv_hi * head_size; mul_attenion_weights_and_value_of_head( attn_w_query_start[vi], - v_cache_head_start, + v_ptr_start, attn_out_start, head_size, - false, - nullptr, + true, + v_cache_head_start, flag_access[thread_id][bi][hi]); + } else if (vi < query_ti + offset) { // caculate attention + // values for the past + // token + if (vi >= offset) { + auto v_ptr_start = v_ptr + + (bi * cur_len + vi - offset) * kv_head * head_size + + kv_hi * head_size; + mul_attenion_weights_and_value_of_head( + attn_w_query_start[vi], + v_ptr_start, + attn_out_start, + head_size, + false, + nullptr, + flag_access[thread_id][bi][hi]); + } else { + auto vc_t_beam_start = vc_token_start + + new_beam_idx[bi][vi] * kv_head * head_size; + if (cur_len > 1) { + auto beam_size = beam_batch / bs; + vc_t_beam_start = + vc_t_beam_start + bi * beam_size * kv_head * head_size; + } + auto v_cache_head_start = + v_cache_ptr + vc_t_beam_start + kv_hi * head_size; + mul_attenion_weights_and_value_of_head( + attn_w_query_start[vi], + v_cache_head_start, + attn_out_start, + head_size, + false, + nullptr, + flag_access[thread_id][bi][hi]); + } } + if (flag_access[thread_id][bi][hi] == 0) + flag_access[thread_id][bi][hi] = 1; } } - if (flag_access[thread_id][bi][hi] == 0) - flag_access[thread_id][bi][hi] = 1; } } } @@ -817,17 +836,20 @@ scale_dot_product_for_indirect_access_kv_cache( if (flag_access[0][bi][hi] == 0) { torch_ipex::cpu::kernel::zero_ker(thr0_head_start, head_size); } - for (auto thread_id = 1; thread_id < thread_numbers; thread_id++) { - if (flag_access[thread_id][bi][hi] == 0) { - continue; + if (kv_block_size < seq_len) { + for (auto thread_id = 1; thread_id < thread_numbers; thread_id++) { + if (flag_access[thread_id][bi][hi] == 0) { + continue; + } + auto attn_out_head_stride = thread_id * attn_outs_stride_priv + + (bi * head_num + hi) * cur_len * head_size; + auto private_attn_out_start = + private_attn_out_ptr + attn_out_head_stride + qi * head_size; + torch_ipex::cpu::kernel::add_ker( + thr0_head_start, private_attn_out_start, head_size); } - auto attn_out_head_stride = thread_id * attn_outs_stride_priv + - (bi * head_num + hi) * cur_len * head_size; - auto private_attn_out_start = - private_attn_out_ptr + attn_out_head_stride + qi * head_size; - torch_ipex::cpu::kernel::add_ker( - thr0_head_start, private_attn_out_start, head_size); } + auto attn_outs_start = attn_out_ptr + (bi * head_num + hi) * cur_len * head_size + qi * head_size; torch_ipex::cpu::kernel::move_ker( From e406c352a5375943962d3cec8693d90c770529af Mon Sep 17 00:00:00 2001 From: ZhaoqiongZ <106125927+ZhaoqiongZ@users.noreply.github.com> Date: Tue, 30 Apr 2024 09:55:43 +0800 Subject: [PATCH 044/199] Fix version/zzq (#2830) * Update llm script/zzq (#2724) * add transformers_stream_generator package for Qwen * add tiktoken * add intermediate step for compile script, compare torch version and decide continue or quit * fix interactive step log * add variable in {} * adjust warning format * update log * rename compare function * compare torch version with patch version and platform suffix * change version to main * fix xpu installation link version * Update compile_bundle.sh remove blank line * Update env_setup.sh remove blank line --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 93fa2e794..992b97429 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Intelยฎ Extension for PyTorch\* **CPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.2.0%2Bcpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
    -**GPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu&version=v2.1.10%2Bxpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/inference/python/llm)
    +**GPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/inference/python/llm)
    Intelยฎ Extension for PyTorch\* extends PyTorch\* with up-to-date features optimizations for an extra performance boost on Intel hardware. Optimizations take advantage of Intelยฎ Advanced Vector Extensions 512 (Intelยฎ AVX-512) Vector Neural Network Instructions (VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) on Intel CPUs as well as Intel Xe Matrix Extensions (XMX) AI engines on Intel discrete GPUs. Moreover, Intelยฎ Extension for PyTorch* provides easy GPU acceleration for Intel discrete GPUs through the PyTorch* xpu device. From d2970f250abb03aeebc7327ff495087e47be13d3 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Mon, 6 May 2024 09:14:40 +0800 Subject: [PATCH 045/199] Update dependency_version.yml 20240506 (#2842) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index cf2f96d9c..77fb7f143 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240427+cpu + version: 2.4.0.dev20240505+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240427+cpu + version: 2.2.0.dev20240505+cpu torchvision: - version: 0.19.0.dev20240427+cpu + version: 0.19.0.dev20240505+cpu transformers: version: 4.38.1 From 26a0d5afbc89931f408ccc578f69b6a6d0d873fc Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 7 May 2024 13:24:12 +0800 Subject: [PATCH 046/199] LLM script: enable jit logic for yuan accuracy test (#2832) --- .../run_accuracy_with_deepspeed.py | 67 +++++++++++++++++- .../llm/single_instance/run_accuracy.py | 69 +++++++++++++++++-- 2 files changed, 129 insertions(+), 7 deletions(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 046fa6cb8..8b8eef8e6 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -464,6 +464,19 @@ def _get_past_key_values(self, input_bs, last_hidden_state=None): for i in range(num_hidden_layers) ] ) + if re.search("yuan", self.config.architectures[0], re.IGNORECASE): + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 1, 2, hidden_size), + ) + for i in range(num_hidden_layers) + ] + ) return past_key_values def _model_call( @@ -649,9 +662,57 @@ def _collate(x): max_gen_tokens = min( self.max_gen_toks, request_args.get("max_length", self.max_gen_toks) ) - cont = self._model_generate( - context_enc, context_enc.shape[1] + max_gen_tokens, primary_until - ) + + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True + if args.quant_with_amp or self._dtype == "bfloat16" + else False, + ): + if self._with_jit and self.iter == 0: + if self._dtype not in ["int8", "int4", "nf4"]: + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + input_bs = context_enc.shape[0] * self.num_beams + attention_mask = torch.ones(len(context_enc[0])) + position_ids = torch.arange(len(context_enc[0])) + example_dict = { + "input_ids": context_enc[:, -1:], + "attention_mask": attention_mask.unsqueeze(0)[:, -1:], + "position_ids": position_ids.unsqueeze(0)[:, -1:], + "past_key_values": self._get_past_key_values(input_bs), + } + model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=example_dict, + strict=False, + check_trace=False, + ) + model = torch.jit.freeze(model.eval()) + example_dict = { + "input_ids": example_dict["input_ids"].repeat(input_bs, 1), + "attention_mask": example_dict["attention_mask"].repeat(input_bs, 1), + "position_ids": example_dict["position_ids"].repeat(input_bs, 1) + } + first_token_model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=example_dict, + strict=False, + check_trace=False, + ) + first_token_model = torch.jit.freeze(first_token_model.eval()) + else: + model = torch.jit.load(args.quantized_model_path) + model = torch.jit.freeze(model.eval()) + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + first_token_model = torch.jit.load(args.quantized_model_path+"2") + first_token_model = torch.jit.freeze(first_token_model.eval()) + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + ipex._set_optimized_model_for_generation(self.model, optimized_model=model, first_token_optimized_model=first_token_model) + else: + ipex._set_optimized_model_for_generation(self.model, optimized_model=model) + self.iter = self.iter + 1 + cont = self._model_generate( + context_enc, context_enc.shape[1] + max_gen_tokens, primary_until + ) s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :]) if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index d1d7c8855..7bccfe43d 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -162,7 +162,7 @@ def __init__( model_id if config is None else config, torchscript=with_jit, trust_remote_code=True ) - if self._dtype in ("int8", "int4", "nf4"): + if self._dtype in ("int8", "int4", "nf4") and not re.search("yuan", self.config.architectures[0], re.IGNORECASE): try: with ipex.OnDevice(dtype=torch.float, device="meta"): self.model = model_class[0].from_config(self.config, trust_remote_code=True) @@ -253,6 +253,19 @@ def _get_past_key_values(self, input_bs, last_hidden_state=None): for i in range(num_hidden_layers) ] ) + if re.search("yuan", self.config.architectures[0], re.IGNORECASE): + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 1, 2, hidden_size), + ) + for i in range(num_hidden_layers) + ] + ) return past_key_values def _model_call( @@ -439,9 +452,57 @@ def _collate(x): max_gen_tokens = min( self.max_gen_toks, request_args.get("max_length", self.max_gen_toks) ) - cont = self._model_generate( - context_enc, context_enc.shape[1] + max_gen_tokens, primary_until - ) + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True + if args.quant_with_amp or self._dtype == "bfloat16" + else False, + ): + if self._with_jit and self.iter == 0: + if self._dtype not in ["int8", "int4", "nf4"]: + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + input_bs = context_enc.shape[0] * self.num_beams + attention_mask = torch.ones(len(context_enc[0])) + position_ids = torch.arange(len(context_enc[0])) + example_dict = { + "input_ids": context_enc[:, -1:], + "attention_mask": attention_mask.unsqueeze(0)[:, -1:], + "position_ids": position_ids.unsqueeze(0)[:, -1:], + "past_key_values": self._get_past_key_values(input_bs), + } + model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=example_dict, + strict=False, + check_trace=False, + ) + model = torch.jit.freeze(model.eval()) + example_dict = { + "input_ids": example_dict["input_ids"].repeat(input_bs, 1), + "attention_mask": example_dict["attention_mask"].repeat(input_bs, 1), + "position_ids": example_dict["position_ids"].repeat(input_bs, 1) + } + first_token_model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=example_dict, + strict=False, + check_trace=False, + ) + first_token_model = torch.jit.freeze(first_token_model.eval()) + else: + model = torch.jit.load(args.quantized_model_path) + model = torch.jit.freeze(model.eval()) + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + first_token_model = torch.jit.load(args.quantized_model_path+"2") + first_token_model = torch.jit.freeze(first_token_model.eval()) + if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + ipex._set_optimized_model_for_generation(self.model, optimized_model=model, first_token_optimized_model=first_token_model) + else: + ipex._set_optimized_model_for_generation(self.model, optimized_model=model) + + self.iter = self.iter + 1 + cont = self._model_generate( + context_enc, context_enc.shape[1] + max_gen_tokens, primary_until + ) s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :]) if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): From 850737b2d296c46577659575ae883a67e930545c Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 7 May 2024 15:43:17 +0800 Subject: [PATCH 047/199] vit: disable int8 flatten (#2829) --- .../quantization/_recipe.py | 10 +++++++++ tests/cpu/test_ao_jit_ipex_quantization.py | 22 +++++++++++++++++++ tests/cpu/test_quantization_default_recipe.py | 2 ++ 3 files changed, 34 insertions(+) diff --git a/intel_extension_for_pytorch/quantization/_recipe.py b/intel_extension_for_pytorch/quantization/_recipe.py index dbff3b10d..eb94e65ec 100644 --- a/intel_extension_for_pytorch/quantization/_recipe.py +++ b/intel_extension_for_pytorch/quantization/_recipe.py @@ -569,6 +569,16 @@ def get_default_recipe(nodes): node.input_tensor_force_inf_dtype[0] = node.input_tensor_infos[ 0 ].inf_dtype + elif node.type == str( + torch.Tensor.flatten + ) and not _check_has_quantizable_node_after_node(node): + # If the post op of flatten is not a quantizable node, force reset input's inf_dtype to orig_dtype + node.input_tensor_infos[0].inf_dtype = node.input_tensor_infos[ + 0 + ].orig_dtype + node.input_tensor_force_inf_dtype[0] = node.input_tensor_infos[ + 0 + ].inf_dtype else: # For other quantizable node, we don't need add fake quant before it if it's pre node is one none-quantizable op. # Now all other quantizable node only have one input info, so we can check the one pre input node info to check diff --git a/tests/cpu/test_ao_jit_ipex_quantization.py b/tests/cpu/test_ao_jit_ipex_quantization.py index f10f81842..ce248c22a 100644 --- a/tests/cpu/test_ao_jit_ipex_quantization.py +++ b/tests/cpu/test_ao_jit_ipex_quantization.py @@ -164,6 +164,22 @@ def forward(self, x): x = self.flatten(x) return x + class M2(nn.Module): + def __init__( + self, + ): + super(M2, self).__init__() + self.projection = torch.nn.Conv2d( + 3, 768, kernel_size=(16, 16), stride=(16, 16) + ) + self.cls_token = torch.rand(1, 1, 768) + + def forward(self, pixel_values): + embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2) + cls_tokens = self.cls_token.expand(224, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + return embeddings + m = M() x = torch.rand(1, 3, 14, 14) for qconfig in static_qconfig: @@ -172,6 +188,12 @@ def forward(self, x): FileCheck().check_not("aten::quantize_per_tensor").check_not( "at::dequantize" ).check("aten::flatten").run(graph) + m2 = M2() + x = torch.rand(224, 3, 224, 224) + for qconfig in static_qconfig: + graph = self.checkQuantizeTrace(m2, [x], atol=2e-1, qconfig=qconfig) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1) + self.assertGraphContainsExactly(graph, "aten::dequantize", 0) def test_embeddingbag_int8(self): class M(nn.Module): diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py index 9b867235c..5f0158f35 100644 --- a/tests/cpu/test_quantization_default_recipe.py +++ b/tests/cpu/test_quantization_default_recipe.py @@ -73,6 +73,8 @@ def __init__(self, quantized_module): def forward(self, x): x = self.quantized_module(x) x = x.flatten(1) + # add a quantifiable op after flatten + x = x.flatten(1) return x class conv_swish(nn.Module): From ab47edcf7f0cb22187c168c803f2776c3a9c1c33 Mon Sep 17 00:00:00 2001 From: Cao E Date: Thu, 9 May 2024 13:27:54 +0800 Subject: [PATCH 048/199] Refine user experience for chatglm model (#2806) (#2856) Co-authored-by: jianan-gu --- csrc/cpu/aten/kernels/RMSNormKrnl.cpp | 27 +++++++- examples/cpu/inference/python/llm/README.md | 2 - .../run_accuracy_with_deepspeed.py | 13 ++-- .../run_generation_with_deepspeed.py | 11 +++- .../llm/single_instance/run_accuracy.py | 13 ++-- .../llm/single_instance/run_generation.py | 16 +++-- .../llm/single_instance/run_quantization.py | 12 +++- tests/cpu/test_rmsnorm.py | 62 +++++++++++++------ 8 files changed, 115 insertions(+), 41 deletions(-) diff --git a/csrc/cpu/aten/kernels/RMSNormKrnl.cpp b/csrc/cpu/aten/kernels/RMSNormKrnl.cpp index 79681e719..8475d46dd 100644 --- a/csrc/cpu/aten/kernels/RMSNormKrnl.cpp +++ b/csrc/cpu/aten/kernels/RMSNormKrnl.cpp @@ -52,8 +52,17 @@ at::Tensor rmsnorm_kernel_impl( c10::nullopt /* device */, c10::nullopt /* pin_memory */, at::MemoryFormat::Contiguous); - if (input.scalar_type() == at::ScalarType::Float) { + if (input.scalar_type() == at::ScalarType::Float && + b.scalar_type() == at::ScalarType::Float) { RMSNormKernelImpl(X, b, M, N, eps, Y); + } else if ( + input.scalar_type() == at::ScalarType::Float && + b.scalar_type() == at::ScalarType::BFloat16) { + RMSNormKernelImpl(X, b, M, N, eps, Y); + } else if ( + input.scalar_type() == at::ScalarType::Float && + b.scalar_type() == at::ScalarType::Half) { + RMSNormKernelImpl(X, b, M, N, eps, Y); } else if ( input.scalar_type() == at::ScalarType::BFloat16 && b.scalar_type() == at::ScalarType::Float) { @@ -62,6 +71,22 @@ at::Tensor rmsnorm_kernel_impl( input.scalar_type() == at::ScalarType::BFloat16 && b.scalar_type() == at::ScalarType::BFloat16) { RMSNormKernelImpl(X, b, M, N, eps, Y); + } else if ( + input.scalar_type() == at::ScalarType::BFloat16 && + b.scalar_type() == at::ScalarType::Half) { + RMSNormKernelImpl(X, b, M, N, eps, Y); + } else if ( + input.scalar_type() == at::ScalarType::Half && + b.scalar_type() == at::ScalarType::Half) { + RMSNormKernelImpl(X, b, M, N, eps, Y); + } else if ( + input.scalar_type() == at::ScalarType::Half && + b.scalar_type() == at::ScalarType::BFloat16) { + RMSNormKernelImpl(X, b, M, N, eps, Y); + } else if ( + input.scalar_type() == at::ScalarType::Half && + b.scalar_type() == at::ScalarType::Float) { + RMSNormKernelImpl(X, b, M, N, eps, Y); } else { TORCH_CHECK(false, "Unsupported input type"); } diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index 1e0c03d06..c6ac2beb7 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -414,8 +414,6 @@ deepspeed --bind_cores_to_rank run.py --benchmark -m meta-llama/Llama-2-7b-hf - There are some model-specific requirements to be aware of, as follows: -- For ChatGLM models, the default torch_dtype is float16 in config.json. We need to replace the "float16" with "float32" in config.json. - - For MPT models from the remote hub, we need to modify the config.json to use the modeling_mpt.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/mosaicml_mpt-7b_config.json". - For Falcon models from remote hub, we need to modify the config.json to use the modeling_falcon.py in transformers. Therefore, in the following scripts, we need to pass an extra configuration file like "--config-file=model_config/tiiuae_falcon-40b_config.json". This is optional for FP32/BF16 but needed for quantizations. diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 8b8eef8e6..b659efa90 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -243,10 +243,15 @@ def __init__( self.tokenizer = model_class[1].from_pretrained( model_id, trust_remote_code=True ) - self.config = AutoConfig.from_pretrained( - model_id if config is None else config, torchscript=with_jit, trust_remote_code=True - ) - + if model_type == "chatglm": + # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, torchscript=with_jit, trust_remote_code=True, torch_dtype=load_dtype, + ) + else: + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, torchscript=with_jit, trust_remote_code=True + ) if re.search("gptbigcode", self.config.architectures[0], re.IGNORECASE): model_type = "gptbigcode" diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index b2a9f2f98..d768a21a9 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -306,9 +306,14 @@ def get_checkpoint_files(model_name_or_path): tokenizer.model_input_names = model_input_names if args.config_file is None: - config = AutoConfig.from_pretrained( - args.model_id, torchscript=True, trust_remote_code=True - ) + if model_type == "chatglm": + config = AutoConfig.from_pretrained( + args.model_id, torchscript=True, trust_remote_code=True, torch_dtype=load_dtype, + ) + else: + config = AutoConfig.from_pretrained( + args.model_id, torchscript=True, trust_remote_code=True + ) else: config = AutoConfig.from_pretrained( args.config_file, torchscript=True, trust_remote_code=True diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 7bccfe43d..4a253faa9 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -158,10 +158,15 @@ def __init__( self.tokenizer = model_class[1].from_pretrained( model_id, trust_remote_code=True ) - self.config = AutoConfig.from_pretrained( - model_id if config is None else config, torchscript=with_jit, trust_remote_code=True - ) - + if model_type == "chatglm": + # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, torchscript=with_jit, trust_remote_code=True, torch_dtype=load_dtype, + ) + else: + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, torchscript=with_jit, trust_remote_code=True + ) if self._dtype in ("int8", "int4", "nf4") and not re.search("yuan", self.config.architectures[0], re.IGNORECASE): try: with ipex.OnDevice(dtype=torch.float, device="meta"): diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index a631ffde0..dd5f3cb96 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -132,12 +132,18 @@ ) model_class = MODEL_CLASSES[model_type] if args.config_file is None: - config = AutoConfig.from_pretrained( - args.model_id, torchscript=args.deployment_mode, trust_remote_code=True - ) + if model_type == "chatglm": + # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided + config = AutoConfig.from_pretrained( + args.model_id, torchscript=args.deployment_mode, trust_remote_code=True, torch_dtype=amp_dtype, + ) + else: + config = AutoConfig.from_pretrained( + args.model_id, torchscript=args.deployment_mode, trust_remote_code=True, + ) else: config = AutoConfig.from_pretrained( - args.config_file, torchscript=args.deployment_mode, trust_remote_code=True + args.config_file, torchscript=args.deployment_mode, trust_remote_code=True, torch_dtype=amp_dtype, ) if not hasattr(config, "text_max_length") and args.prompt is None: config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) @@ -160,7 +166,6 @@ tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) model = model.eval() model = model.to(memory_format=torch.channels_last) - num_beams = 1 if args.greedy else 4 # generate args if args.streaming: @@ -216,7 +221,6 @@ def trace_handler(prof): inplace=True, deployment_mode=args.deployment_mode, ) - if args.torch_compile: if args.deployment_mode: raise SystemExit("[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode to False if want to use torch.compile.") diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 1b602a9c3..c36e9e522 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -204,9 +204,15 @@ if args.config_file is None: - config = AutoConfig.from_pretrained( - args.model_id, torchscript=True, trust_remote_code=True - ) + if "chatglm" in args.model_id.lower(): + # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided + config = AutoConfig.from_pretrained( + args.model_id, torchscript=True, trust_remote_code=True, torch_dtype=torch.float + ) + else: + config = AutoConfig.from_pretrained( + args.model_id, torchscript=True, trust_remote_code=True, + ) else: config = AutoConfig.from_pretrained( args.config_file, torchscript=True, trust_remote_code=True diff --git a/tests/cpu/test_rmsnorm.py b/tests/cpu/test_rmsnorm.py index 4c666ba33..3bb6adb8c 100644 --- a/tests/cpu/test_rmsnorm.py +++ b/tests/cpu/test_rmsnorm.py @@ -5,9 +5,9 @@ class RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): + def __init__(self, hidden_size, eps=1e-6, dtype=torch.float): super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) + self.weight = nn.Parameter(torch.ones(hidden_size, dtype=dtype)) self.variance_epsilon = eps def forward(self, hidden_states, fused_rmsnorm=False): @@ -29,22 +29,48 @@ def forward(self, hidden_states, fused_rmsnorm=False): class RMSNormTester(TestCase): def test_RMSNorm(self): for dim in [2, 3, 4, 5]: - with torch.cpu.amp.autocast(), torch.no_grad(): - input_size = [ - 3, - ] - for _ in range(dim - 1): - input_size.append(10) - x = torch.randn(input_size) - # RMSNorm input is fp32 - model = RMSNorm(input_size).eval() - y1_fp32 = model(x) - fused_y1_fp32 = model(x, fused_rmsnorm=True) - self.assertEqual(y1_fp32, fused_y1_fp32) - x_bf16 = x.to(torch.bfloat16) - y1_bf16 = model(x_bf16) - fused_y1_bf16 = model(x_bf16, fused_rmsnorm=True) - self.assertEqual(y1_bf16, fused_y1_bf16, prec=1e-2) + # RMSNorm input is fp32 + for weight_dtype in [torch.float32, torch.half, torch.bfloat16]: + with torch.no_grad(): + input_size = [ + 3, + ] + for _ in range(dim - 1): + input_size.append(10) + x = torch.randn(input_size) + model = RMSNorm(input_size, dtype=weight_dtype).eval() + y1_fp32 = model(x) + fused_y1_fp32 = model(x, fused_rmsnorm=True) + self.assertEqual(y1_fp32, fused_y1_fp32) + # RMSNorm input is bf16 + for weight_dtype in [torch.float32, torch.half, torch.bfloat16]: + with torch.no_grad(): + input_size = [ + 3, + ] + for _ in range(dim - 1): + input_size.append(10) + x = torch.randn(input_size) + + model = RMSNorm(input_size, dtype=weight_dtype).eval() + x_bf16 = x.to(torch.bfloat16) + y1_bf16 = model(x_bf16) + fused_y1_bf16 = model(x_bf16, fused_rmsnorm=True) + self.assertEqual(y1_bf16, fused_y1_bf16, prec=1e-2) + # RMSNorm input is fp16 + for weight_dtype in [torch.float32, torch.half, torch.bfloat16]: + with torch.no_grad(): + input_size = [ + 3, + ] + for _ in range(dim - 1): + input_size.append(10) + x = torch.randn(input_size) + model = RMSNorm(input_size, dtype=weight_dtype).eval() + x_fp16 = x.to(torch.half) + y1_fp16 = model(x_fp16) + fused_y1_fp16 = model(x_fp16, fused_rmsnorm=True) + self.assertEqual(y1_fp16, fused_y1_fp16, prec=1e-2) if __name__ == "__main__": From baaacf49ce74393c51fbc4332b9691f30e9e1997 Mon Sep 17 00:00:00 2001 From: Ryan Tao <65508217+RanTao123@users.noreply.github.com> Date: Thu, 9 May 2024 15:24:38 +0800 Subject: [PATCH 049/199] set brgemm count and unroll_hint for woq kernel weight dtype = int8 (#2749) * add unroll_hint. * add brgemm count = Kcb * improve the performance by changing the dequant array. * solve bug * modify variable name * improve the performance by change gemm_loop * open lm_head_generation * delete unused code * commit according to comment * add quant_offset * clang-format * fix quant_mode !=0 and add UT * flake8 format * clang-format * modify according comment * fp16 set brcount = 1 * modify Kcb for small M * rename the KCB_BLOCK_SIZE --------- Co-authored-by: Xia Weiwen --- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 163 ++++++++++++++---- tests/cpu/test_quantization_default_recipe.py | 32 ++-- 2 files changed, 144 insertions(+), 51 deletions(-) diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index 30403b90b..e8675a7fa 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -35,6 +35,8 @@ using TensorList = std::vector; #define QINT4 2 #define NF4 3 +static int IPEX_KCB_BLOCK_SIZE = env2int("IPEX_KCB_BLOCK_SIZE", 64); + constexpr bool is_4bit(const int qw_type) { return qw_type == QINT4 || qw_type == NF4; } @@ -1259,10 +1261,17 @@ template < bool ACC, int qw_type, int quant_a_mode, + int quant_w_mode, long PREFETCH_K_DIST = 0> class DequantGemmTPP { public: - DequantGemmTPP(long M, long K, long lda, long ldc) { + DequantGemmTPP( + long M, + long K, + long lda, + long ldc, + int unroll_hint = 1, + long str_a = 1) { TLA_ASSERT(false, "not implemented"); } @@ -1275,7 +1284,10 @@ class DequantGemmTPP { bool no_tile_cfg = true, float* scale_a = nullptr, int32_t* zp_a = nullptr, - int32_t k_groups = -1) { + int32_t k_groups = -1, + int32_t count = 64, + int kc_start = 0, + int quant_block_multiple = 1) { TLA_ASSERT(false, "not implemented"); } @@ -1298,6 +1310,7 @@ template < bool ACC, int qw_type, int quant_a_mode, + int quant_w_mode, long PREFETCH_K_DIST> class DequantGemmTPP< Tin, @@ -1311,9 +1324,16 @@ class DequantGemmTPP< ACC, qw_type, quant_a_mode, + quant_w_mode, PREFETCH_K_DIST> { public: - DequantGemmTPP(long M, long K, long lda, long ldc) + DequantGemmTPP( + long M, + long K, + long lda, + long ldc, + int unroll_hint = 1, + long str_a = 1) : M(M), K(K), lda(lda), ldc(ldc) { static_assert(N % 16 == 0, "N must be a multiple of 16"); if (std::is_same()) @@ -1322,14 +1342,14 @@ class DequantGemmTPP< M, N, K, - 1, - 1, + str_a, + N * K, lda, ldb, ldc, ACC ? 1 : 0, transA, - 1, + unroll_hint, /*b_vnni*/ std::is_same()); } @@ -1346,7 +1366,10 @@ class DequantGemmTPP< bool no_tile_cfg = true, float* scale_a = nullptr, int32_t* zp_a = nullptr, - int32_t k_groups = -1) { + int32_t k_groups = -1, + int32_t count = 64, + int kc_start = 0, + int quant_block_multiple = 1) { if (M < SMALL_BATCH_THRESHOLD && ((std::is_same() && std::is_same()) || (std::is_same() && std::is_same()))) { @@ -1408,11 +1431,22 @@ class DequantGemmTPP< } } else { constexpr const int N_GROUP_SIZE = get_n_group_size(N); - Tin B[K][N]; + Tin B[count][K][N]; // TODO(jgong5): add prefetch - Dequantize::call( - qB, K, N, scales, zps, B[0]); - (*pgemm)(A, B[0], C, 1, no_tile_cfg); + for (int cnt = 0; cnt < count; cnt++) { + int32_t quant_offset = quant_w_mode == 0 + ? 0 + : (kc_start + cnt) / quant_block_multiple - + kc_start / quant_block_multiple; + Dequantize::call( + qB + K * N * cnt, + K, + N, + scales + N * quant_offset, + zps + N * quant_offset, + B[cnt][0]); + } + (*pgemm)(A, B[0][0], C, count, no_tile_cfg); } } @@ -1443,6 +1477,7 @@ template < bool transA, bool ACC, int quant_a_mode, + int quant_w_mode, long PREFETCH_K_DIST> class DequantGemmTPP< /*Tin*/ uint8_t, @@ -1456,11 +1491,18 @@ class DequantGemmTPP< ACC, /*qw_type*/ QINT4, quant_a_mode, + quant_w_mode, PREFETCH_K_DIST> { using TBrgemmTPP = BrgemmTPP; public: - DequantGemmTPP(long M, long K, long lda, long ldc) + DequantGemmTPP( + long M, + long K, + long lda, + long ldc, + int unroll_hint = 1, + long str_a = 1) : M(M), K(K), lda(lda), ldc(ldc) { static_assert(N % 16 == 0, "N must be a multiple of 16"); TLA_ASSERT(K % 4 == 0, "Kb must be a multiple of 4 for int8 VNNI"); @@ -1469,14 +1511,14 @@ class DequantGemmTPP< M, N, K, - 1, - 1, + K, + K * N, lda, N, N, /*ACC*/ 0, /*transA*/ false, - 1, + unroll_hint, /*b_vnni*/ true); } @@ -1493,7 +1535,10 @@ class DequantGemmTPP< bool no_tile_cfg = true, float* scale_a = nullptr, int32_t* zp_a = nullptr, - int32_t k_groups = -1) { + int32_t k_groups = -1, + int32_t count = 1, + int kc_start = 0, + int quant_block_multiple = 1) { auto qA = GetVLAPtr(A, {lda}); #ifdef __AVX512VNNI__ if (M < SMALL_BATCH_THRESHOLD) { @@ -1705,7 +1750,15 @@ void qlinear_woq_affine_impl( auto lda = no_x_buf ? K : Kb; auto ldy = N; auto ldc = (no_y_buf || k_splits > 1) ? ldy : Nb; - + auto str_a = no_x_buf == true ? Kb : BLOCK_M * Kb; + auto Kcb = Kc; + if (M < PARALLEL_M_THRESHOLD) { + Kcb = 1; + } else if (is_4bit_flag || !std::is_same()) { + Kcb = 1; + } else if (M >= PARALLEL_M_THRESHOLD) { + Kcb = IPEX_KCB_BLOCK_SIZE; + } auto px = GetVLAPtr(x, {Kc, Kb}); auto pw = GetVLAPtr( (uint8_t*)qw_packed.data_ptr(), {Kc, Kb * (is_4bit_flag ? Nb / 2 : Nb)}); @@ -1795,11 +1848,14 @@ void qlinear_woq_affine_impl( /*ACC*/ true, qw_type, quant_a_mode, + quant_w_mode, PREFETCH_K_DIST>( /*M*/ BLOCK_M, /*K*/ Kb, /*lda*/ lda, - /*ldc*/ ldc); + /*ldc*/ ldc, + /*unroll_hint*/ IPEX_KCB_BLOCK_SIZE, + str_a); auto dequant_gemm_no_prefetch_tpp = DequantGemmTPP< TComp, TGemmOut, @@ -1812,11 +1868,14 @@ void qlinear_woq_affine_impl( /*ACC*/ true, qw_type, quant_a_mode, + quant_w_mode, 0>( /*M*/ BLOCK_M, /*K*/ Kb, /*lda*/ lda, - /*ldc*/ ldc); + /*ldc*/ ldc, + IPEX_KCB_BLOCK_SIZE, + str_a); auto dequant_gemm_rem_tpp = DequantGemmTPP< TComp, TGemmOut, @@ -1829,11 +1888,14 @@ void qlinear_woq_affine_impl( /*ACC*/ true, qw_type, quant_a_mode, + quant_w_mode, PREFETCH_K_DIST>( /*M*/ BLOCK_M_rem, /*K*/ Kb, /*lda*/ lda, - /*ldc*/ ldc); + /*ldc*/ ldc, + IPEX_KCB_BLOCK_SIZE, + str_a); auto dequant_gemm_no_prefetch_rem_tpp = DequantGemmTPP< TComp, TGemmOut, @@ -1846,11 +1908,14 @@ void qlinear_woq_affine_impl( /*ACC*/ true, qw_type, quant_a_mode, + quant_w_mode, 0>( /*M*/ BLOCK_M_rem, /*K*/ Kb, /*lda*/ lda, - /*ldc*/ ldc); + /*ldc*/ ldc, + IPEX_KCB_BLOCK_SIZE, + str_a); auto pcvt_x_tpp = std::is_same() ? nullptr @@ -1881,12 +1946,14 @@ void qlinear_woq_affine_impl( if (no_y_buf) { auto loop_scheme = M >= PARALLEL_M_THRESHOLD ? "ACb" : "aCb"; auto gemm_loop = ThreadedLoop<3>( - {{0, M, BLOCK_M, false}, {Kc}, {Nc}}, loop_scheme); + {{0, M, BLOCK_M, false}, {0, Kc, Kcb, false}, {Nc}}, + loop_scheme); gemm_loop( [&](int* idx) { int m = idx[0]; int kc = idx[1]; int nc = idx[2]; + auto count = kc + Kcb < Kc ? Kcb : Kc - kc; float* scale_a = nullptr; int32_t* zp_a = nullptr; int32_t k_groups = -1; @@ -1947,7 +2014,8 @@ void qlinear_woq_affine_impl( true, scale_a, zp_a, - k_groups); + k_groups, + count); } else { dequant_gemm_no_prefetch_tpp( x_ptr, @@ -1958,7 +2026,8 @@ void qlinear_woq_affine_impl( true, scale_a, zp_a, - k_groups); + k_groups, + count); if (fusion_type > 0) { post_ops_fn(m, nc); } @@ -1982,7 +2051,8 @@ void qlinear_woq_affine_impl( false, scale_a, zp_a, - k_groups); + k_groups, + count); dequant_gemm_tpp.config(); } else { dequant_gemm_no_prefetch_rem_tpp( @@ -1994,7 +2064,8 @@ void qlinear_woq_affine_impl( false, scale_a, zp_a, - k_groups); + k_groups, + count); dequant_gemm_no_prefetch_tpp.config(); if (fusion_type > 0) { post_ops_rem_fn(m, nc); @@ -2062,7 +2133,8 @@ void qlinear_woq_affine_impl( } } } - for (int kc = kc_start; kc < kc_end; kc++) { + for (int kc = kc_start; kc < kc_end; kc += Kcb) { + auto count = kc + Kcb < Kc ? Kcb : Kc - kc; TComp* x_ptr = (TComp*)px[m][kc]; float* scale_a = nullptr; int32_t* zp_a = nullptr; @@ -2104,11 +2176,14 @@ void qlinear_woq_affine_impl( } } if (!is_rem) { - alignas(64) TComp x_buf[BLOCK_M][Kb]; + alignas(64) TComp x_buf[count][BLOCK_M][Kb]; if (!no_x_buf) { - (*pcvt_x_tpp)(px[m][kc], x_buf[0]); - x_ptr = x_buf[0]; + for (int cnt = 0; cnt < count; cnt++) { + (*pcvt_x_tpp)(px[m][kc + cnt], x_buf[cnt][0]); + } + x_ptr = x_buf[0][0]; } + if (kc < Kc - 1) { dequant_gemm_tpp( x_ptr, @@ -2119,7 +2194,10 @@ void qlinear_woq_affine_impl( true, scale_a, zp_a, - k_groups); + k_groups, + count, + kc, + quant_block_multiple); } else { dequant_gemm_no_prefetch_tpp( x_ptr, @@ -2130,13 +2208,18 @@ void qlinear_woq_affine_impl( true, scale_a, zp_a, - k_groups); + k_groups, + count, + kc, + quant_block_multiple); } } else { - alignas(64) TComp x_buf[BLOCK_M][Kb]; + alignas(64) TComp x_buf[count][BLOCK_M][Kb]; if (!no_x_buf) { - (*pcvt_x_rem_tpp)(px[m][kc], x_buf[0]); - x_ptr = x_buf[0]; + for (int cnt = 0; cnt < count; cnt++) { + (*pcvt_x_rem_tpp)(px[m][kc + cnt], x_buf[cnt][0]); + } + x_ptr = x_buf[0][0]; } if (kc < Kc - 1) { dequant_gemm_rem_tpp( @@ -2148,7 +2231,10 @@ void qlinear_woq_affine_impl( false, scale_a, zp_a, - k_groups); + k_groups, + count, + kc, + quant_block_multiple); dequant_gemm_tpp.config(); } else { dequant_gemm_no_prefetch_rem_tpp( @@ -2160,7 +2246,10 @@ void qlinear_woq_affine_impl( false, scale_a, zp_a, - k_groups); + k_groups, + count, + kc, + quant_block_multiple); dequant_gemm_no_prefetch_tpp.config(); } } diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py index 5f0158f35..9d613f16e 100644 --- a/tests/cpu/test_quantization_default_recipe.py +++ b/tests/cpu/test_quantization_default_recipe.py @@ -1364,19 +1364,22 @@ def __init__(self, ic, oc, has_bias): def forward(self, x): return self.linear(x) - def test(shape, has_bias, act_quant_mode, group_size): - M, N, K = shape + def test(shape, has_bias, act_quant_mode, group_size, w_dtype): dtype = torch.bfloat16 - model = Mod(K, N, has_bias) + model = Mod(shape[1], shape[2], has_bias) m = model.eval() m2 = copy.deepcopy(m) - data = torch.rand(M, K) * 0.5 + data = torch.rand(shape[0], shape[1]) + if w_dtype == WoqWeightDtype.INT4: + lowp_mode = WoqLowpMode.INT8 + else: + lowp_mode = WoqLowpMode.BF16 if group_size == -1 and act_quant_mode != 0: # these cases are covered by another test case for act_quant_mode return qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( - weight_dtype=WoqWeightDtype.INT4, - lowp_mode=WoqLowpMode.INT8, + weight_dtype=w_dtype, + lowp_mode=lowp_mode, act_quant_mode=act_quant_mode, group_size=group_size, ) @@ -1393,20 +1396,20 @@ def test(shape, has_bias, act_quant_mode, group_size): w = copy.deepcopy(m.linear.weight.data) if group_size == -1: qw, w_scales, w_zero_points = quantize_per_channel( - w, WoqWeightDtype.INT4, None, None + w, w_dtype, None, None ) fake_quant_w = dequantize_per_channel( - qw, w_scales, w_zero_points.int(), WoqWeightDtype.INT4, w.shape + qw, w_scales, w_zero_points.int(), w_dtype, w.shape ) else: qw, w_scales, w_zero_points = quantize_per_block( - w, WoqWeightDtype.INT4, group_size, None, None + w, w_dtype, group_size, None, None ) fake_quant_w = dequantize_per_block( qw, w_scales, w_zero_points, - WoqWeightDtype.INT4, + w_dtype, group_size, weight_shape=w.shape, ) @@ -1423,15 +1426,16 @@ def test(shape, has_bias, act_quant_mode, group_size): y_ref = y_ref.to(dtype) torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-1) - MNK_list = [(4, 64, 128), (4, 32, 127), (9, 31, 256)] + MNK_list = [(4, 64, 128), (4, 32, 127), (9, 31, 256), (1024, 4096, 4096)] has_bias_list = [False, True] quant_mode_list = [0, 1, 2, 3] group_size_list = [-1, 32, 64, 128] + weight_dtype = [WoqWeightDtype.INT8, WoqWeightDtype.INT4] cases = itertools.product( - MNK_list, has_bias_list, quant_mode_list, group_size_list + MNK_list, has_bias_list, quant_mode_list, group_size_list, weight_dtype ) - for shape, has_bias, act_quant_mode, group_size in cases: - test(shape, has_bias, act_quant_mode, group_size) + for shape, has_bias, act_quant_mode, group_size, w_dtype in cases: + test(shape, has_bias, act_quant_mode, group_size, w_dtype) def test_compute_with_g_idx(self): class Mod(nn.Module): From d0fe4b8fca9b384a871d94fed222c1a028c394b6 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Fri, 10 May 2024 09:29:53 +0800 Subject: [PATCH 050/199] Update dependency_version.yml 20240510 (#2862) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 77fb7f143..6e06969f5 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240505+cpu + version: 2.4.0.dev20240508+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240505+cpu + version: 2.2.0.dev20240508+cpu torchvision: - version: 0.19.0.dev20240505+cpu + version: 0.19.0.dev20240508+cpu transformers: version: 4.38.1 From acb6e0b570aabe78cb193773e416ce6be8d61a60 Mon Sep 17 00:00:00 2001 From: Cao E Date: Fri, 10 May 2024 11:12:48 +0800 Subject: [PATCH 051/199] add fallback implementations for fp16 flash attention (#2861) --- csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp | 163 +++++++++++-------- tests/cpu/test_cpu_ops.py | 5 +- 2 files changed, 95 insertions(+), 73 deletions(-) diff --git a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp index 16b6a179d..f4d8529bc 100644 --- a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp @@ -659,9 +659,8 @@ cpu_flash_attention( const auto dtype = query.scalar_type(); const auto accumulate_dtype = at::toOpMathType(dtype); const bool is_fp16 = dtype == at::kHalf; - TORCH_CHECK( - !is_fp16 || (is_fp16 && utils::isa_has_amx_fp16_support()), - "scaled_dot_product_attention_flash_attention does not support FP16 on the platforms without amx_fp16 support"); + const int vnni_pack = + (!is_fp16 || (is_fp16 && utils::isa_has_amx_fp16_support())) ? 1 : 0; bool is_bool_mask = attention_mask.has_value() && attention_mask.value().scalar_type() == ScalarType::Bool; @@ -759,7 +758,8 @@ cpu_flash_attention( /*ldc*/ kvSplitSize, /*beta*/ 0.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); auto qk_gemm_ktail = SCOPEITGEMM((BrgemmTPP( /*M*/ qSplitSize, /*N*/ kvTail, @@ -771,7 +771,8 @@ cpu_flash_attention( /*ldc*/ kvTail, /*beta*/ 0.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); auto qk_gemm_qtail = SCOPEITGEMM((BrgemmTPP( /*M*/ qTail, /*N*/ kvSplitSize, @@ -783,7 +784,8 @@ cpu_flash_attention( /*ldc*/ kvSplitSize, /*beta*/ 0.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); auto qk_gemm_qktail = SCOPEITGEMM((BrgemmTPP( /*M*/ qTail, /*N*/ kvTail, @@ -795,7 +797,8 @@ cpu_flash_attention( /*ldc*/ kvTail, /*beta*/ 0.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); // Create tpp kernels for Attention @ Value bool av_gemm_K_even = kvSplitSize % 2 == 0; @@ -809,51 +812,55 @@ cpu_flash_attention( auto av_gemm = SCOPEITGEMM((BrgemmTPP( /*M*/ qSplitSize, /*N*/ headSize, - /*K*/ av_gemm_K, + /*K*/ vnni_pack ? av_gemm_K : kvSplitSize, /*str_a*/ 1, /*str_b*/ 1, /*lda*/ av_gemm_K, - /*ldb*/ headSize, + /*ldb*/ vnni_pack ? headSize : vStrideN, /*ldc*/ headSize, /*beta*/ 0.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); auto av_gemm_tail = SCOPEITGEMM((BrgemmTPP( /*M*/ qSplitSize, /*N*/ headSize, - /*K*/ av_gemm_K_tail, + /*K*/ vnni_pack ? av_gemm_K_tail : kvTail, /*str_a*/ 1, /*str_b*/ 1, /*lda*/ av_gemm_K_tail, - /*ldb*/ headSize, + /*ldb*/ vnni_pack ? headSize : vStrideN, /*ldc*/ headSize, /*beta*/ 0.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); auto av_gemm_bias = SCOPEITGEMM((BrgemmTPP( /*M*/ qSplitSize, /*N*/ headSize, - /*K*/ av_gemm_K, + /*K*/ vnni_pack ? av_gemm_K : kvSplitSize, /*str_a*/ 1, /*str_b*/ 1, /*lda*/ av_gemm_K, - /*ldb*/ headSize, + /*ldb*/ vnni_pack ? headSize : vStrideN, /*ldc*/ headSize, /*beta*/ 1.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); auto av_gemm_bias_tail = SCOPEITGEMM((BrgemmTPP( /*M*/ qSplitSize, /*N*/ headSize, - /*K*/ av_gemm_K_tail, + /*K*/ vnni_pack ? av_gemm_K_tail : kvTail, /*str_a*/ 1, /*str_b*/ 1, /*lda*/ av_gemm_K_tail, - /*ldb*/ headSize, + /*ldb*/ vnni_pack ? headSize : vStrideN, /*ldc*/ headSize, /*beta*/ 1.0, /*a_trans*/ 0, - /*unroll_hint*/ 1))); + /*unroll_hint*/ 1, + /*b_vnni*/ vnni_pack))); // Buffer to store Key and Value after transforms at::Tensor key_t_reorder = at::empty( @@ -900,7 +907,8 @@ cpu_flash_attention( /*out_cols*/ kvSplitSize, /*ldi*/ headSize_even ? kStrideN : qk_gemm_K, /*ldo*/ kvSplitSize, - /*xtype*/ XformTPP::XFORM_XPOSE_N2V_TPP, + /*xtype*/ + vnni_pack ? XformTPP::XFORM_XPOSE_N2V_TPP : XformTPP::XFORM_XPOSE_TPP, /*ignore_vnni_for_fp32*/ true), XPOSE); auto k_xform_tail = SCOPEIT( @@ -911,7 +919,8 @@ cpu_flash_attention( /*out_cols*/ kvTail, /*ldi*/ headSize_even ? kStrideN : qk_gemm_K, /*ldo*/ kvTail, - /*xtype*/ XformTPP::XFORM_XPOSE_N2V_TPP, + /*xtype*/ + vnni_pack ? XformTPP::XFORM_XPOSE_N2V_TPP : XformTPP::XFORM_XPOSE_TPP, /*ignore_vnni_for_fp32*/ true), XPOSE); // Create tpp transforms for Value @@ -972,24 +981,26 @@ cpu_flash_attention( j * qk_gemm_K * kvSize + n * qk_gemm_K); } if (!av_gemm_K_even) { - // padding - // [kvSplitSize, headSize] -> [kvSplitSize + 1, headSize] - pad_row_zero( - v_data + i * vStrideB + j * vStrideH + n * vStrideN, - value_padding_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, - av_gemm_K, - headSize, - vStrideN); - v_xform( - value_padding_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, - value_reorder_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize); - } else { + if (is_fp16 && vnni_pack) { + // padding + // [kvSplitSize, headSize] -> [kvSplitSize + 1, headSize] + pad_row_zero( + v_data + i * vStrideB + j * vStrideH + n * vStrideN, + value_padding_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize, + av_gemm_K, + headSize, + vStrideN); + v_xform( + value_padding_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize, + value_reorder_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize); + } + } else if (vnni_pack) { v_xform( v_data + i * vStrideB + j * vStrideH + n * vStrideN, value_reorder_ptr + @@ -1020,24 +1031,26 @@ cpu_flash_attention( j * qk_gemm_K * kvSize + n * qk_gemm_K); } if (!av_gemm_K_tail_even) { - // padding - // [kvtail, headSize] -> [kvtail + 1, headSize] - pad_row_zero( - v_data + i * vStrideB + j * vStrideH + n * vStrideN, - value_padding_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, - av_gemm_K_tail, - headSize, - vStrideN); - v_xform_tail( - value_padding_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, - value_reorder_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize); - } else { + if (is_fp16 && vnni_pack) { + // padding + // [kvtail, headSize] -> [kvtail + 1, headSize] + pad_row_zero( + v_data + i * vStrideB + j * vStrideH + n * vStrideN, + value_padding_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize, + av_gemm_K_tail, + headSize, + vStrideN); + v_xform_tail( + value_padding_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize, + value_reorder_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize); + } + } else if (vnni_pack) { v_xform_tail( v_data + i * vStrideB + j * vStrideH + n * vStrideN, value_reorder_ptr + @@ -1260,18 +1273,24 @@ cpu_flash_attention( if (n == 0) { av_gemm( qk_reduced_data, - value_reorder_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, + vnni_pack + ? (value_reorder_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize) + : (v_data + i * vStrideB + j * vStrideH + + n * vStrideN), dst_data, 1); } else { // bias av_gemm_bias( qk_reduced_data, - value_reorder_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, + vnni_pack + ? (value_reorder_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize) + : (v_data + i * vStrideB + j * vStrideH + + n * vStrideN), dst_data, 1); } @@ -1280,18 +1299,24 @@ cpu_flash_attention( if (n == 0) { av_gemm_tail( qk_reduced_data, - value_reorder_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, + vnni_pack + ? (value_reorder_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize) + : (v_data + i * vStrideB + j * vStrideH + + n * vStrideN), dst_data, 1); } else { // bias av_gemm_bias_tail( qk_reduced_data, - value_reorder_ptr + - i * num_head * kv_padding_size * headSize + - j * kv_padding_size * headSize + psize * headSize, + vnni_pack + ? (value_reorder_ptr + + i * num_head * kv_padding_size * headSize + + j * kv_padding_size * headSize + psize * headSize) + : (v_data + i * vStrideB + j * vStrideH + + n * vStrideN), dst_data, 1); } diff --git a/tests/cpu/test_cpu_ops.py b/tests/cpu/test_cpu_ops.py index ea0c8d8a2..63c519a1f 100644 --- a/tests/cpu/test_cpu_ops.py +++ b/tests/cpu/test_cpu_ops.py @@ -6,7 +6,6 @@ import random import itertools import intel_extension_for_pytorch as ipex -import intel_extension_for_pytorch._C as core from common_utils import TestCase import torch.autograd.functional as autogradF from copy import deepcopy @@ -1404,9 +1403,7 @@ def test_cat(self): self.assertTrue(y7.dtype == datatype) def test_flash_attention(self): - dtypes = [torch.float, torch.double, torch.bfloat16] - if core.isa_has_amx_fp16_support(): - dtypes.append(torch.float16) + dtypes = [torch.float, torch.double, torch.bfloat16, torch.float16] for dtype in dtypes: for causal, has_attention_mask in [ [False, False], From a178f9965d393e3a67806455915e0dd87740eb65 Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Fri, 10 May 2024 15:04:19 +0800 Subject: [PATCH 052/199] Fix ipex.comm import issue (#2854) --- intel_extension_for_pytorch/cpu/comm/__init__.py | 16 +++++++++++----- tests/cpu/test_ccl_primitive.py | 5 ++++- tests/cpu/test_ipex_tensor_parallel.py | 4 ++++ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/intel_extension_for_pytorch/cpu/comm/__init__.py b/intel_extension_for_pytorch/cpu/comm/__init__.py index fc7d249c2..087eeb069 100644 --- a/intel_extension_for_pytorch/cpu/comm/__init__.py +++ b/intel_extension_for_pytorch/cpu/comm/__init__.py @@ -1,8 +1,14 @@ import torch import intel_extension_for_pytorch._C as torch_ipex_cpp -get_world_size = torch_ipex_cpp.get_world_size -get_rank = torch_ipex_cpp.get_rank -barrier = torch_ipex_cpp.barrier -allreduce_add = torch.ops.torch_ipex.all_reduce_add -allgather = torch.ops.torch_ipex.allgather + +def has_ccl(): + return hasattr(torch.ops.torch_ipex, "all_reduce_add") + + +if has_ccl(): + get_world_size = torch_ipex_cpp.get_world_size + get_rank = torch_ipex_cpp.get_rank + barrier = torch_ipex_cpp.barrier + allreduce_add = torch.ops.torch_ipex.all_reduce_add + allgather = torch.ops.torch_ipex.allgather diff --git a/tests/cpu/test_ccl_primitive.py b/tests/cpu/test_ccl_primitive.py index 432e1521d..1b4ba4160 100644 --- a/tests/cpu/test_ccl_primitive.py +++ b/tests/cpu/test_ccl_primitive.py @@ -3,8 +3,11 @@ import torch import intel_extension_for_pytorch as ipex +has_ccl = ipex.cpu.comm.has_ccl() +world_size = 0 if not has_ccl else ipex.cpu.comm.get_world_size() -@unittest.skip("oneccl can't works in docker") + +@unittest.skipIf(not (has_ccl and world_size > 1), "oneccl is not built") class CCLTester(unittest.TestCase): def test_all_reduce_add(self): mpi_world_size = int(os.environ.get("PMI_SIZE", -1)) diff --git a/tests/cpu/test_ipex_tensor_parallel.py b/tests/cpu/test_ipex_tensor_parallel.py index cae69fcc0..13d68a78e 100644 --- a/tests/cpu/test_ipex_tensor_parallel.py +++ b/tests/cpu/test_ipex_tensor_parallel.py @@ -35,7 +35,11 @@ from hf_configs.yuan.yuan_hf_model import YuanForCausalLM from hf_configs.phi.modeling_phi import PhiForCausalLM +has_ccl = ipex_comm.has_ccl() +world_size = 0 if not has_ccl else ipex_comm.get_world_size() + +@unittest.skipIf(not (has_ccl and world_size > 1), "oneccl is not built") class TensorParallelTester(TestCase): def _shard_model(self, model): rank = ipex_comm.get_rank() From 8d94b88070278bccbd16aadcbec8d9c3eff39cd3 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Fri, 10 May 2024 17:41:48 +0800 Subject: [PATCH 053/199] Revert "Update dependency_version.yml 20240510 (#2862)" (#2868) This reverts commit d0fe4b8fca9b384a871d94fed222c1a028c394b6. --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 6e06969f5..77fb7f143 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240508+cpu + version: 2.4.0.dev20240505+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240508+cpu + version: 2.2.0.dev20240505+cpu torchvision: - version: 0.19.0.dev20240508+cpu + version: 0.19.0.dev20240505+cpu transformers: version: 4.38.1 From 2ee5dba25a3c67d288ab3d8e7589a9dbe509cb2d Mon Sep 17 00:00:00 2001 From: Xu Han Date: Sat, 11 May 2024 01:47:33 +0800 Subject: [PATCH 054/199] code format for example. (#2865) * code format for example 1st round. * Reformat llm-related example script files --------- Co-authored-by: Xia, Weiwen --- .../fast_bert/fast_bert_inference_bf16.py | 1 + .../fast_bert/fast_bert_training_bf16.py | 15 +- examples/cpu/features/graph_capture.py | 3 +- .../features/graph_optimization/folding.py | 2 +- .../features/graph_optimization/fp32_bf16.py | 2 +- .../cpu/features/graph_optimization/int8.py | 24 +- examples/cpu/features/hypertune/resnet50.py | 25 +- .../int8_recipe_tuning/imagenet_autotune.py | 142 +++-- .../int8_recipe_tuning/int8_autotune.py | 18 +- examples/cpu/features/llm/llm_optimize.py | 19 +- .../features/llm/llm_optimize_smoothquant.py | 67 ++- examples/cpu/features/llm/llm_optimize_woq.py | 28 +- examples/cpu/inference/cpp/model_gen.py | 6 +- .../python/bert_eager_mode_inference_bf16.py | 1 + .../python/bert_eager_mode_inference_fp32.py | 1 + .../python/bert_general_inference_script.py | 17 +- .../bert_torchdynamo_mode_inference_bf16.py | 1 + .../bert_torchdynamo_mode_inference_fp32.py | 1 + .../bert_torchscript_mode_inference_bf16.py | 1 + .../bert_torchscript_mode_inference_fp32.py | 1 + .../cpu/inference/python/int8_deployment.py | 8 +- .../python/int8_quantization_dynamic.py | 6 +- .../python/int8_quantization_static.py | 27 +- .../python/llm-modeling/modeling_gptj.py | 6 +- .../python/llm-modeling/modeling_llama.py | 26 +- .../python/llm-modeling/modeling_opt.py | 8 +- .../run_accuracy_with_deepspeed.py | 516 +++++++++++++----- .../llm/distributed/run_generation_tp.py | 141 ++++- .../run_generation_with_deepspeed.py | 133 +++-- examples/cpu/inference/python/llm/run.py | 193 +++++-- .../llm/single_instance/run_accuracy.py | 502 ++++++++++++----- .../llm/single_instance/run_generation.py | 150 +++-- .../run_int4_gpt-j_on_cnndailymail.py | 120 ++-- .../llm/single_instance/run_quantization.py | 326 ++++++++--- .../python/llm/utils/create_shard_model.py | 27 +- .../python/llm/utils/model_class/baichuan.py | 7 +- .../python/llm/utils/model_class/bloom.py | 11 +- .../python/llm/utils/model_class/chatglm.py | 3 +- .../python/llm/utils/model_class/codegen.py | 3 +- .../python/llm/utils/model_class/falcon.py | 5 +- .../python/llm/utils/model_class/git.py | 5 +- .../llm/utils/model_class/gptbigcode.py | 3 +- .../python/llm/utils/model_class/gptj.py | 1 + .../python/llm/utils/model_class/gptneox.py | 1 + .../python/llm/utils/model_class/llama.py | 13 +- .../python/llm/utils/model_class/llava.py | 10 +- .../python/llm/utils/model_class/llm.py | 52 +- .../python/llm/utils/model_class/mistral.py | 2 +- .../python/llm/utils/model_class/mixtral.py | 1 + .../python/llm/utils/model_class/mpt.py | 1 + .../python/llm/utils/model_class/opt.py | 1 + .../python/llm/utils/model_class/phi.py | 4 +- .../python/llm/utils/model_class/qwen.py | 1 + .../python/llm/utils/model_class/stablelm.py | 1 + .../python/llm/utils/model_class/t5.py | 5 +- .../python/llm/utils/model_class/yuan.py | 1 + .../inference/python/llm/utils/run_gptq.py | 50 +- .../resnet50_eager_mode_inference_bf16.py | 3 +- .../resnet50_eager_mode_inference_fp32.py | 3 +- .../resnet50_general_inference_script.py | 21 +- ...esnet50_torchdynamo_mode_inference_bf16.py | 3 +- ...esnet50_torchdynamo_mode_inference_fp32.py | 1 + ...esnet50_torchscript_mode_inference_bf16.py | 3 +- ...esnet50_torchscript_mode_inference_fp32.py | 3 +- .../cpu/serving/torchserve/quantize_model.py | 4 +- .../cpu/serving/triton/bert_base/1/model.py | 117 ++-- .../cpu/serving/triton/bert_large/1/model.py | 117 ++-- .../distributed_data_parallel_training.py | 62 +-- examples/cpu/training/llm/finetune.py | 12 +- examples/cpu/training/llm/utils/prompter.py | 4 +- .../training/single_instance_training_bf16.py | 30 +- .../training/single_instance_training_fp32.py | 30 +- scripts/tools/setup/flake8.py | 41 +- 73 files changed, 2229 insertions(+), 970 deletions(-) diff --git a/examples/cpu/features/fast_bert/fast_bert_inference_bf16.py b/examples/cpu/features/fast_bert/fast_bert_inference_bf16.py index 7a164b1d5..17b519a8b 100644 --- a/examples/cpu/features/fast_bert/fast_bert_inference_bf16.py +++ b/examples/cpu/features/fast_bert/fast_bert_inference_bf16.py @@ -12,6 +12,7 @@ #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.fast_bert(model, dtype=torch.bfloat16) ###################################################### # noqa F401 diff --git a/examples/cpu/features/fast_bert/fast_bert_training_bf16.py b/examples/cpu/features/fast_bert/fast_bert_training_bf16.py index 64a39203b..701d095fb 100644 --- a/examples/cpu/features/fast_bert/fast_bert_training_bf16.py +++ b/examples/cpu/features/fast_bert/fast_bert_training_bf16.py @@ -1,7 +1,9 @@ import torch from transformers import BertForSequenceClassification -model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True) +model = BertForSequenceClassification.from_pretrained( + "bert-base-uncased", return_dict=True +) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) @@ -13,14 +15,15 @@ #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model, optimizer = ipex.fast_bert(model, optimizer=optimizer, dtype=torch.bfloat16) ###################################################### # noqa F401 with torch.cpu.amp.autocast(dtype=torch.bfloat16): - labels = torch.tensor(1) - outputs = model(data, labels=labels) - loss = outputs.loss - loss.backward() - optimizer.step() + labels = torch.tensor(1) + outputs = model(data, labels=labels) + loss = outputs.loss + loss.backward() + optimizer.step() print("Execution finished") diff --git a/examples/cpu/features/graph_capture.py b/examples/cpu/features/graph_capture.py index f22dcf275..0a5b54379 100644 --- a/examples/cpu/features/graph_capture.py +++ b/examples/cpu/features/graph_capture.py @@ -1,12 +1,13 @@ import torch import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(1, 3, 224, 224) #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, graph_mode=True) ###################################################### # noqa F401 diff --git a/examples/cpu/features/graph_optimization/folding.py b/examples/cpu/features/graph_optimization/folding.py index ea9bf2332..5a61aeca7 100644 --- a/examples/cpu/features/graph_optimization/folding.py +++ b/examples/cpu/features/graph_optimization/folding.py @@ -1,7 +1,7 @@ import torch import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() x = torch.randn(4, 3, 224, 224) diff --git a/examples/cpu/features/graph_optimization/fp32_bf16.py b/examples/cpu/features/graph_optimization/fp32_bf16.py index 3454e646d..8d0aaded2 100644 --- a/examples/cpu/features/graph_optimization/fp32_bf16.py +++ b/examples/cpu/features/graph_optimization/fp32_bf16.py @@ -4,7 +4,7 @@ # Import the Intel Extension for PyTorch import intel_extension_for_pytorch as ipex -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() # Apply some fusions at the front end diff --git a/examples/cpu/features/graph_optimization/int8.py b/examples/cpu/features/graph_optimization/int8.py index d9dcfa280..5fe41533a 100644 --- a/examples/cpu/features/graph_optimization/int8.py +++ b/examples/cpu/features/graph_optimization/int8.py @@ -4,7 +4,7 @@ from intel_extension_for_pytorch.quantization import prepare, convert # construct the model -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") qconfig = ipex.quantization.default_static_qconfig model.eval() example_inputs = torch.rand(1, 3, 224, 224) @@ -12,14 +12,17 @@ ##### Example Dataloader ##### # noqa F401 import torchvision -DOWNLOAD = True -DATA = 'datasets/cifar10/' -transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) -]) +DOWNLOAD = True +DATA = "datasets/cifar10/" + +transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize((224, 224)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] +) train_dataset = torchvision.datasets.CIFAR10( root=DATA, train=True, @@ -27,13 +30,12 @@ download=DOWNLOAD, ) calibration_data_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 + dataset=train_dataset, batch_size=128 ) with torch.no_grad(): for batch_idx, (d, target) in enumerate(calibration_data_loader): - print(f'calibrated on batch {batch_idx} out of {len(calibration_data_loader)}') + print(f"calibrated on batch {batch_idx} out of {len(calibration_data_loader)}") prepared_model(d) ############################## # noqa F401 diff --git a/examples/cpu/features/hypertune/resnet50.py b/examples/cpu/features/hypertune/resnet50.py index bbf4fbd2d..3e0071870 100644 --- a/examples/cpu/features/hypertune/resnet50.py +++ b/examples/cpu/features/hypertune/resnet50.py @@ -1,6 +1,7 @@ import torch import torchvision.models as models + def inference(model, data): with torch.no_grad(): # warm up @@ -9,6 +10,7 @@ def inference(model, data): # measure import time + measure_iter = 100 start = time.time() for _ in range(measure_iter): @@ -19,8 +21,13 @@ def inference(model, data): latency = duration / measure_iter throughput = measure_iter / duration - print("@hypertune {'name': 'latency (ms)'}") # Add print statement of the form @hypertune {'name': str, 'higher_is_better': bool, 'target_val': int or float}` - print(latency) # Print the objective(s) you want to optimize. Make sure this is just an int or float to be minimzied or maximized. + print( + "@hypertune {'name': 'latency (ms)'}" + ) # Add print statement of the form @hypertune {'name': str, 'higher_is_better': bool, 'target_val': int or float}` + print( + latency + ) # Print the objective(s) you want to optimize. Make sure this is just an int or float to be minimzied or maximized. + def main(args): model = models.resnet50(pretrained=False) @@ -30,9 +37,9 @@ def main(args): import intel_extension_for_pytorch as ipex - if args.dtype == 'float32': + if args.dtype == "float32": model = ipex.optimize(model, dtype=torch.float32) - elif args.dtype == 'bfloat16': + elif args.dtype == "bfloat16": model = ipex.optimize(model, dtype=torch.bfloat16) else: # int8 from intel_extension_for_pytorch.quantization import prepare, convert @@ -47,7 +54,7 @@ def main(args): model = convert(model) - with torch.cpu.amp.autocast(enabled=args.dtype == 'bfloat16'): + with torch.cpu.amp.autocast(enabled=args.dtype == "bfloat16"): if args.torchscript: with torch.no_grad(): model = torch.jit.trace(model, data) @@ -55,10 +62,14 @@ def main(args): inference(model, data) -if __name__ == '__main__': + +if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser() - parser.add_argument('--dtype', default='float32', choices=['float32', 'bfloat16', 'int8']) + parser.add_argument( + "--dtype", default="float32", choices=["float32", "bfloat16", "int8"] + ) parser.add_argument("--torchscript", default=False, action="store_true") main(parser.parse_args()) diff --git a/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py b/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py index c2cab8ed4..7d741e495 100644 --- a/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py +++ b/examples/cpu/features/int8_recipe_tuning/imagenet_autotune.py @@ -5,13 +5,17 @@ import torchvision.datasets as datasets import intel_extension_for_pytorch as ipex -model_names = sorted(name for name in models.__dict__ - if name.islower() and not name.startswith("__") - and callable(models.__dict__[name])) +model_names = sorted( + name + for name in models.__dict__ + if name.islower() and not name.startswith("__") and callable(models.__dict__[name]) +) + class AverageMeter(object): """Computes and stores the average and current value""" - def __init__(self, name, fmt=':f'): + + def __init__(self, name, fmt=":f"): self.name = name self.fmt = fmt self.reset() @@ -29,9 +33,10 @@ def update(self, val, n=1): self.avg = self.sum / self.count def __str__(self): - fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})" return fmtstr.format(**self.__dict__) + class ProgressMeter(object): def __init__(self, num_batches, meters, prefix=""): self.batch_fmtstr = self._get_batch_fmtstr(num_batches) @@ -41,12 +46,13 @@ def __init__(self, num_batches, meters, prefix=""): def display(self, batch): entries = [self.prefix + self.batch_fmtstr.format(batch)] entries += [str(meter) for meter in self.meters] - print('\t'.join(entries)) + print("\t".join(entries)) def _get_batch_fmtstr(self, num_batches): num_digits = len(str(num_batches // 1)) - fmt = '{:' + str(num_digits) + 'd}' - return '[' + fmt + '/' + fmt.format(num_batches) + ']' + fmt = "{:" + str(num_digits) + "d}" + return "[" + fmt + "/" + fmt.format(num_batches) + "]" + def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" @@ -64,23 +70,23 @@ def accuracy(output, target, topk=(1,)): res.append(correct_k.mul_(100.0 / batch_size)) return res + def validate(val_loader, model, criterion, args): # switch to evaluate mode model.eval() def eval_func(model): - batch_time = AverageMeter('Time', ':6.3f') - losses = AverageMeter('Loss', ':.4e') - top1 = AverageMeter('Acc@1', ':6.2f') - top5 = AverageMeter('Acc@5', ':6.2f') + batch_time = AverageMeter("Time", ":6.3f") + losses = AverageMeter("Loss", ":.4e") + top1 = AverageMeter("Acc@1", ":6.2f") + top5 = AverageMeter("Acc@5", ":6.2f") number_iter = len(val_loader) progress = ProgressMeter( - number_iter, - [batch_time, losses, top1, top5], - prefix='Test: ') - print('Evaluating RESNET: total Steps: {}'.format(number_iter)) + number_iter, [batch_time, losses, top1, top5], prefix="Test: " + ) + print("Evaluating RESNET: total Steps: {}".format(number_iter)) with torch.no_grad(): for i, (images, target) in enumerate(val_loader): images = images.contiguous(memory_format=torch.channels_last) @@ -95,12 +101,16 @@ def eval_func(model): progress.display(i) # TODO: this should also be done with the ProgressMeter - print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) + print( + " * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}".format(top1=top1, top5=top5) + ) return top1.avg.item() print(".........runing autotuning step.........") - tuned_model = ipex.quantization.autotune(model, val_loader, eval_func=eval_func, sampling_sizes=[300]) + tuned_model = ipex.quantization.autotune( + model, val_loader, eval_func=eval_func, sampling_sizes=[300] + ) print(".........autotuning step done.........") print(".........runing int8 inference.........") @@ -118,46 +128,86 @@ def eval_func(model): return + def main(args): print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) - assert args.data is not None, "please set dataset path if you want to using real data" - valdir = os.path.join(args.data, 'val') - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + assert ( + args.data is not None + ), "please set dataset path if you want to using real data" + valdir = os.path.join(args.data, "val") + normalize = transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) criterion = torch.nn.CrossEntropyLoss() val_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(valdir, transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])), - batch_size=args.batch_size, shuffle=False, - num_workers=args.workers, pin_memory=True) + datasets.ImageFolder( + valdir, + transforms.Compose( + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ] + ), + ), + batch_size=args.batch_size, + shuffle=False, + num_workers=args.workers, + pin_memory=True, + ) validate(val_loader, model, criterion, args) -if __name__ == '__main__': + +if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser() - parser.add_argument('-data', metavar='DIR', nargs='?', default='imagenet', - help='path to dataset (default: imagenet)') - parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', - choices=model_names, - help='model architecture: ' + - ' | '.join(model_names) + - ' (default: resnet18)') - parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', - help='number of data loading workers (default: 4)') - parser.add_argument('-b', '--batch-size', default=256, type=int, - metavar='N', - help='mini-batch size (default: 256), this is the total ' - 'batch size of all GPUs on the current node when ' - 'using Data Parallel or Distributed Data Parallel') - parser.add_argument('-p', '--print-freq', default=10, type=int, - metavar='N', help='print frequency (default: 10)') + parser.add_argument( + "-data", + metavar="DIR", + nargs="?", + default="imagenet", + help="path to dataset (default: imagenet)", + ) + parser.add_argument( + "-a", + "--arch", + metavar="ARCH", + default="resnet18", + choices=model_names, + help="model architecture: " + " | ".join(model_names) + " (default: resnet18)", + ) + parser.add_argument( + "-j", + "--workers", + default=4, + type=int, + metavar="N", + help="number of data loading workers (default: 4)", + ) + parser.add_argument( + "-b", + "--batch-size", + default=256, + type=int, + metavar="N", + help="mini-batch size (default: 256), this is the total " + "batch size of all GPUs on the current node when " + "using Data Parallel or Distributed Data Parallel", + ) + parser.add_argument( + "-p", + "--print-freq", + default=10, + type=int, + metavar="N", + help="print frequency (default: 10)", + ) main(parser.parse_args()) print("Execution finished") diff --git a/examples/cpu/features/int8_recipe_tuning/int8_autotune.py b/examples/cpu/features/int8_recipe_tuning/int8_autotune.py index 8b851b539..b70155ca4 100644 --- a/examples/cpu/features/int8_recipe_tuning/int8_autotune.py +++ b/examples/cpu/features/int8_recipe_tuning/int8_autotune.py @@ -36,6 +36,7 @@ print(f"Shape of y: {y.shape} {y.dtype}") break + # Define model class NeuralNetwork(nn.Module): def __init__(self): @@ -46,7 +47,7 @@ def __init__(self): nn.ReLU(), nn.Linear(512, 512), nn.ReLU(), - nn.Linear(512, 10) + nn.Linear(512, 10), ) def forward(self, x): @@ -54,10 +55,12 @@ def forward(self, x): logits = self.linear_relu_stack(x) return logits + model = NeuralNetwork() loss_fn = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) + def train(dataloader, model, loss_fn, optimizer): size = len(dataloader.dataset) model.train() @@ -88,6 +91,7 @@ def train(dataloader, model, loss_fn, optimizer): ################################ QUANTIZE ############################## # noqa F401 model.eval() + def evaluate(dataloader, model): size = len(dataloader.dataset) model.eval() @@ -100,13 +104,21 @@ def evaluate(dataloader, model): accuracy /= size return accuracy + ######################## recipe tuning with INC ######################## # noqa F401 def eval(prepared_model): accu = evaluate(test_dataloader, prepared_model) return float(accu) -tuned_model = ipex.quantization.autotune(model, test_dataloader, eval_func=eval, sampling_sizes=[100], - accuracy_criterion={'relative': .01}, tuning_time=0) + +tuned_model = ipex.quantization.autotune( + model, + test_dataloader, + eval_func=eval, + sampling_sizes=[100], + accuracy_criterion={"relative": 0.01}, + tuning_time=0, +) ######################################################################## # noqa F401 # run tuned model diff --git a/examples/cpu/features/llm/llm_optimize.py b/examples/cpu/features/llm/llm_optimize.py index c78666938..fdd876273 100644 --- a/examples/cpu/features/llm/llm_optimize.py +++ b/examples/cpu/features/llm/llm_optimize.py @@ -1,6 +1,8 @@ import torch + #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + ###################################################### # noqa F401 import argparse from transformers import ( @@ -35,9 +37,7 @@ # load model model_id = "facebook/opt-125m" -config = AutoConfig.from_pretrained( - model_id, torchscript=True, trust_remote_code=True -) +config = AutoConfig.from_pretrained(model_id, torchscript=True, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=amp_dtype, @@ -45,10 +45,7 @@ low_cpu_mem_usage=True, trust_remote_code=True, ) -tokenizer = AutoTokenizer.from_pretrained( - model_id, - trust_remote_code=True -) +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = model.eval() model = model.to(memory_format=torch.channels_last) @@ -73,12 +70,12 @@ prompt = [prompt] * args.batch_size # inference -with torch.no_grad(), torch.inference_mode(), torch.cpu.amp.autocast(enabled=amp_enabled): +with torch.no_grad(), torch.inference_mode(), torch.cpu.amp.autocast( + enabled=amp_enabled +): input_ids = tokenizer(prompt, return_tensors="pt").input_ids gen_ids = model.generate( - input_ids, - max_new_tokens=args.max_new_tokens, - **generate_kwargs + input_ids, max_new_tokens=args.max_new_tokens, **generate_kwargs ) gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True) input_tokens_lengths = [x.shape[0] for x in input_ids] diff --git a/examples/cpu/features/llm/llm_optimize_smoothquant.py b/examples/cpu/features/llm/llm_optimize_smoothquant.py index 5529ee662..8b342aaef 100644 --- a/examples/cpu/features/llm/llm_optimize_smoothquant.py +++ b/examples/cpu/features/llm/llm_optimize_smoothquant.py @@ -1,6 +1,8 @@ import torch + #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + ###################################################### # noqa F401 import argparse from transformers import ( @@ -10,7 +12,9 @@ ) # args -parser = argparse.ArgumentParser("Generation script (static quantization path)", add_help=False) +parser = argparse.ArgumentParser( + "Generation script (static quantization path)", add_help=False +) parser.add_argument( "--dtype", type=str, @@ -27,10 +31,22 @@ parser.add_argument("--greedy", action="store_true") parser.add_argument("--batch-size", default=1, type=int, help="batch size") parser.add_argument("--calibration", action="store_true") -parser.add_argument("--calibration-samples", default=512, type=int, help="total number of calibration samples") -parser.add_argument("--int8-qconfig", nargs="?", default="./qconfig.json", help="static quantization factors summary files generated by calibration") +parser.add_argument( + "--calibration-samples", + default=512, + type=int, + help="total number of calibration samples", +) +parser.add_argument( + "--int8-qconfig", + nargs="?", + default="./qconfig.json", + help="static quantization factors summary files generated by calibration", +) parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k") -parser.add_argument("--alpha", default=0.5, type=float, help="alpha value for smoothquant") +parser.add_argument( + "--alpha", default=0.5, type=float, help="alpha value for smoothquant" +) args = parser.parse_args() print(args) @@ -41,9 +57,7 @@ # load model model_id = "meta-llama/Llama-2-7b-hf" -config = AutoConfig.from_pretrained( - model_id, torchscript=True, trust_remote_code=True -) +config = AutoConfig.from_pretrained(model_id, torchscript=True, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=amp_dtype, @@ -51,10 +65,7 @@ low_cpu_mem_usage=True, trust_remote_code=True, ) -tokenizer = AutoTokenizer.from_pretrained( - model_id, - trust_remote_code=True -) +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = model.eval() model = model.to(memory_format=torch.channels_last) @@ -70,10 +81,7 @@ 1, model.config.num_attention_heads, 1, - int( - model.config.hidden_size - / model.config.num_attention_heads - ), + int(model.config.hidden_size / model.config.num_attention_heads), ] ).contiguous(), torch.zeros( @@ -81,10 +89,7 @@ 1, user_model.config.num_attention_heads, 1, - int( - model.config.hidden_size - / model.config.num_attention_heads - ), + int(model.config.hidden_size / model.config.num_attention_heads), ] ).contiguous(), beam_idx_tmp, @@ -92,6 +97,7 @@ for i in range(model.config.num_hidden_layers) ] + # Intel(R) Extension for PyTorch* #################### code changes #################### # noqa F401 class Calibration: @@ -145,6 +151,7 @@ def collate_batch(self, batch): torch.tensor(last_ind), ) + calib_dataset = load_dataset(args.dataset, split="train") calib_evaluator = Calibration(calib_dataset, tokenizer, args.batch_size) calib_dataloader = DataLoader( @@ -162,10 +169,10 @@ def collate_batch(self, batch): (input_ids, attention_mask, position_ids, past_key_values), last_ind, ) in enumerate(calib_dataloader): - example_inputs = - (input_ids, attention_mask, position_ids, past_key_values) + example_inputs = (input_ids, attention_mask, position_ids, past_key_values) break - from intel_extension_for_pytorch.quantization import prepare, convert + from intel_extension_for_pytorch.quantization import prepare + model = ipex.llm.optimize( model.eval(), dtype=amp_dtype, @@ -173,9 +180,7 @@ def collate_batch(self, batch): inplace=True, deployment_mode=False, ) - prepared_model = prepare( - model.eval(), qconfig, example_inputs=example_inputs - ) + prepared_model = prepare(model.eval(), qconfig, example_inputs=example_inputs) with torch.no_grad(): for i, ( (input_ids, attention_mask, position_ids, past_key_values), @@ -191,7 +196,9 @@ def collate_batch(self, batch): ) prepared_model.save_qconf_summary(qconf_summary=args.int8_qconfig) - print("calibration Done! Will exit and please launch model quantization and benchmark") + print( + "calibration Done! Will exit and please launch model quantization and benchmark" + ) exit(0) else: model = ipex.llm.optimize( @@ -217,12 +224,12 @@ def collate_batch(self, batch): prompt = [prompt] * args.batch_size # inference -with torch.no_grad(), torch.inference_mode(), torch.cpu.amp.autocast(enabled=amp_enabled): +with torch.no_grad(), torch.inference_mode(), torch.cpu.amp.autocast( + enabled=amp_enabled +): input_ids = tokenizer(prompt, return_tensors="pt").input_ids gen_ids = model.generate( - input_ids, - max_new_tokens=args.max_new_tokens, - **generate_kwargs + input_ids, max_new_tokens=args.max_new_tokens, **generate_kwargs ) gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True) input_tokens_lengths = [x.shape[0] for x in input_ids] diff --git a/examples/cpu/features/llm/llm_optimize_woq.py b/examples/cpu/features/llm/llm_optimize_woq.py index 7b3c3bd79..baabd187f 100644 --- a/examples/cpu/features/llm/llm_optimize_woq.py +++ b/examples/cpu/features/llm/llm_optimize_woq.py @@ -1,6 +1,8 @@ import torch + #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + ###################################################### # noqa F401 import argparse from transformers import ( @@ -10,7 +12,9 @@ ) # args -parser = argparse.ArgumentParser("Generation script (weight only quantization path)", add_help=False) +parser = argparse.ArgumentParser( + "Generation script (weight only quantization path)", add_help=False +) parser.add_argument( "--dtype", type=str, @@ -69,9 +73,7 @@ # load model model_id = "facebook/opt-125m" -config = AutoConfig.from_pretrained( - model_id, torchscript=True, trust_remote_code=True -) +config = AutoConfig.from_pretrained(model_id, torchscript=True, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=amp_dtype, @@ -79,17 +81,17 @@ low_cpu_mem_usage=True, trust_remote_code=True, ) -tokenizer = AutoTokenizer.from_pretrained( - model_id, - trust_remote_code=True -) +tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) model = model.eval() model = model.to(memory_format=torch.channels_last) # Intel(R) Extension for PyTorch* #################### code changes #################### # noqa F401 from intel_extension_for_pytorch.quantization import WoqWeightDtype -weight_dtype = WoqWeightDtype.INT4 if args.weight_dtype == "INT4" else WoqWeightDtype.INT8 + +weight_dtype = ( + WoqWeightDtype.INT4 if args.weight_dtype == "INT4" else WoqWeightDtype.INT8 +) if args.lowp_mode == "INT8": lowp_mode = ipex.quantization.WoqLowpMode.INT8 @@ -134,12 +136,12 @@ prompt = [prompt] * args.batch_size # inference -with torch.no_grad(), torch.inference_mode(), torch.cpu.amp.autocast(enabled=amp_enabled): +with torch.no_grad(), torch.inference_mode(), torch.cpu.amp.autocast( + enabled=amp_enabled +): input_ids = tokenizer(prompt, return_tensors="pt").input_ids gen_ids = model.generate( - input_ids, - max_new_tokens=args.max_new_tokens, - **generate_kwargs + input_ids, max_new_tokens=args.max_new_tokens, **generate_kwargs ) gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True) input_tokens_lengths = [x.shape[0] for x in input_ids] diff --git a/examples/cpu/inference/cpp/model_gen.py b/examples/cpu/inference/cpp/model_gen.py index 4fd805cce..3083ffdd6 100644 --- a/examples/cpu/inference/cpp/model_gen.py +++ b/examples/cpu/inference/cpp/model_gen.py @@ -4,11 +4,11 @@ import torch import torchvision -model = torchvision.models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = torchvision.models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() input = torch.rand(1, 3, 224, 224) model = torch.jit.trace(model, input, check_trace=False) -model.save('resnet50.pt') -print("Saved model to: resnet50.pt") \ No newline at end of file +model.save("resnet50.pt") +print("Saved model to: resnet50.pt") diff --git a/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py b/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py index 50a639560..07210cb72 100644 --- a/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py +++ b/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py @@ -11,6 +11,7 @@ #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/bert_eager_mode_inference_fp32.py b/examples/cpu/inference/python/bert_eager_mode_inference_fp32.py index 06ca62969..21f975886 100644 --- a/examples/cpu/inference/python/bert_eager_mode_inference_fp32.py +++ b/examples/cpu/inference/python/bert_eager_mode_inference_fp32.py @@ -11,6 +11,7 @@ #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model) ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/bert_general_inference_script.py b/examples/cpu/inference/python/bert_general_inference_script.py index 0e0d3be89..91308a89b 100644 --- a/examples/cpu/inference/python/bert_general_inference_script.py +++ b/examples/cpu/inference/python/bert_general_inference_script.py @@ -1,6 +1,7 @@ import torch from transformers import BertModel + def inference(model, data): with torch.no_grad(): # warm up @@ -9,11 +10,13 @@ def inference(model, data): # measure import time + start = time.time() for _ in range(10): model(data) end = time.time() - print('Inference took {:.2f} ms in average'.format((end - start) / 10 * 1000)) + print("Inference took {:.2f} ms in average".format((end - start) / 10 * 1000)) + def main(args): model = BertModel.from_pretrained(args.model_name) @@ -26,23 +29,25 @@ def main(args): import intel_extension_for_pytorch as ipex - if args.dtype == 'float32': + if args.dtype == "float32": model = ipex.optimize(model, dtype=torch.float32) - elif args.dtype == 'bfloat16': + elif args.dtype == "bfloat16": model = ipex.optimize(model, dtype=torch.bfloat16) - with torch.cpu.amp.autocast(enabled=args.dtype == 'bfloat16'): + with torch.cpu.amp.autocast(enabled=args.dtype == "bfloat16"): with torch.no_grad(): model = torch.jit.trace(model, data, check_trace=False, strict=False) model = torch.jit.freeze(model) inference(model, data) -if __name__ == '__main__': + +if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser() parser.add_argument("--model_name", default="bert-base-multilingual-cased") - parser.add_argument('--dtype', default='float32', choices=['float32', 'bfloat16']) + parser.add_argument("--dtype", default="float32", choices=["float32", "bfloat16"]) main(parser.parse_args()) diff --git a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py index 860e18b25..eb9e52536 100644 --- a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py +++ b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py @@ -12,6 +12,7 @@ # Beta Feature #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, dtype=torch.bfloat16, weights_prepack=False) model = torch.compile(model, backend="ipex") ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py index 26e722e14..011ed83e6 100644 --- a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py +++ b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py @@ -12,6 +12,7 @@ # Beta Feature #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, weights_prepack=False) model = torch.compile(model, backend="ipex") ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py index 3c5d3bc05..71864eac0 100644 --- a/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py +++ b/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py @@ -11,6 +11,7 @@ #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py index 98b64dc9b..98c5b3ea4 100644 --- a/examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py +++ b/examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py @@ -11,6 +11,7 @@ #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model) ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/int8_deployment.py b/examples/cpu/inference/python/int8_deployment.py index e0ba48049..9c5f0698e 100644 --- a/examples/cpu/inference/python/int8_deployment.py +++ b/examples/cpu/inference/python/int8_deployment.py @@ -1,9 +1,11 @@ import torch + #################### code changes #################### # noqa F401 -import intel_extension_for_pytorch as ipex # noqa F401 +import intel_extension_for_pytorch as ipex # noqa F401 + ###################################################### # noqa F401 -model = torch.jit.load('static_quantized_model.pt') +model = torch.jit.load("static_quantized_model.pt") model.eval() model = torch.jit.freeze(model) data = torch.rand(128, 3, 224, 224) @@ -11,4 +13,4 @@ with torch.no_grad(): model(data) -print("Execution finished") \ No newline at end of file +print("Execution finished") diff --git a/examples/cpu/inference/python/int8_quantization_dynamic.py b/examples/cpu/inference/python/int8_quantization_dynamic.py index 5d588ae7e..864ba597e 100644 --- a/examples/cpu/inference/python/int8_quantization_dynamic.py +++ b/examples/cpu/inference/python/int8_quantization_dynamic.py @@ -1,7 +1,9 @@ import torch + #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex from intel_extension_for_pytorch.quantization import prepare, convert + ###################################################### # noqa F401 ##### Example Model ##### # noqa F401 @@ -27,7 +29,9 @@ converted_model = convert(prepared_model) with torch.no_grad(): - traced_model = torch.jit.trace(converted_model, (data,), check_trace=False, strict=False) + traced_model = torch.jit.trace( + converted_model, (data,), check_trace=False, strict=False + ) traced_model = torch.jit.freeze(traced_model) traced_model.save("dynamic_quantized_model.pt") diff --git a/examples/cpu/inference/python/int8_quantization_static.py b/examples/cpu/inference/python/int8_quantization_static.py index e5b82b6bd..98dc02d4b 100644 --- a/examples/cpu/inference/python/int8_quantization_static.py +++ b/examples/cpu/inference/python/int8_quantization_static.py @@ -1,12 +1,15 @@ import torch + #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex from intel_extension_for_pytorch.quantization import prepare, convert + ###################################################### # noqa F401 ##### Example Model ##### # noqa F401 import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') + +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(128, 3, 224, 224) ######################### # noqa F401 @@ -22,14 +25,17 @@ ##### Example Dataloader ##### # noqa F401 import torchvision -DOWNLOAD = True -DATA = 'datasets/cifar10/' -transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) -]) +DOWNLOAD = True +DATA = "datasets/cifar10/" + +transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize((224, 224)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] +) train_dataset = torchvision.datasets.CIFAR10( root=DATA, train=True, @@ -37,13 +43,12 @@ download=DOWNLOAD, ) calibration_data_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 + dataset=train_dataset, batch_size=128 ) with torch.no_grad(): for batch_idx, (d, target) in enumerate(calibration_data_loader): - print(f'calibrated on batch {batch_idx} out of {len(calibration_data_loader)}') + print(f"calibrated on batch {batch_idx} out of {len(calibration_data_loader)}") prepared_model(d) ############################## # noqa F401 diff --git a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py index a9c3dd034..70b3203a4 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_gptj.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_gptj.py @@ -37,8 +37,8 @@ def __init__(self, config): pos_embd_dim, backbone=config.architectures[0], ) - self._IPEXIndirectAccessKVCacheAttention = ipex.llm.modules.IndirectAccessKVCacheAttention( - max_positions + self._IPEXIndirectAccessKVCacheAttention = ( + ipex.llm.modules.IndirectAccessKVCacheAttention(max_positions) ) # ========================================================================== @@ -530,7 +530,7 @@ def forward( attentions=transformer_outputs.attentions, ) - # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention ==================== + # rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: diff --git a/examples/cpu/inference/python/llm-modeling/modeling_llama.py b/examples/cpu/inference/python/llm-modeling/modeling_llama.py index 3558e07cc..80bd137b0 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_llama.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_llama.py @@ -71,8 +71,10 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None): self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) # ==================== Changes to apply ipex.llm layers ==================== - self._IPEXIndirectAccessKVCacheAttention = ipex.llm.modules.IndirectAccessKVCacheAttention( - self.max_position_embeddings + self._IPEXIndirectAccessKVCacheAttention = ( + ipex.llm.modules.IndirectAccessKVCacheAttention( + self.max_position_embeddings + ) ) self.ipex_rotary_emb = ipex.llm.modules.RotaryEmbedding( self.max_position_embeddings, @@ -128,14 +130,16 @@ def forward( kv_seq_len, ) - (attn_output, attn_weights, past_key_value) = self._IPEXIndirectAccessKVCacheAttention( - query_states, - key_states, - value_states, - math.sqrt(self.head_dim), - past_key_value, - None, - attention_mask, + (attn_output, attn_weights, past_key_value) = ( + self._IPEXIndirectAccessKVCacheAttention( + query_states, + key_states, + value_states, + math.sqrt(self.head_dim), + past_key_value, + None, + attention_mask, + ) ) # ========================================================================== @@ -519,7 +523,7 @@ def prepare_inputs_for_generation( ) return model_inputs - # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention ==================== + # rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: diff --git a/examples/cpu/inference/python/llm-modeling/modeling_opt.py b/examples/cpu/inference/python/llm-modeling/modeling_opt.py index 3847df848..1ce7859f5 100644 --- a/examples/cpu/inference/python/llm-modeling/modeling_opt.py +++ b/examples/cpu/inference/python/llm-modeling/modeling_opt.py @@ -56,8 +56,10 @@ def _handle_deprecated_argument(config_arg_name, config, fn_arg_name, kwargs): self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias) # ==================== Changes to apply ipex.llm layers ==================== - self._IPEXIndirectAccessKVCacheAttention = ipex.llm.modules.IndirectAccessKVCacheAttention( - config.max_position_embeddings + self._IPEXIndirectAccessKVCacheAttention = ( + ipex.llm.modules.IndirectAccessKVCacheAttention( + config.max_position_embeddings + ) ) # ========================================================================== @@ -722,7 +724,7 @@ def prepare_inputs_for_generation( ) return model_inputs - # ==================== rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention ==================== + # rewrite to _reorder_cache to work with ipex.llm.modules.IndirectAccessKVCacheAttention def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index b659efa90..b07ce9933 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -1,3 +1,4 @@ +# encoding: UTF-8 import os import argparse import json @@ -8,7 +9,6 @@ from tqdm import tqdm import math import torch.nn.functional as F -import re import deepspeed from deepspeed.accelerator import get_accelerator import deepspeed.comm as dist @@ -24,14 +24,25 @@ import sys -sys.path.append(sys.path[0] + '/../../') +sys.path.append(sys.path[0] + "/../../") try: - from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM + from llava.model.language_model.llava_llama import ( # noqa F401 + LlavaLlamaForCausalLM, + ) from llava.model.builder import load_pretrained_model from llava.conversation import conv_templates - from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token - from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from llava.mm_utils import ( + get_model_name_from_path, + process_images, + tokenizer_image_token, + ) + from llava.constants import ( # noqa F401 + IMAGE_TOKEN_INDEX, + DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + ) import lmms_eval from lmms_eval.api.instance import Instance from lmms_eval.api.model import lmms @@ -41,11 +52,15 @@ from lmms_eval.api.registry import ALL_TASKS from lmms_eval.tasks import initialize_tasks except ImportError: + def register_model(name): def decorator(func): return func + return decorator + from abc import ABC as lmms + Instance = None pass @@ -91,9 +106,15 @@ def decorator(func): "--ipex", action="store_true", help="use intel extension for pytorch." ) parser.add_argument( - "--disable-jit", action="store_true", help="disable converting model to torchscript mode." + "--disable-jit", + action="store_true", + help="disable converting model to torchscript mode.", +) +parser.add_argument( + "--quant-with-amp", + action="store_true", + help="by default static quant is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", ) -parser.add_argument("--quant-with-amp", action="store_true", help="by default static quant is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)") parser.add_argument("--quantized-model-path", default="./saved_result/best_model.pt") parser.add_argument( "--tasks", @@ -121,13 +142,13 @@ def decorator(func): default="AUTO", type=str, help="low precision mode for weight only quantization. " - "It indicates data type for computation for speedup at the cost " - "of accuracy. Unrelated to activation or weight data type." - "It is not supported yet to use lowp_mode=INT8 for INT8 weight, " - "falling back to lowp_mode=BF16 implicitly in this case." - "If set to AUTO, lowp_mode is determined by weight data type: " - "lowp_mode=BF16 is used for INT8 weight " - "and lowp_mode=INT8 used for INT4 weight", + "It indicates data type for computation for speedup at the cost " + "of accuracy. Unrelated to activation or weight data type." + "It is not supported yet to use lowp_mode=INT8 for INT8 weight, " + "falling back to lowp_mode=BF16 implicitly in this case." + "If set to AUTO, lowp_mode is determined by weight data type: " + "lowp_mode=BF16 is used for INT8 weight " + "and lowp_mode=INT8 used for INT4 weight", ) parser.add_argument( "--weight-dtype", @@ -166,6 +187,7 @@ def decorator(func): use_ipex = args.ipex or args.ipex_weight_only_quantization + def get_int_from_env(env_keys, default): """Returns the first positive env value found in the `env_keys` list or the default.""" for e in env_keys: @@ -192,6 +214,7 @@ def get_int_from_env(env_keys, default): TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding] + class HuggingFaceModel(BaseLM): _DEFAULT_MAX_LENGTH = 2048 @@ -207,7 +230,7 @@ def __init__( dtype: Optional[Union[str, torch.dtype]] = "auto", tp_number=1, config=None, - add_special_tokens = True, + add_special_tokens=True, ): super().__init__() @@ -246,18 +269,29 @@ def __init__( if model_type == "chatglm": # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided self.config = AutoConfig.from_pretrained( - model_id if config is None else config, torchscript=with_jit, trust_remote_code=True, torch_dtype=load_dtype, + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, + torch_dtype=load_dtype, ) else: self.config = AutoConfig.from_pretrained( - model_id if config is None else config, torchscript=with_jit, trust_remote_code=True + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, ) if re.search("gptbigcode", self.config.architectures[0], re.IGNORECASE): model_type = "gptbigcode" # For now, Falcon, baichuan and gptbigcode have accuracy issue with from_config with deepspeed meta device load. # TODO: we will change the scope once deepspeed providing the support - if world_size == 1 or model_type in ["falcon", "baichuan", "gptbigcode", "qwen", "yuan"]: + if world_size == 1 or model_type in [ + "falcon", + "baichuan", + "gptbigcode", + "qwen", + "yuan", + ]: self.model = model_class[0].from_pretrained( model_id, config=self.config, @@ -268,7 +302,7 @@ def __init__( else: with deepspeed.OnDevice(dtype=load_dtype, device="meta"): if model_type in ["t5"]: - self.model = model_class[0](config=self.config) + self.model = model_class[0](config=self.config) else: if model_class[0] == AutoModelForCausalLM: self.model = ( @@ -364,6 +398,7 @@ def write_checkpoints_json(): ipex_woq_enabled = args.ipex_weight_only_quantization if ipex_woq_enabled: from intel_extension_for_pytorch.quantization import WoqWeightDtype + if args.weight_dtype == "INT8": weight_dtype = WoqWeightDtype.INT8 elif args.weight_dtype == "INT4": @@ -410,7 +445,9 @@ def write_checkpoints_json(): self.num_beams = 1 if with_greedy else 4 self.iter = 0 - self.is_t5 = re.search("t5", self.base_model.config.architectures[0], re.IGNORECASE) + self.is_t5 = re.search( + "t5", self.base_model.config.architectures[0], re.IGNORECASE + ) def _get_target_nums(self, names): for n in names: @@ -433,38 +470,51 @@ def _get_past_key_values(self, input_bs, last_hidden_state=None): past_key_values = tuple( [ ( - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - beam_idx_tmp, - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - self.base_model.decoder.block[i].layer[1].EncDecAttention.k(last_hidden_state).view( - int(input_bs), - -1, - self.base_model.decoder.block[i] - .layer[1] - .EncDecAttention.n_heads, - self.base_model.decoder.block[i] - .layer[1] - .EncDecAttention.key_value_proj_dim, - ).transpose(0, 1).contiguous(), - self.base_model.decoder.block[i].layer[1].EncDecAttention.v(last_hidden_state).view( - int(input_bs), - -1, + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), self.base_model.decoder.block[i] .layer[1] - .EncDecAttention.n_heads, + .EncDecAttention.k(last_hidden_state) + .view( + int(input_bs), + -1, + self.base_model.decoder.block[i] + .layer[1] + .EncDecAttention.n_heads, + self.base_model.decoder.block[i] + .layer[1] + .EncDecAttention.key_value_proj_dim, + ) + .transpose(0, 1) + .contiguous(), self.base_model.decoder.block[i] .layer[1] - .EncDecAttention.key_value_proj_dim, - ).transpose(0, 1).contiguous(), - beam_idx_tmp, - ) if self.is_t5 else - ( - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - beam_idx_tmp, + .EncDecAttention.v(last_hidden_state) + .view( + int(input_bs), + -1, + self.base_model.decoder.block[i] + .layer[1] + .EncDecAttention.n_heads, + self.base_model.decoder.block[i] + .layer[1] + .EncDecAttention.key_value_proj_dim, + ) + .transpose(0, 1) + .contiguous(), + beam_idx_tmp, + ) + if self.is_t5 + else ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + beam_idx_tmp, + ) ) for i in range(num_hidden_layers) ] @@ -494,7 +544,7 @@ def _model_call( _attention_mask = [] _position_ids = [] if self.is_t5: - inputs = inputs['input_ids'] + inputs = inputs["input_ids"] for text in inputs: input_ids = text.to(self._device) input_bs = inputs.shape[0] * self.num_beams @@ -507,7 +557,11 @@ def _model_call( position_ids_batched = torch.stack(_position_ids) if self.is_t5: model_kwargs = {"attention_mask": attention_mask_batched} - model_kwargs = self.base_model._prepare_encoder_decoder_kwargs_for_generation(inputs, model_kwargs, "input_ids") + model_kwargs = ( + self.base_model._prepare_encoder_decoder_kwargs_for_generation( + inputs, model_kwargs, "input_ids" + ) + ) ( inputs, example_inputs, @@ -517,26 +571,34 @@ def _model_call( is_encoder_decoder=True, **model_kwargs, ) - past_key_values = self._get_past_key_values(input_bs, example_inputs["encoder_outputs"]["last_hidden_state"]) + past_key_values = self._get_past_key_values( + input_bs, example_inputs["encoder_outputs"]["last_hidden_state"] + ) if self.num_beams == 1: - decoder_input_ids = self.base_model._shift_right(labels['input_ids']) - _labels = labels['input_ids'] + decoder_input_ids = self.base_model._shift_right(labels["input_ids"]) + _labels = labels["input_ids"] else: - decoder_input_ids = self.base_model._shift_right(labels['input_ids'].repeat_interleave(self.num_beams, dim=0)) - _labels = labels['input_ids'].repeat_interleave(self.num_beams, dim=0) + decoder_input_ids = self.base_model._shift_right( + labels["input_ids"].repeat_interleave(self.num_beams, dim=0) + ) + _labels = labels["input_ids"].repeat_interleave(self.num_beams, dim=0) example_dict = { - "decoder_input_ids": decoder_input_ids, - "encoder_outputs": (example_inputs["encoder_outputs"]["last_hidden_state"],), - "labels": _labels, - } + "decoder_input_ids": decoder_input_ids, + "encoder_outputs": ( + example_inputs["encoder_outputs"]["last_hidden_state"], + ), + "labels": _labels, + } else: past_key_values = self._get_past_key_values(input_bs) example_dict = {"input_ids": inputs} - model_inputs = self.base_model.prepare_inputs_for_generation(inputs, attention_mask=attention_mask_batched) + model_inputs = self.base_model.prepare_inputs_for_generation( + inputs, attention_mask=attention_mask_batched + ) has_position_ids = model_inputs.get("position_ids", None) is not None if self._with_jit: - example_dict["attention_mask"]= attention_mask_batched - example_dict["past_key_values"]= past_key_values + example_dict["attention_mask"] = attention_mask_batched + example_dict["past_key_values"] = past_key_values example_dict["return_dict"] = torch.tensor(False) if has_position_ids: example_dict["position_ids"] = position_ids_batched @@ -544,9 +606,7 @@ def _model_call( example_dict["return_last_logit"] = torch.tensor(True) with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( - enabled=True - if args.quant_with_amp or self._dtype == "bfloat16" - else False, + enabled=True if args.quant_with_amp or self._dtype == "bfloat16" else False, ): if self._with_jit and self.iter == 0: if self._dtype != "int8": @@ -652,14 +712,20 @@ def _collate(x): except ValueError: if not warn_stop_seq: print( - "Warning: a primary stop sequence is multi-token! Will default to EOS token for this tokenizer. Consider using `hf-causal-experimental` for multi-token stop sequence support for the time being." + "Warning: a primary stop sequence is multi-token! Will default to EOS token for" + " this tokenizer. Consider using `hf-causal-experimental` for multi-token stop" + " sequence support for the time being." ) warn_stop_seq = True primary_until = self.eot_token_id else: primary_until = None - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): - context = "่ฏฆ็ป†ๅˆ†ๆžๅนถๆฑ‚่งฃไปฅไธ‹ๆ•ฐๅญฆ้—ฎ้ข˜ใ€‚\n" + context.replace("้—ฎ้ข˜: ", "").replace("\n้€ๆญฅ่งฃ็ญ”:", "") + if re.search( + "yuan", self.base_model.config.architectures[0], re.IGNORECASE + ): + context = "่ฏฆ็ป†ๅˆ†ๆžๅนถๆฑ‚่งฃไปฅไธ‹ๆ•ฐๅญฆ้—ฎ้ข˜ใ€‚\n" + context.replace( + "้—ฎ้ข˜: ", "" + ).replace("\n้€ๆญฅ่งฃ็ญ”:", "") context_enc = torch.tensor( [self.tok_encode(context)[self.max_gen_toks - self.max_length :]] ).to(self.device) @@ -669,13 +735,17 @@ def _collate(x): ) with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( - enabled=True - if args.quant_with_amp or self._dtype == "bfloat16" - else False, + enabled=( + True if args.quant_with_amp or self._dtype == "bfloat16" else False + ), ): if self._with_jit and self.iter == 0: if self._dtype not in ["int8", "int4", "nf4"]: - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + if re.search( + "yuan", + self.base_model.config.architectures[0], + re.IGNORECASE, + ): input_bs = context_enc.shape[0] * self.num_beams attention_mask = torch.ones(len(context_enc[0])) position_ids = torch.arange(len(context_enc[0])) @@ -693,9 +763,15 @@ def _collate(x): ) model = torch.jit.freeze(model.eval()) example_dict = { - "input_ids": example_dict["input_ids"].repeat(input_bs, 1), - "attention_mask": example_dict["attention_mask"].repeat(input_bs, 1), - "position_ids": example_dict["position_ids"].repeat(input_bs, 1) + "input_ids": example_dict["input_ids"].repeat( + input_bs, 1 + ), + "attention_mask": example_dict["attention_mask"].repeat( + input_bs, 1 + ), + "position_ids": example_dict["position_ids"].repeat( + input_bs, 1 + ), } first_token_model = torch.jit.trace( self.model.eval(), @@ -703,24 +779,44 @@ def _collate(x): strict=False, check_trace=False, ) - first_token_model = torch.jit.freeze(first_token_model.eval()) + first_token_model = torch.jit.freeze( + first_token_model.eval() + ) else: model = torch.jit.load(args.quantized_model_path) model = torch.jit.freeze(model.eval()) - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): - first_token_model = torch.jit.load(args.quantized_model_path+"2") - first_token_model = torch.jit.freeze(first_token_model.eval()) - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): - ipex._set_optimized_model_for_generation(self.model, optimized_model=model, first_token_optimized_model=first_token_model) + if re.search( + "yuan", + self.base_model.config.architectures[0], + re.IGNORECASE, + ): + first_token_model = torch.jit.load( + args.quantized_model_path + "2" + ) + first_token_model = torch.jit.freeze( + first_token_model.eval() + ) + if re.search( + "yuan", self.base_model.config.architectures[0], re.IGNORECASE + ): + ipex._set_optimized_model_for_generation( + self.model, + optimized_model=model, + first_token_optimized_model=first_token_model, + ) else: - ipex._set_optimized_model_for_generation(self.model, optimized_model=model) + ipex._set_optimized_model_for_generation( + self.model, optimized_model=model + ) self.iter = self.iter + 1 cont = self._model_generate( context_enc, context_enc.shape[1] + max_gen_tokens, primary_until ) s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :]) - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + if re.search( + "yuan", self.base_model.config.architectures[0], re.IGNORECASE + ): s = s.replace("\n", "").split("")[0] for term in until: @@ -733,6 +829,7 @@ def _collate(x): return re_ord.get_original(res) + class HuggingFaceSeq2SeqModel(HuggingFaceModel): """Seq2Seq language modeling. You can find a set of supported models in the following documentation: @@ -860,6 +957,7 @@ def _loglikelihood_tokens( self.cache_hook.add_partial("loglikelihood", cache_key, answer) return results + class T5ModelLambada(HuggingFaceSeq2SeqModel): def _loglikelihood_tokens( self, @@ -887,16 +985,26 @@ def _loglikelihood_tokens( for cache_key, log_softmax, target_tokens, target_mask in output_iterator: length = target_mask.sum() - if length >= 1 and target_tokens[length-1].item() == self.tokenizer.encode(self.tokenizer.eos_token, add_special_tokens = False)[0]: + if ( + length >= 1 + and target_tokens[length - 1].item() + == self.tokenizer.encode( + self.tokenizer.eos_token, add_special_tokens=False + )[0] + ): length = length - 1 log_softmax = log_softmax[:length] target_tokens = target_tokens[:length] greedy_tokens = log_softmax.argmax(dim=-1) max_equal = (greedy_tokens == target_tokens).all() - target_text = self.tokenizer.decode(target_tokens, skip_special_tokens = True) - greedy_text = self.tokenizer.decode(greedy_tokens, skip_special_tokens = True) - max_text_equal = (greedy_text == target_text) + target_text = self.tokenizer.decode( + target_tokens, skip_special_tokens=True + ) + greedy_text = self.tokenizer.decode( + greedy_tokens, skip_special_tokens=True + ) + max_text_equal = greedy_text == target_text target_logits = torch.gather( log_softmax, 1, target_tokens.unsqueeze(-1) ).squeeze(-1) @@ -907,7 +1015,6 @@ def _loglikelihood_tokens( return results - @register_model("test") class LMMS(lmms): def __init__( @@ -921,7 +1028,7 @@ def __init__( dtype: Optional[Union[str, torch.dtype]] = "auto", tp_number=1, config=None, - add_special_tokens = True, + add_special_tokens=True, ) -> None: super().__init__() self._device = torch.device(device) @@ -944,11 +1051,19 @@ def __init__( elif dtype == "int8": load_dtype = torch.float32 infer_dtype = torch.int8 - self.amp_dtype = torch.bfloat16 if args.quant_with_amp or self._dtype == "bfloat16" else torch.float32 + self.amp_dtype = ( + torch.bfloat16 + if args.quant_with_amp or self._dtype == "bfloat16" + else torch.float32 + ) if re.search("llava", pretrained, re.IGNORECASE): - self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained)) + self._tokenizer, self._model, self._image_processor, self._max_length = ( + load_pretrained_model( + pretrained, None, get_model_name_from_path(pretrained) + ) + ) model_name = get_model_name_from_path(pretrained) - if 'llama-2' in model_name.lower(): + if "llama-2" in model_name.lower(): conv_mode = "llava_llama_2" elif "v1" in model_name.lower(): conv_mode = "llava_v1" @@ -964,7 +1079,9 @@ def __init__( ) self._tokenizer = self._image_processor.tokenizer self._config = AutoConfig.from_pretrained( - pretrained if config is None else config, torchscript=with_jit, trust_remote_code=True + pretrained if config is None else config, + torchscript=with_jit, + trust_remote_code=True, ) self._model = model_class[0].from_pretrained( pretrained, @@ -1052,8 +1169,11 @@ def write_checkpoints_json(): ipex_woq_enabled = args.ipex_weight_only_quantization if ipex_woq_enabled: from intel_extension_for_pytorch.quantization import WoqWeightDtype + weight_dtype = ( - WoqWeightDtype.INT4 if args.weight_dtype == "INT4" else WoqWeightDtype.INT8 + WoqWeightDtype.INT4 + if args.weight_dtype == "INT4" + else WoqWeightDtype.INT8 ) if args.lowp_mode == "INT8": @@ -1115,19 +1235,32 @@ def write_checkpoints_json(): "past_key_values": past_key_values, } if re.search("llava", pretrained, re.IGNORECASE): - sample_inputs["inputs_embeds"] = torch.zeros(batch_size, 1, 4096).to(self.amp_dtype) + sample_inputs["inputs_embeds"] = torch.zeros(batch_size, 1, 4096).to( + self.amp_dtype + ) elif re.search("git", pretrained, re.IGNORECASE): sample_inputs["input_ids"] = input_ids.repeat(self.batch_size, 1) - sample_inputs["attention_mask"] = attention_mask.repeat(self.batch_size, 1) + sample_inputs["attention_mask"] = attention_mask.repeat( + self.batch_size, 1 + ) sample_inputs["pixel_values"] = torch.zeros(batch_size, 3, 224, 224) - num_head = self.model.git.encoder.layer[0].attention.self.num_attention_heads - head_dim = int(self.model.git.encoder.layer[0].attention.self.hidden_size / num_head) + num_head = self.model.git.encoder.layer[ + 0 + ].attention.self.num_attention_heads + head_dim = int( + self.model.git.encoder.layer[0].attention.self.hidden_size + / num_head + ) past_key_values = tuple( [ ( torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), - torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), + torch.zeros( + [batch_size, num_head, 1, head_dim] + ).contiguous(), + torch.zeros( + [batch_size, num_head, 1, head_dim] + ).contiguous(), torch.zeros(1, 4, dtype=torch.long), ) for i in range(self.model.config.num_hidden_layers) @@ -1135,7 +1268,8 @@ def write_checkpoints_json(): ) sample_inputs["past_key_values"] = past_key_values with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if self.amp_dtype == torch.bfloat16 else False,): + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): if self._dtype != "int8": traced_model = torch.jit.trace( self._model.eval(), @@ -1150,7 +1284,9 @@ def write_checkpoints_json(): traced_model(**sample_inputs) traced_model(**sample_inputs) - ipex._set_optimized_model_for_generation(self._model, optimized_model=traced_model) + ipex._set_optimized_model_for_generation( + self._model, optimized_model=traced_model + ) @property def config(self): @@ -1177,7 +1313,9 @@ def max_length(self): def pad_sequence(self, input_ids, batch_first, padding_value): if self.tokenizer.padding_side == "left": input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids] - input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, batch_first=batch_first, padding_value=padding_value + ) if self.tokenizer.padding_side == "left": input_ids = torch.flip(input_ids, [1]) return input_ids @@ -1198,7 +1336,9 @@ def rank(self): def world_size(self): return self._world_size - def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + def tok_encode( + self, string: str, left_truncate_len=None, add_special_tokens=None + ) -> List[int]: """ """ add_special_tokens = False if add_special_tokens is None else add_special_tokens encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) @@ -1213,9 +1353,13 @@ def tok_decode(self, tokens): def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: # TODO res = [] - pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + pbar = tqdm( + total=len(requests), disable=(self.rank != 0), desc="Model Responding" + ) - for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [ + reg.args for reg in requests + ]: # encode, pad, and truncate contexts for this batch if type(doc_to_target) == str: continuation = doc_to_target @@ -1226,7 +1370,10 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: if visuals: image = process_images(visuals, self._image_processor, self._config) if type(image) is list: - image = [_image.to(dtype=torch.float16, device=self.device) for _image in image] + image = [ + _image.to(dtype=torch.float16, device=self.device) + for _image in image + ] else: image = image.to(dtype=torch.float16, device=self.device) else: @@ -1234,12 +1381,17 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: prompts_input = contexts[0] - if image is not None and len(image) != 0 and DEFAULT_IMAGE_TOKEN not in prompts_input: + if ( + image is not None + and len(image) != 0 + and DEFAULT_IMAGE_TOKEN not in prompts_input + ): """ Three senarios: 1. No image, and there for, no image token should be added. 2. image token is already specified in the context, so we don't need to add it. - 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + 3. image token is not specified in the context and there is image inputs, so we need to add it. + In this case, we add the image token at the beginning of the context and add a new line. """ image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visuals) image_tokens = " ".join(image_tokens) @@ -1249,24 +1401,44 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: conv.append_message(conv.roles[0], prompts_input) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() - pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id - contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + pad_token_id = ( + self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None + else self.tokenizer.eos_token_id + ) + contxt_id = ( + tokenizer_image_token( + prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + .unsqueeze(0) + .to(self.device) + ) # Add the answer of the second role conv.messages[1][1] = continuation prompt = conv.get_prompt() - input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + input_ids = ( + tokenizer_image_token( + prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + .unsqueeze(0) + .to(self.device) + ) labels = input_ids.clone() # Context part no need to calculate for loss labels[0, : contxt_id.shape[1]] = -100 with torch.inference_mode(): - outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True) + outputs = self.model( + input_ids=input_ids, labels=labels, images=image, use_cache=True + ) loss = outputs["loss"] # loss = torch.exp(loss) logits = outputs["logits"] greedy_tokens = logits.argmax(dim=-1) cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] - greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]] # [1, seq] + greedy_tokens = greedy_tokens[ + :, contxt_id.shape[1] : input_ids.shape[1] + ] # [1, seq] max_equal = (greedy_tokens == cont_toks).all() res.append((float(loss.item()), bool(max_equal))) pbar.update(1) @@ -1296,15 +1468,23 @@ def _collate(x): # we group requests by their generation_kwargs, # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling # in the same batch. - re_ords = lmms_utils.Collator([reg.args for reg in requests], _collate, grouping=True) + re_ords = lmms_utils.Collator( + [reg.args for reg in requests], _collate, grouping=True + ) chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) - num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + num_iters = ( + len(requests) // self.batch_size + if len(requests) % self.batch_size == 0 + else len(requests) // self.batch_size + 1 + ) pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") for chunk in chunks: contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) task = task[0] split = split[0] - visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = [ + doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id + ] visuals = self.flatten(visuals) # we assume all gen kwargs in the batch are the same # this is safe to assume because the `grouper` object ensures it. @@ -1319,14 +1499,23 @@ def _collate(x): if isinstance(until, str): until = [until] elif not isinstance(until, list): - raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + raise ValueError( + f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}" + ) - if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__: + if ( + "image_aspect_ratio" in gen_kwargs.keys() + and "image_aspect_ratio" not in self._config.__dict__ + ): # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation - self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") + self._config.image_aspect_ratio = gen_kwargs.pop( + "image_aspect_ratio" + ) # encode, pad, and truncate contexts for this batch if visuals: - image_tensor = process_images(visuals, self._image_processor, self._config) + image_tensor = process_images( + visuals, self._image_processor, self._config + ) else: image_tensor = None @@ -1335,14 +1524,24 @@ def _collate(x): question_input = [] for visual, context in zip(visuals, contexts): - if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context: + if ( + image_tensor is not None + and len(image_tensor) != 0 + and DEFAULT_IMAGE_TOKEN not in context + ): """ Three senarios: 1. No image, and there for, no image token should be added. 2. image token is already specified in the context, so we don't need to add it. - 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + 3. image token is not specified in the context and there is image inputs, + so we need to add it. In this case, we add the image token at the beginning + of the context and add a new line. """ - image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN] + image_tokens = ( + [DEFAULT_IMAGE_TOKEN] * len(visual) + if isinstance(visual, list) + else [DEFAULT_IMAGE_TOKEN] + ) image_tokens = " ".join(image_tokens) question = image_tokens + "\n" + context else: @@ -1366,9 +1565,10 @@ def _collate(x): prompt_question = conv.get_prompt() question_input.append(prompt_question) - # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) # preconfigure gen_kwargs with defaults - gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + gen_kwargs["image_sizes"] = [ + visuals[idx].size for idx in range(len(visuals)) + ] if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 if "temperature" not in gen_kwargs: @@ -1378,12 +1578,25 @@ def _collate(x): if "num_beams" not in gen_kwargs: gen_kwargs["num_beams"] = 1 - input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input] - pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id - input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(get_accelerator().current_device_name()) - attention_masks = input_ids.ne(pad_token_ids).to(get_accelerator().current_device_name()) + input_ids_list = [ + tokenizer_image_token( + prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + for prompt in question_input + ] + pad_token_ids = ( + self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None + else self.tokenizer.eos_token_id + ) + input_ids = self.pad_sequence( + input_ids_list, batch_first=True, padding_value=pad_token_ids + ).to(get_accelerator().current_device_name()) + attention_masks = input_ids.ne(pad_token_ids).to( + get_accelerator().current_device_name() + ) input_dict = { - "input_ids":input_ids, + "input_ids": input_ids, "attention_mask": attention_masks, "pad_token_id": pad_token_ids, "images": image_tensor.to(self.amp_dtype), @@ -1394,7 +1607,9 @@ def _collate(x): "max_new_tokens": gen_kwargs["max_new_tokens"], } elif re.search("git", self.model.config.architectures[0], re.IGNORECASE): - input_ids=self._image_processor(images=visuals, return_tensors="pt").pixel_values + input_ids = self._image_processor( + images=visuals, return_tensors="pt" + ).pixel_values gen_kwargs.pop("until", None) input_dict = { "pixel_values": input_ids.to(self.amp_dtype), @@ -1406,8 +1621,17 @@ def _collate(x): ): cont = self.model.generate(**input_dict) text_outputs = self.tokenizer.batch_decode( - cont[:, input_ids.shape[1]:] if re.search("llava", self.model.config.architectures[0], re.IGNORECASE) else cont, - skip_special_tokens=True) + ( + cont[:, input_ids.shape[1] :] + if re.search( + "llava", + self.model.config.architectures[0], + re.IGNORECASE, + ) + else cont + ), + skip_special_tokens=True, + ) except Exception as e: print(f"Error {e} in generating") cont = "" @@ -1463,7 +1687,7 @@ def _collate(x): dtype=args.dtype, tp_number=world_size, config=args.config_file, - add_special_tokens=False + add_special_tokens=False, ) results = evaluator.evaluate( @@ -1475,15 +1699,17 @@ def _collate(x): print(evaluator.make_table(results)) elif len(lmms_tasks) != 0: task_names = lmms_utils.pattern_match(lmms_tasks, ALL_TASKS) - lm = LMMS(pretrained=args.model, device="cpu", - batch_size=args.batch_size, - with_ipex=args.ipex, - with_jit=not args.disable_jit, - dtype=args.dtype, - tp_number=world_size, - config=args.config_file, - add_special_tokens=False - ) + lm = LMMS( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + tp_number=world_size, + config=args.config_file, + add_special_tokens=False, + ) task_dict = lmms_eval.tasks.get_task_dict(task_names, model_name="test") for task_name in task_dict.keys(): @@ -1501,6 +1727,6 @@ def _collate(x): task_dict=task_dict, # limit=10, # bootstrap_iters=100, - cli_args=args + cli_args=args, ) print(lmms_evaluator.make_table(results)) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py index 4bc7dce44..7b8fd3f03 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py @@ -18,7 +18,7 @@ import sys -sys.path.append(sys.path[0] + '/../../') +sys.path.append(sys.path[0] + "/../../") import logging @@ -55,7 +55,13 @@ from llava.model.builder import load_pretrained_model from llava.conversation import conv_templates from llava.mm_utils import get_model_name_from_path, tokenizer_image_token - from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from llava.constants import ( + IMAGE_TOKEN_INDEX, + DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + ) + MODEL_CLASSES["llava"] = (LlavaLlamaForCausalLM, AutoTokenizer) except ImportError: pass @@ -89,10 +95,15 @@ "--prompt", default=None, type=str, help="input prompt for self-defined if needed" ) parser.add_argument( - "--streaming", action="store_true", help="enable streaming mode for generation output (greedy search only)" + "--streaming", + action="store_true", + help="enable streaming mode for generation output (greedy search only)", ) parser.add_argument( - "--image-url", default="http://images.cocodataset.org/val2017/000000039769.jpg", type=str, help="image url for image-to-text task" + "--image-url", + default="http://images.cocodataset.org/val2017/000000039769.jpg", + type=str, + help="image url for image-to-text task", ) parser.add_argument( "--config-file", default=None, type=str, help="specific configuration file" @@ -145,7 +156,9 @@ ) parser.add_argument("--deployment-mode", action="store_true") parser.add_argument("--torch-compile", action="store_true") -parser.add_argument("--backend", default="ipex", type=str, help="backend of torch.compile") +parser.add_argument( + "--backend", default="ipex", type=str, help="backend of torch.compile" +) parser.add_argument("--profile", action="store_true") parser.add_argument("--benchmark", action="store_true") parser.add_argument("--num-iter", default=100, type=int, help="num iter") @@ -181,7 +194,7 @@ action="store_true", help="Indicate that the low-precision checkpoint is in the legacy format rather than the" " HuggingFace Optimum format for backward compatibility. It must be used with" - " --low-precision-checkpoint. Otherwise, it has no effect." + " --low-precision-checkpoint. Otherwise, it has no effect.", ) args = parser.parse_args() print(args) @@ -218,7 +231,7 @@ if model_type == "mpt" and args.prompt is None: config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) if model_type == "llava": - config.use_cache=True + config.use_cache = True if not hasattr(config, "lm_head_generation"): config.lm_head_generation = True @@ -229,11 +242,13 @@ torch_dtype=amp_dtype, config=config, low_cpu_mem_usage=True, - trust_remote_code=True + trust_remote_code=True, ) tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) else: - tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) + tokenizer, model, image_processor, context_len = load_pretrained_model( + args.model_id + ) model = model.eval() model = model.to(memory_format=torch.channels_last) @@ -243,7 +258,14 @@ streamer = TextStreamer(tokenizer) else: streamer = None -generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=num_beams, max_new_tokens=args.max_new_tokens, min_new_tokens=args.max_new_tokens, streamer=streamer) +generate_kwargs = dict( + do_sample=False, + temperature=0.9, + num_beams=num_beams, + max_new_tokens=args.max_new_tokens, + min_new_tokens=args.max_new_tokens, + streamer=streamer, +) if re.search("gptbigcode", model.config.architectures[0], re.IGNORECASE): model_type = "gptbigcode" @@ -252,22 +274,27 @@ elif re.search("t5", model.config.architectures[0], re.IGNORECASE): generate_kwargs["max_length"] = generate_kwargs["max_new_tokens"] generate_kwargs.pop("max_new_tokens") -elif re.search("git", model.config.architectures[0], re.IGNORECASE) or re.search("llava", model.config.architectures[0], re.IGNORECASE): +elif re.search("git", model.config.architectures[0], re.IGNORECASE) or re.search( + "llava", model.config.architectures[0], re.IGNORECASE +): from PIL import Image import requests from io import BytesIO + model.config.batch_size = int(args.batch_size) * num_beams def load_image(image_file): - if image_file.startswith('http://') or image_file.startswith('https://'): + if image_file.startswith("http://") or image_file.startswith("https://"): response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert('RGB') + image = Image.open(BytesIO(response.content)).convert("RGB") else: - image = Image.open(image_file).convert('RGB') + image = Image.open(image_file).convert("RGB") return image + + if re.search("llava", model.config.architectures[0], re.IGNORECASE): model_name = get_model_name_from_path(args.model_id) - if 'llama-2' in model_name.lower(): + if "llama-2" in model_name.lower(): conv_mode = "llava_llama_2" elif "v1" in model_name.lower(): conv_mode = "llava_v1" @@ -277,13 +304,17 @@ def load_image(image_file): conv_mode = "llava_v0" conv = conv_templates[conv_mode].copy() if "mpt" in model_name.lower(): - roles = ('user', 'assistant') + roles = ("user", "assistant") else: roles = conv.roles if re.search("yuan", model.config.architectures[0], re.IGNORECASE): model.config.batch_size = int(args.batch_size) * num_beams + + def trace_handler(prof): print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) + + # to ipex if args.ipex: model = ipex.llm.optimize( @@ -294,6 +325,7 @@ def trace_handler(prof): ) elif args.ipex_weight_only_quantization: from intel_extension_for_pytorch.quantization import WoqWeightDtype + if args.weight_dtype == "INT8": weight_dtype = WoqWeightDtype.INT8 elif args.weight_dtype == "INT4": @@ -347,7 +379,10 @@ def trace_handler(prof): if args.torch_compile: if args.deployment_mode: - raise SystemExit("[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode to False if want to use torch.compile.") + raise SystemExit( + "[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode" + " to False if want to use torch.compile." + ) model.forward = torch.compile(model.forward, dynamic=True, backend=args.backend) @@ -364,9 +399,15 @@ def trace_handler(prof): image = load_image(args.image_url) image = [image] * args.batch_size if model.config.mm_use_im_start_end: - prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + prompt = ( + DEFAULT_IM_START_TOKEN + + DEFAULT_IMAGE_TOKEN + + DEFAULT_IM_END_TOKEN + + "\n" + + prompt + ) else: - prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt + prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt conv.append_message(conv.roles[0], prompt) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() @@ -383,7 +424,9 @@ def trace_handler(prof): + args.model_id ) elif int(args.input_tokens) > 8192: - prompt = prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) + prompt = prompt_pool[model_type]["8192"] * int( + int(args.input_tokens) / 8192 + ) elif args.input_tokens in prompt_pool[model_type]: prompt = prompt_pool[model_type][args.input_tokens] else: @@ -409,12 +452,33 @@ def trace_handler(prof): ) as prof: for i in range(5): if model_type == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] - output = model.generate(input_ids, images=image_tensor, **generate_kwargs) + input_ids = torch.stack( + [ + tokenizer_image_token( + pmt, + tokenizer, + IMAGE_TOKEN_INDEX, + return_tensors="pt", + ) + for pmt in prompt + ] + ) + image_tensor = [ + image_processor.preprocess(img, return_tensors="pt")[ + "pixel_values" + ].to(amp_dtype) + for img in image + ] + output = model.generate( + input_ids, images=image_tensor, **generate_kwargs + ) elif model_type == "git": - input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values - output = model.generate(pixel_values=input_ids, **generate_kwargs) + input_ids = tokenizer( + images=prompt, return_tensors="pt" + ).pixel_values + output = model.generate( + pixel_values=input_ids, **generate_kwargs + ) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) @@ -422,17 +486,34 @@ def trace_handler(prof): for i in range(num_iter): tic = time.time() if model_type == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] - output = model.generate(input_ids, images=image_tensor, **generate_kwargs) + input_ids = torch.stack( + [ + tokenizer_image_token( + pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + for pmt in prompt + ] + ) + image_tensor = [ + image_processor.preprocess(img, return_tensors="pt")[ + "pixel_values" + ].to(amp_dtype) + for img in image + ] + output = model.generate( + input_ids, images=image_tensor, **generate_kwargs + ) elif model_type == "git": - input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values + input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values output = model.generate(pixel_values=input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) gen_ids = output[0] if args.token_latency else output - gen_text = tokenizer.batch_decode(gen_ids[:, input_ids.shape[1]:] if model_type=="llava" else gen_ids, skip_special_tokens=True) + gen_text = tokenizer.batch_decode( + gen_ids[:, input_ids.shape[1] :] if model_type == "llava" else gen_ids, + skip_special_tokens=True, + ) toc = time.time() input_tokens_lengths = [x.shape[0] for x in input_ids] diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index d768a21a9..72ef693ac 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -19,12 +19,12 @@ AutoTokenizer, T5ForConditionalGeneration, AutoProcessor, - TextStreamer + TextStreamer, ) import sys -sys.path.append(sys.path[0] + '/../../') +sys.path.append(sys.path[0] + "/../../") import logging @@ -64,7 +64,13 @@ from llava.model.builder import load_pretrained_model from llava.conversation import conv_templates from llava.mm_utils import get_model_name_from_path, tokenizer_image_token - from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from llava.constants import ( + IMAGE_TOKEN_INDEX, + DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + ) + MODEL_CLASSES["llava"] = (LlavaLlamaForCausalLM, AutoTokenizer) except ImportError: pass @@ -109,7 +115,9 @@ "--max-new-tokens", default=32, type=int, help="output max new tokens" ) parser.add_argument( - "--streaming", action="store_true", help="enable streaming mode for generation output (greedy search only)" + "--streaming", + action="store_true", + help="enable streaming mode for generation output (greedy search only)", ) parser.add_argument("--input-tokens", default="32", type=str) parser.add_argument("--prompt", default=None, type=str) @@ -128,7 +136,10 @@ help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", ) parser.add_argument( - "--image-url", default="http://images.cocodataset.org/val2017/000000039769.jpg", type=str, help="image url for image-to-text task" + "--image-url", + default="http://images.cocodataset.org/val2017/000000039769.jpg", + type=str, + help="image url for image-to-text task", ) parser.add_argument("--print-memory", action="store_true") parser.add_argument("--token-latency", action="store_true") @@ -138,13 +149,13 @@ default="AUTO", type=str, help="low precision mode for weight only quantization. " - "It indicates data type for computation for speedup at the cost " - "of accuracy. Unrelated to activation or weight data type." - "It is not supported yet to use lowp_mode=INT8 for INT8 weight, " - "falling back to lowp_mode=BF16 implicitly in this case." - "If set to AUTO, lowp_mode is determined by weight data type: " - "lowp_mode=BF16 is used for INT8 weight " - "and lowp_mode=INT8 used for INT4 weight", + "It indicates data type for computation for speedup at the cost " + "of accuracy. Unrelated to activation or weight data type." + "It is not supported yet to use lowp_mode=INT8 for INT8 weight, " + "falling back to lowp_mode=BF16 implicitly in this case." + "If set to AUTO, lowp_mode is determined by weight data type: " + "lowp_mode=BF16 is used for INT8 weight " + "and lowp_mode=INT8 used for INT4 weight", ) parser.add_argument( "--weight-dtype", @@ -308,7 +319,10 @@ def get_checkpoint_files(model_name_or_path): if args.config_file is None: if model_type == "chatglm": config = AutoConfig.from_pretrained( - args.model_id, torchscript=True, trust_remote_code=True, torch_dtype=load_dtype, + args.model_id, + torchscript=True, + trust_remote_code=True, + torch_dtype=load_dtype, ) else: config = AutoConfig.from_pretrained( @@ -323,7 +337,7 @@ def get_checkpoint_files(model_name_or_path): if model_type == "mpt" and args.prompt is None: config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) if model_type == "llava": - config.use_cache=True + config.use_cache = True if not hasattr(config, "lm_head_generation"): config.lm_head_generation = True @@ -348,9 +362,19 @@ def get_checkpoint_files(model_name_or_path): # TODO: we will change the scope once deepspeed providing the support if model_type in ["llava"]: - tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) + tokenizer, model, image_processor, context_len = load_pretrained_model( + args.model_id + ) model.config = config -elif world_size == 1 or model_type in ["falcon", "baichuan", "baichuan2", "gptbigcode", "git", "qwen", "yuan"]: +elif world_size == 1 or model_type in [ + "falcon", + "baichuan", + "baichuan2", + "gptbigcode", + "git", + "qwen", + "yuan", +]: model = model_class[0].from_pretrained( model_name, config=config, @@ -358,13 +382,15 @@ def get_checkpoint_files(model_name_or_path): torch_dtype=load_dtype, trust_remote_code=True, ) -else: # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load +else: # Construct model with fake meta tensors, later will be replaced during ds-inference ckpt load with deepspeed.OnDevice(dtype=load_dtype, device="meta"): - if model_type in ["t5"]: - model = model_class[0](config=config) + if model_type in ["t5"]: + model = model_class[0](config=config) else: model = ( - model_class[0].from_config(config, trust_remote_code=True).to(load_dtype) + model_class[0] + .from_config(config, trust_remote_code=True) + .to(load_dtype) ) if args.benchmark: @@ -435,6 +461,7 @@ def write_checkpoints_json(): ipex_woq_enabled = args.ipex_weight_only_quantization if ipex_woq_enabled: from intel_extension_for_pytorch.quantization import WoqWeightDtype + if args.weight_dtype == "INT8": weight_dtype = WoqWeightDtype.INT8 elif args.weight_dtype == "INT4": @@ -484,12 +511,20 @@ def write_checkpoints_json(): streamer = TextStreamer(tokenizer) else: streamer = None -generate_kwargs = dict(do_sample=False, num_beams=num_beams, max_new_tokens=args.max_new_tokens, min_new_tokens=args.max_new_tokens, streamer=streamer) +generate_kwargs = dict( + do_sample=False, + num_beams=num_beams, + max_new_tokens=args.max_new_tokens, + min_new_tokens=args.max_new_tokens, + streamer=streamer, +) if args.token_latency and not use_ipex: args.token_latency = False - logger.warning("--token-latency requires using ipex (--ipex or --ipex-weight-only-quantization). Disabling --token-latency.") + logger.warning( + "--token-latency requires using ipex (--ipex or --ipex-weight-only-quantization). Disabling --token-latency." + ) if args.token_latency: if not hasattr(model.config, "token_latency"): model.config.token_latency = True @@ -502,6 +537,7 @@ def write_checkpoints_json(): if model_type == "git": from PIL import Image import requests + prompt = Image.open(requests.get(args.image_url, stream=True).raw) inputs = [prompt] * args.batch_size generate_kwargs.pop("min_new_tokens", None) @@ -509,15 +545,17 @@ def write_checkpoints_json(): from PIL import Image import requests from io import BytesIO + def load_image(image_file): - if image_file.startswith('http://') or image_file.startswith('https://'): + if image_file.startswith("http://") or image_file.startswith("https://"): response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert('RGB') + image = Image.open(BytesIO(response.content)).convert("RGB") else: - image = Image.open(image_file).convert('RGB') + image = Image.open(image_file).convert("RGB") return image + model_name = get_model_name_from_path(args.model_id) - if 'llama-2' in model_name.lower(): + if "llama-2" in model_name.lower(): conv_mode = "llava_llama_2" elif "v1" in model_name.lower(): conv_mode = "llava_v1" @@ -527,7 +565,7 @@ def load_image(image_file): conv_mode = "llava_v0" conv = conv_templates[conv_mode].copy() if "mpt" in model_name.lower(): - roles = ('user', 'assistant') + roles = ("user", "assistant") else: roles = conv.roles if args.prompt is not None: @@ -535,9 +573,15 @@ def load_image(image_file): image = load_image(args.image_url) image = [image] * args.batch_size if model.config.mm_use_im_start_end: - prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + prompt = ( + DEFAULT_IM_START_TOKEN + + DEFAULT_IMAGE_TOKEN + + DEFAULT_IM_END_TOKEN + + "\n" + + prompt + ) else: - prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt + prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt conv.append_message(conv.roles[0], prompt) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() @@ -572,9 +616,9 @@ def load_image(image_file): input_sentences *= math.ceil(args.batch_size / len(input_sentences)) inputs = input_sentences[: args.batch_size] - input_size = tokenizer.batch_encode_plus(inputs, return_tensors="pt").input_ids.size( - dim=1 - ) + input_size = tokenizer.batch_encode_plus( + inputs, return_tensors="pt" + ).input_ids.size(dim=1) print("*** Prompt size: ", input_size) @@ -585,11 +629,25 @@ def generate(): input_tokens = tokenizer(images=inputs, return_tensors="pt") input_ids = input_tokens.pixel_values elif model_type == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in inputs]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(infer_dtype) for img in image] + input_ids = torch.stack( + [ + tokenizer_image_token( + pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + for pmt in inputs + ] + ) + image_tensor = [ + image_processor.preprocess(img, return_tensors="pt")["pixel_values"].to( + infer_dtype + ) + for img in image + ] input_tokens = {"input_ids": input_ids, "images": image_tensor} else: - input_tokens = tokenizer.batch_encode_plus(inputs, return_token_type_ids=False, return_tensors="pt") + input_tokens = tokenizer.batch_encode_plus( + inputs, return_token_type_ids=False, return_tensors="pt" + ) input_ids = input_tokens.input_ids for t in input_tokens: if torch.is_tensor(input_tokens[t]): @@ -607,7 +665,10 @@ def generate(): o - i if model.config.model_type != "t5" else o for i, o in zip(input_tokens_lengths, output_tokens_lengths) ] - gen_text = tokenizer.batch_decode(gen_ids[:, input_ids.shape[1]:] if model_type=="llava" else gen_ids, skip_special_tokens=True) + gen_text = tokenizer.batch_decode( + gen_ids[:, input_ids.shape[1] :] if model_type == "llava" else gen_ids, + skip_special_tokens=True, + ) return zip(inputs, gen_text, total_new_tokens), outputs diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index a7ad2fa38..6bfe1a170 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -17,9 +17,6 @@ from pathlib import Path import argparse from typing import List, Optional -from transformers import ( - AutoConfig, -) import subprocess import re @@ -56,54 +53,101 @@ def main(args_in: Optional[List[str]] = None) -> None: action="store_true", help="by default static quant is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", ) - parser.add_argument("--quantized-model-path", default="", help="path to the quantized model file") - parser.add_argument("--qconfig-summary-file", default="", help="qconfig for static quantization") + parser.add_argument( + "--quantized-model-path", default="", help="path to the quantized model file" + ) + parser.add_argument( + "--qconfig-summary-file", default="", help="qconfig for static quantization" + ) parser.add_argument("--quant-model-name", default="best_model.pt") parser.add_argument( "--dataset", nargs="?", default="NeelNanda/pile-10k", - help="Calibration dataset for static quantization and GPTQ") - parser.add_argument("--ipex-smooth-quant", action="store_true", help="smoothquant forstatic quantization") + help="Calibration dataset for static quantization and GPTQ", + ) parser.add_argument( - "--calib-len", default=512, type=int, help="calibration dataset max or padding max length for SmoothQuant autotuning" + "--ipex-smooth-quant", + action="store_true", + help="smoothquant forstatic quantization", ) - parser.add_argument("--calib-iters", default=512, type=int, help="calibration iters for SmoothQuant autotuning") parser.add_argument( - "--calib-shuffle", action="store_true", help="whether to shuffle on calibration dataset for SmoothQuant autotuning" + "--calib-len", + default=512, + type=int, + help="calibration dataset max or padding max length for SmoothQuant autotuning", ) parser.add_argument( - "--calib-padding", action="store_true", help="whether to pad on calibration dataset for SmoothQuant autotuning" + "--calib-iters", + default=512, + type=int, + help="calibration iters for SmoothQuant autotuning", ) parser.add_argument( - "--calib-pad-val", default=1, type=int, help="calibration dataset padding value for SmoothQuant autotuning" + "--calib-shuffle", + action="store_true", + help="whether to shuffle on calibration dataset for SmoothQuant autotuning", + ) + parser.add_argument( + "--calib-padding", + action="store_true", + help="whether to pad on calibration dataset for SmoothQuant autotuning", + ) + parser.add_argument( + "--calib-pad-val", + default=1, + type=int, + help="calibration dataset padding value for SmoothQuant autotuning", ) parser.add_argument( - "--fallback-add", action="store_true", help="whether to fallback add ops to fp32 for SmoothQuant autotuning" + "--fallback-add", + action="store_true", + help="whether to fallback add ops to fp32 for SmoothQuant autotuning", ) parser.add_argument("--alpha", default=0.5, help="alpha value for smoothquant") parser.add_argument( - "--folding", default=False, type=bool, help="whether to fold mul into the previous layer" + "--folding", + default=False, + type=bool, + help="whether to fold mul into the previous layer", ) parser.add_argument( - "--init-alpha", default=0.5, type=float, help="a value to get baseline quantization error for auto-tuning" + "--init-alpha", + default=0.5, + type=float, + help="a value to get baseline quantization error for auto-tuning", ) parser.add_argument( - "--alpha-min", default=0.0, type=float, help="min value of auto-tuning alpha search space" + "--alpha-min", + default=0.0, + type=float, + help="min value of auto-tuning alpha search space", ) parser.add_argument( - "--alpha-max", default=1.0, type=float, help="max value of auto-tuning alpha search space" + "--alpha-max", + default=1.0, + type=float, + help="max value of auto-tuning alpha search space", ) parser.add_argument( - "--alpha-step", default=0.1, type=float, help="step_size of auto-tuning alpha search space" + "--alpha-step", + default=0.1, + type=float, + help="step_size of auto-tuning alpha search space", ) parser.add_argument( - "--shared-criterion", choices=["min", "mean", "max"], default="max", type=str - , help="criterion for input LayerNorm op of a transformer block" + "--shared-criterion", + choices=["min", "mean", "max"], + default="max", + type=str, + help="criterion for input LayerNorm op of a transformer block", ) parser.add_argument( - "--enable-blockwise-loss", default=False, type=bool, help="whether to enable block-wise auto-tuning" + "--enable-blockwise-loss", + default=False, + type=bool, + help="whether to enable block-wise auto-tuning", ) parser.add_argument( "--ipex-weight-only-quantization", @@ -146,24 +190,26 @@ def main(args_in: Optional[List[str]] = None) -> None: "--gptq", action="store_true", help="Run GPTQ calibration to generate optimized INT4 weight for weight-only quantization." - " This is recommended for INT4 to minimize accuracy drop after quantization." + " This is recommended for INT4 to minimize accuracy drop after quantization.", ) parser.add_argument( "--gptq-legacy-format", action="store_true", help="Indicate that the low-precision checkpoint is in the legacy format rather than the" " HuggingFace Optimum format for backward compatibility. It must be used with" - " --low-precision-checkpoint. Otherwise, it has no effect." + " --low-precision-checkpoint. Otherwise, it has no effect.", ) parser.add_argument( "--group-size", default=0, type=int, - help="For GPTQ and weight-only quantization only. Group size defines granularity of quantization the along input channel of weight. " - "The input channel size must be a multiple of the group size. It is effective for both INT8 and INT4 weight dtype. " - "It must be -1, 0 or a positive power of 2. -1 means group-size equals the input channel size (i.e., per-channel quantization). " - "0 means group-size is selected automatically, -1 for INT8 and 128 for INT4. " - "If --low-precision-checkpoint is given, this parameter is overwritten by data in the checkpoint file.") + help="For GPTQ and weight-only quantization only. Group size defines granularity of quantization the" + " along input channel of weight. The input channel size must be a multiple of the group size." + " It is effective for both INT8 and INT4 weight dtype. It must be -1, 0 or a positive power of 2. -1 means" + " group-size equals the input channel size (i.e., per-channel quantization). 0 means group-size is selected" + " automatically, -1 for INT8 and 128 for INT4. If --low-precision-checkpoint is given, this parameter is " + "overwritten by data in the checkpoint file.", + ) # inference related arguments. parser.add_argument( @@ -172,7 +218,9 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument("--benchmark", action="store_true") parser.add_argument("--input-tokens", default="32", type=str) parser.add_argument( - "--streaming", action="store_true", help="enable streaming mode for generation output (greedy search only)" + "--streaming", + action="store_true", + help="enable streaming mode for generation output (greedy search only)", ) parser.add_argument("--prompt", default=None, type=str) parser.add_argument("--num-iter", default=100, type=int, help="num iter") @@ -204,11 +252,13 @@ def main(args_in: Optional[List[str]] = None) -> None: else: group_size = -1 assert group_size == -1 or ( - group_size > 0 and - (group_size & (group_size-1) == 0) + group_size > 0 and (group_size & (group_size - 1) == 0) ), f"Invalid group size for WOQ: {group_size}" - if re.search("llava", str(args.model_name_or_path), re.IGNORECASE) and args.prompt is None: + if ( + re.search("llava", str(args.model_name_or_path), re.IGNORECASE) + and args.prompt is None + ): args.prompt = "What is this image?" if not args.autotp: if not args.ipex_weight_only_quantization and not args.ipex_smooth_quant: @@ -265,7 +315,10 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--weight-dtype", str(args.weight_dtype)]) infer_cmd.extend(["--lowp-mode", str(args.lowp_mode)]) if args.gptq: - print("LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified and `--weight-dtype` is ignored.") + print( + "LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified" + " and `--weight-dtype` is ignored." + ) if args.low_precision_checkpoint == "": gptq_cmd = [ "python", @@ -273,10 +326,16 @@ def main(args_in: Optional[List[str]] = None) -> None: ] gptq_cmd.extend(["--model", str(args.model_name_or_path)]) gptq_cmd.extend(["--output-dir", str(args.output_dir)]) - print("LLM RUNTIME INFO: Running GPTQ calibration with group_size {}...".format(group_size)) + print( + "LLM RUNTIME INFO: Running GPTQ calibration with group_size {}...".format( + group_size + ) + ) result = subprocess.run(gptq_cmd) if result.returncode != 0: - print("LLM RUNTIME ERROR: Running GPTQ calibration failed. Quit.") + print( + "LLM RUNTIME ERROR: Running GPTQ calibration failed. Quit." + ) quit() print("LLM RUNTIME INFO: Running GPTQ calibration finished.") infer_cmd.extend( @@ -339,7 +398,11 @@ def main(args_in: Optional[List[str]] = None) -> None: if result.returncode != 0: print("LLM RUNTIME ERROR: Quantizing model failed. Quit.") quit() - print("LLM RUNTIME INFO: Model quantized successfully, saved to {}.".format(str(args.output_dir) + "/best_model.pt")) + print( + "LLM RUNTIME INFO: Model quantized successfully, saved to {}.".format( + str(args.output_dir) + "/best_model.pt" + ) + ) else: qpath = Path(parent_path, "single_instance/run_quantization.py") @@ -364,7 +427,10 @@ def main(args_in: Optional[List[str]] = None) -> None: quant_cmd.extend(["--weight-dtype", str(args.weight_dtype)]) quant_cmd.extend(["--lowp-mode", str(args.lowp_mode)]) if args.gptq: - print("LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified and `--weight-dtype` is ignored.") + print( + "LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified" + " and `--weight-dtype` is ignored." + ) if args.low_precision_checkpoint == "": gptq_cmd = [ "python", @@ -374,16 +440,25 @@ def main(args_in: Optional[List[str]] = None) -> None: gptq_cmd.extend(["--dataset", str(args.dataset)]) gptq_cmd.extend(["--group-size", str(group_size)]) gptq_cmd.extend(["--output-dir", str(args.output_dir)]) - print("LLM RUNTIME INFO: Running GPTQ calibration with group_size {}...".format(group_size)) + print( + "LLM RUNTIME INFO: Running GPTQ calibration with group_size {}...".format( + group_size + ) + ) result = subprocess.run(gptq_cmd) if result.returncode != 0: - print("LLM RUNTIME ERROR: Running GPTQ calibration failed. Quit.") + print( + "LLM RUNTIME ERROR: Running GPTQ calibration failed. Quit." + ) quit() - print("LLM RUNTIME INFO: Running GPTQ calibration finished.") + print( + "LLM RUNTIME INFO: Running GPTQ calibration finished." + ) quant_cmd.extend( [ "--low-precision-checkpoint", - str(args.output_dir) + f"/gptq_checkpoint_g{group_size}.pt", + str(args.output_dir) + + f"/gptq_checkpoint_g{group_size}.pt", ] ) else: @@ -421,15 +496,24 @@ def main(args_in: Optional[List[str]] = None) -> None: if args.enable_blockwise_loss: quant_cmd.extend(["--enable-blockwise-loss"]) quant_cmd.extend(["--dataset", str(args.dataset)]) - quant_cmd.extend(["--qconfig-summary-file", str(args.qconfig_summary_file)]) + quant_cmd.extend( + ["--qconfig-summary-file", str(args.qconfig_summary_file)] + ) print("LLM RUNTIME INFO: quantizing model ...") result = subprocess.run(quant_cmd) if result.returncode != 0: print("LLM RUNTIME ERROR: Quantizing model failed. Quit.") quit() - print("LLM RUNTIME INFO: Model quantized successfully, saved to {}.".format(str(args.output_dir) + "/best_model.pt")) + print( + "LLM RUNTIME INFO: Model quantized successfully, saved to {}.".format( + str(args.output_dir) + "/best_model.pt" + ) + ) infer_cmd.extend( - ["--quantized-model-path", str(args.output_dir)+"/"+str(args.quant_model_name)] + [ + "--quantized-model-path", + str(args.output_dir) + "/" + str(args.quant_model_name), + ] ) else: infer_cmd.extend( @@ -502,15 +586,22 @@ def main(args_in: Optional[List[str]] = None) -> None: "phi": ("/phi_local_shard"), } model_type = next( - (x for x in MODEL_CLASSES.keys() if x in args.model_name_or_path.lower()), "auto" + ( + x + for x in MODEL_CLASSES.keys() + if x in args.model_name_or_path.lower() + ), + "auto", ) work_path = Path(str(args.output_dir)) if not work_path.exists(): Path.mkdir(work_path) - model_path = Path(str(args.output_dir)+str(MODEL_CLASSES[model_type])) + model_path = Path(str(args.output_dir) + str(MODEL_CLASSES[model_type])) if not model_path.exists(): Path.mkdir(model_path) - shard_cmd.extend(["--save-path", str(args.output_dir)+str(MODEL_CLASSES[model_type])]) + shard_cmd.extend( + ["--save-path", str(args.output_dir) + str(MODEL_CLASSES[model_type])] + ) shard_cmd.extend(["--local_rank", str(args.local_rank)]) print("LLM RUNTIME INFO: sharding model...") result = subprocess.run(shard_cmd) @@ -519,7 +610,13 @@ def main(args_in: Optional[List[str]] = None) -> None: quit() print("LLM RUNTIME INFO: Model sharded successfully.") # use absolute path here to avoid path error in deepspeed - infer_cmd.extend(["-m", str(os.path.abspath(args.output_dir))+str(MODEL_CLASSES[model_type])]) + infer_cmd.extend( + [ + "-m", + str(os.path.abspath(args.output_dir)) + + str(MODEL_CLASSES[model_type]), + ] + ) else: model_name_or_path = args.model_name_or_path if os.path.exists(model_name_or_path): diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 4a253faa9..5dbf4f249 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -1,3 +1,4 @@ +# encoding: UTF-8 import argparse import torch import intel_extension_for_pytorch as ipex @@ -6,7 +7,8 @@ import math import torch.nn.functional as F import re -sys.path.append(sys.path[0] + '/../../') + +sys.path.append(sys.path[0] + "/../../") from transformers import ( AutoConfig, AutoModelForCausalLM, @@ -45,7 +47,10 @@ parser.add_argument("--output_path", nargs="?", default="./logs") parser.add_argument("--device", default="cpu", type=str, help="cpu") parser.add_argument( - "--dtype", default="bfloat16", type=str, help="float32 or bfloat16 or int8 or int4 or nf4" + "--dtype", + default="bfloat16", + type=str, + help="float32 or bfloat16 or int8 or int4 or nf4", ) parser.add_argument( "--batch-size", default=1, type=int, help="For accuracy measurement only." @@ -57,11 +62,19 @@ "--ipex", action="store_true", help="use intel extension for pytorch." ) parser.add_argument( - "--disable-jit", action="store_true", help="disable converting model to torchscript mode." + "--disable-jit", + action="store_true", + help="disable converting model to torchscript mode.", ) parser.add_argument("--torch-compile", action="store_true") -parser.add_argument("--backend", default="ipex", type=str, help="backend of torch.compile") -parser.add_argument("--quant-with-amp", action="store_true", help="by default static quant is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)") +parser.add_argument( + "--backend", default="ipex", type=str, help="backend of torch.compile" +) +parser.add_argument( + "--quant-with-amp", + action="store_true", + help="by default static quant is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) parser.add_argument("--quantized-model-path", default="./saved_results/best_model.pt") parser.add_argument( "--tasks", @@ -79,19 +92,30 @@ args = parser.parse_args() - import lm_eval from lm_eval import evaluator, utils from lm_eval.base import BaseLM from typing import Union, List, Optional, Tuple from transformers import BatchEncoding import transformers + try: - from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM + from llava.model.language_model.llava_llama import ( # noqa F401 + LlavaLlamaForCausalLM, + ) from llava.model.builder import load_pretrained_model from llava.conversation import conv_templates - from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token - from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from llava.mm_utils import ( + get_model_name_from_path, + process_images, + tokenizer_image_token, + ) + from llava.constants import ( # noqa F401 + IMAGE_TOKEN_INDEX, + DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + ) import lmms_eval from lmms_eval.api.instance import Instance from lmms_eval.api.model import lmms @@ -101,16 +125,21 @@ from lmms_eval.api.registry import ALL_TASKS from lmms_eval.tasks import initialize_tasks except ImportError: + def register_model(name): def decorator(func): return func + return decorator + from abc import ABC as lmms + Instance = None pass TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding] + class HuggingFaceModel(BaseLM): _DEFAULT_MAX_LENGTH = 2048 @@ -125,7 +154,7 @@ def __init__( max_length=None, dtype: Optional[Union[str, torch.dtype]] = "auto", config=None, - add_special_tokens = True, + add_special_tokens=True, ): super().__init__() @@ -161,18 +190,27 @@ def __init__( if model_type == "chatglm": # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided self.config = AutoConfig.from_pretrained( - model_id if config is None else config, torchscript=with_jit, trust_remote_code=True, torch_dtype=load_dtype, + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, + torch_dtype=load_dtype, ) else: self.config = AutoConfig.from_pretrained( - model_id if config is None else config, torchscript=with_jit, trust_remote_code=True + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, ) - if self._dtype in ("int8", "int4", "nf4") and not re.search("yuan", self.config.architectures[0], re.IGNORECASE): + if self._dtype in ("int8", "int4", "nf4") and not re.search( + "yuan", self.config.architectures[0], re.IGNORECASE + ): try: with ipex.OnDevice(dtype=torch.float, device="meta"): - self.model = model_class[0].from_config(self.config, trust_remote_code=True) + self.model = model_class[0].from_config( + self.config, trust_remote_code=True + ) except (RuntimeError, AttributeError) as e: - print('Warning: Loading model to meta device failed:', e) + print("Warning: Loading model to meta device failed:", e) self.model = model_class[0].from_pretrained( model_id, low_cpu_mem_usage=True, @@ -201,17 +239,27 @@ def __init__( if args.torch_compile: if dtype in ["int8", "int4", "nf4"]: - raise SystemExit("[ERROR] Currently this script does not support torch.compile with int8/int4/nf4 datatype, please set dtype to float32 or bfloat16 if want to use torch.compile.") + raise SystemExit( + "[ERROR] Currently this script does not support torch.compile with int8/int4/nf4 datatype," + " please set dtype to float32 or bfloat16 if want to use torch.compile." + ) if with_jit: - raise SystemExit("[ERROR] JIT cannot co-work with torch.compile, please set jit to False if want to use torch.compile.") - self.model.forward = torch.compile(self.model.forward, dynamic=True, backend=args.backend) + raise SystemExit( + "[ERROR] JIT cannot co-work with torch.compile, please set jit to False if want to use" + " torch.compile." + ) + self.model.forward = torch.compile( + self.model.forward, dynamic=True, backend=args.backend + ) self.base_model = self.model self.iter = 0 self.num_beams = 1 if with_greedy else 4 self.tp_number = 1 - self.is_t5 = re.search("t5", self.base_model.config.architectures[0], re.IGNORECASE) + self.is_t5 = re.search( + "t5", self.base_model.config.architectures[0], re.IGNORECASE + ) def _get_target_nums(self, names): for n in names: @@ -234,26 +282,33 @@ def _get_past_key_values(self, input_bs, last_hidden_state=None): past_key_values = tuple( [ ( - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - beam_idx_tmp, - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - self.base_model.decoder.block[i].layer[1] - .EncDecAttention.k(last_hidden_state) - .view(int(input_bs), -1, num_heads, head_dim).transpose(0, 1) - .contiguous(), - self.base_model.decoder.block[i].layer[1] - .EncDecAttention.v(last_hidden_state) - .view(int(input_bs), -1, num_heads, head_dim).transpose(0, 1) - .contiguous(), - beam_idx_tmp, - ) if self.is_t5 else - ( - torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - torch.zeros([1, num_heads, 1, head_dim]).contiguous(), - beam_idx_tmp, + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.base_model.decoder.block[i] + .layer[1] + .EncDecAttention.k(last_hidden_state) + .view(int(input_bs), -1, num_heads, head_dim) + .transpose(0, 1) + .contiguous(), + self.base_model.decoder.block[i] + .layer[1] + .EncDecAttention.v(last_hidden_state) + .view(int(input_bs), -1, num_heads, head_dim) + .transpose(0, 1) + .contiguous(), + beam_idx_tmp, + ) + if self.is_t5 + else ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + torch.zeros([1, num_heads, 1, head_dim]).contiguous(), + beam_idx_tmp, + ) ) for i in range(num_hidden_layers) ] @@ -283,8 +338,11 @@ def _model_call( _attention_mask = [] _position_ids = [] if self.is_t5: - inputs = inputs['input_ids'] - elif hasattr(self.config, "_name_or_path") and self.config._name_or_path == "THUDM/chatglm2-6b": + inputs = inputs["input_ids"] + elif ( + hasattr(self.config, "_name_or_path") + and self.config._name_or_path == "THUDM/chatglm2-6b" + ): input_bs, input_len = inputs.shape bos = torch.tensor([64790, 64792]).repeat(input_bs, 1) inputs = torch.cat((bos, inputs), 1) @@ -300,8 +358,10 @@ def _model_call( position_ids_batched = torch.stack(_position_ids) if self.is_t5: model_kwargs = {"attention_mask": attention_mask_batched} - model_kwargs = self.base_model._prepare_encoder_decoder_kwargs_for_generation( - inputs, model_kwargs, "input_ids" + model_kwargs = ( + self.base_model._prepare_encoder_decoder_kwargs_for_generation( + inputs, model_kwargs, "input_ids" + ) ) ( inputs, @@ -312,33 +372,39 @@ def _model_call( is_encoder_decoder=True, **model_kwargs, ) - past_key_values = self._get_past_key_values(input_bs, example_inputs["encoder_outputs"]["last_hidden_state"]) + past_key_values = self._get_past_key_values( + input_bs, example_inputs["encoder_outputs"]["last_hidden_state"] + ) if self.num_beams == 1: - decoder_input_ids = self.base_model._shift_right(labels['input_ids']) + decoder_input_ids = self.base_model._shift_right(labels["input_ids"]) else: - decoder_input_ids = self.base_model._shift_right(labels['input_ids'].repeat_interleave(self.num_beams, dim=0)) + decoder_input_ids = self.base_model._shift_right( + labels["input_ids"].repeat_interleave(self.num_beams, dim=0) + ) example_dict = { "decoder_input_ids": decoder_input_ids, - "encoder_outputs": (example_inputs["encoder_outputs"]["last_hidden_state"],), + "encoder_outputs": ( + example_inputs["encoder_outputs"]["last_hidden_state"], + ), } else: past_key_values = self._get_past_key_values(input_bs) example_dict = {"input_ids": inputs} - model_inputs = self.base_model.prepare_inputs_for_generation(inputs, attention_mask=attention_mask_batched) + model_inputs = self.base_model.prepare_inputs_for_generation( + inputs, attention_mask=attention_mask_batched + ) has_position_ids = model_inputs.get("position_ids", None) is not None if self._with_jit: - example_dict["attention_mask"]= attention_mask_batched - example_dict["past_key_values"]= past_key_values + example_dict["attention_mask"] = attention_mask_batched + example_dict["past_key_values"] = past_key_values if has_position_ids: example_dict["position_ids"] = position_ids_batched if "return_last_logit" in model_inputs: example_dict["return_last_logit"] = torch.tensor(True) with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( - enabled=True - if args.quant_with_amp or self._dtype == "bfloat16" - else False, + enabled=True if args.quant_with_amp or self._dtype == "bfloat16" else False, ): if self._with_jit and self.iter == 0: if self._dtype not in ["int8", "int4", "nf4"]: @@ -442,14 +508,20 @@ def _collate(x): except ValueError: if not warn_stop_seq: print( - "Warning: a primary stop sequence is multi-token! Will default to EOS token for this tokenizer. Consider using `hf-causal-experimental` for multi-token stop sequence support for the time being." + "Warning: a primary stop sequence is multi-token! Will default to EOS token for" + " this tokenizer. Consider using `hf-causal-experimental` for multi-token stop" + " sequence support for the time being." ) warn_stop_seq = True primary_until = self.eot_token_id else: primary_until = None - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): - context = "่ฏฆ็ป†ๅˆ†ๆžๅนถๆฑ‚่งฃไปฅไธ‹ๆ•ฐๅญฆ้—ฎ้ข˜ใ€‚\n" + context.replace("้—ฎ้ข˜: ", "").replace("\n้€ๆญฅ่งฃ็ญ”:", "") + if re.search( + "yuan", self.base_model.config.architectures[0], re.IGNORECASE + ): + context = "่ฏฆ็ป†ๅˆ†ๆžๅนถๆฑ‚่งฃไปฅไธ‹ๆ•ฐๅญฆ้—ฎ้ข˜ใ€‚\n" + context.replace( + "้—ฎ้ข˜: ", "" + ).replace("\n้€ๆญฅ่งฃ็ญ”:", "") context_enc = torch.tensor( [self.tok_encode(context)[self.max_gen_toks - self.max_length :]] ).to(self.device) @@ -458,13 +530,17 @@ def _collate(x): self.max_gen_toks, request_args.get("max_length", self.max_gen_toks) ) with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( - enabled=True - if args.quant_with_amp or self._dtype == "bfloat16" - else False, + enabled=( + True if args.quant_with_amp or self._dtype == "bfloat16" else False + ), ): if self._with_jit and self.iter == 0: if self._dtype not in ["int8", "int4", "nf4"]: - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + if re.search( + "yuan", + self.base_model.config.architectures[0], + re.IGNORECASE, + ): input_bs = context_enc.shape[0] * self.num_beams attention_mask = torch.ones(len(context_enc[0])) position_ids = torch.arange(len(context_enc[0])) @@ -482,9 +558,15 @@ def _collate(x): ) model = torch.jit.freeze(model.eval()) example_dict = { - "input_ids": example_dict["input_ids"].repeat(input_bs, 1), - "attention_mask": example_dict["attention_mask"].repeat(input_bs, 1), - "position_ids": example_dict["position_ids"].repeat(input_bs, 1) + "input_ids": example_dict["input_ids"].repeat( + input_bs, 1 + ), + "attention_mask": example_dict["attention_mask"].repeat( + input_bs, 1 + ), + "position_ids": example_dict["position_ids"].repeat( + input_bs, 1 + ), } first_token_model = torch.jit.trace( self.model.eval(), @@ -492,25 +574,45 @@ def _collate(x): strict=False, check_trace=False, ) - first_token_model = torch.jit.freeze(first_token_model.eval()) + first_token_model = torch.jit.freeze( + first_token_model.eval() + ) else: model = torch.jit.load(args.quantized_model_path) model = torch.jit.freeze(model.eval()) - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): - first_token_model = torch.jit.load(args.quantized_model_path+"2") - first_token_model = torch.jit.freeze(first_token_model.eval()) - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): - ipex._set_optimized_model_for_generation(self.model, optimized_model=model, first_token_optimized_model=first_token_model) + if re.search( + "yuan", + self.base_model.config.architectures[0], + re.IGNORECASE, + ): + first_token_model = torch.jit.load( + args.quantized_model_path + "2" + ) + first_token_model = torch.jit.freeze( + first_token_model.eval() + ) + if re.search( + "yuan", self.base_model.config.architectures[0], re.IGNORECASE + ): + ipex._set_optimized_model_for_generation( + self.model, + optimized_model=model, + first_token_optimized_model=first_token_model, + ) else: - ipex._set_optimized_model_for_generation(self.model, optimized_model=model) - + ipex._set_optimized_model_for_generation( + self.model, optimized_model=model + ) + self.iter = self.iter + 1 cont = self._model_generate( context_enc, context_enc.shape[1] + max_gen_tokens, primary_until ) s = self.tok_decode(cont[0].tolist()[context_enc.shape[1] :]) - if re.search("yuan", self.base_model.config.architectures[0], re.IGNORECASE): + if re.search( + "yuan", self.base_model.config.architectures[0], re.IGNORECASE + ): s = s.replace("\n", "").split("")[0] for term in until: @@ -523,6 +625,7 @@ def _collate(x): return re_ord.get_original(res) + class HuggingFaceSeq2SeqModel(HuggingFaceModel): """Seq2Seq language modeling. You can find a set of supported models in the following documentation: @@ -650,6 +753,7 @@ def _loglikelihood_tokens( self.cache_hook.add_partial("loglikelihood", cache_key, answer) return results + class T5ModelLambada(HuggingFaceSeq2SeqModel): def _loglikelihood_tokens( self, @@ -677,16 +781,26 @@ def _loglikelihood_tokens( for cache_key, log_softmax, target_tokens, target_mask in output_iterator: length = target_mask.sum() - if length >= 1 and target_tokens[length-1].item() == self.tokenizer.encode(self.tokenizer.eos_token, add_special_tokens = False)[0]: + if ( + length >= 1 + and target_tokens[length - 1].item() + == self.tokenizer.encode( + self.tokenizer.eos_token, add_special_tokens=False + )[0] + ): length = length - 1 log_softmax = log_softmax[:length] target_tokens = target_tokens[:length] greedy_tokens = log_softmax.argmax(dim=-1) max_equal = (greedy_tokens == target_tokens).all() - target_text = self.tokenizer.decode(target_tokens, skip_special_tokens = True) - greedy_text = self.tokenizer.decode(greedy_tokens, skip_special_tokens = True) - max_text_equal = (greedy_text == target_text) + target_text = self.tokenizer.decode( + target_tokens, skip_special_tokens=True + ) + greedy_text = self.tokenizer.decode( + greedy_tokens, skip_special_tokens=True + ) + max_text_equal = greedy_text == target_text target_logits = torch.gather( log_softmax, 1, target_tokens.unsqueeze(-1) ).squeeze(-1) @@ -709,7 +823,7 @@ def __init__( batch_size=1, dtype: Optional[Union[str, torch.dtype]] = "auto", config=None, - add_special_tokens = True, + add_special_tokens=True, ) -> None: super().__init__() self._device = torch.device(device) @@ -730,11 +844,19 @@ def __init__( elif dtype in ["int8", "int4", "nf4"]: load_dtype = torch.float32 infer_dtype = torch.int8 - self.amp_dtype = torch.bfloat16 if args.quant_with_amp or self._dtype == "bfloat16" else torch.float32 + self.amp_dtype = ( + torch.bfloat16 + if args.quant_with_amp or self._dtype == "bfloat16" + else torch.float32 + ) if re.search("llava", pretrained, re.IGNORECASE): - self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained)) + self._tokenizer, self._model, self._image_processor, self._max_length = ( + load_pretrained_model( + pretrained, None, get_model_name_from_path(pretrained) + ) + ) model_name = get_model_name_from_path(pretrained) - if 'llama-2' in model_name.lower(): + if "llama-2" in model_name.lower(): conv_mode = "llava_llama_2" elif "v1" in model_name.lower(): conv_mode = "llava_v1" @@ -750,7 +872,9 @@ def __init__( ) self._tokenizer = self._image_processor.tokenizer self._config = AutoConfig.from_pretrained( - pretrained if config is None else config, torchscript=with_jit, trust_remote_code=True + pretrained if config is None else config, + torchscript=with_jit, + trust_remote_code=True, ) self._model = model_class[0].from_pretrained( pretrained, @@ -772,10 +896,18 @@ def __init__( if args.torch_compile: if dtype in ["int8", "int4", "nf4"]: - raise SystemExit("[ERROR] Currently this script does not support torch.compile with int8/int4/nf4 datatype, please set dtype to float32 or bfloat16 if want to use torch.compile.") + raise SystemExit( + "[ERROR] Currently this script does not support torch.compile with int8/int4/nf4 datatype," + " please set dtype to float32 or bfloat16 if want to use torch.compile." + ) if with_jit: - raise SystemExit("[ERROR] JIT cannot co-work with torch.compile, please set jit to False if want to use torch.compile.") - self._model.forward = torch.compile(self._model.forward, dynamic=True, backend=args.backend) + raise SystemExit( + "[ERROR] JIT cannot co-work with torch.compile, please set jit to False if want to use" + " torch.compile." + ) + self._model.forward = torch.compile( + self._model.forward, dynamic=True, backend=args.backend + ) self._base_model = self._model @@ -803,10 +935,14 @@ def __init__( "past_key_values": past_key_values, } if re.search("llava", pretrained, re.IGNORECASE): - sample_inputs["inputs_embeds"] = torch.zeros(batch_size, 1, 4096).to(self.amp_dtype) + sample_inputs["inputs_embeds"] = torch.zeros(batch_size, 1, 4096).to( + self.amp_dtype + ) elif re.search("git", pretrained, re.IGNORECASE): sample_inputs["input_ids"] = input_ids.repeat(self.batch_size, 1) - sample_inputs["attention_mask"] = attention_mask.repeat(self.batch_size, 1) + sample_inputs["attention_mask"] = attention_mask.repeat( + self.batch_size, 1 + ) sample_inputs["pixel_values"] = torch.zeros(batch_size, 3, 224, 224) num_head = self.model.config.num_attention_heads head_dim = int(self.model.config.hidden_size / num_head) @@ -814,8 +950,12 @@ def __init__( [ ( torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), - torch.zeros([batch_size, num_head, 1, head_dim]).contiguous(), + torch.zeros( + [batch_size, num_head, 1, head_dim] + ).contiguous(), + torch.zeros( + [batch_size, num_head, 1, head_dim] + ).contiguous(), torch.zeros(1, 4, dtype=torch.long), ) for i in range(self.model.config.num_hidden_layers) @@ -823,7 +963,8 @@ def __init__( ) sample_inputs["past_key_values"] = past_key_values with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if self.amp_dtype == torch.bfloat16 else False,): + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): if self._dtype != "int8": traced_model = torch.jit.trace( self._model.eval(), @@ -838,8 +979,9 @@ def __init__( traced_model(**sample_inputs) traced_model(**sample_inputs) - ipex._set_optimized_model_for_generation(self._model, optimized_model=traced_model) - + ipex._set_optimized_model_for_generation( + self._model, optimized_model=traced_model + ) @property def config(self): @@ -866,7 +1008,9 @@ def max_length(self): def pad_sequence(self, input_ids, batch_first, padding_value): if self.tokenizer.padding_side == "left": input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids] - input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value) + input_ids = torch.nn.utils.rnn.pad_sequence( + input_ids, batch_first=batch_first, padding_value=padding_value + ) if self.tokenizer.padding_side == "left": input_ids = torch.flip(input_ids, [1]) return input_ids @@ -887,7 +1031,9 @@ def rank(self): def world_size(self): return self._world_size - def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + def tok_encode( + self, string: str, left_truncate_len=None, add_special_tokens=None + ) -> List[int]: """ """ add_special_tokens = False if add_special_tokens is None else add_special_tokens encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) @@ -902,9 +1048,13 @@ def tok_decode(self, tokens): def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: # TODO res = [] - pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + pbar = tqdm( + total=len(requests), disable=(self.rank != 0), desc="Model Responding" + ) - for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [ + reg.args for reg in requests + ]: # encode, pad, and truncate contexts for this batch if type(doc_to_target) == str: continuation = doc_to_target @@ -915,7 +1065,10 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: if visuals: image = process_images(visuals, self._image_processor, self._config) if type(image) is list: - image = [_image.to(dtype=torch.float16, device=self.device) for _image in image] + image = [ + _image.to(dtype=torch.float16, device=self.device) + for _image in image + ] else: image = image.to(dtype=torch.float16, device=self.device) else: @@ -923,12 +1076,17 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: prompts_input = contexts[0] - if image is not None and len(image) != 0 and DEFAULT_IMAGE_TOKEN not in prompts_input: + if ( + image is not None + and len(image) != 0 + and DEFAULT_IMAGE_TOKEN not in prompts_input + ): """ Three senarios: 1. No image, and there for, no image token should be added. 2. image token is already specified in the context, so we don't need to add it. - 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + 3. image token is not specified in the context and there is image inputs, so we need to add it. + In this case, we add the image token at the beginning of the context and add a new line. """ image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visuals) image_tokens = " ".join(image_tokens) @@ -938,24 +1096,44 @@ def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: conv.append_message(conv.roles[0], prompts_input) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() - pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id - contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + pad_token_id = ( + self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None + else self.tokenizer.eos_token_id + ) + contxt_id = ( + tokenizer_image_token( + prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + .unsqueeze(0) + .to(self.device) + ) # Add the answer of the second role conv.messages[1][1] = continuation prompt = conv.get_prompt() - input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + input_ids = ( + tokenizer_image_token( + prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + .unsqueeze(0) + .to(self.device) + ) labels = input_ids.clone() # Context part no need to calculate for loss labels[0, : contxt_id.shape[1]] = -100 with torch.inference_mode(): - outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True) + outputs = self.model( + input_ids=input_ids, labels=labels, images=image, use_cache=True + ) loss = outputs["loss"] # loss = torch.exp(loss) logits = outputs["logits"] greedy_tokens = logits.argmax(dim=-1) cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] - greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]] # [1, seq] + greedy_tokens = greedy_tokens[ + :, contxt_id.shape[1] : input_ids.shape[1] + ] # [1, seq] max_equal = (greedy_tokens == cont_toks).all() res.append((float(loss.item()), bool(max_equal))) pbar.update(1) @@ -985,15 +1163,23 @@ def _collate(x): # we group requests by their generation_kwargs, # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling # in the same batch. - re_ords = lmms_utils.Collator([reg.args for reg in requests], _collate, grouping=True) + re_ords = lmms_utils.Collator( + [reg.args for reg in requests], _collate, grouping=True + ) chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) - num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + num_iters = ( + len(requests) // self.batch_size + if len(requests) % self.batch_size == 0 + else len(requests) // self.batch_size + 1 + ) pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") for chunk in chunks: contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) task = task[0] split = split[0] - visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = [ + doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id + ] visuals = self.flatten(visuals) # we assume all gen kwargs in the batch are the same # this is safe to assume because the `grouper` object ensures it. @@ -1008,14 +1194,24 @@ def _collate(x): if isinstance(until, str): until = [until] elif not isinstance(until, list): - raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + raise ValueError( + f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}" + ) - if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__: - # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation - self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") + if ( + "image_aspect_ratio" in gen_kwargs.keys() + and "image_aspect_ratio" not in self._config.__dict__ + ): + # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for + # next step of generation + self._config.image_aspect_ratio = gen_kwargs.pop( + "image_aspect_ratio" + ) # encode, pad, and truncate contexts for this batch if visuals: - image_tensor = process_images(visuals, self._image_processor, self._config) + image_tensor = process_images( + visuals, self._image_processor, self._config + ) else: image_tensor = None @@ -1024,14 +1220,23 @@ def _collate(x): question_input = [] for visual, context in zip(visuals, contexts): - if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context: + if ( + image_tensor is not None + and len(image_tensor) != 0 + and DEFAULT_IMAGE_TOKEN not in context + ): """ Three senarios: 1. No image, and there for, no image token should be added. 2. image token is already specified in the context, so we don't need to add it. - 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + 3. image token is not specified in the context and there is image inputs, so we need to add it. + In this case, we add the image token at the beginning of the context and add a new line. """ - image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN] + image_tokens = ( + [DEFAULT_IMAGE_TOKEN] * len(visual) + if isinstance(visual, list) + else [DEFAULT_IMAGE_TOKEN] + ) image_tokens = " ".join(image_tokens) question = image_tokens + "\n" + context else: @@ -1055,9 +1260,10 @@ def _collate(x): prompt_question = conv.get_prompt() question_input.append(prompt_question) - # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) # preconfigure gen_kwargs with defaults - gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + gen_kwargs["image_sizes"] = [ + visuals[idx].size for idx in range(len(visuals)) + ] if "max_new_tokens" not in gen_kwargs: gen_kwargs["max_new_tokens"] = 1024 if "temperature" not in gen_kwargs: @@ -1067,12 +1273,23 @@ def _collate(x): if "num_beams" not in gen_kwargs: gen_kwargs["num_beams"] = 1 - input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input] - pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id - input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device) + input_ids_list = [ + tokenizer_image_token( + prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + for prompt in question_input + ] + pad_token_ids = ( + self.tokenizer.pad_token_id + if self.tokenizer.pad_token_id is not None + else self.tokenizer.eos_token_id + ) + input_ids = self.pad_sequence( + input_ids_list, batch_first=True, padding_value=pad_token_ids + ).to(self.device) attention_masks = input_ids.ne(pad_token_ids).to(self.device) input_dict = { - "input_ids":input_ids, + "input_ids": input_ids, "attention_mask": attention_masks, "pad_token_id": pad_token_ids, "images": image_tensor.to(self.amp_dtype), @@ -1083,7 +1300,9 @@ def _collate(x): "max_new_tokens": gen_kwargs["max_new_tokens"], } elif re.search("git", self.model.config.architectures[0], re.IGNORECASE): - input_ids=self._image_processor(images=visuals, return_tensors="pt").pixel_values + input_ids = self._image_processor( + images=visuals, return_tensors="pt" + ).pixel_values gen_kwargs.pop("until", None) input_dict = { "pixel_values": input_ids.to(self.amp_dtype), @@ -1091,11 +1310,21 @@ def _collate(x): } try: with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if self.amp_dtype == torch.bfloat16 else False,): + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): cont = self.model.generate(**input_dict) text_outputs = self.tokenizer.batch_decode( - cont[:, input_ids.shape[1]:] if re.search("llava", self.model.config.architectures[0], re.IGNORECASE) else cont, - skip_special_tokens=True) + ( + cont[:, input_ids.shape[1] :] + if re.search( + "llava", + self.model.config.architectures[0], + re.IGNORECASE, + ) + else cont + ), + skip_special_tokens=True, + ) except Exception as e: print(f"Error {e} in generating") cont = "" @@ -1107,7 +1336,8 @@ def _collate(x): pbar.close() return res - + + lm_tasks = [] lmms_tasks = [] lm_all_tasks = lm_eval.tasks.ALL_TASKS @@ -1148,7 +1378,7 @@ def _collate(x): with_jit=not args.disable_jit, dtype=args.dtype, config=args.config_file, - add_special_tokens=False + add_special_tokens=False, ) results = evaluator.evaluate( @@ -1160,14 +1390,16 @@ def _collate(x): print(evaluator.make_table(results)) elif len(lmms_tasks) != 0: task_names = lmms_utils.pattern_match(lmms_tasks, ALL_TASKS) - lm = LMMS(pretrained=args.model, device="cpu", - batch_size=args.batch_size, - with_ipex=args.ipex, - with_jit=not args.disable_jit, - dtype=args.dtype, - config=args.config_file, - add_special_tokens=False - ) + lm = LMMS( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=False, + ) task_dict = lmms_eval.tasks.get_task_dict(task_names, model_name="test") for task_name in task_dict.keys(): @@ -1185,6 +1417,6 @@ def _collate(x): task_dict=task_dict, # limit=10, # bootstrap_iters=100, - cli_args=args + cli_args=args, ) print(lmms_evaluator.make_table(results)) diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index dd5f3cb96..70c37c8ae 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -17,7 +17,7 @@ import sys -sys.path.append(sys.path[0] + '/../../') +sys.path.append(sys.path[0] + "/../../") import logging @@ -54,7 +54,13 @@ from llava.model.builder import load_pretrained_model from llava.conversation import conv_templates from llava.mm_utils import get_model_name_from_path, tokenizer_image_token - from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from llava.constants import ( + IMAGE_TOKEN_INDEX, + DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + ) + MODEL_CLASSES["llava"] = (LlavaLlamaForCausalLM, AutoTokenizer) except ImportError: pass @@ -88,10 +94,15 @@ "--prompt", default=None, type=str, help="input prompt for self-defined if needed" ) parser.add_argument( - "--streaming", action="store_true", help="enable streaming mode for generation output (greedy search only)" + "--streaming", + action="store_true", + help="enable streaming mode for generation output (greedy search only)", ) parser.add_argument( - "--image-url", default="http://images.cocodataset.org/val2017/000000039769.jpg", type=str, help="image url for image-to-text task" + "--image-url", + default="http://images.cocodataset.org/val2017/000000039769.jpg", + type=str, + help="image url for image-to-text task", ) parser.add_argument( "--config-file", default=None, type=str, help="specific configuration file" @@ -100,7 +111,9 @@ parser.add_argument("--ipex", action="store_true") parser.add_argument("--deployment-mode", action="store_true") parser.add_argument("--torch-compile", action="store_true") -parser.add_argument("--backend", default="ipex", type=str, help="backend of torch.compile") +parser.add_argument( + "--backend", default="ipex", type=str, help="backend of torch.compile" +) parser.add_argument("--profile", action="store_true") parser.add_argument("--benchmark", action="store_true") parser.add_argument("--num-iter", default=100, type=int, help="num iter") @@ -135,15 +148,23 @@ if model_type == "chatglm": # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided config = AutoConfig.from_pretrained( - args.model_id, torchscript=args.deployment_mode, trust_remote_code=True, torch_dtype=amp_dtype, + args.model_id, + torchscript=args.deployment_mode, + trust_remote_code=True, + torch_dtype=amp_dtype, ) else: config = AutoConfig.from_pretrained( - args.model_id, torchscript=args.deployment_mode, trust_remote_code=True, + args.model_id, + torchscript=args.deployment_mode, + trust_remote_code=True, ) else: config = AutoConfig.from_pretrained( - args.config_file, torchscript=args.deployment_mode, trust_remote_code=True, torch_dtype=amp_dtype, + args.config_file, + torchscript=args.deployment_mode, + trust_remote_code=True, + torch_dtype=amp_dtype, ) if not hasattr(config, "text_max_length") and args.prompt is None: config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) @@ -159,11 +180,13 @@ torch_dtype=amp_dtype, config=config, low_cpu_mem_usage=True, - trust_remote_code=True + trust_remote_code=True, ) tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) else: - tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) + tokenizer, model, image_processor, context_len = load_pretrained_model( + args.model_id + ) model = model.eval() model = model.to(memory_format=torch.channels_last) num_beams = 1 if args.greedy else 4 @@ -172,7 +195,14 @@ streamer = TextStreamer(tokenizer) else: streamer = None -generate_kwargs = dict(do_sample=False, temperature=0.9, num_beams=num_beams, max_new_tokens=args.max_new_tokens, min_new_tokens=args.max_new_tokens, streamer=streamer) +generate_kwargs = dict( + do_sample=False, + temperature=0.9, + num_beams=num_beams, + max_new_tokens=args.max_new_tokens, + min_new_tokens=args.max_new_tokens, + streamer=streamer, +) if re.search("gptbigcode", model.config.architectures[0], re.IGNORECASE): model_type = "gptbigcode" @@ -181,22 +211,27 @@ elif re.search("t5", model.config.architectures[0], re.IGNORECASE): generate_kwargs["max_length"] = generate_kwargs["max_new_tokens"] generate_kwargs.pop("max_new_tokens") -elif re.search("git", model.config.architectures[0], re.IGNORECASE) or re.search("llava", model.config.architectures[0], re.IGNORECASE): +elif re.search("git", model.config.architectures[0], re.IGNORECASE) or re.search( + "llava", model.config.architectures[0], re.IGNORECASE +): from PIL import Image import requests from io import BytesIO + model.config.batch_size = int(args.batch_size) * num_beams def load_image(image_file): - if image_file.startswith('http://') or image_file.startswith('https://'): + if image_file.startswith("http://") or image_file.startswith("https://"): response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert('RGB') + image = Image.open(BytesIO(response.content)).convert("RGB") else: - image = Image.open(image_file).convert('RGB') + image = Image.open(image_file).convert("RGB") return image + + if re.search("llava", model.config.architectures[0], re.IGNORECASE): model_name = get_model_name_from_path(args.model_id) - if 'llama-2' in model_name.lower(): + if "llama-2" in model_name.lower(): conv_mode = "llava_llama_2" elif "v1" in model_name.lower(): conv_mode = "llava_v1" @@ -206,13 +241,17 @@ def load_image(image_file): conv_mode = "llava_v0" conv = conv_templates[conv_mode].copy() if "mpt" in model_name.lower(): - roles = ('user', 'assistant') + roles = ("user", "assistant") else: roles = conv.roles if re.search("yuan", model.config.architectures[0], re.IGNORECASE): model.config.batch_size = int(args.batch_size) * num_beams + + def trace_handler(prof): print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=-1)) + + # to ipex if args.ipex: model = ipex.llm.optimize( @@ -223,7 +262,10 @@ def trace_handler(prof): ) if args.torch_compile: if args.deployment_mode: - raise SystemExit("[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode to False if want to use torch.compile.") + raise SystemExit( + "[ERROR] deployment_mode cannot co-work with torch.compile, please set deployment_mode" + " to False if want to use torch.compile." + ) model.forward = torch.compile(model.forward, dynamic=True, backend=args.backend) @@ -243,9 +285,15 @@ def trace_handler(prof): image = load_image(args.image_url) image = [image] * args.batch_size if model.config.mm_use_im_start_end: - prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + prompt = ( + DEFAULT_IM_START_TOKEN + + DEFAULT_IMAGE_TOKEN + + DEFAULT_IM_END_TOKEN + + "\n" + + prompt + ) else: - prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt + prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt conv.append_message(conv.roles[0], prompt) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() @@ -262,7 +310,9 @@ def trace_handler(prof): + args.model_id ) elif int(args.input_tokens) > 8192: - prompt = prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) + prompt = prompt_pool[model_type]["8192"] * int( + int(args.input_tokens) / 8192 + ) elif args.input_tokens in prompt_pool[model_type]: prompt = prompt_pool[model_type][args.input_tokens] else: @@ -283,17 +333,34 @@ def trace_handler(prof): for i in range(num_iter): tic = time.time() if model_type == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] - output = model.generate(input_ids, images=image_tensor, **generate_kwargs) + input_ids = torch.stack( + [ + tokenizer_image_token( + pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + for pmt in prompt + ] + ) + image_tensor = [ + image_processor.preprocess(img, return_tensors="pt")[ + "pixel_values" + ].to(amp_dtype) + for img in image + ] + output = model.generate( + input_ids, images=image_tensor, **generate_kwargs + ) elif model_type == "git": - input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values + input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values output = model.generate(pixel_values=input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) gen_ids = output[0] if args.token_latency else output - gen_text = tokenizer.batch_decode(gen_ids[:, input_ids.shape[1]:] if model_type=="llava" else gen_ids, skip_special_tokens=True) + gen_text = tokenizer.batch_decode( + gen_ids[:, input_ids.shape[1] :] if model_type == "llava" else gen_ids, + skip_special_tokens=True, + ) toc = time.time() input_tokens_lengths = [x.shape[0] for x in input_ids] @@ -317,12 +384,33 @@ def trace_handler(prof): ) as prof: for i in range(5): if model_type == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] - output = model.generate(input_ids, images=image_tensor, **generate_kwargs) + input_ids = torch.stack( + [ + tokenizer_image_token( + pmt, + tokenizer, + IMAGE_TOKEN_INDEX, + return_tensors="pt", + ) + for pmt in prompt + ] + ) + image_tensor = [ + image_processor.preprocess(img, return_tensors="pt")[ + "pixel_values" + ].to(amp_dtype) + for img in image + ] + output = model.generate( + input_ids, images=image_tensor, **generate_kwargs + ) elif model_type == "git": - input_ids=tokenizer(images=prompt, return_tensors="pt").pixel_values - output = model.generate(pixel_values=input_ids, **generate_kwargs) + input_ids = tokenizer( + images=prompt, return_tensors="pt" + ).pixel_values + output = model.generate( + pixel_values=input_ids, **generate_kwargs + ) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) diff --git a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py b/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py index df3a5ae52..b347ee71f 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py +++ b/examples/cpu/inference/python/llm/single_instance/run_int4_gpt-j_on_cnndailymail.py @@ -45,13 +45,21 @@ help="the INT4 model file path. If provided, calibration with GPTQ and quantization are skipped.", ) parser.add_argument("--output-dir", nargs="?", default="./saved_results") -parser.add_argument("--fp32", action="store_true", - help="Run float32 model without quantization. Cannot use this option along with --bf16.") -parser.add_argument("--bf16", action="store_true", - help="Run bfloat16 model without quantization. Cannot use this option along with --fp32.") +parser.add_argument( + "--fp32", + action="store_true", + help="Run float32 model without quantization. Cannot use this option along with --bf16.", +) +parser.add_argument( + "--bf16", + action="store_true", + help="Run bfloat16 model without quantization. Cannot use this option along with --fp32.", +) args = parser.parse_args() -assert not (args.fp32 and args.bf16), "--fp32 and --bf16 cannot be used at the same time" +assert not ( + args.fp32 and args.bf16 +), "--fp32 and --bf16 cannot be used at the same time" random.seed(9973) logger = logging.getLogger("INT4 GPT-J") @@ -71,7 +79,15 @@ class CNNDAILYMAIL(object): - def __init__(self, model_path, data_path, device="cpu", is_calib=False, num_samples=20, max_len=1920): + def __init__( + self, + model_path, + data_path, + device="cpu", + is_calib=False, + num_samples=20, + max_len=1920, + ): self.model_path = model_path self.data_path = data_path self.device = device @@ -87,7 +103,7 @@ def __init__(self, model_path, data_path, device="cpu", is_calib=False, num_samp self.load_dataset() def load_dataset(self): - """ Loads dataset""" + """Loads dataset""" with open(self.data_path, "r") as fid: list_data_dict = json.load(fid) self.list_data_dict = copy.deepcopy(list_data_dict) @@ -96,11 +112,14 @@ def load_dataset(self): self.num_samples = min(self.num_samples, len(list_data_dict)) if self.is_calib: - list_data_dict = list_data_dict[:self.num_samples] + list_data_dict = list_data_dict[: self.num_samples] else: list_data_dict = random.choices(list_data_dict, k=self.num_samples) - prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"] + prompt_input, prompt_no_input = ( + PROMPT_DICT["prompt_input"], + PROMPT_DICT["prompt_no_input"], + ) sources = [prompt_input.format_map(example) for example in list_data_dict] targets = [f"{example['output']}" for example in list_data_dict] @@ -116,7 +135,7 @@ def load_dataset(self): self.targets = targets def load_tokenizer(self): - """ Returns the tokenizer """ + """Returns the tokenizer""" self.tokenizer = AutoTokenizer.from_pretrained( self.model_path, model_max_length=2048, @@ -127,7 +146,13 @@ def load_tokenizer(self): @torch.no_grad() def tokenize_function(self, text): - example = self.tokenizer(text, truncation=True, max_length=self.max_len, return_tensors="pt", padding=self.padding) + example = self.tokenizer( + text, + truncation=True, + max_length=self.max_len, + return_tensors="pt", + padding=self.padding, + ) return example def __len__(self): @@ -158,6 +183,7 @@ def collate_batch(self, batch): parent_path = Path(__file__).parent.absolute() Path(args.output_dir).mkdir(parents=True, exist_ok=True) + def load_original_model(args): logger.info("Loading model {}...".format(args.model)) config = AutoConfig.from_pretrained(args.model, torchscript=True) @@ -168,14 +194,15 @@ def load_original_model(args): logger.info("model loaded.") return user_model, tokenizer -dataset_id = 'cnn_dailymail' -dataset_version = '3.0.0' + +dataset_id = "cnn_dailymail" +dataset_version = "3.0.0" dataset_split = "validation" if args.dataset_path == "": instruction_template = "Summarize the following news article:" logger.info("Loading {} split of {} dataset...".format(dataset_split, args.model)) dataset = load_dataset(dataset_id, name=dataset_version, split=dataset_split) - train = dict((x['id'], x) for x in dataset) + train = dict((x["id"], x) for x in dataset) inputs = [] for i in tqdm(range(len(dataset))): sample = dataset[i] @@ -185,8 +212,10 @@ def load_original_model(args): x["output"] = sample["highlights"] inputs.append(x) - val_data_path = os.path.join(args.output_dir, "cnn_dailymail_{}.json".format(dataset_split)) - with open(val_data_path, 'w') as write_f: + val_data_path = os.path.join( + args.output_dir, "cnn_dailymail_{}.json".format(dataset_split) + ) + with open(val_data_path, "w") as write_f: json.dump(inputs, write_f, indent=4, ensure_ascii=False) logger.info("{} data saved at {}".format(dataset_split, val_data_path)) @@ -214,25 +243,32 @@ def load_original_model(args): logger.info("Calibration with GPTQ will take an hour or so. Please wait.") user_model, tokenizer = load_original_model(args) calib_iters = 128 - calib_dataset = CNNDAILYMAIL(args.model, val_data_path, is_calib=True, num_samples=calib_iters) + calib_dataset = CNNDAILYMAIL( + args.model, val_data_path, is_calib=True, num_samples=calib_iters + ) calib_dataloader = DataLoader( calib_dataset, batch_size=batch_size, shuffle=False, - collate_fn=calib_dataset.collate_batch + collate_fn=calib_dataset.collate_batch, ) - compressed_model = ipex.quantization.gptq( + compressed_model = ipex.quantization.gptq( model=user_model, dataloader=calib_dataloader, - group_size=128, + group_size=128, use_max_length=True, compression_dtype=torch.int32, compression_dim=1, scale_dtype=torch.float16, - save_dir=args.output_dir) + save_dir=args.output_dir, + ) - logger.info("Calibration finished. Low-precision checkpoint generated as {}.".format(args.output_dir)) + logger.info( + "Calibration finished. Low-precision checkpoint generated as {}.".format( + args.output_dir + ) + ) # Quit here because we want to use different environment variables to run GPTQ and benchmark. # So, run this script twice and specify the GPTQ checkpoint file for the second run. quit() @@ -243,12 +279,12 @@ def load_original_model(args): logger.info("Loading low_precision_checkpoint...") low_precision_checkpoint = torch.load(low_precision_checkpoint_file_path) config_dict = { - "weight_key": "qweight", - "scale_key": "scales", - "zero_point_key": "qzeros", - "bias_key": "bias", - "g_idx_key": "g_idx" - } + "weight_key": "qweight", + "scale_key": "scales", + "zero_point_key": "qzeros", + "bias_key": "bias", + "g_idx_key": "g_idx", + } state_dict_and_config = (low_precision_checkpoint, config_dict) logger.info("low_precision_checkpoint loaded.") @@ -316,7 +352,11 @@ def load_original_model(args): self_jit = torch.jit.freeze(self_jit.eval()) Path(args.output_dir).mkdir(parents=True, exist_ok=True) self_jit.save(args.output_dir + "/int4_model.pt") - logger.info("Quantization finished. INT4 model saved to {}.".format(args.output_dir + "/int4_model.pt")) + logger.info( + "Quantization finished. INT4 model saved to {}.".format( + args.output_dir + "/int4_model.pt" + ) + ) else: user_model, tokenizer = load_original_model(args) logger.info("INT4 model is given. Quantization skipped.") @@ -338,6 +378,7 @@ def load_original_model(args): predictions = [] ground_truths = [] + def postprocess_text(preds, targets): preds = [pred.strip() for pred in preds] targets = [target.strip() for target in targets] @@ -347,22 +388,30 @@ def postprocess_text(preds, targets): return preds, targets + # Only run 1000 samples. It saves a lot of time and it's a good approximation of results on the whole dataset iters = 1000 -val_dataset = CNNDAILYMAIL(args.model, val_data_path, is_calib=False, max_len=max_len, num_samples=iters) +val_dataset = CNNDAILYMAIL( + args.model, val_data_path, is_calib=False, max_len=max_len, num_samples=iters +) sources = val_dataset.sources targets = val_dataset.targets logger.info("Start running accuracy task...") logger.info("Number of samples to run = {}".format(iters)) with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( enabled=(False if args.fp32 else True), - dtype=(None if args.fp32 else torch.bfloat16) + dtype=(None if args.fp32 else torch.bfloat16), ): for i in tqdm(range(len(sources))): input_ids, actual_lens, att_mask = val_dataset[i] input_lens = input_ids.shape[-1] t0 = time.time() - out_tokens = user_model.generate(input_ids, attention_mask=att_mask, **generate_kwargs, pad_token_id=tokenizer.pad_token_id) + out_tokens = user_model.generate( + input_ids, + attention_mask=att_mask, + **generate_kwargs, + pad_token_id=tokenizer.pad_token_id, + ) t1 = time.time() print("Inference time: {}".format(round(t1 - t0, 3))) print("Seq len: {}".format(input_ids.shape[-1])) @@ -379,7 +428,12 @@ def postprocess_text(preds, targets): if i == iters - 1: break -result = metric.compute(predictions=predictions, references=ground_truths, use_stemmer=True, use_aggregator=False) +result = metric.compute( + predictions=predictions, + references=ground_truths, + use_stemmer=True, + use_aggregator=False, +) result = {k: round(np.mean(v) * 100, 4) for k, v in result.items()} logger.info("Accuracy test results:") logger.info(result) diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index c36e9e522..44fe0ef76 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -7,7 +7,6 @@ import torch from torch.utils.data import DataLoader -import transformers from transformers import AutoConfig from transformers import TextStreamer import intel_extension_for_pytorch as ipex @@ -48,7 +47,9 @@ "--max-new-tokens", default=32, type=int, help="output max new tokens" ) parser.add_argument( - "--streaming", action="store_true", help="enable streaming mode for generation output (greedy search only)" + "--streaming", + action="store_true", + help="enable streaming mode for generation output (greedy search only)", ) parser.add_argument("--dataset", nargs="?", default="") parser.add_argument("--split", nargs="?", default="validation", const="validation") @@ -66,9 +67,14 @@ help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", ) parser.add_argument( - "--image-url", default="http://images.cocodataset.org/val2017/000000039769.jpg", type=str, help="image url for image-to-text task" + "--image-url", + default="http://images.cocodataset.org/val2017/000000039769.jpg", + type=str, + help="image url for image-to-text task", +) +parser.add_argument( + "--qconfig-summary-file", default="", help="qconfig for static quantization" ) -parser.add_argument("--qconfig-summary-file", default="", help="qconfig for static quantization") parser.add_argument("--quantized-model-path", default="./saved_results/best_model.pt") parser.add_argument("--benchmark", action="store_true") parser.add_argument("--input-tokens", default="32", type=str) @@ -77,43 +83,77 @@ parser.add_argument("--num-warmup", default=10, type=int, help="num warmup") parser.add_argument("--batch-size", default=1, type=int, help="batch size") parser.add_argument( - "--calib-len", default=512, type=int, help="calibration dataset max or padding max length for SmoothQuant autotuning" + "--calib-len", + default=512, + type=int, + help="calibration dataset max or padding max length for SmoothQuant autotuning", +) +parser.add_argument( + "--calib-iters", + default=512, + type=int, + help="calibration iters for SmoothQuant autotuning", ) -parser.add_argument("--calib-iters", default=512, type=int, help="calibration iters for SmoothQuant autotuning") parser.add_argument( - "--calib-shuffle", action="store_true", help="whether to shuffle on calibration dataset for SmoothQuant autotuning" + "--calib-shuffle", + action="store_true", + help="whether to shuffle on calibration dataset for SmoothQuant autotuning", ) parser.add_argument( - "--calib-padding", action="store_true", help="whether to pad on calibration dataset for SmoothQuant autotuning" + "--calib-padding", + action="store_true", + help="whether to pad on calibration dataset for SmoothQuant autotuning", ) parser.add_argument( - "--calib-pad-val", default=1, type=int, help="calibration dataset padding value for SmoothQuant autotuning" + "--calib-pad-val", + default=1, + type=int, + help="calibration dataset padding value for SmoothQuant autotuning", ) parser.add_argument( - "--fallback-add", action="store_true", help="whether to fallback add ops to fp32 for SmoothQuant autotuning" + "--fallback-add", + action="store_true", + help="whether to fallback add ops to fp32 for SmoothQuant autotuning", ) parser.add_argument("--alpha", default=0.5, help="alpha value for smoothquant") parser.add_argument( "--folding", action="store_true", help="whether to fold mul into the previous layer" ) parser.add_argument( - "--init-alpha", default=0.5, type=float, help="a value to get baseline quantization error for auto-tuning" + "--init-alpha", + default=0.5, + type=float, + help="a value to get baseline quantization error for auto-tuning", ) parser.add_argument( - "--alpha-min", default=0.0, type=float, help="min value of auto-tuning alpha search space" + "--alpha-min", + default=0.0, + type=float, + help="min value of auto-tuning alpha search space", ) parser.add_argument( - "--alpha-max", default=1.0, type=float, help="max value of auto-tuning alpha search space" + "--alpha-max", + default=1.0, + type=float, + help="max value of auto-tuning alpha search space", ) parser.add_argument( - "--alpha-step", default=0.1, type=float, help="step_size of auto-tuning alpha search space" + "--alpha-step", + default=0.1, + type=float, + help="step_size of auto-tuning alpha search space", ) parser.add_argument( - "--shared-criterion", choices=["min", "mean", "max"], default="max", type=str - , help="criterion for input LayerNorm op of a transformer block" + "--shared-criterion", + choices=["min", "mean", "max"], + default="max", + type=str, + help="criterion for input LayerNorm op of a transformer block", ) parser.add_argument( - "--enable-blockwise-loss", action="store_true", help="whether to enable block-wise auto-tuning" + "--enable-blockwise-loss", + action="store_true", + help="whether to enable block-wise auto-tuning", ) parser.add_argument("--token-latency", action="store_true") parser.add_argument("--greedy", action="store_true") @@ -182,7 +222,7 @@ action="store_true", help="Indicate that the low-precision checkpoint is in the legacy format rather than the" " HuggingFace Optimum format for backward compatibility. It must be used with" - " --low-precision-checkpoint. Otherwise, it has no effect." + " --low-precision-checkpoint. Otherwise, it has no effect.", ) args = parser.parse_args() @@ -207,11 +247,16 @@ if "chatglm" in args.model_id.lower(): # chatglm modeling is from remote hub and its torch_dtype in config.json need to be overrided config = AutoConfig.from_pretrained( - args.model_id, torchscript=True, trust_remote_code=True, torch_dtype=torch.float + args.model_id, + torchscript=True, + trust_remote_code=True, + torch_dtype=torch.float, ) else: config = AutoConfig.from_pretrained( - args.model_id, torchscript=True, trust_remote_code=True, + args.model_id, + torchscript=True, + trust_remote_code=True, ) else: config = AutoConfig.from_pretrained( @@ -223,7 +268,9 @@ model = FALCONConfig(args.model_id) elif re.search("GPTJ", config.architectures[0], re.IGNORECASE): model = GPTJConfig(args.model_id) -elif re.search("llama", config.architectures[0], re.IGNORECASE) and not re.search("llava", config.architectures[0], re.IGNORECASE): +elif re.search("llama", config.architectures[0], re.IGNORECASE) and not re.search( + "llava", config.architectures[0], re.IGNORECASE +): model = LLAMAConfig(args.model_id) elif re.search("gptneox", config.architectures[0], re.IGNORECASE): model = GPTNEOXConfig(args.model_id) @@ -254,27 +301,36 @@ elif re.search("git", config.architectures[0], re.IGNORECASE): from PIL import Image import requests + model = GitConfig(args.model_id) elif re.search("llava", config.architectures[0], re.IGNORECASE): from PIL import Image import requests from io import BytesIO + try: from llava.conversation import conv_templates from llava.mm_utils import get_model_name_from_path, tokenizer_image_token - from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from llava.constants import ( + IMAGE_TOKEN_INDEX, + DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + ) except ImportError: pass model = LlavaConfig(args.model_id) + def load_image(image_file): - if image_file.startswith('http://') or image_file.startswith('https://'): + if image_file.startswith("http://") or image_file.startswith("https://"): response = requests.get(image_file) - image = Image.open(BytesIO(response.content)).convert('RGB') + image = Image.open(BytesIO(response.content)).convert("RGB") else: - image = Image.open(image_file).convert('RGB') + image = Image.open(image_file).convert("RGB") return image + model_name = get_model_name_from_path(args.model_id) - if 'llama-2' in model_name.lower(): + if "llama-2" in model_name.lower(): conv_mode = "llava_llama_2" elif "v1" in model_name.lower(): conv_mode = "llava_v1" @@ -284,7 +340,7 @@ def load_image(image_file): conv_mode = "llava_v0" conv = conv_templates[conv_mode].copy() if "mpt" in model_name.lower(): - roles = ('user', 'assistant') + roles = ("user", "assistant") else: roles = conv.roles elif re.search("phi3", config.architectures[0], re.IGNORECASE): @@ -318,7 +374,7 @@ def load_image(image_file): num_beams=num_beams, max_new_tokens=args.max_new_tokens, min_new_tokens=args.max_new_tokens, - streamer=streamer + streamer=streamer, ) if re.search("t5", config.architectures[0], re.IGNORECASE): generate_kwargs["max_length"] = generate_kwargs["max_new_tokens"] @@ -334,12 +390,16 @@ def load_image(image_file): beam_idx_tmp = torch.zeros( (2048, int(args.batch_size * num_beams)), dtype=torch.long ).contiguous() + + def _get_target_nums(names): for n in names: if hasattr(user_model.config, n): return getattr(user_model.config, n) print(f"Not found target {names[0]}") exit(0) + + num_heads_names = ["num_attention_heads", "n_head", "num_heads", "n_heads"] num_layers_names = ["num_hidden_layers", "n_layer", "num_layers", "n_layers"] hidden_size_names = ["hidden_size", "n_embd"] @@ -370,6 +430,7 @@ def _get_target_nums(names): ] ) + def get_example_inputs(model): if model.use_global_past_key_value: global global_past_key_value @@ -446,8 +507,8 @@ def get_example_inputs(model): ] pixel_inputs = torch.ones(batch_size, 3, 224, 224) example_inputs = ( - input_ids.unsqueeze(0).repeat(batch_size,1), - attention_mask.unsqueeze(0).repeat(batch_size,1), + input_ids.unsqueeze(0).repeat(batch_size, 1), + attention_mask.unsqueeze(0).repeat(batch_size, 1), tuple(past_key_value), pixel_inputs, ) @@ -469,11 +530,14 @@ def get_example_inputs(model): tuple(past_key_value), ) else: - raise RuntimeError("Your model does not match existing example inputs used in ipex quantization, exiting...") + raise RuntimeError( + "Your model does not match existing example inputs used in ipex quantization, exiting..." + ) if hasattr(model, "extra_inputs"): example_inputs = example_inputs + model.extra_inputs return example_inputs + if args.ipex_smooth_quant: if args.qconfig_summary_file != "": qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=args.alpha) @@ -486,9 +550,10 @@ def get_example_inputs(model): deployment_mode=True, ) pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True) - user_model.trace_graph.save(args.output_dir + '/' + args.quant_model_name) + user_model.trace_graph.save(args.output_dir + "/" + args.quant_model_name) quant_model = user_model.trace_graph else: + class Evaluator: def __init__( self, dataset, tokenizer, args, batch_size=1, pad_val=1, pad_max=512 @@ -499,11 +564,11 @@ def __init__( self.pad_val = pad_val self.pad_max = pad_max self.args = args - + # tokenize the dataset self.dataset = self.dataset.map(self.tokenize_function, batched=True) self.dataset.set_format(type="torch", columns=["input_ids"]) - + @torch.no_grad() def tokenize_function(self, examples): if "prompt" in examples: @@ -513,7 +578,7 @@ def tokenize_function(self, examples): elif "code" in examples: example = self.tokenizer(examples["code"]) return example - + @torch.no_grad() def collate_batch(self, batch): position_ids_padded = [] @@ -530,6 +595,7 @@ def collate_batch(self, batch): ) else: from torch.nn.functional import pad + pad_len = int(args.calib_len) - input_ids.shape[0] input_ids = pad( input_ids, (0, pad_len), value=int(args.calib_pad_val) @@ -574,17 +640,23 @@ def collate_batch(self, batch): model_kwargs = { "attention_mask": torch.vstack(attention_mask_padded), } - model_kwargs = user_model._prepare_encoder_decoder_kwargs_for_generation( - torch.vstack(input_ids_padded), model_kwargs, "input_ids" + model_kwargs = ( + user_model._prepare_encoder_decoder_kwargs_for_generation( + torch.vstack(input_ids_padded), model_kwargs, "input_ids" + ) ) - input_ids, example_inputs = user_model._expand_inputs_for_generation( - input_ids=torch.vstack(input_ids_padded), - expand_size=num_beams, - is_encoder_decoder=True, - **model_kwargs, + input_ids, example_inputs = ( + user_model._expand_inputs_for_generation( + input_ids=torch.vstack(input_ids_padded), + expand_size=num_beams, + is_encoder_decoder=True, + **model_kwargs, + ) ) input_bs = int(args.batch_size * num_beams) - last_hidden_state = example_inputs["encoder_outputs"]["last_hidden_state"] + last_hidden_state = example_inputs["encoder_outputs"][ + "last_hidden_state" + ] global_past_key_value = tuple( [ ( @@ -593,16 +665,24 @@ def collate_batch(self, batch): torch.zeros([1, n_heads, 1, head_dim]).contiguous(), beam_idx_tmp, torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), - user_model.decoder.block[i].layer[1].EncDecAttention.k(last_hidden_state) - .view(input_bs, -1, n_heads, head_dim).transpose(0, 1), - user_model.decoder.block[i].layer[1].EncDecAttention.v(last_hidden_state) - .view(input_bs, -1, n_heads, head_dim).transpose(0, 1), + user_model.decoder.block[i] + .layer[1] + .EncDecAttention.k(last_hidden_state) + .view(input_bs, -1, n_heads, head_dim) + .transpose(0, 1), + user_model.decoder.block[i] + .layer[1] + .EncDecAttention.v(last_hidden_state) + .view(input_bs, -1, n_heads, head_dim) + .transpose(0, 1), beam_idx_tmp, ) for i in range(n_layers) ] ) - decoder_input_ids = (torch.zeros(input_bs).to(torch.long).unsqueeze(1)) + decoder_input_ids = ( + torch.zeros(input_bs).to(torch.long).unsqueeze(1) + ) model_inputs = ( decoder_input_ids, torch.vstack(attention_mask_padded), @@ -610,13 +690,15 @@ def collate_batch(self, batch): (last_hidden_state,), ) else: - raise RuntimeError("Your model does not match existing example inputs used in ipex smooth quant, exiting...") + raise RuntimeError( + "Your model does not match existing example inputs used in ipex smooth quant, exiting..." + ) if hasattr(model, "extra_inputs"): model_inputs = model_inputs + model.extra_inputs return (model_inputs, last_ind) - + calib_dataset = load_dataset( args.dataset if args.dataset else model.default_dataset, split="train" ) @@ -624,7 +706,11 @@ def collate_batch(self, batch): calib_dataset = calib_dataset.shuffle(seed=42) user_model.eval() calib_evaluator = Evaluator( - calib_dataset, tokenizer, args, batch_size=args.batch_size, pad_max=int(args.input_tokens) if model.name=="t5" else 512 + calib_dataset, + tokenizer, + args, + batch_size=args.batch_size, + pad_max=int(args.input_tokens) if model.name == "t5" else 512, ) calib_dataloader = DataLoader( calib_evaluator.dataset, @@ -642,7 +728,7 @@ def calib_func(prepared_model): example_inputs = get_example_inputs(model) from intel_extension_for_pytorch.quantization import prepare, convert - + if model.use_ipex_autotune: qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping() user_model = ipex.llm.optimize( @@ -658,17 +744,21 @@ def calib_func(prepared_model): "weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}, } - - smoothquant_args = {"alpha": args.alpha if args.alpha == "auto" \ - else literal_eval(args.alpha), "folding": args.folding} + + smoothquant_args = { + "alpha": ( + args.alpha if args.alpha == "auto" else literal_eval(args.alpha) + ), + "folding": args.folding, + } if args.alpha == "auto": smoothquant_args["auto_alpha_args"] = { - "init_alpha": float(args.init_alpha), - "alpha_min": float(args.alpha_min), - "alpha_max": float(args.alpha_max), - "alpha_step": float(args.alpha_step), - "shared_criterion": args.shared_criterion, - "enable_blockwise_loss": args.enable_blockwise_loss + "init_alpha": float(args.init_alpha), + "alpha_min": float(args.alpha_min), + "alpha_max": float(args.alpha_max), + "alpha_step": float(args.alpha_step), + "shared_criterion": args.shared_criterion, + "enable_blockwise_loss": args.enable_blockwise_loss, } # using specified sq recipes for llama2-7b if re.search("llama", config.architectures[0], re.IGNORECASE): @@ -679,21 +769,23 @@ def calib_func(prepared_model): "alpha_max": 0.99, "alpha_step": 0.01, "shared_criterion": "mean", - "enable_blockwise_loss": False - } + "enable_blockwise_loss": False, + } prepared_model = ipex.quantization.autotune( user_model, calib_dataloader, calib_func=calib_func, op_type_dict=op_type_dict, - smoothquant_args=smoothquant_args + smoothquant_args=smoothquant_args, ) pathlib.Path(args.output_dir).mkdir(parents=True, exist_ok=True) prepared_model.save_qconf_summary(args.output_dir + "/best_configure.json") else: - qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=args.alpha) + qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping( + alpha=args.alpha + ) user_model = ipex.llm.optimize( user_model.eval(), dtype=amp_dtype, @@ -723,15 +815,23 @@ def calib_func(prepared_model): quant_model = self_jit if model.name == "yuan": input_bs = int(args.batch_size * num_beams) - example_inputs = (example_inputs[0].repeat(input_bs, 1), example_inputs[1].repeat(input_bs, 1), example_inputs[2].repeat(input_bs, 1)) + example_inputs = ( + example_inputs[0].repeat(input_bs, 1), + example_inputs[1].repeat(input_bs, 1), + example_inputs[2].repeat(input_bs, 1), + ) self_jit_first = torch.jit.trace( - convert_model.eval(), example_inputs, strict=False, check_trace=False + convert_model.eval(), + example_inputs, + strict=False, + check_trace=False, ) self_jit_first = torch.jit.freeze(self_jit_first.eval()) self_jit_first.save(args.output_dir + "/" + args.quant_model_name + "2") elif args.ipex_weight_only_quantization: from intel_extension_for_pytorch.quantization import WoqWeightDtype + if args.weight_dtype == "INT8": weight_dtype = WoqWeightDtype.INT8 elif args.weight_dtype == "INT4": @@ -796,7 +896,11 @@ def calib_func(prepared_model): quant_model = self_jit if model.name == "yuan": input_bs = int(args.batch_size * num_beams) - example_inputs = (example_inputs[0].repeat(input_bs, 1), example_inputs[1].repeat(input_bs, 1), example_inputs[2].repeat(input_bs, 1)) + example_inputs = ( + example_inputs[0].repeat(input_bs, 1), + example_inputs[1].repeat(input_bs, 1), + example_inputs[2].repeat(input_bs, 1), + ) self_jit_first = torch.jit.trace( user_model.eval(), example_inputs, strict=False, check_trace=False ) @@ -826,9 +930,15 @@ def calib_func(prepared_model): print("warning: loading failed.", e) self_jit = quant_model if model.name == "yuan": - ipex._set_optimized_model_for_generation(user_model, optimized_model=self_jit, first_token_optimized_model=self_jit_first) + ipex._set_optimized_model_for_generation( + user_model, + optimized_model=self_jit, + first_token_optimized_model=self_jit_first, + ) else: - ipex._set_optimized_model_for_generation(user_model, optimized_model=self_jit) + ipex._set_optimized_model_for_generation( + user_model, optimized_model=self_jit + ) if model.name == "git": prompt = Image.open(requests.get(args.image_url, stream=True).raw) @@ -838,9 +948,15 @@ def calib_func(prepared_model): image = load_image(args.image_url) image = [image] * args.batch_size if user_model.config.mm_use_im_start_end: - prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + prompt = ( + DEFAULT_IM_START_TOKEN + + DEFAULT_IMAGE_TOKEN + + DEFAULT_IM_END_TOKEN + + "\n" + + prompt + ) else: - prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt + prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt conv.append_message(conv.roles[0], prompt) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() @@ -853,7 +969,9 @@ def calib_func(prepared_model): if args.prompt is not None: prompt = args.prompt elif int(args.input_tokens) > 8192: - prompt = prompt_pool[model.name]["8192"] * int(int(args.input_tokens) / 8192) + prompt = prompt_pool[model.name]["8192"] * int( + int(args.input_tokens) / 8192 + ) elif args.input_tokens in prompt_pool[model.name]: prompt = prompt_pool[model.name][args.input_tokens] else: @@ -878,9 +996,23 @@ def calib_func(prepared_model): for i in range(num_iter): tic = time.time() if model.name == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] - output = user_model.generate(input_ids, images=image_tensor, **generate_kwargs) + input_ids = torch.stack( + [ + tokenizer_image_token( + pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + for pmt in prompt + ] + ) + image_tensor = [ + image_processor.preprocess(img, return_tensors="pt")[ + "pixel_values" + ].to(amp_dtype) + for img in image + ] + output = user_model.generate( + input_ids, images=image_tensor, **generate_kwargs + ) elif model.name == "git": input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values output = user_model.generate(pixel_values=input_ids, **generate_kwargs) @@ -888,7 +1020,10 @@ def calib_func(prepared_model): input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = user_model.generate(input_ids, **generate_kwargs) gen_ids = output[0] if args.token_latency else output - gen_text = tokenizer.batch_decode(gen_ids[:, input_ids.shape[1]:] if model.name=="llava" else gen_ids, skip_special_tokens=True) + gen_text = tokenizer.batch_decode( + gen_ids[:, input_ids.shape[1] :] if model.name == "llava" else gen_ids, + skip_special_tokens=True, + ) toc = time.time() input_tokens_lengths = [x.shape[0] for x in input_ids] output_tokens_lengths = [x.shape[0] for x in gen_ids] @@ -917,17 +1052,42 @@ def trace_handler(prof): ) as prof: for i in range(5): if model.name == "llava": - input_ids = torch.stack([tokenizer_image_token(pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') for pmt in prompt]) - image_tensor = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'].to(amp_dtype) for img in image] - output = user_model.generate(input_ids, images=image_tensor, **generate_kwargs) + input_ids = torch.stack( + [ + tokenizer_image_token( + pmt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt" + ) + for pmt in prompt + ] + ) + image_tensor = [ + image_processor.preprocess(img, return_tensors="pt")[ + "pixel_values" + ].to(amp_dtype) + for img in image + ] + output = user_model.generate( + input_ids, images=image_tensor, **generate_kwargs + ) elif model.name == "git": - input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values - output = user_model.generate(pixel_values=input_ids, **generate_kwargs) + input_ids = tokenizer( + images=prompt, return_tensors="pt" + ).pixel_values + output = user_model.generate( + pixel_values=input_ids, **generate_kwargs + ) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = user_model.generate(input_ids, **generate_kwargs) gen_ids = output[0] if args.token_latency else output - gen_text = tokenizer.batch_decode(gen_ids[:, input_ids.shape[1]:] if model.name=="llava" else gen_ids, skip_special_tokens=True) + gen_text = tokenizer.batch_decode( + ( + gen_ids[:, input_ids.shape[1] :] + if model.name == "llava" + else gen_ids + ), + skip_special_tokens=True, + ) prof.step() print("\n", "-" * 10, "Summary:", "-" * 10) diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/inference/python/llm/utils/create_shard_model.py index 9daac65a2..fcd10d89f 100644 --- a/examples/cpu/inference/python/llm/utils/create_shard_model.py +++ b/examples/cpu/inference/python/llm/utils/create_shard_model.py @@ -1,9 +1,15 @@ import torch import argparse -from transformers import AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, AutoProcessor +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + T5ForConditionalGeneration, + AutoProcessor, +) + # Here import ipex for Baichuan loading compatibility, for other models we can ignore this import -import intel_extension_for_pytorch +import intel_extension_for_pytorch # noqa F401 # supported models MODEL_CLASSES = { @@ -33,6 +39,7 @@ try: from llava.model.language_model.llava_llama import LlavaLlamaForCausalLM from llava.model.builder import load_pretrained_model + MODEL_CLASSES["llava"] = (LlavaLlamaForCausalLM, AutoTokenizer) except ImportError: pass @@ -69,7 +76,7 @@ ) args = parser.parse_args() print(args) -if args.local_rank == 0 : +if args.local_rank == 0: model_type = next( (x for x in MODEL_CLASSES.keys() if x in args.model_id.lower()), "auto" ) @@ -80,7 +87,9 @@ elif args.dtype == "bfloat16": load_dtype = torch.bfloat16 if model_type != "llava": - tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) + tokenizer = model_class[1].from_pretrained( + args.model_id, trust_remote_code=True + ) model = model_class[0].from_pretrained( args.model_id, torch_dtype=load_dtype, @@ -88,8 +97,14 @@ trust_remote_code=True, ) else: - tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_id) - model.save_pretrained(save_directory=args.save_path, max_shard_size=args.max_shard_size, safe_serialization=False) + tokenizer, model, image_processor, context_len = load_pretrained_model( + args.model_id + ) + model.save_pretrained( + save_directory=args.save_path, + max_shard_size=args.max_shard_size, + safe_serialization=False, + ) tokenizer.save_pretrained(save_directory=args.save_path) if model_type == "llava": image_processor.save_pretrained(save_directory=args.save_path) diff --git a/examples/cpu/inference/python/llm/utils/model_class/baichuan.py b/examples/cpu/inference/python/llm/utils/model_class/baichuan.py index 4c7550912..37fd4adae 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/baichuan.py +++ b/examples/cpu/inference/python/llm/utils/model_class/baichuan.py @@ -1,9 +1,10 @@ import torch from .llm import LLMConfig, EXAMPLE_INPUTS_MODE -import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch as ipex # noqa F401 import re + class BaichuanConfig(LLMConfig): def __init__(self, model_id): self.name = "baichuan" @@ -21,7 +22,9 @@ def get_user_model(self, config, benchmark): super().get_user_model(config, benchmark) input_ids = torch.ones(32).to(torch.long).unsqueeze(0) attention_mask = torch.ones_like(input_ids) - example_inputs = self.model.prepare_inputs_for_generation(input_ids, attention_mask=attention_mask) + example_inputs = self.model.prepare_inputs_for_generation( + input_ids, attention_mask=attention_mask + ) if example_inputs.get("position_ids", None) is not None: self.example_inputs_mode = EXAMPLE_INPUTS_MODE.MASK_KV_POS return self.model diff --git a/examples/cpu/inference/python/llm/utils/model_class/bloom.py b/examples/cpu/inference/python/llm/utils/model_class/bloom.py index 5da3a39f9..0b239a80a 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/bloom.py +++ b/examples/cpu/inference/python/llm/utils/model_class/bloom.py @@ -1,10 +1,11 @@ -import torch +import torch # noqa F401 from .llm import LLMConfig, EXAMPLE_INPUTS_MODE -from transformers.models.bloom.modeling_bloom import BloomForCausalLM -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers.models.bloom.modeling_bloom import BloomForCausalLM # noqa F401 +from transformers import AutoModelForCausalLM, AutoTokenizer # noqa F401 + +import intel_extension_for_pytorch as ipex # noqa F401 -import intel_extension_for_pytorch as ipex class BloomConfig(LLMConfig): def __init__(self, model_id): @@ -15,4 +16,4 @@ def __init__(self, model_id): # for smooth quant self.default_dataset = "NeelNanda/pile-10k" self.use_global_past_key_value = True - self.use_ipex_autotune = True \ No newline at end of file + self.use_ipex_autotune = True diff --git a/examples/cpu/inference/python/llm/utils/model_class/chatglm.py b/examples/cpu/inference/python/llm/utils/model_class/chatglm.py index aa8153033..3c447cc99 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/chatglm.py +++ b/examples/cpu/inference/python/llm/utils/model_class/chatglm.py @@ -2,6 +2,7 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class ChatGLMConfig(LLMConfig): def __init__(self, model_id): self.name = "chatglm" @@ -17,4 +18,4 @@ def __init__(self, model_id): def get_user_model(self, config, benchmark): super().get_user_model(config, benchmark) self.model.config.num_hidden_layers = self.model.config.num_layers - return self.model \ No newline at end of file + return self.model diff --git a/examples/cpu/inference/python/llm/utils/model_class/codegen.py b/examples/cpu/inference/python/llm/utils/model_class/codegen.py index 807bc6f5a..455b31911 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/codegen.py +++ b/examples/cpu/inference/python/llm/utils/model_class/codegen.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class CodeGenConfig(LLMConfig): def __init__(self, model_id): self.name = "codegen" @@ -9,4 +10,4 @@ def __init__(self, model_id): # for smooth quant self.default_dataset = "NeelNanda/pile-10k" self.use_global_past_key_value = True - self.use_ipex_autotune = True \ No newline at end of file + self.use_ipex_autotune = True diff --git a/examples/cpu/inference/python/llm/utils/model_class/falcon.py b/examples/cpu/inference/python/llm/utils/model_class/falcon.py index 1a30daf63..b85993d35 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/falcon.py +++ b/examples/cpu/inference/python/llm/utils/model_class/falcon.py @@ -2,6 +2,7 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class FALCONConfig(LLMConfig): def __init__(self, model_id): self.name = "falcon" @@ -17,7 +18,9 @@ def get_user_model(self, config, benchmark): super().get_user_model(config, benchmark) input_ids = torch.ones(32).to(torch.long).unsqueeze(0) attention_mask = torch.ones_like(input_ids) - example_inputs = self.model.prepare_inputs_for_generation(input_ids, attention_mask=attention_mask) + example_inputs = self.model.prepare_inputs_for_generation( + input_ids, attention_mask=attention_mask + ) if example_inputs.get("position_ids", None) is not None: self.example_inputs_mode = EXAMPLE_INPUTS_MODE.MASK_KV_POS return self.model diff --git a/examples/cpu/inference/python/llm/utils/model_class/git.py b/examples/cpu/inference/python/llm/utils/model_class/git.py index fd11b49cc..2f2c58ddb 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/git.py +++ b/examples/cpu/inference/python/llm/utils/model_class/git.py @@ -1,7 +1,8 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE from transformers import AutoModelForCausalLM, AutoProcessor import torch -import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch as ipex # noqa F401 + class GitConfig(LLMConfig): def __init__(self, model_id): @@ -25,4 +26,4 @@ def get_user_model(self, config, benchmark): return self.model def get_tokenizer(self): - return AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True) \ No newline at end of file + return AutoProcessor.from_pretrained(self.model_id, trust_remote_code=True) diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py b/examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py index 6d5624bb0..416b12bd5 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py +++ b/examples/cpu/inference/python/llm/utils/model_class/gptbigcode.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class GPTJBigCodeConfig(LLMConfig): def __init__(self, model_id): self.name = "gptbigcode" @@ -9,4 +10,4 @@ def __init__(self, model_id): # for smooth quant self.default_dataset = "NeelNanda/pile-10k" self.use_global_past_key_value = True - self.use_ipex_autotune = True \ No newline at end of file + self.use_ipex_autotune = True diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptj.py b/examples/cpu/inference/python/llm/utils/model_class/gptj.py index fcbbf147e..a3d79e796 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/gptj.py +++ b/examples/cpu/inference/python/llm/utils/model_class/gptj.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class GPTJConfig(LLMConfig): def __init__(self, model_id): self.name = "gpt-j" diff --git a/examples/cpu/inference/python/llm/utils/model_class/gptneox.py b/examples/cpu/inference/python/llm/utils/model_class/gptneox.py index da4f7c471..f8d073204 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/gptneox.py +++ b/examples/cpu/inference/python/llm/utils/model_class/gptneox.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class GPTNEOXConfig(LLMConfig): def __init__(self, model_id): self.name = "gpt-neox" diff --git a/examples/cpu/inference/python/llm/utils/model_class/llama.py b/examples/cpu/inference/python/llm/utils/model_class/llama.py index 31e75fb83..a3b4ea169 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/llama.py +++ b/examples/cpu/inference/python/llm/utils/model_class/llama.py @@ -5,6 +5,7 @@ import intel_extension_for_pytorch as ipex + class LLAMAConfig(LLMConfig): def __init__(self, model_id): self.name = "llama" @@ -16,7 +17,7 @@ def __init__(self, model_id): self.default_dataset = "NeelNanda/pile-10k" self.use_global_past_key_value = True self.use_ipex_autotune = True - + def get_user_model(self, config, benchmark): if benchmark: try: @@ -24,11 +25,17 @@ def get_user_model(self, config, benchmark): self.model = LlamaForCausalLM._from_config(config) except (RuntimeError, AttributeError): self.model = LlamaForCausalLM.from_pretrained( - self.model_id, config=config, low_cpu_mem_usage=True, torch_dtype=torch.half + self.model_id, + config=config, + low_cpu_mem_usage=True, + torch_dtype=torch.half, ) else: self.model = LlamaForCausalLM.from_pretrained( - self.model_id, config=config, low_cpu_mem_usage=True, torch_dtype=torch.float + self.model_id, + config=config, + low_cpu_mem_usage=True, + torch_dtype=torch.float, ) return self.model diff --git a/examples/cpu/inference/python/llm/utils/model_class/llava.py b/examples/cpu/inference/python/llm/utils/model_class/llava.py index fa6f50c13..f774ea33f 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/llava.py +++ b/examples/cpu/inference/python/llm/utils/model_class/llava.py @@ -1,9 +1,11 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + try: from llava.model.builder import load_pretrained_model except ImportError: pass + class LlavaConfig(LLMConfig): def __init__(self, model_id): self.name = "llava" @@ -14,14 +16,16 @@ def __init__(self, model_id): # for smooth quant self.use_global_past_key_value = True self.use_ipex_autotune = True - self.tokenizer, self.model, self.image_processor, _ = load_pretrained_model(self.model_id) - + self.tokenizer, self.model, self.image_processor, _ = load_pretrained_model( + self.model_id + ) + def get_user_model(self, config, benchmark): self.model.config = config return self.model def get_tokenizer(self): return self.tokenizer - + def get_image_processor(self): return self.image_processor diff --git a/examples/cpu/inference/python/llm/utils/model_class/llm.py b/examples/cpu/inference/python/llm/utils/model_class/llm.py index fc1b71bbc..6a44ec8eb 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/llm.py +++ b/examples/cpu/inference/python/llm/utils/model_class/llm.py @@ -6,6 +6,7 @@ import intel_extension_for_pytorch as ipex + class EXAMPLE_INPUTS_MODE(IntEnum): MASK_KV = 1 KV_MASK = 2 @@ -15,39 +16,48 @@ class EXAMPLE_INPUTS_MODE(IntEnum): MASK_KV_PIXEL = 6 EMBEDS_MASK_KV = 7 + class LLMConfig(ABC): @abstractmethod def __init__(self, model_id): - ''' - self.name: model name - self.model_id: model id - self.to_channels_last: channels last model - self.example_inputs_mode: - MASK_KV: input_ids+attn_mask+past_kv - KV_MASK: input_ids+past_kv+attn_mask - MASK_POS_KV: input_ids+attn_mask+position_ids+past_kv - MASK_KV_POS: input_ids+attn_mask+past_kv+position_ids - MASK_KV_ENC: input_ids+attn_mask+past_kv+encoder_output - - # if support smooth quant - self.default_dataset: default dataset - self.use_global_past_key_value: - use_global_past_key_value in collate_batch - self.use_ipex_autotune: - use_ipex_autotune in ipex_smooth_quant - ''' + """ + self.name: model name + self.model_id: model id + self.to_channels_last: channels last model + self.example_inputs_mode: + MASK_KV: input_ids+attn_mask+past_kv + KV_MASK: input_ids+past_kv+attn_mask + MASK_POS_KV: input_ids+attn_mask+position_ids+past_kv + MASK_KV_POS: input_ids+attn_mask+past_kv+position_ids + MASK_KV_ENC: input_ids+attn_mask+past_kv+encoder_output + + # if support smooth quant + self.default_dataset: default dataset + self.use_global_past_key_value: + use_global_past_key_value in collate_batch + self.use_ipex_autotune: + use_ipex_autotune in ipex_smooth_quant + """ self.model_id = model_id def get_user_model(self, config, benchmark): if benchmark: try: with ipex.OnDevice(dtype=torch.float, device="meta"): - self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=True) + self.model = AutoModelForCausalLM.from_config( + config, trust_remote_code=True + ) except (RuntimeError, AttributeError): - self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=True) + self.model = AutoModelForCausalLM.from_config( + config, trust_remote_code=True + ) else: self.model = AutoModelForCausalLM.from_pretrained( - self.model_id, torch_dtype=torch.float, config=config, low_cpu_mem_usage=True, trust_remote_code=True + self.model_id, + torch_dtype=torch.float, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, ) return self.model diff --git a/examples/cpu/inference/python/llm/utils/model_class/mistral.py b/examples/cpu/inference/python/llm/utils/model_class/mistral.py index 73dbf00b1..936e9695f 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/mistral.py +++ b/examples/cpu/inference/python/llm/utils/model_class/mistral.py @@ -10,4 +10,4 @@ def __init__(self, model_id): # for smooth quant self.use_global_past_key_value = True - self.use_ipex_autotune = True \ No newline at end of file + self.use_ipex_autotune = True diff --git a/examples/cpu/inference/python/llm/utils/model_class/mixtral.py b/examples/cpu/inference/python/llm/utils/model_class/mixtral.py index 4daaf2794..610ec9352 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/mixtral.py +++ b/examples/cpu/inference/python/llm/utils/model_class/mixtral.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class MixtralConfig(LLMConfig): def __init__(self, model_id): self.name = "mixtral" diff --git a/examples/cpu/inference/python/llm/utils/model_class/mpt.py b/examples/cpu/inference/python/llm/utils/model_class/mpt.py index a04343a68..b06e1613a 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/mpt.py +++ b/examples/cpu/inference/python/llm/utils/model_class/mpt.py @@ -1,6 +1,7 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE from transformers.models.mpt.modeling_mpt import MptForCausalLM + class MPTConfig(LLMConfig): def __init__(self, model_id): self.name = "mpt" diff --git a/examples/cpu/inference/python/llm/utils/model_class/opt.py b/examples/cpu/inference/python/llm/utils/model_class/opt.py index 9660ac9de..b7fbc34fe 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/opt.py +++ b/examples/cpu/inference/python/llm/utils/model_class/opt.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class OPTConfig(LLMConfig): def __init__(self, model_id): self.name = "opt" diff --git a/examples/cpu/inference/python/llm/utils/model_class/phi.py b/examples/cpu/inference/python/llm/utils/model_class/phi.py index bab8494c3..f437b347b 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/phi.py +++ b/examples/cpu/inference/python/llm/utils/model_class/phi.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class PhiConfig(LLMConfig): def __init__(self, model_id): self.name = "phi" @@ -12,6 +13,7 @@ def __init__(self, model_id): self.use_global_past_key_value = True self.use_ipex_autotune = True + class Phi3Config(LLMConfig): def __init__(self, model_id): self.name = "phi-3" @@ -22,4 +24,4 @@ def __init__(self, model_id): # for smooth quant self.default_dataset = "NeelNanda/pile-10k" self.use_global_past_key_value = True - self.use_ipex_autotune = True \ No newline at end of file + self.use_ipex_autotune = True diff --git a/examples/cpu/inference/python/llm/utils/model_class/qwen.py b/examples/cpu/inference/python/llm/utils/model_class/qwen.py index da5d8cbbd..e1995b1a4 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/qwen.py +++ b/examples/cpu/inference/python/llm/utils/model_class/qwen.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class QwenConfig(LLMConfig): def __init__(self, model_id): self.name = "qwen" diff --git a/examples/cpu/inference/python/llm/utils/model_class/stablelm.py b/examples/cpu/inference/python/llm/utils/model_class/stablelm.py index 0ef5a693e..99e2f9731 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/stablelm.py +++ b/examples/cpu/inference/python/llm/utils/model_class/stablelm.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class StableLMConfig(LLMConfig): def __init__(self, model_id): self.name = "stablelm" diff --git a/examples/cpu/inference/python/llm/utils/model_class/t5.py b/examples/cpu/inference/python/llm/utils/model_class/t5.py index 43345aa0e..86d1610d6 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/t5.py +++ b/examples/cpu/inference/python/llm/utils/model_class/t5.py @@ -3,6 +3,7 @@ from transformers import T5ForConditionalGeneration import intel_extension_for_pytorch as ipex + class T5Config(LLMConfig): def __init__(self, model_id): self.name = "t5" @@ -44,7 +45,9 @@ def get_user_model(self, config, benchmark): ) input_ids = torch.ones(32).to(torch.long).unsqueeze(0) attention_mask = torch.ones_like(input_ids) - example_inputs = self.model.prepare_inputs_for_generation(input_ids, attention_mask=attention_mask) + example_inputs = self.model.prepare_inputs_for_generation( + input_ids, attention_mask=attention_mask + ) if example_inputs.get("position_ids", None) is not None: self.example_inputs_mode = EXAMPLE_INPUTS_MODE.MASK_KV_POS return self.model diff --git a/examples/cpu/inference/python/llm/utils/model_class/yuan.py b/examples/cpu/inference/python/llm/utils/model_class/yuan.py index 54ddf85ab..710fdd141 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/yuan.py +++ b/examples/cpu/inference/python/llm/utils/model_class/yuan.py @@ -1,5 +1,6 @@ from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + class YuanConfig(LLMConfig): def __init__(self, model_id): self.name = "yuan" diff --git a/examples/cpu/inference/python/llm/utils/run_gptq.py b/examples/cpu/inference/python/llm/utils/run_gptq.py index 68826b8e5..39fb3e8d0 100644 --- a/examples/cpu/inference/python/llm/utils/run_gptq.py +++ b/examples/cpu/inference/python/llm/utils/run_gptq.py @@ -1,15 +1,15 @@ -''' +""" Ported from Intel(R) Extension for Transformers -https://github.com/intel/intel-extension-for-transformers/blob/53bed434f16cba1fff6cdb30749d3ea545e56ee5/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py +https://github.com/intel/intel-extension-for-transformers/blob/53bed434f16cba1fff6cdb30749d3ea545e56ee5/examples/huggingface/pytorch/language-modeling/quantization/run_clm_no_trainer.py # noqa With unused code removed. -''' +""" import argparse import sys -sys.path.append('./') + +sys.path.append("./") import time import re -from pathlib import Path import torch from datasets import load_dataset from torch.nn.functional import pad @@ -29,7 +29,9 @@ class Evaluator: - def __init__(self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False): + def __init__( + self, dataset, tokenizer, batch_size=8, pad_val=1, pad_max=196, is_calib=False + ): self.dataset = dataset self.tokenizer = tokenizer self.batch_size = batch_size @@ -56,7 +58,11 @@ def collate_batch(self, batch): pad_len = self.pad_max - input_ids.shape[0] last_ind.append(input_ids.shape[0] - 1) if self.is_calib: - input_ids = input_ids[:self.pad_max] if len(input_ids) > self.pad_max else input_ids + input_ids = ( + input_ids[: self.pad_max] + if len(input_ids) > self.pad_max + else input_ids + ) else: input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) input_ids_padded.append(input_ids) @@ -101,9 +107,11 @@ def evaluate(self, model): def get_user_model(): from transformers import AutoModelForCausalLM, AutoModel, AutoTokenizer + torchscript = False if re.search("llama", args.model.lower()): - from transformers import LlamaForCausalLM, AutoTokenizer + from transformers import LlamaForCausalLM + user_model = LlamaForCausalLM.from_pretrained( args.model, torchscript=torchscript, # torchscript will force `return_dict=False` to avoid jit errors @@ -111,6 +119,7 @@ def get_user_model(): tokenizer = AutoTokenizer.from_pretrained(args.model) elif re.search("mpt-7b-chat", args.model.lower()): from mpt_7b.modeling_mpt import MPTForCausalLM + user_model = MPTForCausalLM.from_pretrained( args.model, torchscript=torchscript, # torchscript will force `return_dict=False` to avoid jit errors @@ -120,6 +129,7 @@ def get_user_model(): user_model.config.use_cache = True elif re.search("falcon-7b-instruct", args.model.lower()): from falcon_7b_instruct.modelling_RW import RWForCausalLM + user_model = RWForCausalLM.from_pretrained( args.model, torchscript=torchscript, # torchscript will force `return_dict=False` to avoid jit errors @@ -156,7 +166,9 @@ def get_user_model(): calib_dataset = load_dataset(args.dataset, split="train") # calib_dataset = datasets.load_from_disk('/your/local/dataset/pile-10k/') # use this if trouble with connecting to HF calib_dataset = calib_dataset.shuffle(seed=42) -calib_evaluator = Evaluator(calib_dataset, tokenizer, batch_size=1, pad_max=512, is_calib=True) +calib_evaluator = Evaluator( + calib_dataset, tokenizer, batch_size=1, pad_max=512, is_calib=True +) calib_dataloader = DataLoader( calib_evaluator.dataset, batch_size=1, @@ -164,13 +176,13 @@ def get_user_model(): collate_fn=calib_evaluator.collate_batch, ) -compressed_model = ipex.quantization.gptq( - model=user_model, - dataloader=calib_dataloader, - group_size=args.group_size, - act_order=args.act_order, - nsamples=args.nsamples, - use_max_length=args.use_max_length, - pad_max_length=args.pad_max_length, - save_dir=args.output_dir) - +compressed_model = ipex.quantization.gptq( + model=user_model, + dataloader=calib_dataloader, + group_size=args.group_size, + act_order=args.act_order, + nsamples=args.nsamples, + use_max_length=args.use_max_length, + pad_max_length=args.pad_max_length, + save_dir=args.output_dir, +) diff --git a/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py index 423b139e3..a135b2b09 100644 --- a/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py +++ b/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py @@ -1,12 +1,13 @@ import torch import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(128, 3, 224, 224) #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py b/examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py index 504507e7b..fff892228 100644 --- a/examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py +++ b/examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py @@ -1,12 +1,13 @@ import torch import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(128, 3, 224, 224) #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model) ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/resnet50_general_inference_script.py b/examples/cpu/inference/python/resnet50_general_inference_script.py index 0352afe5b..e60c78769 100644 --- a/examples/cpu/inference/python/resnet50_general_inference_script.py +++ b/examples/cpu/inference/python/resnet50_general_inference_script.py @@ -1,6 +1,7 @@ import torch import torchvision.models as models + def inference(model, data): with torch.no_grad(): # warm up @@ -9,14 +10,16 @@ def inference(model, data): # measure import time + start = time.time() for _ in range(100): output = model(data) end = time.time() - print('Inference took {:.2f} ms in average'.format((end - start) / 100 * 1000)) + print("Inference took {:.2f} ms in average".format((end - start) / 100 * 1000)) + def main(args): - model = models.resnet50(weights='ResNet50_Weights.DEFAULT') + model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(128, 3, 224, 224) @@ -26,9 +29,9 @@ def main(args): model = model.to(memory_format=torch.channels_last) data = data.to(memory_format=torch.channels_last) - if args.dtype == 'float32': + if args.dtype == "float32": model = ipex.optimize(model, dtype=torch.float32) - elif args.dtype == 'bfloat16': + elif args.dtype == "bfloat16": model = ipex.optimize(model, dtype=torch.bfloat16) else: # int8 from intel_extension_for_pytorch.quantization import prepare, convert @@ -44,17 +47,21 @@ def main(args): model = convert(model) - with torch.cpu.amp.autocast(enabled=args.dtype == 'bfloat16'): + with torch.cpu.amp.autocast(enabled=args.dtype == "bfloat16"): with torch.no_grad(): model = torch.jit.trace(model, data) model = torch.jit.freeze(model) inference(model, data) -if __name__ == '__main__': + +if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser() - parser.add_argument('--dtype', default='float32', choices=['float32', 'bfloat16', 'int8']) + parser.add_argument( + "--dtype", default="float32", choices=["float32", "bfloat16", "int8"] + ) main(parser.parse_args()) diff --git a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py index 3656cd868..682230a15 100644 --- a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py +++ b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py @@ -1,13 +1,14 @@ import torch import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(128, 3, 224, 224) # Beta Feature #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, dtype=torch.bfloat16, weights_prepack=False) model = torch.compile(model, backend="ipex") ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py index 92a14c75f..bacff3afc 100644 --- a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py +++ b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py @@ -8,6 +8,7 @@ # Beta Feature #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, weights_prepack=False) model = torch.compile(model, backend="ipex") ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py index 988e83163..52af5f448 100644 --- a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py +++ b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py @@ -1,12 +1,13 @@ import torch import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(128, 3, 224, 224) #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model, dtype=torch.bfloat16) ###################################################### # noqa F401 diff --git a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py index f69a8cd8a..62874b84c 100644 --- a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py +++ b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py @@ -1,12 +1,13 @@ import torch import torchvision.models as models -model = models.resnet50(weights='ResNet50_Weights.DEFAULT') +model = models.resnet50(weights="ResNet50_Weights.DEFAULT") model.eval() data = torch.rand(128, 3, 224, 224) #################### code changes #################### # noqa F401 import intel_extension_for_pytorch as ipex + model = ipex.optimize(model) ###################################################### # noqa F401 diff --git a/examples/cpu/serving/torchserve/quantize_model.py b/examples/cpu/serving/torchserve/quantize_model.py index fe8b37ff6..6bac0ce41 100644 --- a/examples/cpu/serving/torchserve/quantize_model.py +++ b/examples/cpu/serving/torchserve/quantize_model.py @@ -22,7 +22,7 @@ n_iter = 100 for i in range(n_iter): model(dummy_tensor) - + # convert and deploy model = convert(model) @@ -30,4 +30,4 @@ model = torch.jit.trace(model, dummy_tensor) model = torch.jit.freeze(model) -torch.jit.save(model, './rn50_int8_jit.pt') +torch.jit.save(model, "./rn50_int8_jit.pt") diff --git a/examples/cpu/serving/triton/bert_base/1/model.py b/examples/cpu/serving/triton/bert_base/1/model.py index d92497fa4..d53cfbc86 100644 --- a/examples/cpu/serving/triton/bert_base/1/model.py +++ b/examples/cpu/serving/triton/bert_base/1/model.py @@ -34,34 +34,48 @@ import intel_extension_for_pytorch as ipex import json + def make_model(model_name, input_shape, device, bfloat16): - print(f"{{ origin: '{model_name}', input shape: {input_shape}, enabled bfloat16: {bfloat16}}}") + print( + f"{{ origin: '{model_name}', input shape: {input_shape}, enabled bfloat16: {bfloat16}}}" + ) # Download PyTorch model config = AutoConfig.from_pretrained( - model_name, return_dict=False, torchscript=True, num_labels=2) + model_name, return_dict=False, torchscript=True, num_labels=2 + ) model = BertModel.from_pretrained(model_name, config=config) model = model.eval() vocab_size = model.config.vocab_size data = torch.randint(vocab_size, size=input_shape) - print('Optimizing model in IPEX:') + print("Optimizing model in IPEX:") try: - model = ipex.optimize(model, level="O1",auto_kernel_selection=True, conv_bn_folding=False, dtype=torch.bfloat16 if bfloat16 else torch.float32) - with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): - model = torch.jit.trace(model, data, check_trace=False, strict=False) - model = torch.jit.freeze(model) - except Exception as e: print(e) - - print('Trigger Init Model Execution') + model = ipex.optimize( + model, + level="O1", + auto_kernel_selection=True, + conv_bn_folding=False, + dtype=torch.bfloat16 if bfloat16 else torch.float32, + ) + with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): + model = torch.jit.trace(model, data, check_trace=False, strict=False) + model = torch.jit.freeze(model) + except Exception as e: + print(e) + + print("Trigger Init Model Execution") # Enable fusion path (need to run forward propagation twice) with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): model(data) model(data) return model.to(device) + def compute_batch_set(full_batch, batches): if batches is None or len(batches) == 0: - return [full_batch,] + return [ + full_batch, + ] batches = sorted(batches, reverse=True) batch_set = [] @@ -74,6 +88,7 @@ def compute_batch_set(full_batch, batches): return batch_set + def execute_model(models, inputs, batches, dynamic_shape, bfloat16): input_batches = [x.shape[0] for x in inputs] @@ -92,10 +107,10 @@ def execute_model(models, inputs, batches, dynamic_shape, bfloat16): # Execute the model model_outputs = [] with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): - for i in range(len(splits)): - inp = splitted_inputs[i] - out = models[0 if dynamic_shape else splits[i]](inp)[1] - model_outputs.append(out) + for i in range(len(splits)): + inp = splitted_inputs[i] + out = models[0 if dynamic_shape else splits[i]](inp)[1] + model_outputs.append(out) # Re-combine results full_output = torch.concat(model_outputs, 0) @@ -127,57 +142,64 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = json.loads(args['model_config']) - self.device = torch.device('cpu') + self.model_config = json.loads(args["model_config"]) + self.device = torch.device("cpu") # Get INPUT0 configuration - input0_config = pb_utils.get_input_config_by_name( - self.model_config, "INPUT0") - seq_length = input0_config['dims'][0] + input0_config = pb_utils.get_input_config_by_name(self.model_config, "INPUT0") + seq_length = input0_config["dims"][0] # Get OUTPUT0 configuration output0_config = pb_utils.get_output_config_by_name( - self.model_config, "OUTPUT0") + self.model_config, "OUTPUT0" + ) # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.batches = [] self.dynamic_shape = True self.bfloat16 = False - parameters = self.model_config['parameters'] + parameters = self.model_config["parameters"] - if 'origin' in parameters: - origin = parameters['origin']['string_value'] + if "origin" in parameters: + origin = parameters["origin"]["string_value"] else: - raise pb_utils.TritonModelException("Origin model name should be defined") + raise pb_utils.TritonModelException("Origin model name should be defined") - if 'batches' in parameters: - self.batches = json.loads(parameters['batches']['string_value']) + if "batches" in parameters: + self.batches = json.loads(parameters["batches"]["string_value"]) - if 'dynamic_shape' in parameters: - self.dynamic_shape = json.loads(parameters['dynamic_shape']['string_value']) + if "dynamic_shape" in parameters: + self.dynamic_shape = json.loads(parameters["dynamic_shape"]["string_value"]) - if 'bfloat16' in parameters: - self.bfloat16 = json.loads(parameters['bfloat16']['string_value']) + if "bfloat16" in parameters: + self.bfloat16 = json.loads(parameters["bfloat16"]["string_value"]) self.models_cpu = dict() # Dynamic shapes supported in fp32/bf6 mode for PyTorch+IPEX if self.dynamic_shape: - input_shape = [1, seq_length if seq_length > 0 else 128] - self.models_cpu[0] = make_model(origin, input_shape, self.device, self.bfloat16) + input_shape = [1, seq_length if seq_length > 0 else 128] + self.models_cpu[0] = make_model( + origin, input_shape, self.device, self.bfloat16 + ) else: - if seq_length <= 0: - raise pb_utils.TritonModelException("Dynamic shapes switched off but input size is not defined") + if seq_length <= 0: + raise pb_utils.TritonModelException( + "Dynamic shapes switched off but input size is not defined" + ) - if self.batches is None or len(self.batches) == 0: - self.batches = [1] + if self.batches is None or len(self.batches) == 0: + self.batches = [1] - for batch in self.batches: - input_shape = [batch, seq_length] - self.models_cpu[batch] = make_model(origin, input_shape, self.device, self.bfloat16) + for batch in self.batches: + input_shape = [batch, seq_length] + self.models_cpu[batch] = make_model( + origin, input_shape, self.device, self.bfloat16 + ) def execute(self, requests): """`execute` must be implemented in every Python model. `execute` @@ -208,16 +230,21 @@ def execute(self, requests): in_0_cpu = dlpack.from_dlpack(in_0).to(self.device) inputs.append(in_0_cpu) - outputs = execute_model(self.models_cpu, inputs, self.batches, self.dynamic_shape, self.bfloat16) + outputs = execute_model( + self.models_cpu, inputs, self.batches, self.dynamic_shape, self.bfloat16 + ) # Convert model outputs to triton responses responses = [] for cur_bert_output in outputs: pooler_output = cur_bert_output.cpu().detach().numpy() - out_tensor_0 = pb_utils.Tensor("OUTPUT0", pooler_output.astype(self.output0_dtype)) + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", pooler_output.astype(self.output0_dtype) + ) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0]) + output_tensors=[out_tensor_0] + ) responses.append(inference_response) return responses @@ -227,4 +254,4 @@ def finalize(self): Implementing `finalize` function is optional. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') \ No newline at end of file + print("Cleaning up...") diff --git a/examples/cpu/serving/triton/bert_large/1/model.py b/examples/cpu/serving/triton/bert_large/1/model.py index d92497fa4..d53cfbc86 100644 --- a/examples/cpu/serving/triton/bert_large/1/model.py +++ b/examples/cpu/serving/triton/bert_large/1/model.py @@ -34,34 +34,48 @@ import intel_extension_for_pytorch as ipex import json + def make_model(model_name, input_shape, device, bfloat16): - print(f"{{ origin: '{model_name}', input shape: {input_shape}, enabled bfloat16: {bfloat16}}}") + print( + f"{{ origin: '{model_name}', input shape: {input_shape}, enabled bfloat16: {bfloat16}}}" + ) # Download PyTorch model config = AutoConfig.from_pretrained( - model_name, return_dict=False, torchscript=True, num_labels=2) + model_name, return_dict=False, torchscript=True, num_labels=2 + ) model = BertModel.from_pretrained(model_name, config=config) model = model.eval() vocab_size = model.config.vocab_size data = torch.randint(vocab_size, size=input_shape) - print('Optimizing model in IPEX:') + print("Optimizing model in IPEX:") try: - model = ipex.optimize(model, level="O1",auto_kernel_selection=True, conv_bn_folding=False, dtype=torch.bfloat16 if bfloat16 else torch.float32) - with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): - model = torch.jit.trace(model, data, check_trace=False, strict=False) - model = torch.jit.freeze(model) - except Exception as e: print(e) - - print('Trigger Init Model Execution') + model = ipex.optimize( + model, + level="O1", + auto_kernel_selection=True, + conv_bn_folding=False, + dtype=torch.bfloat16 if bfloat16 else torch.float32, + ) + with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): + model = torch.jit.trace(model, data, check_trace=False, strict=False) + model = torch.jit.freeze(model) + except Exception as e: + print(e) + + print("Trigger Init Model Execution") # Enable fusion path (need to run forward propagation twice) with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): model(data) model(data) return model.to(device) + def compute_batch_set(full_batch, batches): if batches is None or len(batches) == 0: - return [full_batch,] + return [ + full_batch, + ] batches = sorted(batches, reverse=True) batch_set = [] @@ -74,6 +88,7 @@ def compute_batch_set(full_batch, batches): return batch_set + def execute_model(models, inputs, batches, dynamic_shape, bfloat16): input_batches = [x.shape[0] for x in inputs] @@ -92,10 +107,10 @@ def execute_model(models, inputs, batches, dynamic_shape, bfloat16): # Execute the model model_outputs = [] with torch.no_grad(), torch.cpu.amp.autocast(enabled=bfloat16): - for i in range(len(splits)): - inp = splitted_inputs[i] - out = models[0 if dynamic_shape else splits[i]](inp)[1] - model_outputs.append(out) + for i in range(len(splits)): + inp = splitted_inputs[i] + out = models[0 if dynamic_shape else splits[i]](inp)[1] + model_outputs.append(out) # Re-combine results full_output = torch.concat(model_outputs, 0) @@ -127,57 +142,64 @@ def initialize(self, args): """ # You must parse model_config. JSON string is not parsed here - self.model_config = json.loads(args['model_config']) - self.device = torch.device('cpu') + self.model_config = json.loads(args["model_config"]) + self.device = torch.device("cpu") # Get INPUT0 configuration - input0_config = pb_utils.get_input_config_by_name( - self.model_config, "INPUT0") - seq_length = input0_config['dims'][0] + input0_config = pb_utils.get_input_config_by_name(self.model_config, "INPUT0") + seq_length = input0_config["dims"][0] # Get OUTPUT0 configuration output0_config = pb_utils.get_output_config_by_name( - self.model_config, "OUTPUT0") + self.model_config, "OUTPUT0" + ) # Convert Triton types to numpy types self.output0_dtype = pb_utils.triton_string_to_numpy( - output0_config['data_type']) + output0_config["data_type"] + ) self.batches = [] self.dynamic_shape = True self.bfloat16 = False - parameters = self.model_config['parameters'] + parameters = self.model_config["parameters"] - if 'origin' in parameters: - origin = parameters['origin']['string_value'] + if "origin" in parameters: + origin = parameters["origin"]["string_value"] else: - raise pb_utils.TritonModelException("Origin model name should be defined") + raise pb_utils.TritonModelException("Origin model name should be defined") - if 'batches' in parameters: - self.batches = json.loads(parameters['batches']['string_value']) + if "batches" in parameters: + self.batches = json.loads(parameters["batches"]["string_value"]) - if 'dynamic_shape' in parameters: - self.dynamic_shape = json.loads(parameters['dynamic_shape']['string_value']) + if "dynamic_shape" in parameters: + self.dynamic_shape = json.loads(parameters["dynamic_shape"]["string_value"]) - if 'bfloat16' in parameters: - self.bfloat16 = json.loads(parameters['bfloat16']['string_value']) + if "bfloat16" in parameters: + self.bfloat16 = json.loads(parameters["bfloat16"]["string_value"]) self.models_cpu = dict() # Dynamic shapes supported in fp32/bf6 mode for PyTorch+IPEX if self.dynamic_shape: - input_shape = [1, seq_length if seq_length > 0 else 128] - self.models_cpu[0] = make_model(origin, input_shape, self.device, self.bfloat16) + input_shape = [1, seq_length if seq_length > 0 else 128] + self.models_cpu[0] = make_model( + origin, input_shape, self.device, self.bfloat16 + ) else: - if seq_length <= 0: - raise pb_utils.TritonModelException("Dynamic shapes switched off but input size is not defined") + if seq_length <= 0: + raise pb_utils.TritonModelException( + "Dynamic shapes switched off but input size is not defined" + ) - if self.batches is None or len(self.batches) == 0: - self.batches = [1] + if self.batches is None or len(self.batches) == 0: + self.batches = [1] - for batch in self.batches: - input_shape = [batch, seq_length] - self.models_cpu[batch] = make_model(origin, input_shape, self.device, self.bfloat16) + for batch in self.batches: + input_shape = [batch, seq_length] + self.models_cpu[batch] = make_model( + origin, input_shape, self.device, self.bfloat16 + ) def execute(self, requests): """`execute` must be implemented in every Python model. `execute` @@ -208,16 +230,21 @@ def execute(self, requests): in_0_cpu = dlpack.from_dlpack(in_0).to(self.device) inputs.append(in_0_cpu) - outputs = execute_model(self.models_cpu, inputs, self.batches, self.dynamic_shape, self.bfloat16) + outputs = execute_model( + self.models_cpu, inputs, self.batches, self.dynamic_shape, self.bfloat16 + ) # Convert model outputs to triton responses responses = [] for cur_bert_output in outputs: pooler_output = cur_bert_output.cpu().detach().numpy() - out_tensor_0 = pb_utils.Tensor("OUTPUT0", pooler_output.astype(self.output0_dtype)) + out_tensor_0 = pb_utils.Tensor( + "OUTPUT0", pooler_output.astype(self.output0_dtype) + ) inference_response = pb_utils.InferenceResponse( - output_tensors=[out_tensor_0]) + output_tensors=[out_tensor_0] + ) responses.append(inference_response) return responses @@ -227,4 +254,4 @@ def finalize(self): Implementing `finalize` function is optional. This function allows the model to perform any necessary clean ups before exit. """ - print('Cleaning up...') \ No newline at end of file + print("Cleaning up...") diff --git a/examples/cpu/training/distributed_data_parallel_training.py b/examples/cpu/training/distributed_data_parallel_training.py index 82d83f2aa..c2601ee83 100644 --- a/examples/cpu/training/distributed_data_parallel_training.py +++ b/examples/cpu/training/distributed_data_parallel_training.py @@ -2,44 +2,41 @@ import torch import torch.distributed as dist import torchvision -import oneccl_bindings_for_pytorch as torch_ccl +import oneccl_bindings_for_pytorch as torch_ccl # noqa F401 import intel_extension_for_pytorch as ipex LR = 0.001 DOWNLOAD = True -DATA = 'datasets/cifar10/' - -os.environ['MASTER_ADDR'] = '127.0.0.1' -os.environ['MASTER_PORT'] = '29500' -os.environ['RANK'] = os.environ.get('PMI_RANK', 0) -os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', 1) -dist.init_process_group( -backend='ccl', -init_method='env://' +DATA = "datasets/cifar10/" + +os.environ["MASTER_ADDR"] = "127.0.0.1" +os.environ["MASTER_PORT"] = "29500" +os.environ["RANK"] = os.environ.get("PMI_RANK", 0) +os.environ["WORLD_SIZE"] = os.environ.get("PMI_SIZE", 1) +dist.init_process_group(backend="ccl", init_method="env://") +rank = os.environ["RANK"] + +transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize((224, 224)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] ) -rank = os.environ['RANK'] - -transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) -]) train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, + root=DATA, + train=True, + transform=transform, + download=DOWNLOAD, ) dist_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128, - sampler=dist_sampler + dataset=train_dataset, batch_size=128, sampler=dist_sampler ) model = torchvision.models.resnet50() criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr = LR, momentum=0.9) +optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9) model.train() model, optimizer = ipex.optimize(model, optimizer=optimizer) @@ -51,13 +48,16 @@ loss = criterion(output, target) loss.backward() optimizer.step() - print('batch_id: {}'.format(batch_idx)) + print("batch_id: {}".format(batch_idx)) if rank == 0: - torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), - }, 'checkpoint.pth') + torch.save( + { + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + }, + "checkpoint.pth", + ) dist.destroy_process_group() -print("Execution finished") \ No newline at end of file +print("Execution finished") diff --git a/examples/cpu/training/llm/finetune.py b/examples/cpu/training/llm/finetune.py index 230f941af..ae09cf482 100644 --- a/examples/cpu/training/llm/finetune.py +++ b/examples/cpu/training/llm/finetune.py @@ -4,7 +4,7 @@ """ import os -from typing import List +from typing import tuple import fire import torch @@ -44,10 +44,10 @@ def train( lora_r: int = 8, lora_alpha: int = 16, lora_dropout: float = 0.05, - lora_target_modules: List[str] = [ + lora_target_modules: tuple[str] = ( "q_proj", "v_proj", - ], + ), # llm hyperparams train_on_inputs: bool = True, # if False, masks out inputs in loss group_by_length: bool = False, # faster, but produces an odd training loss curve @@ -123,9 +123,7 @@ def train( base_model, attn_implementation="eager" ) else: - model = AutoModelForCausalLM.from_pretrained( - base_model - ) + model = AutoModelForCausalLM.from_pretrained(base_model) tokenizer = AutoTokenizer.from_pretrained(base_model) @@ -179,7 +177,7 @@ def generate_and_tokenize_prompt(data_point): config = LoraConfig( r=lora_r, lora_alpha=lora_alpha, - target_modules=lora_target_modules, + target_modules=list(lora_target_modules), lora_dropout=lora_dropout, bias="none", task_type="CAUSAL_LM", diff --git a/examples/cpu/training/llm/utils/prompter.py b/examples/cpu/training/llm/utils/prompter.py index 0915f2aa9..269afabe1 100644 --- a/examples/cpu/training/llm/utils/prompter.py +++ b/examples/cpu/training/llm/utils/prompter.py @@ -56,9 +56,7 @@ def generate_prompt( instruction=instruction, input=input ) else: - res = self.template["prompt_no_input"].format( - instruction=instruction - ) + res = self.template["prompt_no_input"].format(instruction=instruction) if label: res = f"{res}{label}" if self._verbose: diff --git a/examples/cpu/training/single_instance_training_bf16.py b/examples/cpu/training/single_instance_training_bf16.py index 9a7c5cdcd..fa596e686 100644 --- a/examples/cpu/training/single_instance_training_bf16.py +++ b/examples/cpu/training/single_instance_training_bf16.py @@ -4,23 +4,22 @@ LR = 0.001 DOWNLOAD = True -DATA = 'datasets/cifar10/' +DATA = "datasets/cifar10/" -transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) -]) +transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize((224, 224)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] +) train_dataset = torchvision.datasets.CIFAR10( root=DATA, train=True, transform=transform, download=DOWNLOAD, ) -train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 -) +train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) model = torchvision.models.resnet50() criterion = torch.nn.CrossEntropyLoss() @@ -41,9 +40,12 @@ optimizer.step() print(batch_idx) -torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), -}, 'checkpoint.pth') +torch.save( + { + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + }, + "checkpoint.pth", +) print("Execution finished") diff --git a/examples/cpu/training/single_instance_training_fp32.py b/examples/cpu/training/single_instance_training_fp32.py index a1bab1a81..ae2b970ad 100644 --- a/examples/cpu/training/single_instance_training_fp32.py +++ b/examples/cpu/training/single_instance_training_fp32.py @@ -4,23 +4,22 @@ LR = 0.001 DOWNLOAD = True -DATA = 'datasets/cifar10/' +DATA = "datasets/cifar10/" -transform = torchvision.transforms.Compose([ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) -]) +transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize((224, 224)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] +) train_dataset = torchvision.datasets.CIFAR10( root=DATA, train=True, transform=transform, download=DOWNLOAD, ) -train_loader = torch.utils.data.DataLoader( - dataset=train_dataset, - batch_size=128 -) +train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) model = torchvision.models.resnet50() criterion = torch.nn.CrossEntropyLoss() @@ -39,9 +38,12 @@ optimizer.step() print(batch_idx) -torch.save({ - 'model_state_dict': model.state_dict(), - 'optimizer_state_dict': optimizer.state_dict(), -}, 'checkpoint.pth') +torch.save( + { + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + }, + "checkpoint.pth", +) print("Execution finished") diff --git a/scripts/tools/setup/flake8.py b/scripts/tools/setup/flake8.py index df6200779..338ce97da 100644 --- a/scripts/tools/setup/flake8.py +++ b/scripts/tools/setup/flake8.py @@ -23,25 +23,31 @@ def check_flake8_errors(base_dir, filepath): black_cmd.append(filepath) flak8_cmd.append(filepath) + ret_blk = 0 # Auto format python code. - blk_output = subprocess.check_output( - black_cmd, - cwd=base_dir, - stderr=subprocess.STDOUT, - ) - output_string = blk_output.decode("utf-8") - print(output_string) - if output_string.find("reformatted") == -1: - ret_blk = 0 - else: - ret_blk = 1 + try: + blk_output = subprocess.check_output( + black_cmd, + cwd=base_dir, + stderr=subprocess.STDOUT, + ) + + output_string = blk_output.decode("utf-8") + print(output_string) + if output_string.find("reformatted") == -1: + ret_blk = 0 + else: + ret_blk = 1 - # Check code style. - ret_flak8 = subprocess.call(flak8_cmd, cwd=base_dir) - status_code = ret_flak8 + ret_blk - print("status code: ", status_code) + # Check code style. + ret_flak8 = subprocess.call(flak8_cmd, cwd=base_dir) + status_code = ret_flak8 + ret_blk + print("status code: ", status_code) - return status_code + return status_code + except subprocess.CalledProcessError as e: + print(e.output) + return 1 if __name__ == "__main__": @@ -52,8 +58,9 @@ def check_flake8_errors(base_dir, filepath): base_pydir = os.path.join(base_dir, "intel_extension_for_pytorch") base_scripts = os.path.join(base_dir, "scripts") base_cpu_uts = os.path.join(base_dir, "tests/cpu") + base_cpu_example = os.path.join(base_dir, "examples/cpu") - Check_dir = [setupfile, base_pydir, base_scripts, base_cpu_uts] + Check_dir = [setupfile, base_pydir, base_scripts, base_cpu_uts, base_cpu_example] ret = sum([check_flake8_errors(base_dir, path) for path in Check_dir]) if ret > 0: print("ERROR: flake8 found format errors!") From b7bc2ade7c583b5a49cf59f869e4d92503f614ff Mon Sep 17 00:00:00 2001 From: Ryan Tao <65508217+RanTao123@users.noreply.github.com> Date: Sat, 11 May 2024 09:49:40 +0800 Subject: [PATCH 055/199] Improve the first token performance for weight dtype = int 8 by dequant weight upfront (#2530) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * weight dequant upfront with low_p_mode = 0ใ€1ใ€2 * add header file * add code for low_p mode = 3 and fusion_type = 0ใ€1ใ€2ใ€3 * modify the code according to the comment * delete dispatcher lowp_mode3_dequant_upfront * update dequantGemmTpp code * add dequant upfront for int8 * implement dequant upfront with int8 * delete unused code. * delete unnecessary changes. * delete unnecessary changes. * delete code for weight dtype = int4 * modify the format * delete unnecessary change. * delete x.reshape and apply ConvertTPP * delete unused codes. * delete unused codes. * delete unnecessary empty lines. * modify according to the comment. * remove unnecessary change. * implment brgemm count != 1 for y_buf == true * add DEQUANT_UPFRONT_THRESHOLD * clang-format * remove unused code * keep the precision. * repair the bug * improve the performance * improve performance * code standardization * code standardization * modify according to comment * remove unnecessary change * clang-format * Refine code and add UT * Remove lm_head_generation * Fix clang-format issue * reduce unnecessary change * Refine code * Refine code * Fix bugs and simplify code * Fix clang-format issue * Simplify code * Simplify code * Remove unused code and refine post op implementation * Fix UT failure on old platforms * Fix flake8 issue * Use TPP fused post op when Tout == TGemmOut and post op is not add * Fix UT * fix clang-format and flake8 issues --------- Co-authored-by: root Co-authored-by: Xia, Weiwen --- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 325 +++++++++++++++++- csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp | 2 +- csrc/cpu/tpp/kernels/TPPGEMMKrnl.h | 148 ++++++-- csrc/cpu/tpp/tensor_helper.h | 3 +- tests/cpu/test_quantization_default_recipe.py | 120 ++++--- 5 files changed, 520 insertions(+), 78 deletions(-) diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index e8675a7fa..f6782c937 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -5,6 +5,7 @@ #include #include #include +#include "csrc/cpu/tpp/kernels/TPPGEMMKrnl.h" #include "csrc/cpu/tpp/woq/tla.h" #ifdef __GNUC__ @@ -117,6 +118,7 @@ at::Tensor map_nf4_tensor_to_float(const at::Tensor& t) { #define QUANT_A_THRESHOLD 30720 #define SMALL_BATCH_THRESHOLD 32 +#define DEQUANT_UPFRONT_THRESHOLD 1024 #define PARALLEL_M_THRESHOLD 128 constexpr long PREFETCH_K_DIST = 64; // TODO(jgong5): do not hard-code constexpr long LOOP_K_UNROLL = 4; // TODO(jgong5): do not hard-code @@ -1675,6 +1677,297 @@ class DequantGemmTPP< long ldc; }; +// Compared to qlinear_woq_affine_impl, +// this function dequantize weight upfront before gemm to improve the +// performance for the first token. +template < + typename T, + typename TComp, + typename TGemmOut, + typename Tout, + typename TScale, + typename TZero, + int quant_a_mode = -1, + int quant_w_mode = 0> +void qlinear_woq_affine_dequant_upfront_impl( + const at::Tensor& x, + const at::Tensor& qw_packed, + const at::Tensor& scales, // dtype is TComp + const at::Tensor& b, // dtype is TGemmOut + at::Tensor y, + const int qw_type, + int fusion_type, + const TensorList& others_list, + int64_t quant_block_k, + const std::optional& zps = std::nullopt) { // dtype is TComp + const bool sym_quant = is_sym_quant(qw_type); + auto x_sizes = x.sizes(); + auto w_sizes = qw_packed.sizes(); + auto Nc = w_sizes[0]; + auto Nb = w_sizes[3]; + auto Kc = w_sizes[1]; + auto Kb = w_sizes[2]; + auto N = Nc * Nb; + auto K = Kc * Kb; + TLA_ASSERT( + !(std::is_same() || std::is_same()), + "WOQ dequant upfront does not support uint8_t computation"); + + auto quant_block_multiple = quant_block_k == 0 ? 1 : quant_block_k / Kb; + auto quant_k_blocks = + quant_block_k == 0 ? 1 : (K + quant_block_k - 1) / quant_block_k; + int scales_kc = quant_w_mode == QUANT_W_PER_CHANNEL ? QUANT_W_PER_K_BLOCK + : quant_k_blocks; + + auto pw = GetVLAPtr((uint8_t*)qw_packed.data_ptr(), {Kc, Kb * Nb}); + auto ldy = N; + + torch::Tensor dqw = + torch::empty({Nc, Kc, Kb, Nb}, c10::CppTypeToScalarType::value); + if (std::is_same()) { + // reshape to VNNI format + dqw = dqw.view({Nc, Kc, Kb / 2, Nb, 2}); + } + auto dqw_ptr = GetVLAPtr(dqw, {Kc, Kb, Nb}); + auto tin0 = others_list.size() > 0 + ? others_list[0].to(c10::CppTypeToScalarType::value) + : at::Tensor{}; + auto tin1 = others_list.size() > 1 + ? others_list[1].to(c10::CppTypeToScalarType::value) + : at::Tensor{}; + auto pscales = GetVLAPtr(scales, {scales_kc, Nb}); + auto pzps = sym_quant ? GetVLAPtr(nullptr, {1, 1}) + : GetVLAPtr(zps.value(), {scales_kc, Nb}); + product_dispatcher< + std::tuple, + std::tuple< + enumerate_dispatcher, + enumerate_dispatcher>>:: + call( + std::make_tuple(qw_type, Nb), + [&](auto tuple) { + auto qw_type_ = std::get<0>(tuple); + auto block_n = std::get<1>(tuple); + auto loop_scheme = "bA"; + auto dequant_loop = + ThreadedLoop<2>({{Nc}, {0, Kc, Kc}}, loop_scheme); + constexpr const int N_GROUP_SIZE = get_n_group_size(block_n); + dequant_loop( + [&](int* idx) { + int nc = idx[0]; + int kc_start = idx[1]; + int kc_end = kc_start + Kc; + for (int kc = kc_start; kc < kc_end; kc++) { + int32_t k_groups = -1; + int32_t quant_offset = kc / quant_block_multiple; + TScale* scale_w = nullptr; + TZero* zp_w = nullptr; + if constexpr (quant_w_mode == QUANT_W_PER_CHANNEL) { + scale_w = pscales[nc][0]; + if (!sym_quant) { + zp_w = pzps[nc][0]; + } + } else { + scale_w = pscales[nc][quant_offset]; + if (!sym_quant) { + zp_w = pzps[nc][quant_offset]; + } + } + Dequantize::call( + pw[nc][kc], + Kb, + block_n, + scale_w, + zp_w, + dqw_ptr[nc][kc][0]); + } + }, + [&]() {}, + [&]() {}); + + auto x_reshaped = x.dim() == 3 ? x : x.view({x_sizes[0], 1, K}); + auto M = y.numel() / y.size(-1); + auto block_m = 64L; + auto rem = M % block_m; + auto ldy = N; + if (Nc % 4 == 0) { + Nc /= 4; + Nb *= 4; + } else if (Nc % 2 == 0) { + Nc /= 2; + Nb *= 2; + } + auto gelu_fwd_tpp_ptr = fusion_type == FUSE_GELU_ERF + ? std::make_shared>( + GeluFwdTPP(block_m, Nb, ldy, ldy)) + : nullptr; + auto gelu_fwd_tpp_rem_ptr = fusion_type == FUSE_GELU_ERF + ? std::make_shared>( + GeluFwdTPP(rem, Nb, ldy, ldy)) + : nullptr; + bool has_add_post_op = + fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD; + auto add_tpp_ptr = has_add_post_op + ? std::make_shared>( + AddTPP(block_m, Nb, ldy, ldy)) + : nullptr; + auto add_tpp_rem_ptr = has_add_post_op + ? std::make_shared>( + AddTPP(rem, Nb, ldy, ldy)) + : nullptr; + auto gelu_tanh_fwd_tpp_ptr = fusion_type == FUSE_GELU_TANH + ? std::make_shared>( + GeluTanhFwdTPP(block_m, Nb, ldy, ldy)) + : nullptr; + auto gelu_tanh_fwd_tpp_rem_ptr = fusion_type == FUSE_GELU_TANH + ? std::make_shared>( + GeluTanhFwdTPP(rem, Nb, ldy, ldy)) + : nullptr; + auto in0_ptr = GetVLAPtr(tin0, {Nc, Nb}); + auto in1_ptr = GetVLAPtr(tin1, {Nc, Nb}); + + // For add/add_add, using aten add is faster than TPP fused kernel + auto tpp_linear_with_post_op = [&](at::Tensor& in, + at::Tensor& out, + int fuse_type = 0) { + if (fuse_type == FUSE_GELU_ERF) { + tpp_linear_gelu(in, dqw, b, out); + } else if (fuse_type == FUSE_ADD || fuse_type == FUSE_ADD_ADD) { + TLA_ASSERT( + false, + "fuse_type should not be ADD or ADD_ADD since it's slower than aten add"); + } else if (fuse_type == FUSE_GELU_TANH) { + tpp_linear_gelu_tanh(in, dqw, b, out); + } else { + tpp_linear_bias(in, dqw, b, out); + } + }; + + // Maybe convert x to TComp and then call tpp_linear + // We can only call tpp linear with post op when Tout == TGemmOut + // Otherwise, we need to convert the output to Tout then apply post + // op ourselves + auto maybe_cvt_x_and_compute = [&](at::Tensor& y, + int fuse_type = 0) { + if constexpr (!std::is_same()) { + auto x_comp = at::empty( + x_reshaped.sizes(), + x_reshaped.options().dtype( + c10::CppTypeToScalarType::value)); + auto cvt_x_tpp = ConvertTPP(block_m, Kb, K, K); + auto cvt_x_rem_tpp = ConvertTPP(rem, Kb, K, K); + auto cvt_loop = torch_ipex::tpp::ThreadedLoop<2>( + {{0, M, block_m}, {Kc}}, "AB"); + auto in_ptr = GetVLAPtr(x, {Kc, Kb}); + auto out_ptr = GetVLAPtr(x_comp, {Kc, Kb}); + cvt_loop([&](int* ind) { + int m = ind[0], kc = ind[1]; + if (m + block_m <= M) { + cvt_x_tpp(in_ptr[m][kc], out_ptr[m][kc]); + } else { + cvt_x_rem_tpp(in_ptr[m][kc], out_ptr[m][kc]); + } + }); + tpp_linear_with_post_op(x_comp, y, fuse_type); + } else { + tpp_linear_with_post_op(x_reshaped, y, fuse_type); + } + }; + + // If Tout != TGemmOut, such as the lowp-mode=bf16 case, we need a + // buffer for output + if constexpr (!std::is_same()) { + auto y_gemm = at::empty( + {M, y.size(-1)}, + y.options().dtype(c10::CppTypeToScalarType::value)); + maybe_cvt_x_and_compute(y_gemm); + auto cvt_y_tpp = + ConvertTPP(block_m, Nb, ldy, ldy); + auto cvt_y_rem_tpp = + ConvertTPP(rem, Nb, ldy, ldy); + auto post_loop = torch_ipex::tpp::ThreadedLoop<2>( + {{0, M, block_m}, {Nc}}, "AB"); + auto in_ptr = GetVLAPtr(y_gemm, {Nc, Nb}); + auto out_ptr = GetVLAPtr(y, {Nc, Nb}); + // Convert y to T and handle post ops + if (fusion_type == 0) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + } + }); + } else if (fusion_type == FUSE_GELU_ERF) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*gelu_fwd_tpp_ptr)(out_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*gelu_fwd_tpp_rem_ptr)(out_ptr[m][nc], out_ptr[m][nc]); + } + }); + } else if (fusion_type == FUSE_GELU_TANH) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*gelu_tanh_fwd_tpp_ptr)(out_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*gelu_tanh_fwd_tpp_rem_ptr)( + out_ptr[m][nc], out_ptr[m][nc]); + } + }); + } else if (fusion_type == FUSE_ADD) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*add_tpp_ptr)( + out_ptr[m][nc], in0_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*add_tpp_rem_ptr)( + out_ptr[m][nc], in0_ptr[m][nc], out_ptr[m][nc]); + } + }); + } else if (fusion_type == FUSE_ADD_ADD) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*add_tpp_ptr)( + out_ptr[m][nc], in0_ptr[m][nc], out_ptr[m][nc]); + (*add_tpp_ptr)( + out_ptr[m][nc], in1_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*add_tpp_rem_ptr)( + out_ptr[m][nc], in0_ptr[m][nc], out_ptr[m][nc]); + (*add_tpp_rem_ptr)( + out_ptr[m][nc], in1_ptr[m][nc], out_ptr[m][nc]); + } + }); + } + } else { // Tout == TGemmOut + // For add/add_add, using aten add is faster than TPP fused kernel + if (fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD) { + maybe_cvt_x_and_compute(y, 0); + for (auto& tin : others_list) { + y.add_(tin.view(y.sizes())); + } + } else { + maybe_cvt_x_and_compute(y, fusion_type); + } + } + }, + [](auto tuple) { failing_fallback(); }); +} + // If T != TComp // T -> TComp -> GEMM -> TComp -> bias/PostOp -> Tout // If T == TComp (we can save intermediate output buffer and schedule M/N/K @@ -1720,6 +2013,33 @@ void qlinear_woq_affine_impl( TLA_ASSERT(Nb % 16 == 0, "Nb must be a multiple of 16"); + // For first token with large M, go to the dequant upfront path + // Now it only supports INT8 weight + if constexpr (!std::is_same()) { + if (M >= DEQUANT_UPFRONT_THRESHOLD && !is_4bit_flag) { + qlinear_woq_affine_dequant_upfront_impl< + T, + TComp, + TGemmOut, + Tout, + TScale, + TZero, + quant_a_mode, + quant_w_mode>( + x, + qw_packed, + scales, + b, + y, + qw_type, + fusion_type, + others_list, + quant_block_k, + zps); + return; + } + } + // select BLOCK_M according to M // TODO(jgong5): improve the heuristic auto BLOCK_M = [&]() -> long { @@ -2324,7 +2644,10 @@ at::Tensor qlinear_woq_pack( auto K = is_4bit_flag ? sizes[1] * 2 : sizes[1]; TLA_ASSERT(N % block_n == 0, "N must be multiple of block_n"); TLA_ASSERT(K % block_k == 0, "K must be multiple of block_k"); - TLA_ASSERT(block_n % 16 == 0, "block_n must be multiple of 16 for int4"); + if (is_4bit_flag) { + TLA_ASSERT( + block_n % 16 == 0, "block_n must be multiple of 16 for 4bit weight"); + } if (lowp_mode == LOWP_MODE_INT8) { TLA_ASSERT( block_k % 4 == 0, diff --git a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp index be16ecb82..11e8787bc 100644 --- a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp +++ b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp @@ -275,7 +275,7 @@ static at::Tensor _shuffle_input_channels_if_needed( auto& g_idx = context.g_idx_.value(); auto K = input.size(-1); std::vector input_shape = {input.numel() / K, K}; - return woq_shuffle_tensor_by_group_idx( + return woq_shuffle_tensor_by_group_idx( input, input_shape, g_idx, context.group_size_); } return input; diff --git a/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h b/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h index 08ea1c64b..3ab141406 100644 --- a/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h +++ b/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h @@ -31,7 +31,9 @@ REGISTER_LOCAL_SCOPE( REGISTER_LOCAL_SCOPE( tpp_linear_gelu_krnl, "tpp_linear_gelu_krnl"); // linear bias + gelu - +REGISTER_LOCAL_SCOPE( + tpp_linear_gelu_tanh_krnl, + "tpp_linear_gelu_tanh_krnl"); // linear bias + gelu_tanh REGISTER_LOCAL_SCOPE( tpp_linear_mul_krnl, "tpp_linear_mul_krnl"); // linear bias + mul @@ -65,7 +67,7 @@ inline at::Tensor wt_tensor_for_first_token(at::Tensor& t) { auto C2 = sizes[2]; auto K2 = sizes[3]; auto C3 = sizes[4]; - if (K2 >= 32) + if (K2 > 32) return t; auto t_new = t.new_empty({K1 / RBS, C1, C2, RBS * K2, C3}); auto in = GetVLAPtr(t, {RBS, C1, C2, K2 * C3}); @@ -86,7 +88,7 @@ inline at::Tensor wt_tensor_for_first_token(at::Tensor& t) { return t_new; } -template +template inline void tpp_linear_bias( const at::Tensor& t_in, const at::Tensor& t_wt, @@ -119,9 +121,9 @@ inline void tpp_linear_bias( auto wt_V = GetVLAPtr(t_wt_V, {Nc, Hc * Hk}); - auto bias = GetVLAPtr(t_bias, {Hk}); + auto bias = GetVLAPtr(t_bias, {Hk}); - auto out = GetVLAPtr(t_out, {Nk, Hk}); + auto out = GetVLAPtr(t_out, {Nk, Hk}); auto Ncb = Nc; auto BSb = 64L; @@ -130,14 +132,15 @@ inline void tpp_linear_bias( Ncb = NCB_BLOCK_SIZE; bool with_bias = (t_bias.numel() > 0); - auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); - auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); - auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); - auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); - auto brgemm_tpp = SCOPEITGEMM( - (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); - auto brgemm_tpp_rem = SCOPEITGEMM( - (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); + auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); + auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); + auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); + auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); + constexpr int b_vnni = std::is_same(); + auto brgemm_tpp = SCOPEITGEMM((BrgemmTPP( + BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); + auto brgemm_tpp_rem = SCOPEITGEMM((BrgemmTPP( + rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); { RECORD_SCOPE(tpp_linear_krnl, {t_in, t_wt_V}); @@ -441,7 +444,7 @@ inline void tpp_linear_add_add( } } -template +template inline void tpp_linear_gelu( const at::Tensor& t_in, const at::Tensor& t_wt, @@ -469,8 +472,8 @@ inline void tpp_linear_gelu( auto in = GetVLAPtr(t_in, {Nc, Hc}); auto wt_V = GetVLAPtr(t_wt_V, {Nc, Hc * Hk}); - auto bias = GetVLAPtr(t_bias, {Hk}); - auto out = GetVLAPtr(t_out, {Nk, Hk}); + auto bias = GetVLAPtr(t_bias, {Hk}); + auto out = GetVLAPtr(t_out, {Nk, Hk}); auto Ncb = Nc; auto BSb = 64L; @@ -478,16 +481,109 @@ inline void tpp_linear_gelu( if (large_cache_opt) Ncb = NCB_BLOCK_SIZE; bool with_bias = (t_bias.numel() > 0); - auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); - auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); - auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); - auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); - auto brgemm_tpp = SCOPEITGEMM( - (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); - auto brgemm_tpp_rem = SCOPEITGEMM( - (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); - auto gelu_fwd_tpp = SCOPEIT(GeluFwdTPP(BSb, Hk, K, K), ACT); - auto gelu_fwd_tpp_rem = SCOPEIT(GeluFwdTPP(rem, Hk, K, K), ACT); + auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); + auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); + auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); + auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); + constexpr int b_vnni = std::is_same(); + auto brgemm_tpp = SCOPEITGEMM((BrgemmTPP( + BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); + auto brgemm_tpp_rem = SCOPEITGEMM((BrgemmTPP( + rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); + auto gelu_fwd_tpp = SCOPEIT(GeluFwdTPP(BSb, Hk, K, K), ACT); + auto gelu_fwd_tpp_rem = SCOPEIT(GeluFwdTPP(rem, Hk, K, K), ACT); + + { + RECORD_SCOPE(tpp_linear_gelu_krnl, {t_in, t_wt_V}); + + auto loop_scheme = large_cache_opt ? GEMM_LOOP_SCHEME : "aCb"; + auto igemm_loop = torch_ipex::tpp::ThreadedLoop<3>( + {{0, Nc, Ncb, false}, {0, BS, BSb}, {Nk}}, loop_scheme); + igemm_loop( + [&](int* ind) { + int nc = ind[0], s1 = ind[1], nk = ind[2]; + auto count = nc + Ncb < Nc ? Ncb : Nc - nc; + bool is_rem = (s1 + BSb > BS); + if (!is_rem) { + if (nc == 0) { + if (with_bias) { + copy_bias_tpp(bias[nk], out[s1][nk]); + } else { + zero_tpp(out[s1][nk]); + } + } + brgemm_tpp(in[s1][nc], wt_V[nk][nc], out[s1][nk], count, true); + if (!(nc + Ncb < Nc)) { // last nc iter + gelu_fwd_tpp(out[s1][nk], out[s1][nk]); + } + } else { + if (nc == 0) { + if (with_bias) { + copy_bias_tpp_rem(bias[nk], out[s1][nk]); + } else { + zero_tpp_rem(out[s1][nk]); + } + } + brgemm_tpp_rem(in[s1][nc], wt_V[nk][nc], out[s1][nk], count, false); + brgemm_tpp.config(); + if (!(nc + Ncb < Nc)) { // last nc iter + gelu_fwd_tpp_rem(out[s1][nk], out[s1][nk]); + } + } + }, + [&]() { brgemm_tpp.config(); }, + [&]() { brgemm_tpp.release(); }); + } +} + +template +inline void tpp_linear_gelu_tanh( + const at::Tensor& t_in, + const at::Tensor& t_wt, + const at::Tensor& t_bias, + at::Tensor& t_out) { + auto t_wt_ = t_wt; + auto in_sizes = t_in.sizes(); + auto BS = in_sizes[0] * in_sizes[1]; + bool large_cache_opt = false; + if (BS > FT_OPT_SIZE) { // first token compute + t_wt_ = wt_tensor_for_first_token(t_wt_); + large_cache_opt = true; + } + + auto wt_sizes = t_wt_.sizes(); + auto C = in_sizes[2]; + + auto Nc = wt_sizes[1]; + auto Hc = C / Nc; + auto Nk = wt_sizes[0]; + auto Hk = wt_sizes[3]; + auto K = Nk * Hk; + + auto t_wt_V = torch_ipex::tpp::wt_tensor_for_fwd(Nk, Hk, Nc, Hc, t_wt_); + + auto in = GetVLAPtr(t_in, {Nc, Hc}); + auto wt_V = GetVLAPtr(t_wt_V, {Nc, Hc * Hk}); + auto bias = GetVLAPtr(t_bias, {Hk}); + auto out = GetVLAPtr(t_out, {Nk, Hk}); + + auto Ncb = Nc; + auto BSb = 64L; + auto rem = BS % 64; + if (large_cache_opt) + Ncb = NCB_BLOCK_SIZE; + bool with_bias = (t_bias.numel() > 0); + auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); + auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); + auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); + auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); + constexpr int b_vnni = std::is_same(); + auto brgemm_tpp = SCOPEITGEMM((BrgemmTPP( + BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); + auto brgemm_tpp_rem = SCOPEITGEMM((BrgemmTPP( + rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); + auto gelu_fwd_tpp = SCOPEIT(GeluTanhFwdTPP(BSb, Hk, K, K), ACT); + auto gelu_fwd_tpp_rem = SCOPEIT(GeluTanhFwdTPP(rem, Hk, K, K), ACT); { RECORD_SCOPE(tpp_linear_gelu_krnl, {t_in, t_wt_V}); diff --git a/csrc/cpu/tpp/tensor_helper.h b/csrc/cpu/tpp/tensor_helper.h index 80cef62d2..a25542d7f 100644 --- a/csrc/cpu/tpp/tensor_helper.h +++ b/csrc/cpu/tpp/tensor_helper.h @@ -165,7 +165,8 @@ inline at::Tensor wt_tensor_for_fwd( if (input.dtype() == at::kBFloat16) { return wt_tensor_n2v(Nk, Hk, Nc, Hc, input); } else { - AT_ASSERT(false, "Unsupported datatype!"); + // unsupported data type + return input; } } } else { diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py index 9d613f16e..8643f630a 100644 --- a/tests/cpu/test_quantization_default_recipe.py +++ b/tests/cpu/test_quantization_default_recipe.py @@ -723,7 +723,7 @@ def test(feature, has_bias): [9, 4095, 4095], [9, 4096, 4096], [196, 4095, 16383], - [192, 4096, 16384], + [1024, 512, 512], ] use_bias_list = [True, False] cases = itertools.product(shape_list, use_bias_list) @@ -739,15 +739,6 @@ def __init__(self, input_channel, output_channel, has_bias): def forward(self, x): return self.linear(x) - def tpp_is_used(N, K): - num_threads = torch.get_num_threads() - block_n = 32 if N // 64 // num_threads < 4 else 64 - block_k = 64 - while K % block_k != 0: - block_k //= 2 - assert block_k > 0 - return N % block_n == 0 and K % block_k == 0 - def test(feature, has_bias, w_dtype): model = M(feature[1], feature[2], has_bias) m = model.eval() @@ -763,33 +754,51 @@ def test(feature, has_bias, w_dtype): weight_int8, w_scales, w_zero_points = quantize_per_channel( weight, w_dtype ) - weight_fp32 = dequantize_per_channel( - weight_int8, w_scales, w_zero_points.int(), w_dtype, weight.shape - ) - weight_bf16 = weight_fp32.bfloat16() - weight_fp16 = weight_fp32.half() data_bf16 = data.bfloat16() data_fp16 = data_bf16.half() bias_fp32 = m.linear.bias # if M >= 32, compute in bf16 # if M < 32, compute in fp32 or fp16. Depends on fp16 support. if feature[0] >= 32: + weight_bf16 = dequantize_per_channel( + weight_int8, + w_scales.bfloat16(), + w_zero_points.bfloat16(), + w_dtype, + weight.shape, + ).bfloat16() output1 = torch.matmul( data_bf16.float(), weight_bf16.float().T + ).float() + if has_bias: + output1 = output1 + bias_fp32 + output1 = output1.bfloat16() + # For reference kernel + weight_bf16_ref = dequantize_per_channel( + weight_int8, + w_scales.float(), + w_zero_points.float(), + w_dtype, + weight.shape, ).bfloat16() + output1_ref = torch.matmul(data_bf16, weight_bf16_ref.T) if has_bias: - output1 = output1 + bias_fp32.bfloat16() + output1_ref = output1_ref + bias_fp32.bfloat16() + output1_ref = output1_ref.bfloat16() else: - output1_fp32 = torch.matmul( - data_bf16.float(), weight_bf16.float().T + weight_fp16 = dequantize_per_channel( + weight_int8, + w_scales.half(), + w_zero_points.half(), + w_dtype, + weight.shape, ) - if has_bias: - output1_fp32 = output1_fp32 + bias_fp32 output1_fp16 = torch.matmul( data_fp16.float(), weight_fp16.float().T ).half() if has_bias: output1_fp16 = output1_fp16 + bias_fp32.half() + output1_fp16 = output1_fp16.bfloat16() with torch.autocast( device_type="cpu", enabled=True, dtype=torch.bfloat16 ): @@ -804,22 +813,22 @@ def test(feature, has_bias, w_dtype): output2 = woq_model(data) output2 = output2.bfloat16() if feature[0] < 32: - try: - torch.testing.assert_close( - output1_fp32.bfloat16(), output2, atol=0.01, rtol=0.1 - ) - except Exception as e: - torch.testing.assert_close( - output1_fp16.bfloat16(), output2, atol=0.01, rtol=0.1 - ) + torch.testing.assert_close( + output1_fp16, output2, atol=0.01, rtol=0.1 + ) else: - torch.testing.assert_close(output1, output2) + # Use try...except to handle numeric differences between optimized and ref kernels + try: + torch.testing.assert_close(output1, output2) + except Exception: + torch.testing.assert_close(output1_ref, output2) shape_list = [ [3, 31, 31], [4, 64, 64], [9, 128, 128], [196, 63, 255], + [1024, 512, 512], ] use_bias_list = [True, False] w_dtype_list = [WoqWeightDtype.INT8, WoqWeightDtype.INT4] @@ -839,6 +848,7 @@ def forward(self, x): shape_list = [ [2, 24, 24], [8, 64, 64], + [1024, 512, 512], ] use_bias_list = [True, False] w_dtype_list = [WoqWeightDtype.INT8, WoqWeightDtype.INT4] @@ -901,6 +911,7 @@ def test(feature, has_bias, w_dtype): [4, 4096, 4096], [9, 4095, 4095], [196, 4095, 16383], + [1024, 512, 512], ] use_bias_list = [True, False] w_dtype_list = [WoqWeightDtype.INT8, WoqWeightDtype.INT4] @@ -958,6 +969,7 @@ def test(feature, has_bias): [4, 4096, 4095], [9, 4095, 4095], [196, 4095, 16383], + [1024, 512, 512], ] use_bias_list = [True, False] cases = itertools.product(shape_list, use_bias_list) @@ -1014,6 +1026,7 @@ def test(feature, has_bias): [4, 4096, 4095], [9, 4095, 4095], [196, 4095, 4095], + [1024, 512, 512], ] use_bias_list = [True, False] cases = itertools.product(shape_list, use_bias_list) @@ -1032,13 +1045,14 @@ def forward(self, x): bias_list = [False, True] bf16_list = [False, True] - cases = itertools.product(bias_list, bf16_list) - for bias, bf16 in cases: + batch_size_list = [4, 1024] + cases = itertools.product(bias_list, bf16_list, batch_size_list) + for bias, bf16, bs in cases: with torch.cpu.amp.autocast( enabled=bf16, dtype=torch.bfloat16 if bf16 else None ): model = Mod(bias).eval() - data = torch.rand(4, 64) + data = torch.rand(bs, 64) qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( lowp_mode=2 ) @@ -1067,13 +1081,14 @@ def forward(self, x): bias_list = [False, True] bf16_list = [False, True] - cases = itertools.product(bias_list, bf16_list) - for bias, bf16 in cases: + batch_size_list = [4, 1024] + cases = itertools.product(bias_list, bf16_list, batch_size_list) + for bias, bf16, bs in cases: with torch.cpu.amp.autocast( enabled=bf16, dtype=torch.bfloat16 if bf16 else None ): model = Mod(bias).eval() - data = torch.rand(4, 64) + data = torch.rand(bs, 64) qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( lowp_mode=2 ) @@ -1105,14 +1120,17 @@ def forward(self, x, others): bias_list = [False, True] bf16_list = [False, True] others_len_list = [1, 2] - cases = itertools.product(bias_list, bf16_list, others_len_list) - for bias, bf16, others_len in cases: + batch_size_list = [4, 1024] + cases = itertools.product( + bias_list, bf16_list, others_len_list, batch_size_list + ) + for bias, bf16, others_len, bs in cases: with torch.cpu.amp.autocast( enabled=bf16, dtype=torch.bfloat16 if bf16 else None ): model = Mod(bias).eval() - data = torch.rand(4, 64) - others = [torch.rand(4, 64)] * others_len + data = torch.rand(bs, 64) + others = [torch.rand(bs, 64)] * others_len fused_op = ( torch.ops.torch_ipex.woq_linear_add if others_len == 1 @@ -1177,14 +1195,13 @@ def __init__(self): def forward(self, x): return self.linear(x) - # When lowp_mode=BF16, only case of batch size >= 32 uses BF16. - data = torch.rand(32, 64) m = M() lowp_mode_list = [WoqLowpMode.NONE, WoqLowpMode.FP16, WoqLowpMode.BF16] act_dtype_list = [torch.bfloat16, torch.half] compute_dtype_list = [None, torch.half, torch.bfloat16] - cases = itertools.product(lowp_mode_list, act_dtype_list) + batch_size_list = [4, 1024] + cases = itertools.product(lowp_mode_list, act_dtype_list, batch_size_list) # lowp_mode does not affect weight observer for int8 qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping() weight = copy.deepcopy(m.linear.weight) @@ -1192,7 +1209,11 @@ def forward(self, x): weight_int8, w_scales, w_zps = quantize_per_channel(weight, w_dtype) weight_fp32 = dequantize_per_channel(weight_int8, w_scales, w_zps, w_dtype) bias_fp32 = copy.deepcopy(m.linear.bias) - for lowp_mode, act_dtype in cases: + for lowp_mode, act_dtype, bs in cases: + # When lowp_mode=BF16, only case of batch size >= 32 uses BF16. + if lowp_mode == WoqLowpMode.BF16 and bs < 32: + continue + data = torch.rand(bs, 64) if lowp_mode == WoqLowpMode.NONE: compute_dtype_list[0] = act_dtype compute_dtype = compute_dtype_list[int(lowp_mode)] @@ -1292,8 +1313,6 @@ def _fakequant_by_group(self, t, quant_a_mode, groupsize): return out def test_weight_only_quantization_act_quant_mode(self): - M, N, K = 4, 64, 128 - groupsize = 64 class Mod(nn.Module): def __init__(self, has_bias): @@ -1303,7 +1322,7 @@ def __init__(self, has_bias): def forward(self, x): return self.linear(x) - def test(has_bias, act_quant_mode): + def test(has_bias, act_quant_mode, M): dtype = torch.bfloat16 model = Mod(has_bias) m = model.eval() @@ -1349,11 +1368,14 @@ def test(has_bias, act_quant_mode): y_ref = y_ref.to(dtype) torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-1) + N, K = 64, 512 + groupsize = 64 has_bias_list = [False, True] quant_mode_list = [0, 1, 2, 3] - cases = itertools.product(has_bias_list, quant_mode_list) - for has_bias, quant_mode in cases: - test(has_bias, quant_mode) + batch_size_list = [4, 1024] + cases = itertools.product(has_bias_list, quant_mode_list, batch_size_list) + for has_bias, quant_mode, M in cases: + test(has_bias, quant_mode, M) def test_weight_only_quantization_group_size(self): class Mod(nn.Module): From 3727406ebbccb34bd150351cddb451d37092f460 Mon Sep 17 00:00:00 2001 From: blzheng Date: Sat, 11 May 2024 12:43:28 +0800 Subject: [PATCH 056/199] refactor rope part and optimize mask memory for Phi3 (#2828) --- csrc/cpu/aten/MaskedMultiHeadAttention.cpp | 25 ++ csrc/cpu/aten/MaskedMultiHeadAttention.h | 19 +- csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp | 294 ++++++++++++------ .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 87 ++++++ .../llm/distributed/run_generation_tp.py | 6 +- .../run_generation_with_deepspeed.py | 8 +- examples/cpu/inference/python/llm/run.py | 1 + .../llm/single_instance/run_generation.py | 6 +- .../llm/single_instance/run_quantization.py | 6 +- .../models/cpu/modules/attentions.py | 2 + .../models/reference/fusions/mha_fusion.py | 146 +++++++-- .../transformers/models/reference/models.py | 27 +- .../models/reference/modules/attentions.py | 14 +- tests/cpu/test_cpu_ops.py | 35 +++ 14 files changed, 516 insertions(+), 160 deletions(-) diff --git a/csrc/cpu/aten/MaskedMultiHeadAttention.cpp b/csrc/cpu/aten/MaskedMultiHeadAttention.cpp index 71618ecb6..5c634a088 100644 --- a/csrc/cpu/aten/MaskedMultiHeadAttention.cpp +++ b/csrc/cpu/aten/MaskedMultiHeadAttention.cpp @@ -6,6 +6,7 @@ namespace torch_ipex { namespace cpu { IPEX_DEFINE_DISPATCH(masked_multihead_self_attention_kernel_stub); +IPEX_DEFINE_DISPATCH(prepare_4d_causal_attention_mask_kernel_stub); /* *Caculate the masked multihead attention for decoder layer in decoder only @@ -54,6 +55,21 @@ masked_multihead_self_attention_forward_cpu( add_casual_mask); } +at::Tensor prepare_4d_causal_attention_mask_forward_cpu( + at::Tensor& attention_mask, + at::Tensor& inputs_embeds, + at::Tensor& past_kv_len, + at::Tensor& finfo_min, + int64_t sliding_window) { + return prepare_4d_causal_attention_mask_kernel_stub( + kCPU, + attention_mask, + inputs_embeds, + past_kv_len, + finfo_min, + sliding_window); +} + } // namespace cpu } // namespace torch_ipex @@ -69,4 +85,13 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { c10::DispatchKey::CPU, torch_ipex::cpu::masked_multihead_self_attention_forward_cpu); } + +TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { + m.def( + "prepare_4d_causal_attention_mask(Tensor attention_mask, Tensor inputs_embeds, Tensor past_kv_len, Tensor finfo_min, int sliding_window)-> (Tensor)"); + m.impl( + "prepare_4d_causal_attention_mask", + c10::DispatchKey::CPU, + torch_ipex::cpu::prepare_4d_causal_attention_mask_forward_cpu); +} } // namespace diff --git a/csrc/cpu/aten/MaskedMultiHeadAttention.h b/csrc/cpu/aten/MaskedMultiHeadAttention.h index 8c25183dd..bc2525c69 100644 --- a/csrc/cpu/aten/MaskedMultiHeadAttention.h +++ b/csrc/cpu/aten/MaskedMultiHeadAttention.h @@ -22,7 +22,14 @@ masked_multihead_self_attention( const c10::optional& head_mask /* optional */, const c10::optional& attention_mask /* optional */, c10::optional add_casual_mask /* optional */); -} + +at::Tensor prepare_4d_causal_attention_mask_forward_cpu( + at::Tensor& attention_mask, + at::Tensor& inputs_embeds, + at::Tensor& past_kv_len, + at::Tensor& finfo_min, + int64_t sliding_window); +} // namespace using masked_multihead_self_attention_kernel_fn = std::tuple (*)( @@ -42,6 +49,16 @@ using masked_multihead_self_attention_kernel_fn = IPEX_DECLARE_DISPATCH( masked_multihead_self_attention_kernel_fn, masked_multihead_self_attention_kernel_stub); +using prepare_4d_causal_attention_mask_kernel_fn = at::Tensor (*)( + at::Tensor& attention_mask, + at::Tensor& inputs_embeds, + at::Tensor& past_kv_len, + at::Tensor& finfo_min, + int64_t sliding_window); + +IPEX_DECLARE_DISPATCH( + prepare_4d_causal_attention_mask_kernel_fn, + prepare_4d_causal_attention_mask_kernel_stub); } // namespace cpu } // namespace torch_ipex diff --git a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp index f4d8529bc..d409c6667 100644 --- a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp @@ -282,6 +282,80 @@ inline Vectorized exp_u20(Vectorized data) { } #endif +// out = val * a + b +template +inline void _scale_attn_mask_fusion_kernel( + T1* a, + T2* b, + const int& size, + T1* out, + T1& val) { + auto vec_size = at::vec::Vectorized::size(); + auto vec_scale = at::vec::Vectorized(val); + for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { + auto tmp0 = at::vec::Vectorized::loadu(a + i); + auto tmp1 = at::vec::Vectorized::loadu(b + i); + auto tmp2 = at::vec::convert(tmp1); + auto tmp3 = tmp0 * vec_scale + tmp2; + _store(out + i, tmp3); + } + for (long i = vec_size * (size / vec_size); i < size; i++) { + auto tmp0 = a[i]; + auto tmp1 = (T1)b[i]; + out[i] = tmp0 * val + tmp1; + } +} + +// out = val * a + b +template +inline void _scale_attn_mask_fusion_kernel( + T1* a, + T1* b, + const int& size, + T1* out, + T1& val) { + auto vec_size = at::vec::Vectorized::size(); + auto vec_scale = at::vec::Vectorized(val); + for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { + auto tmp0 = at::vec::Vectorized::loadu(a + i); + auto tmp1 = at::vec::Vectorized::loadu(b + i); + auto tmp2 = tmp0 * vec_scale + tmp1; + _store(out + i, tmp2); + } + for (long i = vec_size * (size / vec_size); i < size; i++) { + auto tmp0 = a[i]; + auto tmp1 = b[i]; + out[i] = tmp0 * val + tmp1; + } +} + +// out = b ? val * a : -inf +template +inline void _scale_attn_mask_fusion_kernel( + T1* a, + bool* b, + const int& size, + T1* out, + T1& val) { + auto vec_size = at::vec::Vectorized::size(); + auto vec_scale = at::vec::Vectorized(val); + auto neg_inf = -std::numeric_limits::infinity(); + auto vec_neg_inf = at::vec::Vectorized(neg_inf); + for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { + auto tmp0 = at::vec::Vectorized::loadu(a + i); + auto tmp1 = at::vec::Vectorized::loadu(b + i); + auto tmp2 = at::vec::convert(tmp1); + auto tmp3 = + at::vec::Vectorized::blendv(vec_neg_inf, tmp0 * vec_scale, tmp2); + _store(out + i, tmp3); + } + for (long i = vec_size * (size / vec_size); i < size; i++) { + auto tmp0 = a[i]; + auto tmp1 = b[i]; + out[i] = tmp1 ? tmp0 * val : neg_inf; + } +} + // 1) out = exp(a - val) // 2) val = sum(out) template @@ -366,7 +440,11 @@ inline void _mul_reduce_max_fusion_kernel( *@param attention_mask: attention mask *@param scale: scaling factor applied prior to softmax */ -template +template < + typename scalar_t, + typename mask_t, + int64_t q_split_size, + int64_t kv_split_size> inline typename std::enable_if_t, void> cpu_flash_attention( const at::Tensor& output, @@ -388,14 +466,9 @@ cpu_flash_attention( at::Tensor key = k.transpose(1, 2); at::Tensor value = v.transpose(1, 2); - bool is_bool_mask = attention_mask.has_value() && - attention_mask.value().scalar_type() == ScalarType::Bool; using accum_t = at::opmath_type; using Vec = at::vec::Vectorized; accum_t scaling_factor = calculate_scale(query, scale).as_float_unchecked(); - if (attention_mask.has_value() && is_bool_mask) { - attention_mask.value() = attention_mask.value().to(at::kFloat); - } // Sizes TORCH_CHECK( @@ -458,8 +531,8 @@ cpu_flash_attention( scalar_t* q_data = query.data_ptr(); scalar_t* k_data = key.data_ptr(); scalar_t* v_data = value.data_ptr(); - accum_t* mask_data = attention_mask.has_value() - ? attention_mask.value().data_ptr() + mask_t* mask_data = attention_mask.has_value() + ? attention_mask.value().data_ptr() : nullptr; scalar_t* out_data = output.data_ptr(); accum_t* lse_data = logsumexp.data_ptr(); @@ -523,31 +596,15 @@ cpu_flash_attention( // And apply scaling factor if (attention_mask.has_value()) { for (int64_t row = 0; row < qBlockSize; ++row) { - if (is_bool_mask) { - // qk <- attn_mask ? qk : -inf - auto neg_inf = -std::numeric_limits::infinity(); - at::vec::map2( - [neg_inf, scaling_factor](Vec x, Vec m) { - return Vec::blendv( - Vec(neg_inf), x * Vec(scaling_factor), m); - }, - qk_data + row * kvBlockSize, - qk_data + row * kvBlockSize, - mask_data + i * mStrideB + j * mStrideH + - (m + row) * mStrideM + n, - kvBlockSize); - } else { - // qk <- qk + attn_mask - at::vec::map2( - [scaling_factor](Vec x, Vec y) { - return x * Vec(scaling_factor) + y; - }, - qk_data + row * kvBlockSize, - qk_data + row * kvBlockSize, - mask_data + i * mStrideB + j * mStrideH + - (m + row) * mStrideM + n, - kvBlockSize); - } + // qk <- attn_mask ? qk : -inf, if attn_mask is bool + // qk <- qk + attn_mask, else + _scale_attn_mask_fusion_kernel( + qk_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM + n, + kvBlockSize, + qk_data + row * kvBlockSize, + scaling_factor); } } // Update coefficients with Softmax @@ -634,7 +691,11 @@ cpu_flash_attention( } // Half/BFloat16 -template +template < + typename scalar_t, + typename mask_t, + int64_t q_split_size, + int64_t kv_split_size> inline typename std::enable_if_t, void> cpu_flash_attention( const at::Tensor& output, @@ -662,14 +723,9 @@ cpu_flash_attention( const int vnni_pack = (!is_fp16 || (is_fp16 && utils::isa_has_amx_fp16_support())) ? 1 : 0; - bool is_bool_mask = attention_mask.has_value() && - attention_mask.value().scalar_type() == ScalarType::Bool; using accum_t = at::opmath_type; using Vec = at::vec::Vectorized; accum_t scaling_factor = calculate_scale(query, scale).as_float_unchecked(); - if (attention_mask.has_value()) { - attention_mask.value() = attention_mask.value().to(at::kFloat); - } // Sizes TORCH_CHECK( @@ -734,8 +790,8 @@ cpu_flash_attention( scalar_t* q_data = query.data_ptr(); scalar_t* k_data = key.data_ptr(); scalar_t* v_data = value.data_ptr(); - accum_t* mask_data = attention_mask.has_value() - ? attention_mask.value().data_ptr() + mask_t* mask_data = attention_mask.has_value() + ? attention_mask.value().data_ptr() : nullptr; scalar_t* out_data = output.data_ptr(); accum_t* lse_data = logsumexp.data_ptr(); @@ -1185,31 +1241,15 @@ cpu_flash_attention( // And apply scaling factor if (attention_mask.has_value()) { for (int64_t row = 0; row < qBlockSize; ++row) { - if (is_bool_mask) { - // qk <- attn_mask ? qk : -inf - auto neg_inf = -std::numeric_limits::infinity(); - at::vec::map2( - [neg_inf, scaling_factor](Vec x, Vec m) { - return Vec::blendv( - Vec(neg_inf), x * Vec(scaling_factor), m); - }, - qk_data + row * kvBlockSize, - qk_data + row * kvBlockSize, - mask_data + i * mStrideB + j * mStrideH + - (m + row) * mStrideM + n, - kvBlockSize); - } else { - // qk <- qk + attn_mask - at::vec::map2( - [scaling_factor](Vec x, Vec y) { - return x * Vec(scaling_factor) + y; - }, - qk_data + row * kvBlockSize, - qk_data + row * kvBlockSize, - mask_data + i * mStrideB + j * mStrideH + - (m + row) * mStrideM + n, - kvBlockSize); - } + // qk <- attn_mask ? qk : -inf, if attn_mask is bool + // qk <- qk + attn_mask, else + _scale_attn_mask_fusion_kernel( + qk_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM + n, + kvBlockSize, + qk_data + row * kvBlockSize, + scaling_factor); } } // Update coefficients with Softmax @@ -1363,6 +1403,21 @@ cpu_flash_attention( }); } +#define AT_DISPATCH_MASK_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, \ + NAME, \ + AT_PRIVATE_CASE_TYPE_USING_HINT( \ + at::ScalarType::Bool, mask_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE_USING_HINT( \ + at::ScalarType::Float, mask_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE_USING_HINT( \ + at::ScalarType::Double, mask_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE_USING_HINT( \ + at::ScalarType::BFloat16, mask_t, __VA_ARGS__) \ + AT_PRIVATE_CASE_TYPE_USING_HINT( \ + at::ScalarType::Half, mask_t, __VA_ARGS__)) + void flash_attention_kernel_impl( const at::Tensor& output, const at::Tensor& logsumexp, @@ -1377,39 +1432,81 @@ void flash_attention_kernel_impl( AT_DISPATCH_FLOATING_TYPES_AND2( kBFloat16, kHalf, query.scalar_type(), "flash_attention", [&] { - if (q_seq_len >= 768) { - cpu_flash_attention( - output, - logsumexp, - query, - key, - value, - dropout_p, - is_causal, - attention_mask, - scale); - } else if (q_seq_len >= 192) { - cpu_flash_attention( - output, - logsumexp, - query, - key, - value, - dropout_p, - is_causal, - attention_mask, - scale); + if (!attention_mask.has_value()) { + if (q_seq_len >= 768) { + cpu_flash_attention( + output, + logsumexp, + query, + key, + value, + dropout_p, + is_causal, + attention_mask, + scale); + } else if (q_seq_len >= 192) { + cpu_flash_attention( + output, + logsumexp, + query, + key, + value, + dropout_p, + is_causal, + attention_mask, + scale); + } else { + cpu_flash_attention( + output, + logsumexp, + query, + key, + value, + dropout_p, + is_causal, + attention_mask, + scale); + } } else { - cpu_flash_attention( - output, - logsumexp, - query, - key, - value, - dropout_p, - is_causal, - attention_mask, - scale); + AT_DISPATCH_MASK_TYPES( + attention_mask.value().scalar_type(), + "flash_attention_mask", + [&]() { + if (q_seq_len >= 768) { + cpu_flash_attention( + output, + logsumexp, + query, + key, + value, + dropout_p, + is_causal, + attention_mask, + scale); + } else if (q_seq_len >= 192) { + cpu_flash_attention( + output, + logsumexp, + query, + key, + value, + dropout_p, + is_causal, + attention_mask, + scale); + } else { + cpu_flash_attention( + output, + logsumexp, + query, + key, + value, + dropout_p, + is_causal, + attention_mask, + scale); + } + }); } }); } @@ -1442,6 +1539,7 @@ std::tuple flash_attention_kernel( TORCH_CHECK( !attention_mask.has_value() || dtype == attention_mask.value().scalar_type() || + attention_mask.value().scalar_type() == ScalarType::Float || attention_mask.value().scalar_type() == ScalarType::Bool, "IPEX flash_attention: Mask should have the same data type as Q/K/V or Bool"); TORCH_CHECK( diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index f479c27f7..a01181b1e 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -1471,11 +1471,98 @@ masked_multihead_self_attention_kernel_impl( add_casual_mask.value_or(true)); } } + +template +void attention_mask_2d_to_4d( + const T* attention_mask_ptr, + T* causal_4d_mask_ptr, + at::Tensor& finfo_min, + int64_t batch_size, + int64_t seq_length, + int64_t src_length, + int64_t past_key_value_length, + int64_t length, + int64_t diagonal) { + T finfo_min_val = finfo_min.item(); +#pragma omp parallel for collapse(2) + for (int64_t b = 0; b < batch_size; ++b) { + for (int64_t l = 0; l < seq_length; ++l) { + for (int64_t c = 0; c < length; ++c) { + int64_t idx = b * seq_length * length + l * length + c; + int64_t mask_idx = l * length + c; + T value = finfo_min_val; + if (l + diagonal <= c && l + past_key_value_length >= c) { + value = 0; + } + if (c < src_length) { + T inverted_mask_value = 1.0 - attention_mask_ptr[b * src_length + c]; + if (inverted_mask_value != 0) { + value = finfo_min_val; + } + } + causal_4d_mask_ptr[idx] = value; + } + } + } +} + +at::Tensor prepare_4d_causal_attention_mask_kernel_impl( + at::Tensor& attention_mask, + at::Tensor& inputs_embeds, + at::Tensor& past_kv_len, + at::Tensor& finfo_min, + int64_t sliding_window) { + auto dtype = inputs_embeds.scalar_type(); + int64_t batch_size = inputs_embeds.size(0); + int64_t seq_length = inputs_embeds.size(1); + int64_t src_length = attention_mask.size(-1); + int64_t past_key_value_length = past_kv_len.item(); + int64_t length = seq_length + past_key_value_length; + int64_t diagonal = past_key_value_length - sliding_window + 1; + + at::Tensor causal_4d_mask = torch::empty( + {batch_size, 1, seq_length, length}, inputs_embeds.options()); + attention_mask = attention_mask.to(inputs_embeds.dtype()); + + if (dtype == at::kFloat) { + float* attention_mask_ptr = attention_mask.data_ptr(); + float* causal_4d_mask_ptr = causal_4d_mask.data_ptr(); + attention_mask_2d_to_4d( + attention_mask_ptr, + causal_4d_mask_ptr, + finfo_min, + batch_size, + seq_length, + src_length, + past_key_value_length, + length, + diagonal); + } else if (dtype == at::kBFloat16) { + at::BFloat16* attention_mask_ptr = attention_mask.data_ptr(); + at::BFloat16* causal_4d_mask_ptr = causal_4d_mask.data_ptr(); + attention_mask_2d_to_4d( + attention_mask_ptr, + causal_4d_mask_ptr, + finfo_min, + batch_size, + seq_length, + src_length, + past_key_value_length, + length, + diagonal); + } + + return causal_4d_mask; +} } // anonymous namespace IPEX_REGISTER_DISPATCH( masked_multihead_self_attention_kernel_stub, &masked_multihead_self_attention_kernel_impl); +IPEX_REGISTER_DISPATCH( + prepare_4d_causal_attention_mask_kernel_stub, + &prepare_4d_causal_attention_mask_kernel_impl); + } // namespace cpu } // namespace torch_ipex diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py index 7b8fd3f03..5397b05de 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py @@ -423,10 +423,8 @@ def trace_handler(prof): "[ERROR] model prompt is not supported, please use --prompt for this model: " + args.model_id ) - elif int(args.input_tokens) > 8192: - prompt = prompt_pool[model_type]["8192"] * int( - int(args.input_tokens) / 8192 - ) + # elif int(args.input_tokens) > 8192: + # prompt = prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) elif args.input_tokens in prompt_pool[model_type]: prompt = prompt_pool[model_type][args.input_tokens] else: diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 72ef693ac..4ec30d387 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -603,10 +603,10 @@ def load_image(image_file): "[ERROR] model prompt is not supported, please use --prompt for this model: " + args.model_id ) - elif int(args.input_tokens) > 8192: - input_sentences.append( - prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) - ) + # elif int(args.input_tokens) > 8192: + # input_sentences.append( + # prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) + # ) elif args.input_tokens in prompt_pool[model_type]: input_sentences.append(prompt_pool[model_type][args.input_tokens]) else: diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 6bfe1a170..16c254520 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -583,6 +583,7 @@ def main(args_in: Optional[List[str]] = None) -> None: "qwen": ("/qwen_local_shard"), "git": ("/git_local_shard"), "yuan": ("/yuan_local_shard"), + "phi-3": ("/phi-3_local_shard"), "phi": ("/phi_local_shard"), } model_type = next( diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 70c37c8ae..1c4c04a0f 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -309,10 +309,8 @@ def trace_handler(prof): "[ERROR] model prompt is not supported, please use --prompt for this model: " + args.model_id ) - elif int(args.input_tokens) > 8192: - prompt = prompt_pool[model_type]["8192"] * int( - int(args.input_tokens) / 8192 - ) + # elif int(args.input_tokens) > 8192: + # prompt = prompt_pool[model_type]["8192"] * int(int(args.input_tokens) / 8192) elif args.input_tokens in prompt_pool[model_type]: prompt = prompt_pool[model_type][args.input_tokens] else: diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 44fe0ef76..e5024a201 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -968,10 +968,8 @@ def calib_func(prepared_model): prompt_pool = json.load(f) if args.prompt is not None: prompt = args.prompt - elif int(args.input_tokens) > 8192: - prompt = prompt_pool[model.name]["8192"] * int( - int(args.input_tokens) / 8192 - ) + # elif int(args.input_tokens) > 8192: + # prompt = prompt_pool[model.name]["8192"] * int(int(args.input_tokens) / 8192) elif args.input_tokens in prompt_pool[model.name]: prompt = prompt_pool[model.name][args.input_tokens] else: diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py index 7d893b648..29491dfe0 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py @@ -39,6 +39,8 @@ def __init__(self, module, config, tpp=False, woq=False): ] if "long_factor" in config.rope_scaling: extra_inputs["long_factor"] = config.rope_scaling["long_factor"] + if "type" in config.rope_scaling: + extra_inputs["type"] = config.rope_scaling["type"] if hasattr(config, "original_max_position_embeddings"): extra_inputs["original_max_position_embeddings"] = ( config.original_max_position_embeddings diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py index d3d077894..1d1a4e113 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py @@ -5,10 +5,68 @@ from torch.nn import functional as F +@torch.library.impl("myops::longrope", "cpu") +def longrope( + inv_freq, + max_seq_len_cached, + max_position_embeddings, + sin_cos, + sin_cached, + cos_cached, + sin_cos_long, + sin_cached_long, + cos_cached_long, + seq_len, + rope_type, +): + if seq_len > max_seq_len_cached: + if rope_type == 1: # Phi3ForCausalLM + return ( + max_position_embeddings, + sin_cos_long, + sin_cached_long, + cos_cached_long, + ) + elif rope_type == 2: # Falcon + t = torch.arange(seq_len, dtype=inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, inv_freq) + sin_cos = torch.cat( + (freqs.sin().repeat(1, 2), freqs.cos().repeat(1, 2)), dim=-1 + ) + emb = torch.cat((freqs, freqs), dim=-1).float() + cos_cached = emb.cos()[None, :, :] + sin_cached = emb.sin()[None, :, :] + return seq_len, sin_cos, sin_cached, cos_cached + else: # Default + t = torch.arange(seq_len, dtype=inv_freq.dtype) + freqs = torch.einsum("i,j->ij", t, inv_freq) + sin_cos = torch.cat((torch.sin(freqs), torch.cos(freqs)), dim=1) + emb = torch.cat((freqs, freqs), dim=-1) + cos_cached = emb.cos()[None, None, :, :] + sin_cached = emb.sin()[None, None, :, :] + return ( + seq_len, + sin_cos, + sin_cached[:, :, :seq_len, ...], + cos_cached[:, :, :seq_len, ...], + ) + return max_seq_len_cached, sin_cos, sin_cached, cos_cached + + +torch.library.define( + "myops::longrope", + "(Tensor inv_freq, Tensor max_seq_len_cached, Tensor max_position_embeddings, Tensor sin_cos, " + + " Tensor sin_cached, Tensor cos_cached, Tensor? sin_cos_long, Tensor? sin_cached_long, " + + "Tensor? cos_cached_long, Tensor seq_len, Tensor rope_type) -> (Tensor, Tensor, Tensor, Tensor)", +) + + class RotaryEmbedding(torch.nn.Module): def __init__(self, max_position_embeddings, dim, backbone, base=10000, kwargs=None): super().__init__() self.scaling_factor = 1.0 + self.max_position_embeddings = max_position_embeddings + self.max_seq_len_cached = max_position_embeddings if kwargs is not None and "short_factor" in kwargs: self.short_factor = kwargs["short_factor"] ext_factors = torch.tensor(self.short_factor, dtype=torch.float32) @@ -19,23 +77,28 @@ def __init__(self, max_position_embeddings, dim, backbone, base=10000, kwargs=No inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) if kwargs is not None and "long_factor" in kwargs: self.long_factor = kwargs["long_factor"] - new_ext_factors = torch.tensor(self.long_factor, dtype=torch.float32) - new_inv_freq = 1.0 / ( - new_ext_factors * base ** (torch.arange(0, dim, 2).float() / dim) + ext_factors_long = torch.tensor(self.long_factor, dtype=torch.float32) + inv_freq_long = 1.0 / ( + ext_factors_long * base ** (torch.arange(0, dim, 2).float() / dim) ) - self.new_inv_freq = new_inv_freq if kwargs is not None and "original_max_position_embeddings" in kwargs: self.original_max_position_embeddings = kwargs[ "original_max_position_embeddings" ] scale = max_position_embeddings / self.original_max_position_embeddings if scale > 1.0: - self.scaling_factor = math.sqrt( - 1 - + math.log(scale) / math.log(self.original_max_position_embeddings) - ) + if "type" in kwargs and kwargs["type"] == "su": + self.scaling_factor = math.sqrt( + 1 + + math.log(scale) + / math.log(self.original_max_position_embeddings) + ) + elif "type" in kwargs and kwargs["type"] == "yarn": + self.scaling_factor = 0.1 * math.log(scale) + 1.0 + self.max_seq_len_cached = self.original_max_position_embeddings self.register_buffer("inv_freq", inv_freq, persistent=False) - self.max_seq_len_cached = max_position_embeddings + if backbone == "Phi3ForCausalLM" and "long_factor" not in kwargs: + self.max_seq_len_cached = self.max_seq_len_cached + 256 t = torch.arange( self.max_seq_len_cached, dtype=self.inv_freq.dtype, @@ -68,35 +131,50 @@ def __init__(self, max_position_embeddings, dim, backbone, base=10000, kwargs=No self.emb.sin()[None, None, :, :] * self.scaling_factor, persistent=False, ) - - def forward(self, seq_len=None): - if seq_len is not None and seq_len > self.max_seq_len_cached: - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, dtype=self.inv_freq.dtype) if hasattr(self, "long_factor"): - freqs = torch.einsum("i,j->ij", t, self.new_inv_freq) - else: - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - if ( - self.model_backbone == "FalconForCausalLM" - or self.model_backbone == "RWForCausalLM" - ): - self.sin_cos = torch.cat( - (freqs.sin().repeat(1, 2), freqs.cos().repeat(1, 2)), dim=-1 + t_long = torch.arange( + max_position_embeddings, dtype=self.inv_freq.dtype ) - self.emb = torch.cat((freqs, freqs), dim=-1).float() - self.cos_cached = self.emb.cos()[None, :, :] - self.sin_cached = self.emb.sin()[None, :, :] - else: - self.sin_cos = ( - torch.cat((torch.sin(freqs), torch.cos(freqs)), dim=1) + freqs_long = torch.einsum("i,j->ij", t_long, inv_freq_long) + self.sin_cos_long = ( + torch.cat((torch.sin(freqs_long), torch.cos(freqs_long)), dim=1) * self.scaling_factor ) - self.emb = torch.cat((freqs, freqs), dim=-1) - self.cos_cached = self.emb.cos()[None, None, :, :] * self.scaling_factor - self.sin_cached = self.emb.sin()[None, None, :, :] * self.scaling_factor - self.cos_cached[:, :, :seq_len, ...] - self.sin_cached[:, :, :seq_len, ...] + self.emb_long = torch.cat((freqs_long, freqs_long), dim=-1) + self.register_buffer( + "cos_cached_long", + self.emb_long.cos()[None, None, :, :] * self.scaling_factor, + persistent=False, + ) + self.register_buffer( + "sin_cached_long", + self.emb_long.sin()[None, None, :, :] * self.scaling_factor, + persistent=False, + ) + + def forward(self, seq_len=None): + rope_type = 0 + if self.model_backbone == "Phi3ForCausalLM" and hasattr(self, "long_factor"): + rope_type = 1 + elif self.model_backbone in ["FalconForCausalLM", "RWForCausalLM"]: + rope_type = 2 + if seq_len is not None: + max_seq_len_cached, self.sin_cos, self.sin_cached, self.cos_cached = ( + torch.ops.myops.longrope( + torch.tensor(self.inv_freq).contiguous(), + torch.tensor(self.max_seq_len_cached).contiguous(), + torch.tensor(self.max_position_embeddings).contiguous(), + self.sin_cos.contiguous(), + self.sin_cached.contiguous(), + self.cos_cached.contiguous(), + self.sin_cos_long.contiguous() if rope_type == 1 else None, + self.sin_cached_long.contiguous() if rope_type == 1 else None, + self.cos_cached_long.contiguous() if rope_type == 1 else None, + torch.tensor(seq_len).contiguous(), + torch.tensor(rope_type).contiguous(), + ) + ) + self.max_seq_len_cached = max_seq_len_cached.item() return self.sin_cos, self.sin_cached, self.cos_cached diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index a9dd50b55..4fb722d6a 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -3231,8 +3231,6 @@ def Phi3Model_forward( else: raise ValueError("You have to specify either input_ids or inputs_embeds") - past_key_values_length = 0 - if self.gradient_checkpointing and self.training: if use_cache: logger.warning_once( @@ -3267,13 +3265,24 @@ def Phi3Model_forward( else None ) else: - # 4d mask is passed through the layers - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, - (batch_size, seq_length), - inputs_embeds, - past_key_values_length, - ) + if self.config.sliding_window is not None: + # 4d mask is passed through the layers + if attention_mask is not None and len(attention_mask.shape) == 2: + attention_mask = torch.ops.torch_ipex.prepare_4d_causal_attention_mask( + attention_mask, + inputs_embeds, + torch.tensor(past_key_values_length).contiguous(), + torch.tensor(torch.finfo(inputs_embeds.dtype).min).contiguous(), + self.config.sliding_window, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) hidden_states = inputs_embeds diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 84472c564..87d13866d 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -1966,6 +1966,16 @@ def _PhiAttention_forward( return attn_output, attn_weights, past_key_value +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + batch, slen, num_key_value_heads, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, :, None, :].expand( + batch, slen, num_key_value_heads, n_rep, head_dim + ) + return hidden_states.reshape(batch, slen, num_key_value_heads * n_rep, head_dim) + + def _Phi3Attention_forward( self, hidden_states: torch.Tensor, @@ -1993,8 +2003,6 @@ def _Phi3Attention_forward( value_states = value_states.view( bsz, q_len, self.num_key_value_heads, self.head_dim ) - key_states = _repeat_kv(key_states, self.num_key_value_groups) - value_states = _repeat_kv(value_states, self.num_key_value_groups) (attn_output, attn_weights, past_key_value) = self._IPEXScaleDotProduct( query_states, key_states, @@ -2181,6 +2189,8 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): ] if "long_factor" in config.rope_scaling: extra_inputs["long_factor"] = config.rope_scaling["long_factor"] + if "type" in config.rope_scaling: + extra_inputs["type"] = config.rope_scaling["type"] if hasattr(config, "original_max_position_embeddings"): extra_inputs["original_max_position_embeddings"] = ( config.original_max_position_embeddings diff --git a/tests/cpu/test_cpu_ops.py b/tests/cpu/test_cpu_ops.py index 63c519a1f..c9fcd0f0f 100644 --- a/tests/cpu/test_cpu_ops.py +++ b/tests/cpu/test_cpu_ops.py @@ -1511,6 +1511,41 @@ def test_flash_attention_stride0(self): math_ref = torch._scaled_dot_product_attention_math(q2, k2, v2)[0] torch.testing.assert_close(actual, math_ref, atol=1e-5, rtol=5e-6) + def test_prepare_4d_causal_attention_mask(self): + for dtype in [torch.float32, torch.bfloat16]: + for sliding_window in [10, 40]: + for seq_len in [1, 32]: + inputs_embeds = torch.rand((1, seq_len, 768), dtype=dtype) + finfo_min = torch.finfo(dtype).min + past_key_values_length = 0 + if seq_len == 1: + past_key_values_length = 32 + attention_mask = torch.ones( + (1, past_key_values_length + seq_len), dtype=torch.long + ) + output = torch.ops.torch_ipex.prepare_4d_causal_attention_mask( + attention_mask, + inputs_embeds, + torch.tensor(past_key_values_length).contiguous(), + torch.tensor(finfo_min).contiguous(), + sliding_window, + ) + try: + from transformers.modeling_attn_mask_utils import ( + _prepare_4d_causal_attention_mask, + ) + + output_ref = _prepare_4d_causal_attention_mask( + attention_mask, + (inputs_embeds.shape[0], inputs_embeds.shape[1]), + inputs_embeds, + past_key_values_length, + sliding_window, + ) + self.assertEqual(output, output_ref) + except ImportError: + pass + if __name__ == "__main__": test = unittest.main() From 5384c00dd2129740313a9c88fb5f7969be0c0939 Mon Sep 17 00:00:00 2001 From: Cao E Date: Sat, 11 May 2024 14:38:36 +0800 Subject: [PATCH 057/199] Leverage flash attention for fp16 first_token_masked_mha (#2846) * leverage flash attention for fp16 first_token_masked_mha * fix format --------- Co-authored-by: Zhang, Mingxu --- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 33 +++--- csrc/cpu/vec/vec512/vec512_half.h | 102 ++++++++++++++++++ 2 files changed, 118 insertions(+), 17 deletions(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index a01181b1e..d01cfbf96 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -1294,30 +1294,28 @@ first_token_masked_mha( auto key_lenght = key.size(1); auto kv_head_num = key.size(2); auto head_size = key.size(3); - if (origin_type == at::kHalf) { - key = key.to(at::kFloat); - query = query.to(at::kFloat); - value = value.to(at::kFloat); - key_cache = key_cache.to(at::kFloat); - value_cache = value_cache.to(at::kFloat); - } if (add_casual_mask) { - auto casual_mask = - at::full({query_length, key_lenght}, -1e6, query.options()); + auto casual_mask = at::full( + {query_length, key_lenght}, + origin_type == at::kHalf ? -6e4 : -1e6, + query.options()); casual_mask = at::triu(casual_mask, 1); casual_mask = casual_mask.unsqueeze(0).unsqueeze(0); attention_mask = attention_mask + casual_mask; } - if (key.scalar_type() != at::kBFloat16 && key.scalar_type() != at::kFloat) { + if (key.scalar_type() != at::kBFloat16 && key.scalar_type() != at::kFloat && + key.scalar_type() != at::kHalf) { TORCH_CHECK( false, - "key and value must be float or bfloat16 to use ipex::masked_multihead_self_attention_kernel_impl"); + "key and value must be float, float16 or bfloat16 to use ipex::masked_multihead_self_attention_kernel_impl"); } if (key.scalar_type() == at::kFloat) { copy_key_value(key_cache, key, value_cache, value, beam_batch); - } else { + } else if (key.scalar_type() == at::kBFloat16) { copy_key_value( key_cache, key, value_cache, value, beam_batch); + } else { + copy_key_value(key_cache, key, value_cache, value, beam_batch); } // support MGQ/MQA // expand the head dimensiopn of key/value to be same to the query @@ -1344,6 +1342,11 @@ first_token_masked_mha( attention_mask, 1. / scale_attn)); } else { + if (origin_type == at::kHalf) { + key = key.to(at::kFloat); + query = query.to(at::kFloat); + value = value.to(at::kFloat); + } key = key.permute({0, 2, 1, 3}); query = query.permute({0, 2, 1, 3}); value = value.permute({0, 2, 1, 3}); @@ -1355,13 +1358,9 @@ first_token_masked_mha( attn_outputs = attn_weights.matmul(value); if (origin_type == at::kHalf) { attn_weights = attn_weights.to(origin_type); + attn_outputs = attn_outputs.to(origin_type); } } - if (origin_type == at::kHalf) { - attn_outputs = attn_outputs.to(origin_type); - key_cache = key_cache.to(origin_type); - value_cache = value_cache.to(origin_type); - } return std::make_tuple( attn_outputs, attn_weights, key_cache, value_cache, beam_idx); } diff --git a/csrc/cpu/vec/vec512/vec512_half.h b/csrc/cpu/vec/vec512/vec512_half.h index 54b586900..abef5a0e6 100644 --- a/csrc/cpu/vec/vec512/vec512_half.h +++ b/csrc/cpu/vec/vec512/vec512_half.h @@ -53,6 +53,75 @@ IPEX_FORCE_INLINE void move_ker(at::Half* out, const float* in, int64_t len) { cvt_fp32_to_fp16(out, in, len); } +template <> +IPEX_FORCE_INLINE void move_ker( + at::Half* out, + const at::Half* in, + int64_t len) { + int64_t i = 0; +#pragma unroll(4) + for (i = 0; i < len - 31; i += 32) { + auto in0 = _mm512_loadu_si512(in + i); + _mm512_storeu_si512(out + i, in0); + } + + if (i < len) { + auto mask = (1 << (len - i)) - 1; + auto in0 = _mm512_maskz_loadu_epi16(mask, in + i); + _mm512_mask_storeu_epi16(out + i, mask, in0); + } +} + +static IPEX_FORCE_INLINE void zero_ker(at::Half* out, int64_t len) { + int64_t i = 0; + __m512i zero_512 = _mm512_setzero_si512(); +#pragma unroll(4) + for (i = 0; i < len - 31; i += 32) { + _mm512_storeu_si512(out + i, zero_512); + } + + if (i < len) { + auto mask = ((1 << (len - i)) - 1); + _mm512_mask_storeu_epi16(out + i, mask, zero_512); + } +} + +template <> +IPEX_FORCE_INLINE void add_ker( + at::Half* inout, + const at::Half* in, + int64_t len) { + int64_t i = 0; +#pragma unroll(2) + for (i = 0; i < len - 31; i += 32) { + auto inout1 = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(inout + i))); + auto inout2 = + cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(inout + i + 16))); + auto in1 = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(in + i))); + auto in2 = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(in + i + 16))); + inout1 = _mm512_add_ps(inout1, in1); + inout2 = _mm512_add_ps(inout2, in2); + _mm256_storeu_si256((__m256i*)(inout + i), cvt_fp32_to_fp16(inout1)); + _mm256_storeu_si256((__m256i*)(inout + i + 16), cvt_fp32_to_fp16(inout2)); + } + + if (i < len - 15) { + auto inout1 = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(inout + i))); + auto in1 = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(in + i))); + inout1 = _mm512_add_ps(inout1, in1); + _mm256_storeu_si256((__m256i*)(inout + i), cvt_fp32_to_fp16(inout1)); + i += 16; + } + + if (i < len) { + auto mask = (1 << (len - i)) - 1; + auto inout1 = cvt_fp16_to_fp32(_mm256_maskz_loadu_epi16(mask, inout + i)); + auto in1 = cvt_fp16_to_fp32(_mm256_maskz_loadu_epi16(mask, in + i)); + inout1 = _mm512_add_ps(inout1, in1); + _mm256_mask_storeu_epi16(inout + i, mask, cvt_fp32_to_fp16(inout1)); + } +} + template <> IPEX_FORCE_INLINE void add_ker(float* inout, const at::Half* in, int64_t len) { int64_t i = 0; @@ -85,6 +154,39 @@ IPEX_FORCE_INLINE void add_ker(float* inout, const at::Half* in, int64_t len) { } } +template <> +IPEX_FORCE_INLINE void add_ker(at::Half* inout, const float* in, int64_t len) { + int64_t i = 0; +#pragma unroll(2) + for (i = 0; i < len - 31; i += 32) { + auto in1 = _mm512_loadu_ps(in + i); + auto in2 = _mm512_loadu_ps(in + i + 16); + auto inout1 = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(inout + i))); + auto inout2 = + cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(inout + i + 16))); + inout1 = _mm512_add_ps(inout1, in1); + inout2 = _mm512_add_ps(inout2, in2); + _mm256_storeu_si256((__m256i*)(inout + i), cvt_fp32_to_fp16(inout1)); + _mm256_storeu_si256((__m256i*)(inout + i + 16), cvt_fp32_to_fp16(inout2)); + } + + if (i < len - 15) { + auto in1 = _mm512_loadu_ps(in + i); + auto inout1 = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(inout + i))); + inout1 = _mm512_add_ps(inout1, in1); + _mm256_storeu_si256((__m256i*)(inout + i), cvt_fp32_to_fp16(inout1)); + i += 16; + } + + if (i < len) { + auto mask = (1 << (len - i)) - 1; + auto in1 = _mm512_maskz_loadu_ps(mask, in + i); + auto inout1 = cvt_fp16_to_fp32(_mm256_maskz_loadu_epi16(mask, inout + i)); + inout1 = _mm512_add_ps(inout1, in1); + _mm256_mask_storeu_epi16(inout + i, mask, cvt_fp32_to_fp16(inout1)); + } +} + } // namespace kernel } // namespace cpu } // namespace torch_ipex From 0f9b825a23b04d205f66663edee9931faed74e0f Mon Sep 17 00:00:00 2001 From: Xu Han Date: Sat, 11 May 2024 16:44:16 +0800 Subject: [PATCH 058/199] add cpu example to auto clang-format. (#2866) --- CMakeLists.txt | 1 + csrc/cpu/CMakeLists.txt | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0eca75dd3..8cee416f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,6 +18,7 @@ set(IPEX_ROOT_DIR ${PROJECT_SOURCE_DIR}) set(IPEX_CSRC_ROOT_DIR ${IPEX_ROOT_DIR}/csrc) set(IPEX_CPU_ROOT_DIR ${IPEX_ROOT_DIR}/csrc/cpu) set(IPEX_GPU_ROOT_DIR ${IPEX_ROOT_DIR}/csrc/gpu) +set(IPEX_CPU_EXAMPLE_ROOT_DIR ${IPEX_ROOT_DIR}/examples/cpu) set(IPEX_THIRD_PARTY_ROOT "${PROJECT_SOURCE_DIR}/third_party") set(IPEX_FRONT_END_ROOT_DIR ${IPEX_ROOT_DIR}/intel_extension_for_pytorch/csrc) list(APPEND CMAKE_MODULE_PATH ${IPEX_ROOT_DIR}/cmake/Modules) diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt index a09380761..74c7057d3 100644 --- a/csrc/cpu/CMakeLists.txt +++ b/csrc/cpu/CMakeLists.txt @@ -153,6 +153,11 @@ if(CLANG_FORMAT) file(GLOB_RECURSE ALL_CPU_NATIVE_CSRC_FILES ${IPEX_CPU_ROOT_DIR}/**.[ch] ${IPEX_CPU_ROOT_DIR}/**.[ch]pp) add_custom_target(CL_FORMAT_CPU_NATIVE_CSRC COMMAND ${CLANG_FORMAT_EXEC} -i -style=file ${ALL_CPU_NATIVE_CSRC_FILES}) add_dependencies(${PLUGIN_NAME_CPU} CL_FORMAT_CPU_NATIVE_CSRC) + + # clang-foamt CPU examples + file(GLOB_RECURSE ALL_CPU_EXAMPLE_NATIVE_CSRC_FILES ${IPEX_CPU_EXAMPLE_ROOT_DIR}/**.[ch] ${IPEX_CPU_EXAMPLE_ROOT_DIR}/**.[ch]pp) + add_custom_target(CL_FORMAT_CPU_EXAMPLE_NATIVE_CSRC COMMAND ${CLANG_FORMAT_EXEC} -i -style=file ${ALL_CPU_EXAMPLE_NATIVE_CSRC_FILES}) + add_dependencies(${PLUGIN_NAME_CPU} CL_FORMAT_CPU_EXAMPLE_NATIVE_CSRC) endif() if(USE_LIBXSMM) From 2ec5bc44be875c4a86f4248c42bcdbccd4b8510a Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Sat, 11 May 2024 18:02:04 +0800 Subject: [PATCH 059/199] update oneDNN to df3022638a on main (#2863) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 240d32a66..417f0c7a9 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 240d32a66a0ce709017708483a93ef64c7a70161 +Subproject commit 417f0c7a91aaf3c1566ace488fc825bc3719abbd From 30d9938bf4f1fe3c9feb5479f2294bc1ce404cab Mon Sep 17 00:00:00 2001 From: Cao E Date: Mon, 13 May 2024 10:11:13 +0800 Subject: [PATCH 060/199] Use device-agnostic APIs to avoid autocast warnings (#2844) * use device-agnostic APIs to avoid autocast warnings * fix format --------- Co-authored-by: Zhang, Mingxu Co-authored-by: WeizhuoZhang-intel Co-authored-by: Xia Weiwen --- csrc/cpu/autocast/autocast_mode.cpp | 2 +- intel_extension_for_pytorch/jit/_trace.py | 2 +- .../quantization/_quantization_state.py | 16 ++++++++-------- .../quantization/_quantization_state_utils.py | 8 ++++---- .../quantization/_quantize.py | 4 ++-- tests/cpu/test_autocast.py | 18 +++++++++--------- 6 files changed, 25 insertions(+), 25 deletions(-) diff --git a/csrc/cpu/autocast/autocast_mode.cpp b/csrc/cpu/autocast/autocast_mode.cpp index 4aee2f32b..4a7533847 100644 --- a/csrc/cpu/autocast/autocast_mode.cpp +++ b/csrc/cpu/autocast/autocast_mode.cpp @@ -6,7 +6,7 @@ namespace torch_ipex { namespace autocast { at::ScalarType get_autocast_dtype() { - return at::autocast::get_autocast_cpu_dtype(); + return at::autocast::get_autocast_dtype(at::kCPU); } Tensor cpu_cached_cast(at::ScalarType to_type, const Tensor& arg) { diff --git a/intel_extension_for_pytorch/jit/_trace.py b/intel_extension_for_pytorch/jit/_trace.py index 77ea42e24..6292f4506 100644 --- a/intel_extension_for_pytorch/jit/_trace.py +++ b/intel_extension_for_pytorch/jit/_trace.py @@ -56,7 +56,7 @@ def jit_trace_wrapper(f): def wrapper(*args, **kwargs): prev = torch.is_autocast_cache_enabled() # For running CPU workload, disable autocast cache - if torch.is_autocast_cpu_enabled(): + if torch.is_autocast_enabled("cpu"): torch.set_autocast_cache_enabled(False) # For running XPU workload and the platform unsupports 2d block, diff --git a/intel_extension_for_pytorch/quantization/_quantization_state.py b/intel_extension_for_pytorch/quantization/_quantization_state.py index 8bd86d88d..ebaecc438 100644 --- a/intel_extension_for_pytorch/quantization/_quantization_state.py +++ b/intel_extension_for_pytorch/quantization/_quantization_state.py @@ -472,8 +472,8 @@ def op_weight_convert_before_hook( if w_dtype != torch.float32: weight = weight.to(w_dtype) if ( - torch.is_autocast_cpu_enabled() - and torch.get_autocast_cpu_dtype() == torch.bfloat16 + torch.is_autocast_enabled("cpu") + and torch.get_autocast_dtype("cpu") == torch.bfloat16 ): if weight.dtype == torch.bfloat16: weight = weight.to(dtype=torch.float32) @@ -510,8 +510,8 @@ def op_weight_convert_before_hook( scale, zp, dtype = quant_info weight = op.weight if ( - torch.is_autocast_cpu_enabled() - and torch.get_autocast_cpu_dtype() == torch.bfloat16 + torch.torch.is_autocast_enabled("cpu") + and torch.get_autocast_dtype("cpu") == torch.bfloat16 ): if weight.dtype == torch.bfloat16: weight = weight.to(dtype=torch.float32) @@ -538,8 +538,8 @@ def op_weight_convert_before_hook( ): scale, zp, dtype = quant_info if ( - torch.is_autocast_cpu_enabled() - and torch.get_autocast_cpu_dtype() == torch.bfloat16 + torch.torch.is_autocast_enabled("cpu") + and torch.get_autocast_dtype("cpu") == torch.bfloat16 ): if weights[tensor_arg_idx].dtype == torch.bfloat16: weights[tensor_arg_idx] = weights[tensor_arg_idx].to( @@ -574,8 +574,8 @@ def op_weight_convert_before_hook( tensor_arg_idx + 1 ] if ( - torch.is_autocast_cpu_enabled() - and torch.get_autocast_cpu_dtype() == torch.bfloat16 + torch.torch.is_autocast_enabled("cpu") + and torch.get_autocast_dtype("cpu") == torch.bfloat16 ): weight_if_bf16 = w_ih.dtype == torch.bfloat16 if weight_if_bf16: diff --git a/intel_extension_for_pytorch/quantization/_quantization_state_utils.py b/intel_extension_for_pytorch/quantization/_quantization_state_utils.py index 1d59eedac..7fa343b27 100644 --- a/intel_extension_for_pytorch/quantization/_quantization_state_utils.py +++ b/intel_extension_for_pytorch/quantization/_quantization_state_utils.py @@ -410,8 +410,8 @@ def iterate_and_apply_convert( ]: ch_axis = 1 if ( - torch.is_autocast_cpu_enabled() - and torch.get_autocast_cpu_dtype() == torch.bfloat16 + torch.is_autocast_enabled("cpu") + and torch.get_autocast_dtype("cpu") == torch.bfloat16 ): # do autocast in Python side if args.dtype == torch.float32: @@ -437,8 +437,8 @@ def iterate_and_apply_convert( or str(type(op)) in conv_linear_modules ): if ( - torch.is_autocast_cpu_enabled() - and torch.get_autocast_cpu_dtype() == torch.bfloat16 + torch.is_autocast_enabled("cpu") + and torch.get_autocast_dtype("cpu") == torch.bfloat16 ): if args.dtype == torch.bfloat16: args = args.to(dtype=torch.float32) diff --git a/intel_extension_for_pytorch/quantization/_quantize.py b/intel_extension_for_pytorch/quantization/_quantize.py index c21b867b3..170823135 100644 --- a/intel_extension_for_pytorch/quantization/_quantize.py +++ b/intel_extension_for_pytorch/quantization/_quantize.py @@ -473,8 +473,8 @@ def convert(model, inplace=False): # which will reduce the dtype conversion. # TODO: check whether can be removed or not? if ( - torch.is_autocast_cpu_enabled() - and torch.get_autocast_cpu_dtype() == torch.bfloat16 + torch.torch.is_autocast_enabled("cpu") + and torch.get_autocast_dtype("cpu") == torch.bfloat16 ): convert_model = nn.utils._model_convert.convert_model_data_type( convert_model, torch.bfloat16 diff --git a/tests/cpu/test_autocast.py b/tests/cpu/test_autocast.py index f59a0ae55..b1ef8cc8c 100644 --- a/tests/cpu/test_autocast.py +++ b/tests/cpu/test_autocast.py @@ -60,9 +60,9 @@ def setUp(self): def test_set_autocast_dtype(self): with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): - self.assertEqual(torch.get_autocast_cpu_dtype(), torch.bfloat16) + self.assertEqual(torch.get_autocast_dtype("cpu"), torch.bfloat16) with torch.cpu.amp.autocast(enabled=True, dtype=torch.float16): - self.assertEqual(torch.get_autocast_cpu_dtype(), torch.float16) + self.assertEqual(torch.get_autocast_dtype("cpu"), torch.float16) def test_forward_dtype(self): rand_seed = int(get_rand_seed()) @@ -803,9 +803,9 @@ def cast(val, to_type): if add_kwargs is None: add_kwargs = {} - self.assertFalse(torch.is_autocast_cpu_enabled()) + self.assertFalse(torch.torch.is_autocast_enabled("cpu")) with torch.cpu.amp.autocast(enabled=True, dtype=autocast_type): - self.assertTrue(torch.is_autocast_cpu_enabled()) + self.assertTrue(torch.torch.is_autocast_enabled("cpu")) out_type = out_type if out_type is not None else run_as_type output = output_method = None @@ -860,7 +860,7 @@ def compare(first, second): # as the C++-side autocasting, and should be bitwise accurate. output_to_compare = output if output is not None else output_method with torch.cpu.amp.autocast(enabled=False, dtype=autocast_type): - self.assertFalse(torch.is_autocast_cpu_enabled()) + self.assertFalse(torch.torch.is_autocast_enabled("cpu")) if module is not None and hasattr(module, op): control = getattr(module, op)( @@ -875,8 +875,8 @@ def compare(first, second): self.assertTrue( comparison, "torch.{} result did not match control".format(op) ) - self.assertTrue(torch.is_autocast_cpu_enabled()) - self.assertFalse(torch.is_autocast_cpu_enabled()) + self.assertTrue(torch.torch.is_autocast_enabled("cpu")) + self.assertFalse(torch.torch.is_autocast_enabled("cpu")) def _run_autocast_pass_test( self, @@ -900,9 +900,9 @@ def cast(val, to_type): if add_kwargs is None: add_kwargs = {} - self.assertFalse(torch.is_autocast_cpu_enabled()) + self.assertFalse(torch.torch.is_autocast_enabled("cpu")) with torch.cpu.amp.autocast(enabled=True, dtype=autocast_type): - self.assertTrue(torch.is_autocast_cpu_enabled()) + self.assertTrue(torch.torch.is_autocast_enabled("cpu")) out_type = out_type if out_type is not None else run_as_type # Try module.* variant, if requested: From c608a02aa9879b9832a3caf23fe6c3cade9fbc38 Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Mon, 13 May 2024 13:24:31 +0800 Subject: [PATCH 061/199] Enable stride paged attention (#2875) --- csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp | 71 +++++++++++--------- tests/cpu/test_paged_attention.py | 20 +++++- 2 files changed, 59 insertions(+), 32 deletions(-) diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp index 32a316a94..0696c9d10 100644 --- a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp @@ -117,9 +117,14 @@ void single_query_cached_kv_attention_kernel( {num_seqs, num_heads, max_context_len}, query.options().dtype(at::ScalarType::Float)); auto attn_weights_ptr = attn_weights.data_ptr(); - auto kv_block_stride = key_cache.stride(0); - auto q_stride = query.stride(0); - auto attn_weights_stride = attn_weights.stride(0); + auto kv_block_strideN = key_cache.stride(0); + auto kv_block_strideP = key_cache.stride(1); + auto kv_block_strideH = key_cache.stride(2); + + auto q_strideN = query.stride(0); + auto q_strideH = query.stride(1); + auto attn_weights_strideN = attn_weights.stride(0); + auto attn_weights_strideH = attn_weights.stride(1); if (alibi_slopes.has_value()) { auto alibi_slopes_size = alibi_slopes.value().size(0); @@ -135,15 +140,15 @@ void single_query_cached_kv_attention_kernel( auto context_len = context_lens_ptr[seq_id]; if (token_id >= context_len) continue; - auto attn_w_pos = attn_weights_ptr + seq_id * attn_weights_stride + - head_id * max_context_len + token_id; - auto q_ptr_start = query_ptr + seq_id * q_stride + head_id * head_size; + auto attn_w_pos = attn_weights_ptr + seq_id * attn_weights_strideN + + head_id * attn_weights_strideH + token_id; + auto q_ptr_start = query_ptr + seq_id * q_strideN + head_id * q_strideH; auto block_id = block_tables_ptr [seq_id * max_num_blocks_per_seq + token_id / block_size]; auto block_offset = token_id % block_size; - auto k_cache_start = key_cache_ptr + block_id * kv_block_stride + - block_offset * num_kv_heads * head_size + - head_mapping_ptr[head_id] * head_size; + auto k_cache_start = key_cache_ptr + block_id * kv_block_strideN + + block_offset * kv_block_strideP + + head_mapping_ptr[head_id] * kv_block_strideH; reduce_head( q_ptr_start, k_cache_start, attn_w_pos, head_size); } @@ -157,8 +162,8 @@ void single_query_cached_kv_attention_kernel( auto max_val = -10000.0f; float sum = 0.0f; auto context_len = context_lens_ptr[seq_id]; - auto attn_w_start = attn_weights_ptr + seq_id * attn_weights_stride + - head_id * max_context_len; + auto attn_w_start = attn_weights_ptr + seq_id * attn_weights_strideN + + head_id * attn_weights_strideH; #if defined(CPU_CAPABILITY_AVX512) if (alibi_slopes_ptr != nullptr) { auto alibi_slope = alibi_slopes_ptr[head_id]; @@ -221,7 +226,12 @@ void single_query_cached_kv_attention_kernel( at::zeros({thread_numbers, num_seqs, num_heads}, at::kByte); auto flag_access = private_attn_out_flag.accessor(); auto private_attn_out_ptr = private_attn_outs.data_ptr(); - auto private_attn_out_stride = private_attn_outs.stride(0); + auto private_attn_out_strideT = private_attn_outs.stride(0); + auto private_attn_out_strideN = private_attn_outs.stride(1); + auto private_attn_out_strideH = private_attn_outs.stride(2); + auto attn_out_strideN = out.stride(0); + auto attn_out_strideH = out.stride(1); + // mul and accumulate #pragma omp parallel for collapse(3) for (auto seq_id = 0; seq_id < num_seqs; seq_id++) { @@ -232,17 +242,17 @@ void single_query_cached_kv_attention_kernel( if (token_id >= context_len) continue; auto attn_w = attn_weights_ptr - [seq_id * attn_weights_stride + head_id * max_context_len + + [seq_id * attn_weights_strideN + head_id * attn_weights_strideH + token_id]; auto block_id = block_tables_ptr [seq_id * max_num_blocks_per_seq + token_id / block_size]; auto block_offset = token_id % block_size; - auto v_cache_start = value_cache_ptr + block_id * kv_block_stride + - block_offset * num_kv_heads * head_size + - head_mapping_ptr[head_id] * head_size; + auto v_cache_start = value_cache_ptr + block_id * kv_block_strideN + + block_offset * kv_block_strideP + + head_mapping_ptr[head_id] * kv_block_strideH; auto attn_out_start = private_attn_out_ptr + - thread_id * private_attn_out_stride + seq_id * q_stride + - head_id * head_size; + thread_id * private_attn_out_strideT + seq_id * attn_out_strideN + + head_id * attn_out_strideH; mul_attenion_weights_and_value_of_head( attn_w, v_cache_start, @@ -262,8 +272,8 @@ void single_query_cached_kv_attention_kernel( #pragma omp parallel for collapse(2) for (auto seq_id = 0; seq_id < num_seqs; seq_id++) { for (auto hi = 0; hi < num_heads; hi++) { - auto thr0_head_start = - private_attn_out_ptr + (seq_id * num_heads + hi) * head_size; + auto thr0_head_start = private_attn_out_ptr + + seq_id * private_attn_out_strideN + hi * private_attn_out_strideH; if (flag_access[0][seq_id][hi] == 0) { torch_ipex::cpu::kernel::zero_ker(thr0_head_start, head_size); } @@ -271,10 +281,10 @@ void single_query_cached_kv_attention_kernel( if (flag_access[thread_id][seq_id][hi] == 0) { continue; } - auto attn_out_head_stride = thread_id * private_attn_out_stride + - (seq_id * num_heads + hi) * head_size; + auto attn_out_head_offset = thread_id * private_attn_out_strideT + + seq_id * private_attn_out_strideN + hi * private_attn_out_strideH; auto private_attn_out_start = - private_attn_out_ptr + attn_out_head_stride; + private_attn_out_ptr + attn_out_head_offset; torch_ipex::cpu::kernel::add_ker( thr0_head_start, private_attn_out_start, head_size); } @@ -327,16 +337,19 @@ void reshape_and_cache_kernel( auto value_cache_ptr = value_cache.data_ptr(); auto value_ptr = value.data_ptr(); auto slot_mapping_ptr = slot_mapping.data_ptr(); - auto cache_stride = key_cache.stride(0); - auto state_stride = key.stride(0); + auto cache_strideN = key_cache.stride(0); + auto cache_strideP = key_cache.stride(1); + auto cache_strideH = key_cache.stride(2); + auto state_strideN = key.stride(0); + auto state_strideH = key.stride(1); #pragma omp parallel for collapse(2) for (auto ti = 0; ti < num_tokens; ti++) { for (auto hi = 0; hi < head_num; hi++) { auto block_id = slot_mapping_ptr[ti] / block_size; auto block_offset = slot_mapping_ptr[ti] % block_size; - auto cache_offset = block_id * cache_stride + - block_offset * key_cache.stride(1) + hi * head_size; - auto state_offset = ti * state_stride + hi * head_size; + auto cache_offset = block_id * cache_strideN + + block_offset * cache_strideP + hi * cache_strideH; + auto state_offset = ti * state_strideN + hi * state_strideH; auto key_cache_start = key_cache_ptr + cache_offset; auto key_ptr_start = key_ptr + state_offset; auto value_cache_start = value_cache_ptr + cache_offset; @@ -410,8 +423,6 @@ void reshape_and_cache_cpu_kernel_impl( TORCH_CHECK( key_cache.scalar_type() == value_cache.scalar_type(), "key_cache and value_cache should have the same data type"); - TORCH_CHECK(key_cache.is_contiguous(), "key_cache should be contiguous"); - TORCH_CHECK(value_cache.is_contiguous(), "value_cache should be contiguous"); TORCH_CHECK( slot_mapping.is_contiguous(), "slot_mapping should be contiguous"); RECORD_FUNCTION( diff --git a/tests/cpu/test_paged_attention.py b/tests/cpu/test_paged_attention.py index 792e4add9..cefe7f48c 100644 --- a/tests/cpu/test_paged_attention.py +++ b/tests/cpu/test_paged_attention.py @@ -122,9 +122,25 @@ def _test_paged_attention_func( max_seq_len = 1024 scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_head = num_head - query = torch.empty( - num_seqs, num_query_heads, head_size, dtype=dtype, device="cpu" + qkv = torch.empty( + num_seqs, + (num_query_heads + num_kv_head * 2) * head_size, + dtype=dtype, + device="cpu", + ) + query, _, _ = qkv.split( + [ + num_query_heads * head_size, + num_kv_head * head_size, + num_kv_head * head_size, + ], + dim=1, ) + query = query.view(num_seqs, num_query_heads, head_size) + # import pdb + + # pdb.set_trace() + print(query.shape, query.stride()) query.uniform_(-scale, scale) assert num_query_heads % num_kv_head == 0 num_queries_per_kv = num_query_heads // num_kv_head From ea63c1eed7660c95ec79e514eb65ade064403e3e Mon Sep 17 00:00:00 2001 From: jiayisunx Date: Mon, 13 May 2024 14:56:35 +0800 Subject: [PATCH 062/199] skip test_multi_stream_graph_mode_torchdynamo (#2871) Co-authored-by: WeizhuoZhang-intel --- tests/cpu/test_graph_capture.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/cpu/test_graph_capture.py b/tests/cpu/test_graph_capture.py index e0165bab7..79be7594b 100644 --- a/tests/cpu/test_graph_capture.py +++ b/tests/cpu/test_graph_capture.py @@ -618,6 +618,7 @@ def test_multi_stream_graph_mode_jit(self): not ipex.cpu.runtime.is_runtime_ext_enabled(), "Skip when IPEX Runtime extension is not enabled", ) + @unittest.skipIf(True, "Core dump.") # TODO: Jiayi to re-enable @runtime_thread_affinity_test_env def test_multi_stream_graph_mode_torchdynamo(self): model = Conv_IF_Relu().to(memory_format=torch.channels_last) @@ -670,6 +671,7 @@ def test_multi_stream_graph_mode_jit_autocast(self): not ipex.cpu.runtime.is_runtime_ext_enabled(), "Skip when IPEX Runtime extension is not enabled", ) + @unittest.skipIf(True, "Core dump.") # TODO: Jiayi to re-enable @runtime_thread_affinity_test_env def test_multi_stream_graph_mode_torchdynamo_autocast(self): model = Conv_IF_Relu().to(memory_format=torch.channels_last) From 61f06ac2db80e2a87d9c8a9c3b1186c91a50086a Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Mon, 13 May 2024 16:01:11 +0800 Subject: [PATCH 063/199] Update dependency_version.yml 20240513 (#2874) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 77fb7f143..6e06969f5 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240505+cpu + version: 2.4.0.dev20240508+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240505+cpu + version: 2.2.0.dev20240508+cpu torchvision: - version: 0.19.0.dev20240505+cpu + version: 0.19.0.dev20240508+cpu transformers: version: 4.38.1 From a324d72b74371bab01953a4ac0861b7fde678b6d Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Mon, 13 May 2024 17:50:35 +0800 Subject: [PATCH 064/199] Minor adaptation for inplace rope/varelenSdpa (#2879) --- .../llm/modules/mha_fusion.py | 15 +++++++++++++-- .../transformers/models/cpu/fusions/mha_fusion.py | 8 ++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py index fc9d3e62b..c3f410e63 100644 --- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py @@ -139,9 +139,14 @@ def apply_function( runtime_module = cls.runtime_ops.get_module_from_device( query.device.type, IPEXCustomOpType.ROPE, False ) - query, key = runtime_module.rotary_embedding( + + query_, key_ = runtime_module.rotary_embedding( query, key, sin, cos, rotary_dim, rotary_half, position_ids ) + + # keep the inplace context as used in TGI + query.copy_(query_) + key.copy_(key_) return query, key @@ -467,7 +472,13 @@ def reshape_and_cache( ): return cls.runtime_ops.get_module_from_device( key.device.type, IPEXCustomOpType.PAGED_ATTENTION, False - ).reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) + ).reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping.int() if slot_mapping.dtype is torch.long else slot_mapping, + ) @classmethod def single_query_cached_kv_attention( diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index 629b32cc5..0e01d94dc 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -376,7 +376,7 @@ def __init__(self): super().__init__() @classmethod - def apply_functions( + def apply_function( cls, query, # [total_q, num_head, head_size] key, # [total_k, num_head_k, head_size] @@ -451,8 +451,8 @@ def apply_functions( attn_mask=attn_mask if not is_causal else None, is_causal=is_causal, ) - out.copy_(out_.transpose(1, 2).reshape(-1, out.shape[-2], out.shape[-1])) - + out_ = out_.permute(0, 2, 1, 3) + out.copy_(out_[q_mask]) return out def forward( @@ -472,7 +472,7 @@ def forward( return_softmax, gen_, ): - self.apply_function( + return self.apply_function( query, key, value, From ec863c0da1690723e35ec15646cf04e068d132f1 Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Tue, 14 May 2024 09:06:43 +0800 Subject: [PATCH 065/199] Add fast path for greedy and bs=1 (#2878) --- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index d01cfbf96..e497942b5 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -553,8 +553,6 @@ scale_dot_product_for_indirect_access_kv_cache( auto attn_out_ptr = attn_outs.data_ptr(); // torch_ipex::cpu::kernel::zero_ker(attn_out_ptr, attn_outs.numel()); auto attn_w_ptr = attn_weights.data_ptr(); - long new_beam_idx[beam_batch][offset + query.size(1) + 1]; - auto b_ptr = beam_idx.data_ptr(); auto thread_numbers = omp_get_max_threads(); auto max_parallel_parts = thread_numbers * 4; @@ -564,7 +562,10 @@ scale_dot_product_for_indirect_access_kv_cache( : std::max(seq_len / max_parallel_parts, 1L); kv_block_size = std::min(kv_block_size, 32L); auto kv_block_count = (seq_len + kv_block_size - 1) / kv_block_size; - if (offset > 0) { + auto need_update_beam_idx = offset > 0 and bs > 1; + auto b_ptr = beam_idx.data_ptr(); + long new_beam_idx[beam_batch][offset + query.size(1) + 1]; + if (need_update_beam_idx) { // according to the last decoded token to get the target beam for the past // token for (int i = 0; i < bs; i++) { @@ -597,6 +598,7 @@ scale_dot_product_for_indirect_access_kv_cache( attn_w_pos[0] = 0.0f; auto kc_token_start = ti * kc_token_stride; auto kc_t_beam_start = kc_token_start; + auto beam = need_update_beam_idx ? new_beam_idx[bi][ti] : 0; if (ti > query_ti + offset) { // only caculate the innerproduct for // the past token and current token @@ -637,8 +639,8 @@ scale_dot_product_for_indirect_access_kv_cache( false, nullptr); } else { - kc_t_beam_start = kc_t_beam_start + - new_beam_idx[bi][ti] * kv_head * head_size; + kc_t_beam_start = + kc_t_beam_start + beam * kv_head * head_size; if (cur_len > 1) { auto beam_size = beam_batch / bs; kc_t_beam_start = @@ -755,6 +757,7 @@ scale_dot_product_for_indirect_access_kv_cache( attn_out_head_stride + query_ti * head_size; auto vc_token_start = vi * kc_token_stride; + auto beam = need_update_beam_idx ? new_beam_idx[bi][vi] : 0; if (vi == query_ti + offset) { // caculate the attention values // for the current token auto vc_t_beam_start = vc_token_start; @@ -796,8 +799,8 @@ scale_dot_product_for_indirect_access_kv_cache( nullptr, flag_access[thread_id][bi][hi]); } else { - auto vc_t_beam_start = vc_token_start + - new_beam_idx[bi][vi] * kv_head * head_size; + auto vc_t_beam_start = + vc_token_start + beam * kv_head * head_size; if (cur_len > 1) { auto beam_size = beam_batch / bs; vc_t_beam_start = @@ -909,7 +912,8 @@ scale_dot_product_for_indirect_access_kv_cache_half( auto attn_w_ptr = attn_weights.data_ptr(); long new_beam_idx[beam_batch][offset + query.size(1) + 1]; auto b_ptr = beam_idx.data_ptr(); - if (offset > 0) { + auto need_update_beam_idx = offset > 0 && bs > 1; + if (need_update_beam_idx) { // according to the last decoded token to get the target beam for the past // token for (int i = 0; i < bs; i++) { @@ -939,6 +943,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( attn_w_pos[0] = 0.0f; auto kc_token_start = ti * kc_token_stride; auto kc_t_beam_start = kc_token_start; + auto beam = need_update_beam_idx ? new_beam_idx[bi][ti] : 0; if (ti > query_ti + offset) { // only caculate the innerproduct for // the past token and current token attn_w_pos[0] = -10000.0f; @@ -978,8 +983,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( false, nullptr); } else { - kc_t_beam_start = kc_t_beam_start + - new_beam_idx[bi][ti] * kv_head * head_size; + kc_t_beam_start = kc_t_beam_start + beam * kv_head * head_size; if (cur_len > 1) { auto beam_size = beam_batch / bs; kc_t_beam_start = @@ -1065,6 +1069,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( query_ti * head_size; auto vc_token_start = vi * kc_token_stride; + auto beam = need_update_beam_idx ? new_beam_idx[bi][vi] : 0; if (vi == query_ti + offset) { // caculate the attention values // for the current token auto vc_t_beam_start = vc_token_start; @@ -1107,7 +1112,7 @@ scale_dot_product_for_indirect_access_kv_cache_half( flag_access[thread_id][bi][hi]); } else { auto vc_t_beam_start = - vc_token_start + new_beam_idx[bi][vi] * kv_head * head_size; + vc_token_start + beam * kv_head * head_size; if (cur_len > 1) { auto beam_size = beam_batch / bs; vc_t_beam_start = @@ -1412,6 +1417,7 @@ masked_multihead_self_attention_kernel_impl( value.options()); beam_idx = at::empty({max_positions, beam_batch}, beam_idx.options()); auto beam_idx_access = beam_idx.accessor(); +#pragma omp parallel for collapse(2) for (auto i = 0; i < max_positions; i++) { for (auto j = 0; j < beam_batch; j++) { if (key.size(0) == beam_batch) { @@ -1483,7 +1489,7 @@ void attention_mask_2d_to_4d( int64_t length, int64_t diagonal) { T finfo_min_val = finfo_min.item(); -#pragma omp parallel for collapse(2) + for (int64_t b = 0; b < batch_size; ++b) { for (int64_t l = 0; l < seq_length; ++l) { for (int64_t c = 0; c < length; ++c) { From 701f7c86e23faad6848d98231b5e8c552431940b Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 14 May 2024 11:57:38 +0800 Subject: [PATCH 066/199] Phi3: not create causal mask in masked mha (#2877) --- .../transformers/models/reference/modules/attentions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 87d13866d..2bbd4bd50 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -2011,6 +2011,7 @@ def _Phi3Attention_forward( past_key_value, None, attention_mask, + add_casual_mask=False, ) attn_output = attn_output.transpose(1, 2).contiguous() attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) From 2d6e604159ba5dd38d806abb7e3302708c26ccbb Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Wed, 15 May 2024 13:32:35 +0800 Subject: [PATCH 067/199] Update dependency_version.yml 20240515 (#2884) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 6e06969f5..fddc3e804 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240508+cpu + version: 2.4.0.dev20240514+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240508+cpu + version: 2.2.0.dev20240514+cpu torchvision: - version: 0.19.0.dev20240508+cpu + version: 0.19.0.dev20240514+cpu transformers: version: 4.38.1 From c3203f7f386a217c90c6b3481c65df153149ecc5 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 16 May 2024 12:03:51 +0800 Subject: [PATCH 068/199] update oneDNN to 2fa152e201 on main (#2889) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 417f0c7a9..0430e47c6 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 417f0c7a91aaf3c1566ace488fc825bc3719abbd +Subproject commit 0430e47c6b2704627977b99ab5556aa0ba6908ce From a5793e1104c7544aa9e74a6b8bedffb9c8d90524 Mon Sep 17 00:00:00 2001 From: Xu Han Date: Thu, 16 May 2024 20:47:36 +0800 Subject: [PATCH 069/199] Xu add lintrunner (#2890) * add lintrunner config. * update lintrunner init version. * update python code check guide. * remove flake8 install requirement file. --- .lintrunner.toml | 31 +++++++++++++++++++++ CONTRIBUTING.md | 4 ++- scripts/tools/setup/requirements-flake8.txt | 10 ------- 3 files changed, 34 insertions(+), 11 deletions(-) create mode 100644 .lintrunner.toml delete mode 100644 scripts/tools/setup/requirements-flake8.txt diff --git a/.lintrunner.toml b/.lintrunner.toml new file mode 100644 index 000000000..fed5d458c --- /dev/null +++ b/.lintrunner.toml @@ -0,0 +1,31 @@ +[[linter]] +code = 'FLAKE8' +include_patterns = ['*.py'] +exclude_patterns = [ + '.git/**', +] +command = [ + 'python3', + 'scripts/tools/setup/flake8.py', + '--', + '@{{PATHSFILE}}' +] + +init_command = [ + 'python', + '-m', + 'lintrunner_adapters', + 'run', + 'pip_init', + '--dry-run={{DRYRUN}}', + 'flake8==3.8.2', + 'flake8-bugbear==20.1.4', + 'flake8-comprehensions==3.3.0', + 'flake8-executable==2.0.4', + # 'git+https://github.com/malfet/flake8-coding.git', + 'flake8-pyi==20.5.0', + 'mccabe==0.6.1', + 'pycodestyle==2.6.0', + 'pyflakes==2.2.0', + 'black==24.3.0', +] \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 795728fd7..d888a816c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -180,7 +180,9 @@ For example, if you wanted to run the test `MayContainAlias`, which is part of t ### Python Code We can find python code style utils in `scripts/tools/setup` folder. Please install the related dependency python modules: ```bash -pip install -r scripts/tools/setup/requirements-flake8.txt +pip install lintrunner +pip install lintrunner-adapters +lintrunner init ``` Please run flake8.py to auto-format python code and check the python code style. The script will return results, please manual modify code follow the output information, and until it shows pass: ```bash diff --git a/scripts/tools/setup/requirements-flake8.txt b/scripts/tools/setup/requirements-flake8.txt deleted file mode 100644 index 5689e9542..000000000 --- a/scripts/tools/setup/requirements-flake8.txt +++ /dev/null @@ -1,10 +0,0 @@ -flake8==3.8.2 -flake8-bugbear==20.1.4 -flake8-comprehensions==3.3.0 -flake8-executable==2.0.4 -git+https://github.com/malfet/flake8-coding.git -flake8-pyi==20.5.0 -mccabe -pycodestyle==2.6.0 -pyflakes==2.2.0 -black==24.3.0 From a94cdbd106b6a7751bb95eb51ebfcb1f8ea0a81c Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Fri, 17 May 2024 09:15:38 +0800 Subject: [PATCH 070/199] Update dependency_version.yml 20240517 (#2895) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index fddc3e804..56bca9060 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240514+cpu + version: 2.4.0.dev20240516+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240514+cpu + version: 2.2.0.dev20240516+cpu torchvision: - version: 0.19.0.dev20240514+cpu + version: 0.19.0.dev20240516+cpu transformers: version: 4.38.1 From 6eb32019c650c92b4e93f3d56b1f5d5e7885d927 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Mon, 20 May 2024 09:23:43 +0800 Subject: [PATCH 071/199] Update dependency_version.yml 20240520 (#2904) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 56bca9060..b258442c5 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240516+cpu + version: 2.4.0.dev20240519+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240516+cpu + version: 2.2.0.dev20240519+cpu torchvision: - version: 0.19.0.dev20240516+cpu + version: 0.19.0.dev20240519+cpu transformers: version: 4.38.1 From 34a349e456023b5e9ba13815b31d2c1bad3bdeaf Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Mon, 20 May 2024 12:44:24 +0800 Subject: [PATCH 072/199] Fix DDP resume training (#2902) * refine state_dict in ipex.optimize * remove debug print * fix ut --- .../nn/utils/_parameter_wrapper.py | 149 +++++++----------- .../nn/utils/_weight_prepack.py | 7 +- tests/cpu/ipex-optimize-ddp-static-graph.py | 57 +++++++ tests/cpu/test_ipex_optimize.py | 44 ++++++ 4 files changed, 156 insertions(+), 101 deletions(-) create mode 100644 tests/cpu/ipex-optimize-ddp-static-graph.py diff --git a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py index 25d10ba77..70d0e1769 100644 --- a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py +++ b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py @@ -245,23 +245,56 @@ def remove_empty_tensor(out): return out +def found_wrapper(parameter, params_attr): + for _, v in params_attr.items(): + if parameter is v.parameter: + return v + return None + + def patch_state_dict(model, params_attr, mode): - def cast_back_state_dict(self, *args, destination=None, prefix="", keep_vars=False): - with torch.no_grad(), contextlib.ExitStack() as stack: - for v in params_attr.values(): - if mode == "inference": - stack.enter_context(v.inference_cast_save()) + def get_parammeter_from_model(model, name_list): + if name_list[0] == "module" and not hasattr(model, "module"): + # for DDP model, there is an extra module + name_list = name_list[1:] + model_or_param = model + for attr in name_list: + model_or_param = getattr(model_or_param, attr) + return model_or_param + + def to_public_fp32(model, state_dict, params_attr): + data_ptr_dict = {} + for k, v in state_dict.items(): + v_ptr = v.data_ptr() + if v_ptr in data_ptr_dict: + # use cached tensor for multiple parameters share same tensor data + state_dict[k] = data_ptr_dict[v_ptr] + continue + # k = "submodule_name.submodule_name.attr_name" + # for example, "attn.linear.weight" + name_list = k.split(".") + param = get_parammeter_from_model(model, name_list) + param_wrapper = found_wrapper(param, params_attr) + if param_wrapper: + if mode == "inference" and param_wrapper.original_dtype is not None: + state_dict[k] = v.to(param_wrapper.original_dtype) elif mode == "training": - stack.enter_context(v.training_cast_save()) + state_dict[k] = param_wrapper._training_cast_to_fp32() else: assert mode == "prepack" - stack.enter_context(v.prepack_cast_save()) - out = self._original_state_dict( + state_dict[k] = param_wrapper._unpack_cast_to_fp32() + data_ptr_dict[v_ptr] = state_dict[k] + return state_dict + + def cast_back_state_dict(self, *args, destination=None, prefix="", keep_vars=False): + with torch.no_grad(), contextlib.ExitStack() as stack: + state_dict = self._original_state_dict( *args, destination=destination, prefix=prefix, keep_vars=keep_vars ) # We don't save the _ipex_module_empty_weight_tensor or _ipex_module_empty_bias_tensor Parameter in the state dict - out = remove_empty_tensor(out) - return out + state_dict = remove_empty_tensor(state_dict) + state_dict = to_public_fp32(self, state_dict, params_attr) + return state_dict if not hasattr(model, "_original_state_dict"): setattr(model, "_original_state_dict", model.state_dict) # noqa: B010 @@ -362,52 +395,9 @@ def cast_for_training(self, dtype, split): requires_grad=self.master_parameter.requires_grad, ) - def inference_cast_save(self): - @contextlib.contextmanager - def ctx(): - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.original_dtype) - try: - yield - finally: - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.casted_dtype) - - return ctx() - - def training_cast_save(self): - @contextlib.contextmanager - def ctx(): - self._training_cast_before_save() - try: - yield - finally: - self._training_cast_after_save() - - return ctx() - - def prepack_cast_save(self): - @contextlib.contextmanager - def ctx(): - self._cast_unpack_before_save() - try: - yield - finally: - self._cast_unpack_after_save() - - return ctx() - - def _inference_cast_before_save(self): - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.original_dtype) - - def _inference_cast_after_save(self): - if self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.casted_dtype) - - def _training_cast_before_save(self): + def _training_cast_to_fp32(self): if self.original_dtype is None: - return + return self.parameter.data.detach() assert self.original_dtype in ( torch.float, torch.float32, @@ -417,51 +407,20 @@ def _training_cast_before_save(self): fp32_param = torch.ops.torch_ipex.cat_bfloat16_float( self.parameter.data, self.parameter_trail ) - with torch.no_grad(): - self.parameter.data = fp32_param - else: - # will save parameter for non-split case - with torch.no_grad(): - self.parameter.data = self.master_parameter.data - - def _training_cast_after_save(self): - if self.original_dtype is None: - return - if self.split: - assert self.casted_dtype == torch.bfloat16 - top, self.parameter_trail = torch.ops.torch_ipex.split_float_bfloat16( - self.parameter.data - ) - with torch.no_grad(): - self.parameter.data = top + return fp32_param.detach() else: - self.parameter.data = self.master_parameter.data.to(self.casted_dtype) + return self.master_parameter.data.detach() - def _cast_unpack_before_save(self): + def _unpack_cast_to_fp32(self): + fp32_param = self.parameter.data if self.split is not None: - self._training_cast_before_save() + fp32_param = self._training_cast_to_fp32() elif self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.original_dtype) + fp32_param = self.parameter.to(self.original_dtype) if self.op_ctx is None: - return - with torch.no_grad(): - if self.master_parameter is not None: - self.parameter.data = self.op_ctx.to_public(self.master_parameter) - else: - self.parameter.data = self.op_ctx.to_public(self.parameter) - - def _cast_unpack_after_save(self): - if self.split is not None: - self._training_cast_after_save() - elif self.original_dtype is not None: - self.parameter.data = self.parameter.to(self.casted_dtype) - if self.op_ctx is None: - return - with torch.no_grad(): - if self.master_parameter is None: - self.parameter.data = self.op_ctx.pack(self.parameter) - if self.parameter_trail is not None: - self.parameter_trail = self.op_ctx.pack(self.parameter_trail) + return fp32_param + else: + return self.op_ctx.to_public(fp32_param) def can_prepack(self, module, is_training): if self.num_modules != 1: diff --git a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py index 67725ac2e..604232ecd 100644 --- a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py +++ b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py @@ -430,18 +430,13 @@ def weight_prepack_with_ipex(model, optimizer, params_attr, device_type="cpu"): patch_state_dict, get_shared_parameter_status, IPEX_WEIGHT_PREPACK_MODULE_CPU, + found_wrapper, ) is_training = optimizer is not None if len(params_attr) == 0: get_shared_parameter_status(model, params_attr) - def found_wrapper(parameter, params_attr): - for _, v in params_attr.items(): - if parameter is v.parameter: - return v - return None - def convert(m, optimizer, params_attr): # already packed for reentrancy test if m.__class__ in IPEX_WEIGHT_PREPACK_MODULE_CPU().values(): diff --git a/tests/cpu/ipex-optimize-ddp-static-graph.py b/tests/cpu/ipex-optimize-ddp-static-graph.py new file mode 100644 index 000000000..62937b4ca --- /dev/null +++ b/tests/cpu/ipex-optimize-ddp-static-graph.py @@ -0,0 +1,57 @@ +import torch +import os +import intel_extension_for_pytorch as ipex +from torch.nn.parallel import DistributedDataParallel as DDP +import torch.distributed as dist +import argparse + + +class Module(torch.nn.Module): + def __init__( + self, + ): + super(Module, self).__init__() + self.linear = torch.nn.Linear(1024, 1024, bias=False) + + def forward(self, x): + return self.linear(x) + + +torch.manual_seed(10) +model = Module() +optim = torch.optim.SGD(model.parameters(), lr=1) + +opt_model, opt = ipex.optimize( + model, dtype=torch.bfloat16, optimizer=optim, inplace=False, weights_prepack=False +) + + +def env2int(env_list, default=-1): + for e in env_list: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +rank = env2int(["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0) + +os.environ["MASTER_ADDR"] = "127.0.0.1" +os.environ["MASTER_PORT"] = "29510" +dist.init_process_group("gloo", world_size=2, rank=rank) +my_rank = dist.get_rank() +parser = argparse.ArgumentParser() +parser.add_argument("--get-state-dict", action="store_true") +args = parser.parse_args() + +opt_model = DDP(opt_model, static_graph=True) +for i in range(10): + input = torch.randn(1024, 1024).bfloat16() + output = opt_model(input) + if i == 5 and my_rank == 0 and args.get_state_dict: + state_dict = opt_model.state_dict() + loss = output.sum() + loss.backward() + opt.step() + if i == 9: + print(f"Resume training successfully, final lose = {loss.item()}") diff --git a/tests/cpu/test_ipex_optimize.py b/tests/cpu/test_ipex_optimize.py index 95e3cbf8f..fc2b2b02e 100644 --- a/tests/cpu/test_ipex_optimize.py +++ b/tests/cpu/test_ipex_optimize.py @@ -24,6 +24,7 @@ from common_utils import TestModule, _empty_weight_bias_parameter_names from intel_extension_for_pytorch.optim._lamb import Lamb import os +import subprocess try: import transformers @@ -914,6 +915,49 @@ def run_and_recursively_call_ipex_optimize( graph_mode, ) + def test_ddp_strict_graph(self): + # check if the model can be trained with DDP in strict graph mode + # with calling "statce_dict" during training, and also check there + # is no difference for final lose between two training. + def get_loss(line): + loss = line.split(" = ")[-1] + if loss.endswith("]"): + loss = loss[:-3] + return float(loss) + + num = 0 + loc = os.path.dirname(os.path.abspath(__file__)) + loss = -1 + with subprocess.Popen( + "python -m intel_extension_for_pytorch.cpu.launch --ccl_worker_count=1" + + f" --nproc_per_node=2 --distributed --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py --get-state-dict", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) as p: + for line in p.stdout.readlines(): + line = str(line, "utf-8").strip() + if "Resume training successfully" in line: + loss = get_loss(line) + num = num + 1 + assert num == 2, "training not finished." + + num = 0 + with subprocess.Popen( + "python -m intel_extension_for_pytorch.cpu.launch --ccl_worker_count=1" + + f" --nproc_per_node=2 --distributed --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py", + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) as p: + for line in p.stdout.readlines(): + line = str(line, "utf-8").strip() + if "Resume training successfully" in line: + loss_ = get_loss(line) + self.assertEqual(loss_, loss) + num = num + 1 + assert num == 2, "training not finished." + if __name__ == "__main__": test = unittest.main() From bb14ce18b9d6504f122e3d1a12aebe6c966111de Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Mon, 20 May 2024 22:18:19 +0800 Subject: [PATCH 073/199] Refine tpp flags in IPEX linear (#2906) --- csrc/cpu/aten/AddLayerNorm.cpp | 29 ++++- csrc/cpu/aten/AddLayerNorm.h | 10 ++ .../inference/python/llm-modeling/README.md | 5 +- .../cpu/inference/python/llm-modeling/run.py | 3 +- examples/cpu/training/llm/README.md | 7 +- .../cpu/training/llm/templates/alpaca.json | 6 - .../llm/functional/__init__.py | 4 + .../llm/functional/fusions.py | 87 +++++++++++++ .../llm/functional/utils.py | 20 +++ .../llm/modules/mha_fusion.py | 19 +-- .../nn/utils/_parameter_wrapper.py | 1 + .../models/cpu/fusions/mha_fusion.py | 58 +++++++++ tests/cpu/test_ipex_llm_module.py | 116 ++++++++++++++++++ tests/cpu/test_tpp_linear.py | 20 +++ 14 files changed, 367 insertions(+), 18 deletions(-) delete mode 100644 examples/cpu/training/llm/templates/alpaca.json create mode 100644 intel_extension_for_pytorch/llm/functional/utils.py diff --git a/csrc/cpu/aten/AddLayerNorm.cpp b/csrc/cpu/aten/AddLayerNorm.cpp index 22c400bfb..fe5415a80 100644 --- a/csrc/cpu/aten/AddLayerNorm.cpp +++ b/csrc/cpu/aten/AddLayerNorm.cpp @@ -4,7 +4,7 @@ // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp #include "AddLayerNorm.h" - +#include #include namespace torch_ipex { @@ -57,5 +57,32 @@ at::Tensor dil_add_layernorm( return at::layer_norm(add_res, normalized_shape, weight_opt, bias_opt, eps); } } + +// register as a python op +at::Tensor add_layernorm( + const at::Tensor& a, + const at::Tensor& b, + int64_t alpha, + at::IntArrayRef normalized_shape, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + double eps) { + RECORD_FUNCTION("add_layernorm", c10::ArrayRef({})); + return dil_add_layernorm( + a, b, alpha, normalized_shape, weight_opt, bias_opt, eps, false); +} + } // namespace cpu } // namespace torch_ipex + +namespace { + +TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { + m.def( + "add_layernorm(Tensor a, Tensor b, int alpha, int[] normalized_shape, Tensor ? weight_opt, \ + Tensor ? bias_opt, float eps) -> Tensor"); + m.impl( + "add_layernorm", c10::DispatchKey::CPU, torch_ipex::cpu::add_layernorm); +} + +} // namespace \ No newline at end of file diff --git a/csrc/cpu/aten/AddLayerNorm.h b/csrc/cpu/aten/AddLayerNorm.h index 67b26a00f..76b74a421 100644 --- a/csrc/cpu/aten/AddLayerNorm.h +++ b/csrc/cpu/aten/AddLayerNorm.h @@ -81,6 +81,16 @@ at::Tensor dil_add_layernorm( float eps, bool cuda_enable); +// register as a python op +at::Tensor add_layernorm( + const at::Tensor& a, + const at::Tensor& b, + int64_t alpha, + at::IntArrayRef normalized_shape, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + double eps); + namespace { at::Tensor add_layer_norm_kernel_impl( diff --git a/examples/cpu/inference/python/llm-modeling/README.md b/examples/cpu/inference/python/llm-modeling/README.md index 2587a188c..694b7b605 100644 --- a/examples/cpu/inference/python/llm-modeling/README.md +++ b/examples/cpu/inference/python/llm-modeling/README.md @@ -37,7 +37,10 @@ ipex.llm.functional.rms_norm ipex.llm.functional.fast_layer_norm ipex.llm.functional.indirect_access_kv_cache_attention ipex.llm.functional.varlen_attention - +ipex.llm.functional.add_layer_norm +ipex.llm.functional.add_rms_norm +ipex.llm.functional.silu_mul +ipex.llm.functional.gelu_mul ``` ### Generation related fusions diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py index 2090797eb..6a34ca845 100644 --- a/examples/cpu/inference/python/llm-modeling/run.py +++ b/examples/cpu/inference/python/llm-modeling/run.py @@ -5,7 +5,6 @@ import argparse from transformers import ( AutoTokenizer, - LlamaTokenizer, AutoModelForCausalLM, ) import transformers @@ -22,7 +21,7 @@ MODEL_CLASSES = { "gpt-j": (AutoModelForCausalLM, AutoTokenizer), - "llama": (AutoModelForCausalLM, LlamaTokenizer), + "llama": (AutoModelForCausalLM, AutoTokenizer), "opt": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/training/llm/README.md b/examples/cpu/training/llm/README.md index 50dad1f90..79b18ce4d 100644 --- a/examples/cpu/training/llm/README.md +++ b/examples/cpu/training/llm/README.md @@ -29,8 +29,13 @@ export HOSTFILE=hostfile # Quick Start Scripts ## Run the model ``` -# Get the dataset here https://github.com/tloen/alpaca-lora/blob/main/alpaca_data.json +# Get the dataset here: https://github.com/tloen/alpaca-lora/blob/main/alpaca_data.json export DATASET="./alpaca_data.json" + +# Get the dataset template here: https://github.com/tloen/alpaca-lora/blob/main/templates/alpaca.json +mkdir ./templates +mv alpaca.json ./templates + # Env vars export LOCAL_BATCH_SIZE=32 #32 is default one, you can choose per need export MODEL_NAME_OR_PATH="YOUR LOCAL PATH or MODEL_ID (HF)" diff --git a/examples/cpu/training/llm/templates/alpaca.json b/examples/cpu/training/llm/templates/alpaca.json deleted file mode 100644 index e486439c4..000000000 --- a/examples/cpu/training/llm/templates/alpaca.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "description": "Template used by Alpaca-LoRA.", - "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", - "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", - "response_split": "### Response:" -} diff --git a/intel_extension_for_pytorch/llm/functional/__init__.py b/intel_extension_for_pytorch/llm/functional/__init__.py index 82f0b5e45..023dd4832 100644 --- a/intel_extension_for_pytorch/llm/functional/__init__.py +++ b/intel_extension_for_pytorch/llm/functional/__init__.py @@ -4,4 +4,8 @@ fast_layer_norm, indirect_access_kv_cache_attention, varlen_attention, + add_layer_norm, + add_rms_norm, + silu_mul, + gelu_mul, ) diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py index 7251bc525..04f05dc49 100644 --- a/intel_extension_for_pytorch/llm/functional/fusions.py +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -8,6 +8,8 @@ VarlenAttention, ) +from .utils import _get_function_from_device + def rotary_embedding( query: torch.Tensor, @@ -209,3 +211,88 @@ def varlen_attention( return_softmax, gen_, ) + + +def silu_mul(x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None): + r""" + Applies PyTorch silu on input x, and them mul input y: + out = silu(x)*y + + Args: + x (torch.Tensor): input to apply silu. + y (torch.Tensor): input for mul to apply on silu(x). + out (torch.Tensor): buffer to get the results. + + """ + f = _get_function_from_device(x.device.type, silu_mul) + return f(x, y, out) + + +def gelu_mul( + x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None, approximate="none" +): + r""" + Applies PyTorch gelu on input x, and them mul input y: + out = gelu(x)*y + + Args: + x (torch.Tensor): input to apply gelu. + y (torch.Tensor): input for mul to apply on gelu(x). + out (torch.Tensor): buffer to get the results. + approximate (str): approximate config for gelu. + + """ + f = _get_function_from_device(x.device.type, gelu_mul) + return f(x, y, out, approximate) + + +def add_rms_norm( + residual: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool = False, +): + r""" + Add residual on input x and apply RMSnorm on the result. + + Args: + residual (torch.Tensor): residual to add with x. If residual is None, + it means only apply rmsnorm on x. + x (torch.Tensor) : the input tensor to add residual and apply RMSNorm. + weight (torch.Tensor): the weight to apply RMSnorm. + bias (torch.Tensor): the bias to apply RMSnorm. + eps (float) : the variance_epsilon to apply RMSnorm. + add_back (bool) : whether to store the result of (x + residual) back + to the residual buffer (if residual is not None). Default is False. + + """ + f = _get_function_from_device(x.device.type, add_rms_norm) + return f(residual, x, weight, bias, eps, add_back) + + +def add_layer_norm( + residual: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool = False, +): + r""" + Add residual on input x and apply layernorm on the result. + + Args: + residual (torch.Tensor): residual to add with x. If residual is None, + it means only apply layernorm on x. + x (torch.Tensor) : the input tensor to add residual and apply layernorm. + weight (torch.Tensor): the weight to apply layernorm. + bias (torch.Tensor): the bias to apply layernorm. + eps (float) : the variance_epsilon to apply layernorm. + add_back (bool) : whether to store the result of (x + residual) back + to the residual buffer (if residual is not None). Default is False. + + """ + f = _get_function_from_device(x.device.type, add_layer_norm) + return f(residual, x, weight, bias, eps, add_back) diff --git a/intel_extension_for_pytorch/llm/functional/utils.py b/intel_extension_for_pytorch/llm/functional/utils.py new file mode 100644 index 000000000..ba834a1c7 --- /dev/null +++ b/intel_extension_for_pytorch/llm/functional/utils.py @@ -0,0 +1,20 @@ +import sys +from intel_extension_for_pytorch.transformers.models.cpu.fusions.mha_fusion import ( # noqa F401 + silu_mul_cpu, + gelu_mul_cpu, + add_rms_norm_cpu, + add_layer_norm_cpu, +) + + +def _get_function_from_device(device_type: str, f): + assert device_type in [ + "cpu", + "xpu", + ], "The device is not in the supported device list." + target_f_name = f.__name__ + "_" + device_type + assert hasattr( + sys.modules[__name__], target_f_name + ), f"Target function {f.__name__} on {device_type} haven't implemented yet." + target_f = getattr(sys.modules[__name__], target_f_name) + return target_f diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py index c3f410e63..940fea611 100644 --- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py @@ -9,13 +9,15 @@ class RotaryEmbedding(nn.Module): [module init and forward] Applies RotaryEmbedding (see https://huggingface.co/papers/2104.09864) on the `query ` or `key` before their multi-head attention computation. Args: - module init: - - max_position_embeddings (int): size (max) of the position embeddings. - - pos_embd_dim (int): dimension of the position embeddings. - - base (int) : Default: 10000. Base to generate the frequency of position embeddings. - - backbone (str): Default: None. The exact transformers model backbone - (e.g., "GPTJForCausalLM", get from model.config.architectures[0], - see https://huggingface.co/EleutherAI/gpt-j-6b/blob/main/config.json#L4). + max_position_embeddings (int): size (max) of the position embeddings. + pos_embd_dim (int): dimension of the position embeddings. + base (int) : Default: 10000. Base to generate the frequency of position embeddings. + backbone (str): Default: None. The exact transformers model backbone + (e.g., "GPTJForCausalLM", get from model.config.architectures[0], + see https://huggingface.co/EleutherAI/gpt-j-6b/blob/main/config.json#L4). + extra_rope_config (dict): like phi-3 model, it uses original_max_position_embeddings, + long_factor and short_factor, see details: + https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json#L23. forward: - input (torch.Tensor) : input to be applied with position embeddings, @@ -70,12 +72,14 @@ def __init__( pos_embd_dim: int, base=10000, backbone: str = None, + extra_rope_config: dict = None, ): super().__init__() self.model_backbone = backbone self.max_position_embeddings = max_position_embeddings self.pos_embd_dim = pos_embd_dim self.base = base + self.extra_rope_config = extra_rope_config def forward( self, @@ -107,6 +111,7 @@ def forward( self.pos_embd_dim, self.base, self.model_backbone, + self.extra_rope_config, ) return runtime_module( x, diff --git a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py index 70d0e1769..3e3894d01 100644 --- a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py +++ b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py @@ -546,6 +546,7 @@ def linear_prepack(self, module, is_training): if not hasattr(module, "out_features"): setattr(module, "out_features", module.weight.shape[0]) # noqa: B010 + module.tpp_fallback = False if module.use_tpp: from intel_extension_for_pytorch.nn.utils import ( Apply_TPPLinear_weight_prepack, diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index 0e01d94dc..2de1e0ddb 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -488,3 +488,61 @@ def forward( return_softmax, gen_, ) + + +def add_rms_norm_cpu( + add: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool, +): + assert bias is None, "bias is not supported in add_rmsnorm yet" + if add is not None: + if add_back: + add.add_(x) + input = add + else: + input = add + x + else: + input = x + + return torch.ops.torch_ipex.rmsnorm(input, weight, eps) + + +def add_layer_norm_cpu( + add: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + add_back: bool, +): + if add is not None: + out = torch.ops.torch_ipex.add_layernorm( + x, add, 1, [x.size(-1)], weight, bias, eps + ) + if add_back: + add.add_(x) + return out + else: + return torch.nn.functional.layer_norm( + x, [x.size(-1)], weight=weight, bias=bias, eps=eps + ) + + +@torch.compile(dynamic=True, options={"fx_graph_cache": True}) +def silu_mul_cpu(x, y, out=None): + res = torch.nn.functional.silu(x) * y + if out is not None: + out.copy_(res) + return res + + +@torch.compile(dynamic=True, options={"fx_graph_cache": True}) +def gelu_mul_cpu(x, y, out=None, approximate="none"): + res = torch.nn.functional.gelu(x, approximate=approximate) * y + if out is not None: + out.copy_(res) + return res diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py index f3ee00552..1674b2b01 100644 --- a/tests/cpu/test_ipex_llm_module.py +++ b/tests/cpu/test_ipex_llm_module.py @@ -148,6 +148,49 @@ def apply(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor): return torch.cat([c, d], dim=-1) +def add_rmsnorm(residual, x, weight, bias, eps, add_back): + orig_dtype = x.dtype + x = x.to(torch.float32) + if residual is not None: + x = residual + x + variance = x.pow(2).mean(dim=-1, keepdim=True) + out = x * torch.rsqrt(variance + eps) + out = out.to(orig_dtype) * weight + if add_back and residual is not None: + residual.copy_(x.to(orig_dtype)) + return out + + +def add_layernorm(residual, x, weight, bias, eps, add_back): + if residual is None: + return torch.nn.functional.layer_norm( + x, [x.size(-1)], weight=weight, bias=bias, eps=eps + ) + x = residual + x + out = torch.nn.functional.layer_norm( + x, [x.size(-1)], weight=weight, bias=bias, eps=eps + ) + if add_back: + residual.copy_(x) + return out + + +def silu_mul(x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None): + if out is None: + out = torch.empty_like(x) + out = torch.nn.functional.silu(x) * y + return out + + +def gelu_mul( + x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None, approximate="none" +): + if out is None: + out = torch.empty_like(x) + out = torch.nn.functional.gelu(x, approximate=approximate) * y + return out + + class TestLLMModules(TestCase): def test_linearfusion_args0(self): x1 = torch.rand(1, 4, 4096) @@ -314,6 +357,79 @@ def test_rotary_embedding_tgi(self): self.assertEqual(ipex_q, ref_q) self.assertEqual(ref_k, ipex_k) + def test_add_layernorm(self): + for add_back in [True, False]: + for dtype in [torch.float, torch.bfloat16]: + for residual_is_none in [True, False]: + weight = torch.nn.Parameter(torch.randn(4096)).to(dtype) + eps = 1e-6 + x = torch.rand(1, 32, 4096).to(dtype) + if residual_is_none: + residual = None + else: + if add_back: + target_residual = x + x + residual = x + x_ = copy.deepcopy(x) + residual_ = x_ if not residual_is_none else None + ref_out = add_layernorm(residual_, x_, weight, None, eps, add_back) + ipex_out = ipex.llm.functional.add_layer_norm( + residual, x, weight, None, eps, add_back + ) + if not residual_is_none: + if add_back: + self.assertEqual(residual, target_residual) + self.assertEqual(residual_, target_residual) + else: + self.assertEqual(residual, x) + self.assertEqual(residual_, x) + self.assertEqual(ref_out, ipex_out) + + def test_add_rmsnorm(self): + for add_back in [True, False]: + for dtype in [torch.float, torch.bfloat16]: + for residual_is_none in [True, False]: + weight = torch.nn.Parameter(torch.randn(4096)).to(dtype) + eps = 1e-6 + x = torch.rand(1, 32, 4096).to(dtype) + if residual_is_none: + residual = None + else: + if add_back: + target_residual = x + x + residual = x + x_ = copy.deepcopy(x) + residual_ = x_ if not residual_is_none else None + ref_out = add_rmsnorm(residual_, x_, weight, None, eps, add_back) + ipex_out = ipex.llm.functional.add_rms_norm( + residual, x, weight, None, eps, add_back + ) + if not residual_is_none: + if add_back: + self.assertEqual(residual, target_residual) + self.assertEqual(residual_, target_residual) + else: + self.assertEqual(residual, x) + self.assertEqual(residual_, x) + self.assertEqual(ref_out, ipex_out) + + def test_gelu_mul(self): + for dtype in [torch.float, torch.bfloat16]: + for approximate in ["tanh", "none"]: + x = torch.rand(1, 32, 4096).to(dtype) + x_ = copy.deepcopy(x) + ref_out = gelu_mul(x_, x_, approximate=approximate) + ipex_out = ipex.llm.functional.gelu_mul(x_, x_, approximate=approximate) + self.assertEqual(ref_out, ipex_out) + + def test_silu_mul(self): + for dtype in [torch.float, torch.bfloat16]: + x = torch.rand(1, 32, 4096).to(dtype) + x_ = copy.deepcopy(x) + ref_out = silu_mul(x_, x_) + ipex_out = ipex.llm.functional.silu_mul(x_, x_) + self.assertEqual(ref_out, ipex_out) + if __name__ == "__main__": test = unittest.main() diff --git a/tests/cpu/test_tpp_linear.py b/tests/cpu/test_tpp_linear.py index 0f8d1f593..a121be8e7 100644 --- a/tests/cpu/test_tpp_linear.py +++ b/tests/cpu/test_tpp_linear.py @@ -102,6 +102,26 @@ def forward(self, x): class TestTPPlinear(TestCase): + def test_tpp_linear_fallback_flag(self): + x1 = torch.rand(1, 1, 4097) + x2 = copy.deepcopy(x1) + for dtype in [torch.float, torch.bfloat16]: + model = Linear_tpp_fallback_dnnl().eval() + + with torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if dtype is torch.bfloat16 else False + ): + ref_out = model(x1) + + model = ipex.optimize(model, dtype=dtype) + with torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if dtype is torch.bfloat16 else False + ): + model = torch.jit.script(model) + model = torch.jit.freeze(model) + out = model(x2) + self.assertEqual(out, ref_out) + def test_tpp_linear_fallback(self): x1 = torch.rand(1, 1, 4097) x2 = copy.deepcopy(x1) From 091a6a2e16cb48fd917b8b2d9d570bb0025f6f01 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Tue, 21 May 2024 07:13:05 +0800 Subject: [PATCH 074/199] Update dependency_version.yml 20240521 (#2908) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index b258442c5..7ffa00ac3 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240519+cpu + version: 2.4.0.dev20240520+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240519+cpu + version: 2.2.0.dev20240520+cpu torchvision: - version: 0.19.0.dev20240519+cpu + version: 0.19.0.dev20240520+cpu transformers: version: 4.38.1 From 0285cc50d20025c53b82de886bed53476f505a99 Mon Sep 17 00:00:00 2001 From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com> Date: Tue, 21 May 2024 14:13:14 +0800 Subject: [PATCH 075/199] rel 2.3.0 backport to main (#2881) * r230 backport to main * add #2848 * minor format fix * update int8 recipe table & version numbers in serving example part * flake8 format correction --- README.md | 31 +- docker/Dockerfile.prebuilt | 11 +- docker/README.md | 2 +- docs/_static/htmls/tbl_deepspeed.html | 34 +- docs/_static/htmls/tbl_single.html | 117 ++++-- docs/tutorials/api_doc.rst | 63 +++ docs/tutorials/examples.md | 2 +- docs/tutorials/features/fast_bert.md | 2 +- .../features/sq_recipe_tuning_api.md | 2 +- docs/tutorials/installation.md | 4 +- docs/tutorials/introduction.rst | 2 +- docs/tutorials/llm.rst | 10 +- docs/tutorials/llm/llm_optimize.md | 4 +- docs/tutorials/releases.md | 69 ++++ examples/cpu/inference/cpp/README.md | 8 +- .../inference/python/llm-modeling/README.md | 180 +++++++-- .../cpu/inference/python/llm-modeling/run.py | 95 +++-- examples/cpu/inference/python/llm/README.md | 97 +++-- .../inference/python/llm/llm_sq_recipes.md | 3 +- examples/cpu/serving/torchserve/README.md | 8 +- examples/cpu/serving/triton/Dockerfile | 8 +- examples/cpu/serving/triton/requirements.txt | 8 +- .../llm/functional/fusions.py | 155 +++++--- .../llm/modules/linear_fusion.py | 126 +++++- .../llm/modules/mha_fusion.py | 367 ++++++++++-------- 25 files changed, 969 insertions(+), 439 deletions(-) diff --git a/README.md b/README.md index 992b97429..f2f90b7d6 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Intelยฎ Extension for PyTorch\* -**CPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.2.0%2Bcpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
    +**CPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.3.0%2Bcpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
    **GPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/inference/python/llm)
    Intelยฎ Extension for PyTorch\* extends PyTorch\* with up-to-date features optimizations for an extra performance boost on Intel hardware. Optimizations take advantage of Intelยฎ Advanced Vector Extensions 512 (Intelยฎ AVX-512) Vector Neural Network Instructions (VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) on Intel CPUs as well as Intel Xe Matrix Extensions (XMX) AI engines on Intel discrete GPUs. Moreover, Intelยฎ Extension for PyTorch* provides easy GPU acceleration for Intel discrete GPUs through the PyTorch* xpu device. @@ -19,28 +19,35 @@ In the current technological landscape, Generative AI (GenAI) workloads and mode | MODEL FAMILY | MODEL NAME (Huggingface hub) | FP32 | BF16 | Static quantization INT8 | Weight only quantization INT8 | Weight only quantization INT4 | |:---:|:---:|:---:|:---:|:---:|:---:|:---:| |LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-8B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLAMA| meta-llama/Meta-Llama-3-70B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | |GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | |DOLLY| databricks/dolly-v2-12b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|FALCON| tiiuae/falcon-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | |FALCON| tiiuae/falcon-40b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |OPT| facebook/opt-30b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | |OPT| facebook/opt-1.3b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | |Bloom| bigscience/bloom-1b7 | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | +|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | -|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |Baichuan| baichuan-inc/Baichuan-13B-Chat | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | | |ChatGLM| THUDM/chatglm3-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |ChatGLM| THUDM/chatglm2-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | |Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | -|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | -|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | -|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | ๐ŸŸจ | +|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLaVA| liuhaotian/llava-v1.5-7b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|GIT| microsoft/git-base | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|Yuan| IEITYuan/Yuan2-102B-hf | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | +|Phi| microsoft/phi-2 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | - ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32). @@ -49,6 +56,10 @@ In the current technological landscape, Generative AI (GenAI) workloads and mode *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future. +In addition, Intelยฎ Extension for PyTorch* introduces module level optimization APIs (prototype feature) since release 2.3.0. +The feature provides optimized alternatives for several commonly used LLM modules and functionalities for the optimizations of the niche or customized LLMs. +Please read [**LLM module level optimization practice**](./examples/cpu/inference/python/llm-modeling) to better understand how to optimize your own LLM and achieve better performance. + ## Support The team tracks bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-pytorch/issues/). Before submitting a suggestion or bug report, search the existing GitHub issues to see if your issue has already been reported. diff --git a/docker/Dockerfile.prebuilt b/docker/Dockerfile.prebuilt index c004228f5..e6561cc50 100644 --- a/docker/Dockerfile.prebuilt +++ b/docker/Dockerfile.prebuilt @@ -17,7 +17,8 @@ RUN apt-get update -y && \ apt-get upgrade -y && \ apt-get install -y --no-install-recommends --fix-missing \ ${PYTHON} \ - ${PYTHON}-pip + ${PYTHON}-pip \ + ${PYTHON}-dev RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \ pip \ @@ -27,10 +28,10 @@ RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \ # Some TF tools expect a "python" binary RUN ln -s $(which ${PYTHON}) /usr/local/bin/python -ARG IPEX_VERSION=2.2.0 -ARG PYTORCH_VERSION=2.2.0 -ARG TORCHAUDIO_VERSION=2.2.0 -ARG TORCHVISION_VERSION=0.17.0 +ARG IPEX_VERSION=2.3.0 +ARG PYTORCH_VERSION=2.3.0 +ARG TORCHAUDIO_VERSION=2.3.0 +ARG TORCHVISION_VERSION=0.18.0 ARG TORCH_CPU_URL=https://download.pytorch.org/whl/cpu/torch_stable.html RUN \ diff --git a/docker/README.md b/docker/README.md index 72147b211..b263f74a5 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,7 +2,7 @@ * Notes - If you use linux kernerl under version 5.4 in host, upgrade it. + If you use linux kernel under version 5.4 in host, upgrade it. * How to build an image diff --git a/docs/_static/htmls/tbl_deepspeed.html b/docs/_static/htmls/tbl_deepspeed.html index 2b0dd5bc7..20b94d8bd 100644 --- a/docs/_static/htmls/tbl_deepspeed.html +++ b/docs/_static/htmls/tbl_deepspeed.html @@ -26,6 +26,18 @@

    ๐ŸŸฉ

    ๐ŸŸฉ

    + +

    LLAMA

    +

    meta-llama/Meta-Llama-3-8B

    +

    ๐ŸŸฉ

    +

    ๐ŸŸฉ

    + + +

    LLAMA

    +

    meta-llama/Meta-Llama-3-70B

    +

    ๐ŸŸฉ

    +

    ๐ŸŸฉ

    +

    GPT-J

    EleutherAI/gpt-j-6b

    @@ -83,7 +95,7 @@

    Baichuan

    baichuan-inc/Baichuan2-13B-Chat

    -

    ๐ŸŸจ

    +

    ๐ŸŸฉ

    ๐ŸŸฉ

    @@ -116,9 +128,27 @@

    ๐ŸŸฉ

    ๐ŸŸฉ

    + +

    Stablelm

    +

    stabilityai/stablelm-2-1_6b

    +

    ๐ŸŸฉ

    +

    ๐ŸŸฉ

    + + +

    Qwen

    +

    Qwen/Qwen-7B-Chat

    +

    ๐ŸŸฉ

    +

    ๐ŸŸฉ

    + + +

    GIT

    +

    microsoft/git-base

    +

    ๐ŸŸฉ

    +

    ๐ŸŸฉ

    +
    • ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

    • ๐ŸŸจ signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

    • -
    +
\ No newline at end of file diff --git a/docs/_static/htmls/tbl_single.html b/docs/_static/htmls/tbl_single.html index 8eaefecf8..3d1aa537f 100644 --- a/docs/_static/htmls/tbl_single.html +++ b/docs/_static/htmls/tbl_single.html @@ -27,7 +27,7 @@

๐ŸŸฉ

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

LLAMA

@@ -36,6 +36,24 @@

๐ŸŸฉ

๐ŸŸฉ

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

LLAMA

+

meta-llama/Meta-Llama-3-8B

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+

๐ŸŸฉ

+

+ + +

LLAMA

+

meta-llama/Meta-Llama-3-70B

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+

๐ŸŸฉ

๐ŸŸจ

@@ -65,7 +83,16 @@

๐ŸŸฉ

๐ŸŸจ

- + +

FALCON

+

tiiuae/falcon-7b

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

+ +

FALCON

tiiuae/falcon-40b

๐ŸŸฉ

@@ -74,7 +101,7 @@

๐ŸŸฉ

๐ŸŸฉ

- +

OPT

facebook/opt-30b

๐ŸŸฉ

@@ -83,7 +110,7 @@

๐ŸŸฉ

๐ŸŸจ

- +

OPT

facebook/opt-1.3b

๐ŸŸฉ

@@ -92,7 +119,7 @@

๐ŸŸฉ

๐ŸŸจ

- +

Bloom

bigscience/bloom-1b7

๐ŸŸฉ

@@ -101,16 +128,16 @@

๐ŸŸฉ

๐ŸŸจ

- +

CodeGen

Salesforce/codegen-2B-multi

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

๐ŸŸฉ

๐ŸŸฉ

- +

Baichuan

baichuan-inc/Baichuan2-7B-Chat

๐ŸŸฉ

@@ -119,16 +146,16 @@

๐ŸŸฉ

- +

Baichuan

baichuan-inc/Baichuan2-13B-Chat

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸฉ

+

๐ŸŸจ

๐ŸŸฉ

- +

Baichuan

baichuan-inc/Baichuan-13B-Chat

๐ŸŸฉ

@@ -137,7 +164,7 @@

๐ŸŸฉ

- +

ChatGLM

THUDM/chatglm3-6b

๐ŸŸฉ

@@ -146,7 +173,7 @@

๐ŸŸฉ

- +

ChatGLM

THUDM/chatglm2-6b

๐ŸŸฉ

@@ -155,7 +182,7 @@

๐ŸŸฉ

- +

GPTBigCode

bigcode/starcoder

๐ŸŸฉ

@@ -164,15 +191,24 @@

๐ŸŸฉ

๐ŸŸจ

- +

T5

google/flan-t5-xl

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

+ +

MPT

+

mosaicml/mpt-7b

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

Mistral

mistralai/Mistral-7B-v0.1

@@ -183,17 +219,35 @@

๐ŸŸจ

-

MPT

-

mosaicml/mpt-7b

+

Mixtral

+

mistralai/Mixtral-8x7B-v0.1

๐ŸŸฉ

๐ŸŸฉ

+

+

๐ŸŸฉ

๐ŸŸจ

+ + +

Stablelm

+

stabilityai/stablelm-2-1_6b

+

๐ŸŸฉ

๐ŸŸฉ

+

๐ŸŸจ

๐ŸŸฉ

+

๐ŸŸจ

+ + +

Qwen

+

Qwen/Qwen-7B-Chat

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+

๐ŸŸฉ

+

-

Mixtral

-

mistralai/Mixtral-8x7B-v0.1

+

LLaVA

+

liuhaotian/llava-v1.5-7b

๐ŸŸฉ

๐ŸŸฉ

@@ -201,26 +255,35 @@

-

Stablelm

-

stabilityai/stablelm-2-1_6b

+

GIT

+

microsoft/git-base

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸจ

+

๐ŸŸฉ

-

Qwen

-

Qwen/Qwen-7B-Chat

+

Yuan

+

IEITYuan/Yuan2-102B-hf

๐ŸŸฉ

๐ŸŸฉ

-

๐ŸŸฉ

+

๐ŸŸจ

+ +

Phi

+

microsoft/phi-2

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸฉ

+

๐ŸŸจ

+
  • ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32).

  • ๐ŸŸจ signifies that the model can perform well while accuracy may not been in a perfect state (>1% difference as compared with FP32).

  • -
+ \ No newline at end of file diff --git a/docs/tutorials/api_doc.rst b/docs/tutorials/api_doc.rst index 7080252eb..1a161c0a3 100644 --- a/docs/tutorials/api_doc.rst +++ b/docs/tutorials/api_doc.rst @@ -18,7 +18,70 @@ General .. currentmodule:: intel_extension_for_pytorch .. autoclass:: verbose +LLM Module Level Optimizations (Prototype) +****************************************** +Module level optimization APIs are provided for optimizing customized LLMs. + +.. automodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearSilu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearSiluMul + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: Linear2SiluMul + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearRelu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearNewGelu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearGelu + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearMul + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearAdd + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: LinearAddAdd + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: RotaryEmbedding + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: RMSNorm + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: FastLayerNorm + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: IndirectAccessKVCacheAttention + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: PagedAttention + +.. currentmodule:: intel_extension_for_pytorch.llm.modules +.. autoclass:: VarlenAttention + +.. automodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: rotary_embedding + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: rms_norm + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: fast_layer_norm + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: indirect_access_kv_cache_attention + +.. currentmodule:: intel_extension_for_pytorch.llm.functional +.. autofunction:: varlen_attention Fast Bert (Prototype) ************************ diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md index f8bdd5982..f90505a2d 100644 --- a/docs/tutorials/examples.md +++ b/docs/tutorials/examples.md @@ -359,5 +359,5 @@ $ ldd example-app ## Intelยฎ AI Reference Models -Use cases that have already been optimized by Intel engineers are available at [Intelยฎ AI Reference Models](https://github.com/IntelAI/models/tree/pytorch-r2.2.0-models) (former Model Zoo). A number of PyTorch use cases for benchmarking are also available in the [benchmarks](https://github.com/IntelAI/models/tree/pytorch-r2.2.0-models/benchmarks#pytorch-use-cases). You can get performance benefits out-of-the-box by simply running scripts in the Intelยฎ AI Reference Models. +Use cases that have already been optimized by Intel engineers are available at [Intelยฎ AI Reference Models](https://github.com/IntelAI/models/tree/pytorch-r2.3-models) (former Model Zoo). A number of PyTorch use cases for benchmarking are also available in the [benchmarks](https://github.com/IntelAI/models/tree/pytorch-r2.3-models/benchmarks#pytorch-use-cases). You can get performance benefits out-of-the-box by simply running scripts in the Intelยฎ AI Reference Models. diff --git a/docs/tutorials/features/fast_bert.md b/docs/tutorials/features/fast_bert.md index 12725f6b6..a16862e6f 100644 --- a/docs/tutorials/features/fast_bert.md +++ b/docs/tutorials/features/fast_bert.md @@ -9,7 +9,7 @@ Currently `ipex.fast_bert` API is only well optimized for training. For inferenc ### Prerequisite -- Transformers 4.6.0 ~ 4.31.0 +- Transformers 4.6.0 ~ 4.38.1 ### Usage Example diff --git a/docs/tutorials/features/sq_recipe_tuning_api.md b/docs/tutorials/features/sq_recipe_tuning_api.md index 6928a063c..0ef8e0918 100644 --- a/docs/tutorials/features/sq_recipe_tuning_api.md +++ b/docs/tutorials/features/sq_recipe_tuning_api.md @@ -15,6 +15,6 @@ SmoothQuant will introduce alpha to calculate the ratio of input and weight upda | shared_criterion | "mean" | ["min", "mean","max"] | criterion for input LayerNorm op of a transformer block. | | enable_blockwise_loss | False | [True, False] | whether to enable block-wise auto-tuning | -For LLM examples, please refer to [example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm). +For LLM examples, please refer to [example](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm). **Note**: When defining dataloaders for calibration, please follow INC's dataloader [format](https://github.com/intel/neural-compressor/blob/master/docs/source/dataloader.md). diff --git a/docs/tutorials/installation.md b/docs/tutorials/installation.md index 1383ed7a8..b42f81066 100644 --- a/docs/tutorials/installation.md +++ b/docs/tutorials/installation.md @@ -1,8 +1,8 @@ Installation ============ -Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.2.0%2Bcpu). +Select your preferences and follow the installation instructions provided on the [Installation page](../../../index.html#installation?platform=cpu&version=v2.3.0%2Bcpu). After successful installation, refer to the [Quick Start](getting_started.md) and [Examples](examples.md) sections to start using the extension in your code. -**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm). +**NOTE:** For detailed instructions on installing and setting up the environment for Large Language Models (LLM), as well as example scripts, refer to the [LLM best practices](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm). diff --git a/docs/tutorials/introduction.rst b/docs/tutorials/introduction.rst index 3edf7f1c4..0f0439dd1 100644 --- a/docs/tutorials/introduction.rst +++ b/docs/tutorials/introduction.rst @@ -16,7 +16,7 @@ the `Large Language Models (LLM) `_ section. Get Started ----------- -- `Installation <../../../index.html#installation?platform=cpu&version=v2.2.0%2Bcpu>`_ +- `Installation <../../../index.html#installation?platform=cpu&version=v2.3.0%2Bcpu>`_ - `Quick Start `_ - `Examples `_ diff --git a/docs/tutorials/llm.rst b/docs/tutorials/llm.rst index 72eb62c2a..e1e117e5d 100644 --- a/docs/tutorials/llm.rst +++ b/docs/tutorials/llm.rst @@ -30,8 +30,14 @@ Verified for distributed inference mode via DeepSpeed *Note*: The above verified models (including other models in the same model family, like "codellama/CodeLlama-7b-hf" from LLAMA family) are well supported with all optimizations like indirect access KV cache, fused ROPE, and prepacked TPP Linear (fp32/bf16). We are working in progress to better support the models in the tables with various data types. In addition, more models will be optimized in the future. -Please check `LLM best known practice <../../examples/cpu/inference/python/llm>`_ for instructions to install/setup environment and example scripts. +Please check `LLM best known practice `_ for instructions to install/setup environment and example scripts. +Module Level Optimization API for customized LLM (Prototype) +------------------------------------------------------------ + +In the past year, LLM has been flourishing with many open-sourced models contributed to the community, while researchers are building their own LLMs from transformer blocks with variants in implementation details. To help LLM researchers and developers improve their productivity, Intelยฎ Extension for PyTorch* provides module level optimizations for commonly used LLM modules and functionalities, which are operators or certain operator combinations in nature. + +Please check `LLM module level optimization practice `_ to better understand how to use `module level APIs `_ to optimize your LLM and achieve better performance. Demos ----- @@ -143,4 +149,4 @@ Operators fusion is generally used to enable sub-graph fusion to reduce the memo Distributed Inference ~~~~~~~~~~~~~~~~~~~~~ -All above optimizations already help you to get very good performance with single instance. To furthly reduce the inference latency and improve throughput, tensor parallel is also enabled in our soluction. You can firstly use DeepSpeed to auto shard the model and then apply above optimizations with the frontend API function provided by Intelยฎ Extension for PyTorch. +All above optimizations already help you to get very good performance with single instance. To further reduce the inference latency and improve throughput, tensor parallel is also enabled in our solution. You can firstly use DeepSpeed to auto shard the model and then apply above optimizations with the frontend API function provided by Intelยฎ Extension for PyTorch. diff --git a/docs/tutorials/llm/llm_optimize.md b/docs/tutorials/llm/llm_optimize.md index efc4278c2..44203cc80 100644 --- a/docs/tutorials/llm/llm_optimize.md +++ b/docs/tutorials/llm/llm_optimize.md @@ -9,7 +9,7 @@ API documentation is available at [API Docs page](https://intel.github.io/intel- ## Pseudocode of Common Usage Scenarios -The following sections show pseudocode snippets to invoke Intelยฎ Extension for PyTorch\* APIs to work with LLM models. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm). +The following sections show pseudocode snippets to invoke Intelยฎ Extension for PyTorch\* APIs to work with LLM models. Complete examples can be found at [the Example directory](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm). ### FP32/BF16 @@ -98,7 +98,7 @@ model = ipex.llm.optimize(model, quantization_config=qconfig, low_precision_chec Distributed inference can be performed with `DeepSpeed`. Based on original Intelยฎ Extension for PyTorch\* scripts, the following code changes are required. -Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.2.0%2Bcpu/examples/cpu/inference/python/llm/distributed) for complete codes. +Check [LLM distributed inference examples](https://github.com/intel/intel-extension-for-pytorch/tree/v2.3.0%2Bcpu/examples/cpu/inference/python/llm/distributed) for complete codes. ``` python import torch diff --git a/docs/tutorials/releases.md b/docs/tutorials/releases.md index a7456b000..974774be5 100644 --- a/docs/tutorials/releases.md +++ b/docs/tutorials/releases.md @@ -1,6 +1,75 @@ Releases ============= +## 2.3.0 + +We are excited to announce the release of Intelยฎ Extension for PyTorch* 2.3.0+cpu which accompanies PyTorch 2.3. This release mainly brings you the new feature on Large Language Model (LLM) called module level LLM optimization API, which provides module level optimizations for commonly used LLM modules and functionalities, and targets to optimize customized LLM modeling for scenarios like private models, self-customized models, LLM serving frameworks, etc. This release also extends the list of optimized LLM models to a broader level and includes a set of bug fixing and small optimizations. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product. + +### Highlights + +- Large Language Model (LLM) optimization + + [Intelยฎ Extension for PyTorch*](https://github.com/intel/intel-extension-for-pytorch) provides a new feature called module level LLM optimization API, which provides module level optimizations for commonly used LLM modules and functionalities. LLM creators can then use this new API set to replace related parts in models by themselves, with which to reach peak performance. + + There are 3 categories of module level LLM optimization APIs in general: + + - Linear post-op APIs + + ```python + # using module init and forward + ipex.llm.modules.linearMul + ipex.llm.modules.linearGelu + ipex.llm.modules.linearNewGelu + ipex.llm.modules.linearAdd + ipex.llm.modules.linearAddAdd + ipex.llm.modules.linearSilu + ipex.llm.modules.linearSiluMul + ipex.llm.modules.linear2SiluMul + ipex.llm.modules.linearRelu + ``` + + - Attention related APIs + + ```python + # using module init and forward + ipex.llm.modules.RotaryEmbedding + ipex.llm.modules.RMSNorm + ipex.llm.modules.FastLayerNorm + ipex.llm.modules.VarlenAttention + ipex.llm.modules.PagedAttention + ipex.llm.modules.IndirectAccessKVCacheAttention + + # using as functions + ipex.llm.functional.rotary_embedding + ipex.llm.functional.rms_norm + ipex.llm.functional.fast_layer_norm + ipex.llm.functional.indirect_access_kv_cache_attention + ipex.llm.functional.varlen_attention + ``` + + - Generation related APIs + + ```python + # using for optimizing huggingface generation APIs with prompt sharing + ipex.llm.generation.hf_beam_sample + ipex.llm.generation.hf_beam_search + ipex.llm.generation.hf_greedy_search + ipex.llm.generation.hf_sample + ``` + + More detailed introduction on how to apply this API set and example code walking you through can be found [here](https://github.com/intel/intel-extension-for-pytorch/tree/release/2.3/examples/cpu/inference/python/llm-modeling). + +- Bug fixing and other optimization + + - Optimized the performance of LLM [#2561](https://github.com/intel/intel-extension-for-pytorch/commit/ade45387ecc4e707754de9db6fc2be0af186e2ba) [#2584](https://github.com/intel/intel-extension-for-pytorch/commit/05d07645e1ae5eeeff15abda31a6ba5806dd2bb2) [#2617](https://github.com/intel/intel-extension-for-pytorch/commit/adb563834a4f6bd327d7307c493c8fe1648e6211) [#2663](https://github.com/intel/intel-extension-for-pytorch/commit/214dea0c8e7b2864a0c2d1a1c32fb7815ca68070) [#2733](https://github.com/intel/intel-extension-for-pytorch/commit/f5b941c3b7ea8fe1a387617a9329467d1e1b544a) + - Supported Act Order of GPTQ [#2550](https://github.com/intel/intel-extension-for-pytorch/commit/be636289eef628b995e79a475c58f8a4d93e4890) [#2568](https://github.com/intel/intel-extension-for-pytorch/commit/9fcc4897492333330fb6bd156b1178d55347d292) + - Improved the warning and the logging information for better user experience [#2641](https://github.com/intel/intel-extension-for-pytorch/commit/e0bf673cf3ea4063a7e168ec221f421fbd378fb3) [#2675](https://github.com/intel/intel-extension-for-pytorch/commit/770275a755ea0445675720a3f6f14e77c491fceb) + - Added TorchServe CPU Example [#2613](https://github.com/intel/intel-extension-for-pytorch/commit/1f6fe6423dde7ccecc1565e73dc81d9cb281bc1f) + - Upgraded oneDNN to v3.4.1 [#2747](https://github.com/intel/intel-extension-for-pytorch/commit/e2a9af49874fcf39097036c08848cd37cadc0084) + - Misc fix and enhancement [#2468](https://github.com/intel/intel-extension-for-pytorch/commit/f88a7d127a6a3017db508454c7d332d7b2ad83f6) [#2627](https://github.com/intel/intel-extension-for-pytorch/commit/bc32ea463084d711e4a9aae85e38dd5d7d427849) [#2631](https://github.com/intel/intel-extension-for-pytorch/commit/f55a2bfa5d505fb7c7a6225c1c6206b5926777ab) [#2704](https://github.com/intel/intel-extension-for-pytorch/commit/eae477f76356b5a83640941787a168f680334775) + +**Full Changelog**: https://github.com/intel/intel-extension-for-pytorch/compare/v2.2.0+cpu...v2.3.0+cpu + ## 2.2.0 We are excited to announce the release of Intelยฎ Extension for PyTorch\* 2.2.0+cpu which accompanies PyTorch 2.2. This release mainly brings in our latest optimization on Large Language Model (LLM) including new dedicated API set (`ipex.llm`), new capability for auto-tuning accuracy recipe for LLM, and a broader list of optimized LLM models, together with a set of bug fixing and small optimization. We want to sincerely thank our dedicated community for your contributions. As always, we encourage you to try this release and feedback as to improve further on this product. diff --git a/examples/cpu/inference/cpp/README.md b/examples/cpu/inference/cpp/README.md index 49afbdd80..e4d30b2b1 100644 --- a/examples/cpu/inference/cpp/README.md +++ b/examples/cpu/inference/cpp/README.md @@ -16,15 +16,15 @@ We can have `libtorch` and `libintel-ext-pt` installed via the following command Download zip file of `libtorch` and decompress it: ```bash -wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.2.0%2Bcpu.zip -unzip libtorch-cxx11-abi-shared-with-deps-2.2.0+cpu.zip +wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.3.0%2Bcpu.zip +unzip libtorch-cxx11-abi-shared-with-deps-2.3.0+cpu.zip ``` Download and execute `libintel-ext-pt` installation script: ```bash -wget https://intel-extension-for-pytorch.s3.amazonaws.com/libipex/cpu/libintel-ext-pt-cxx11-abi-2.2.0%2Bcpu.run -bash libintel-ext-pt-cxx11-abi-2.2.0+cpu.run install ./libtorch +wget https://intel-extension-for-pytorch.s3.amazonaws.com/libipex/cpu/libintel-ext-pt-cxx11-abi-2.3.0%2Bcpu.run +bash libintel-ext-pt-cxx11-abi-2.3.0+cpu.run install ./libtorch ``` *Note:* If your C++ project has pre-C\+\+11 library dependencies, diff --git a/examples/cpu/inference/python/llm-modeling/README.md b/examples/cpu/inference/python/llm-modeling/README.md index 694b7b605..196843409 100644 --- a/examples/cpu/inference/python/llm-modeling/README.md +++ b/examples/cpu/inference/python/llm-modeling/README.md @@ -1,15 +1,16 @@ -# 1. LLM Optimization Overview +๏ปฟ# 1. LLM Module Level Optimizations Overview -ipex.llm provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. -To further provide optimized modules or functions to help build modelings, ipex supports the following module/function level APIs: +Intelยฎ Extension for PyTorch* provides dedicated optimization for running Large Language Models (LLMs) faster, including technical points like paged attention, ROPE fusion, etc. +To further provide optimized modules or functions to help build modelings, `ipex.llm` supports the following module/function level APIs: -``` +```python import intel_extension_for_pytorch as ipex ``` -### linear post-op fusions -``` -#using module init and forward +## Linear post-op fusions + +```python +# using module init and forward ipex.llm.modules.linearMul ipex.llm.modules.linearGelu ipex.llm.modules.linearNewGelu @@ -21,9 +22,10 @@ ipex.llm.modules.linear2SiluMul ipex.llm.modules.linearRelu ``` -### Attention related fusions -``` -#using module init and forward +## Attention related fusions + +```python +# using module init and forward ipex.llm.modules.RotaryEmbedding ipex.llm.modules.RMSNorm ipex.llm.modules.FastLayerNorm @@ -31,7 +33,7 @@ ipex.llm.modules.VarlenAttention ipex.llm.modules.PagedAttention ipex.llm.modules.IndirectAccessKVCacheAttention -#using as functions +# using as functions ipex.llm.functional.rotary_embedding ipex.llm.functional.rms_norm ipex.llm.functional.fast_layer_norm @@ -43,8 +45,11 @@ ipex.llm.functional.silu_mul ipex.llm.functional.gelu_mul ``` -### Generation related fusions -``` +## Generation related fusions + +```python +# using for optimizing huggingface generation APIs with prompt sharing +ipex.llm.generation.hf_beam_sample ipex.llm.generation.hf_beam_search ipex.llm.generation.hf_greedy_search ipex.llm.generation.hf_sample @@ -52,39 +57,21 @@ ipex.llm.generation.hf_sample
-# 2. Show cases of ipex.llm optimized modules and functions based modeling -We provide LLAMA, GPTJ and OPT modeling as show cases that apply the optimized modules or functions from ipex.llm layers. - -| MODEL FAMILY | MODEL NAME (Huggingface hub) | -|:---:|:---:| -|LLAMA| "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-13b-hf", etc. | -|GPT-J| "EleutherAI/gpt-j-6b", etc. | -|OPT| "facebook/opt-30b", "facebook/opt-1.3b", etc. | - -## How To Run LLM with ipex.llm +# 2. Showcases of ipex.llm optimized modules and functions based modeling -**ipex.llm provides a single script to facilitate running generation tasks as below:** -Note that please setup ENV according to the ../llm/README.md +We provide optimized LLAMA, GPT-J and OPT modeling files on the basis of [huggingface modeling APIs](https://huggingface.co/docs/transformers/en/main_classes/model) and a entry script `run.py` as showcases that apply the optimized modules or functions from `ipex.llm`. -``` -python run.py --help # for more detailed usages -``` +## Running example script -| Key args of run.py | Notes | -|---|---| -| model name | use "-m MODEL_NAME" to choose models to run | -| generation | default: beam search (beam size = 4), "--greedy" for greedy search | -| input tokens | default: 32, provide fixed sizes for input prompt size, use "--input-tokens" for [32, 64, 128, 256, 512, 1024, 2016, 2017, 2048, 4096, 8192]; if "--input-tokens" is not used, use "--prompt" to choose other strings as inputs| -| output tokens | default: 32, use "--max-new-tokens" to choose any other size | -| batch size | default: 1, use "--batch-size" to choose any other size | -| generation iterations | use "--num-iter" and "--num-warmup" to control the repeated iterations of generation, default: 100-iter/10-warmup | -| ipex prepack | apply ipex weight prepack optimization by "--use-ipex-optimize"| -| profiling | enable pytorch profiling by " --profile"| +Please refer to the [instructions](../llm/README.md#3-environment-setup) for environment setup. -*Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login). +The detail usage of `run.py` can be obtained by running +```bash +python run.py --help +``` -## Run commands +Example commands are listed below: ```bash # The following "OMP_NUM_THREADS" and "numactl" settings are based on the assumption that @@ -92,9 +79,118 @@ python run.py --help # for more detailed usages # Please adjust the settings per your hardware. # Running FP32 model -OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype float32 --use-ipex-optimize +OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype float32 # Running BF16 model -OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype bfloat16 --use-ipex-optimize +OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py -m meta-llama/Llama-2-7b-hf --dtype bfloat16 +``` + +*Note:* You may need to log in your HuggingFace account to access the model files. Please refer to [HuggingFace login](https://huggingface.co/docs/huggingface_hub/quick-start#login). + +
+ +# 3. Optimize your own LLM with ipex.llm + +## Changes required in the modeling file + +The changes required for applying `ipex.llm` optimizations for the customized LLMs are highly diverse based on their respective model architectures and implementations. +Generally speaking, the key steps would be: + +1. Analyze the model to find out the parts that are suitable for utilizing the optimizations. +2. Re-write these parts, applying the optimized `ipex.llm` operators. + +3. Some refactor of model architecture definition may be required to connect the original and optimized modules. + +## Changes required in the inference entry script + +Some key updates are required in the LLM inference entry script: + +1. Optimization for linear modules and their fusions: realized by weight prepacking with `ipex.optimize()`. + +```python +from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( + _enable_tpp, + _disable_tpp, +) + +_disable_tpp() +if args.dtype == "bfloat16": + _enable_tpp() + model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True) +else: + model = ipex.optimize( + model.eval(), + dtype=torch.float32, + inplace=True, + auto_kernel_selection=True, + ) +``` + +*Note:* The example is for FP32/BF16 optimization. +Please refer to [Advanced Usage](#4-advanced-usage) part for weight only quantization enabling. + +2. Optimizations for [the huggingface text generation API](https://huggingface.co/docs/transformers/en/main_classes/text_generation): + +- Using `ipex.llm.generation` functions to get prompt sharing for first token acceleration when `num_beams > 1`. + +```python +# Taking beam search as example here, please check complete code updates in run.py +hf_beam_search = ipex.llm.generation.hf_beam_search.__get__(model, model.__class__) +setattr(model, "beam_search", hf_beam_search) ``` + +- Using PyTorch jit to further reduce dispatch overhead for first token and next tokens acceleration. + +```python +# Please create a dummy `sample_inputs` in advance +# as the example input for jit.trace() +with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): + trace_model = torch.jit.trace( + model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + trace_model = torch.jit.freeze(trace_model) + model = ipex._set_optimized_model_for_generation( + model, optimized_model=trace_model + ) +``` + +Please read `run.py` and the example modeling files for detail of the changes. +The key parts are highlighted with comments. + +
+ +# 4. Advanced usage + +## How to apply weight only quantization int8 + +Intelยฎ Extension for PyTorch* also provides weight only quantization for int8 precision optimization +(replace the part using `ipex.optimize()`, which is for fp32/bf16 optimization in above showcases). + +```python +from intel_extension_for_pytorch.quantization import WoqWeightDtype +from intel_extension_for_pytorch.quantization import prepare, convert +weight_dtype = WoqWeightDtype.INT8 # weight dtype is int8 +lowp_mode = ipex.quantization.WoqLowpMode.BF16 # lowest precision for computation +qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + group_size= -1, # default is -1, can be further tuned in [32, 64, 128, 256, 512] (recommend) for better accuracy if needed +) +prepared_model = prepare(model, qconfig) +with torch.no_grad(), torch.cpu.amp.autocast(enabled=True): # we recommend to use quantization with AMP for better perf + converted_model = convert(prepared_model).to(torch.bfloat16) +``` + +
+ +# 5. Miscellaneous Tips + +- For LLMs, usually the query, key and value linear operations in Attention layer can be fused into one linear as kind of concat linear optimization. (e.g., [modeling_gpt_neox](https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#L175) from transformers) + +- LLM generation tasks are based on the [assumption](https://huggingface.co/blog/how-to-generate) that the probability distribution of a word sequence can be decomposed into the product of conditional next word distributions. +Thus the model's computation of `lm_head` layer during the first token's generation can be reduced with using last token as its inputs (instead of using the full tokens from input prompt). +The showcases we provide contain such optimization (set with `lm_head_generation` flag). This is also optimized in LLM serving [text-generation-inference](https://github.com/huggingface/text-generation-inference/blob/main/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py#L419). \ No newline at end of file diff --git a/examples/cpu/inference/python/llm-modeling/run.py b/examples/cpu/inference/python/llm-modeling/run.py index 6a34ca845..d8f73ee60 100644 --- a/examples/cpu/inference/python/llm-modeling/run.py +++ b/examples/cpu/inference/python/llm-modeling/run.py @@ -117,7 +117,6 @@ def get_dummy_input(_model, return_dict=False): ) parser.add_argument("--greedy", action="store_true") parser.add_argument("--profile", action="store_true") -parser.add_argument("--use-ipex-optimize", action="store_true") parser.add_argument("--token-latency", action="store_true") parser.add_argument("--num-iter", default=100, type=int, help="num iter") parser.add_argument("--num-warmup", default=10, type=int, help="num warmup") @@ -138,7 +137,6 @@ def get_dummy_input(_model, return_dict=False): torch_dtype=amp_dtype, low_cpu_mem_usage=True, attn_implementation="eager", - # torchscript=True if args.use_ipex_optimize else False, ) tokenizer = model_class[1].from_pretrained(args.model_id, trust_remote_code=True) @@ -153,56 +151,55 @@ def get_dummy_input(_model, return_dict=False): model = model.eval() -if args.use_ipex_optimize: - if not hasattr(model.config, "use_ipex_optimize"): - model.config.use_ipex_optimize = True - # 1) using ipex weight prepack to work with IPEX linear module and their fusions - from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( - _enable_tpp, - _disable_tpp, - ) +# Adding this attribute in model.config +# as it will be used in the modeling file. +if not hasattr(model.config, "use_ipex_optimize"): + model.config.use_ipex_optimize = True +# 1) Applying IPEX weight prepacking with `ipex.optimize()` +# to accelerate linear modules and their fusions. +from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( + _enable_tpp, + _disable_tpp, +) - _disable_tpp() - if args.dtype == "bfloat16": - _enable_tpp() - model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True) - else: - model = ipex.optimize( - model.eval(), - dtype=torch.float32, - inplace=True, - auto_kernel_selection=True, - ) +_disable_tpp() +if args.dtype == "bfloat16": + _enable_tpp() + model = ipex.optimize(model.eval(), dtype=torch.bfloat16, inplace=True) +else: + model = ipex.optimize( + model.eval(), + dtype=torch.float32, + inplace=True, + auto_kernel_selection=True, + ) - # 2) using ipex geneartion function to get prompt sharing and first token optimizations - hf_beam_search = ipex.llm.generation.hf_beam_search.__get__(model, model.__class__) - hf_greedy_search = ipex.llm.generation.hf_greedy_search.__get__( - model, model.__class__ +# 2) using `ipex.llm.generation` functions +# to get prompt sharing for first token optimization +hf_beam_search = ipex.llm.generation.hf_beam_search.__get__(model, model.__class__) +hf_greedy_search = ipex.llm.generation.hf_greedy_search.__get__(model, model.__class__) +hf_sample = ipex.llm.generation.hf_sample.__get__(model, model.__class__) +hf_beam_sample = ipex.llm.generation.hf_beam_sample.__get__(model, model.__class__) + +setattr(model, "beam_search", hf_beam_search) # noqa: B010 +setattr(model, "greedy_search", hf_greedy_search) # noqa: B010 +setattr(model, "sample", hf_sample) # noqa: B010 +setattr(model, "beam_sample", hf_beam_sample) # noqa: B010 + +if not hasattr(model.config, "lm_head_generation"): + model.config.lm_head_generation = True + +# 3) using PyTorch jit to further reduce dispatch overhead +sample_inputs = get_dummy_input(model, return_dict=True) +with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): + trace_model = torch.jit.trace( + model, + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, ) - hf_sample = ipex.llm.generation.hf_sample.__get__(model, model.__class__) - hf_beam_sample = ipex.llm.generation.hf_beam_sample.__get__(model, model.__class__) - - setattr(model, "beam_search", hf_beam_search) # noqa: B010 - setattr(model, "greedy_search", hf_greedy_search) # noqa: B010 - setattr(model, "sample", hf_sample) # noqa: B010 - setattr(model, "beam_sample", hf_beam_sample) # noqa: B010 - - if not hasattr(model.config, "lm_head_generation"): - model.config.lm_head_generation = True - - # 3) using PyTorch jit to further reduce dispatch overhead - sample_inputs = get_dummy_input(model, return_dict=True) - with torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): - trace_model = torch.jit.trace( - model, - example_kwarg_inputs=sample_inputs, - strict=False, - check_trace=False, - ) - trace_model = torch.jit.freeze(trace_model) - model = ipex._set_optimized_model_for_generation( - model, optimized_model=trace_model - ) + trace_model = torch.jit.freeze(trace_model) + model = ipex._set_optimized_model_for_generation(model, optimized_model=trace_model) if ( diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index c6ac2beb7..7123f243e 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -1,6 +1,7 @@ # 1. LLM Optimization Overview -ipex.llm provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype). +`ipex.llm` provides dedicated optimization for running Large Language Models (LLM) faster, including technical points like paged attention, ROPE fusion, etc. +And a set of data types are supported for various scenarios, including FP32, BF16, Smooth Quantization INT8, Weight Only Quantization INT8/INT4 (prototype).
@@ -10,29 +11,36 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) | MODEL FAMILY | MODEL NAME (Huggingface hub) | FP32 | BF16 | Static quantization INT8 | Weight only quantization INT8 | Weight only quantization INT4 | |:---:|:---:|:---:|:---:|:---:|:---:|:---:| -|LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | -|GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|DOLLY| databricks/dolly-v2-12b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|FALCON| tiiuae/falcon-40b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | -|OPT| facebook/opt-30b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|OPT| facebook/opt-1.3b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-8B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLAMA| meta-llama/Meta-Llama-3-70B | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|DOLLY| databricks/dolly-v2-12b | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|FALCON| tiiuae/falcon-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|FALCON| tiiuae/falcon-40b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|OPT| facebook/opt-30b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|OPT| facebook/opt-1.3b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | |Bloom| bigscience/bloom-1b7 | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | -|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | -|Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | -|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | | +|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | |Baichuan| baichuan-inc/Baichuan-13B-Chat | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | | -|ChatGLM| THUDM/chatglm3-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | -|ChatGLM| THUDM/chatglm2-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | -|GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | -|Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | -|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸฉ | -|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | -|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | -|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|ChatGLM| THUDM/chatglm3-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|ChatGLM| THUDM/chatglm2-6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | +|Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Mixtral| mistralai/Mixtral-8x7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | ๐ŸŸจ | +|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | | +|LLaVA| liuhaotian/llava-v1.5-7b | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|GIT| microsoft/git-base | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | +|Yuan| IEITYuan/Yuan2-102B-hf | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | +|Phi| microsoft/phi-2 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ## 2.2 Verified for distributed inference mode via DeepSpeed @@ -41,6 +49,8 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) |LLAMA| meta-llama/Llama-2-7b-hf | ๐ŸŸฉ | ๐ŸŸฉ | |LLAMA| meta-llama/Llama-2-13b-hf | ๐ŸŸฉ | ๐ŸŸฉ | |LLAMA| meta-llama/Llama-2-70b-hf | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-8B | ๐ŸŸฉ | ๐ŸŸฉ | +|LLAMA| meta-llama/Meta-Llama-3-70B | ๐ŸŸฉ | ๐ŸŸฉ | |GPT-J| EleutherAI/gpt-j-6b | ๐ŸŸจ | ๐ŸŸฉ | |GPT-NEOX| EleutherAI/gpt-neox-20b | ๐ŸŸจ | ๐ŸŸฉ | |DOLLY| databricks/dolly-v2-12b | ๐ŸŸจ | ๐ŸŸฉ | @@ -50,12 +60,15 @@ ipex.llm provides dedicated optimization for running Large Language Models (LLM) |Bloom| bigscience/bloom-1b7 | ๐ŸŸจ | ๐ŸŸฉ | |CodeGen| Salesforce/codegen-2B-multi | ๐ŸŸฉ | ๐ŸŸฉ | |Baichuan| baichuan-inc/Baichuan2-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | -|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸจ | ๐ŸŸฉ | +|Baichuan| baichuan-inc/Baichuan2-13B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | |Baichuan| baichuan-inc/Baichuan-13B-Chat | ๐ŸŸจ | ๐ŸŸฉ | |GPTBigCode| bigcode/starcoder | ๐ŸŸฉ | ๐ŸŸฉ | |T5| google/flan-t5-xl | ๐ŸŸฉ | ๐ŸŸฉ | |Mistral| mistralai/Mistral-7B-v0.1 | ๐ŸŸฉ | ๐ŸŸฉ | |MPT| mosaicml/mpt-7b | ๐ŸŸฉ | ๐ŸŸฉ | +|Stablelm| stabilityai/stablelm-2-1_6b | ๐ŸŸฉ | ๐ŸŸฉ | +|Qwen| Qwen/Qwen-7B-Chat | ๐ŸŸฉ | ๐ŸŸฉ | +|GIT| microsoft/git-base | ๐ŸŸฉ | ๐ŸŸฉ | - ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32). @@ -69,8 +82,7 @@ We are working in progress to better support the models in the tables with vario # 3. Environment Setup *Note*: The instructions in this section will setup an environment with a recent PyTorch\* nightly build and **a latest source build of IPEX**. -If you would like to use stable PyTorch\* and IPEX release versions, please refer to the instructions [in the release branch](https://github.com/intel/intel-extension-for-pytorch/blob/v2.2.0%2Bcpu/examples/cpu/inference/python/llm/README.md#3-environment-setup), in which IPEX is installed via prebuilt wheels using `pip install` rather than source code building. - +If you would like to use stable PyTorch\* and IPEX release versions, please refer to the instructions [in the release branch](https://github.com/intel/intel-extension-for-pytorch/blob/v2.3.0%2Bcpu/examples/cpu/inference/python/llm/README.md#3-environment-setup), in which IPEX is installed via prebuilt wheels using `pip install` rather than source code building. ## 3.1 [Recommended] Docker-based environment setup with compilation from source @@ -119,6 +131,21 @@ source ./tools/env_activate.sh
+*Note*: In `env_setup.sh` script a `prompt.json` file is downloaded, which provides prompt samples with pre-defined input token lengths for benchmarking. +For **Llama-3 models** benchmarking, the users need to download a specific `prompt.json` file, overwriting the original one. + +```bash +wget -O prompt.json https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt-3.json +``` + +The original `prompt.json` file can be restored from the repository if needed. + +```bash +wget https://intel-extension-for-pytorch.s3.amazonaws.com/miscellaneous/llm/prompt.json +``` + +
+ # 4. How To Run LLM with ipex.llm **ipex.llm provides a single script to facilitate running generation tasks as below:** @@ -127,7 +154,6 @@ source ./tools/env_activate.sh python run.py --help # for more detailed usages ``` - | Key args of run.py | Notes | |---|---| | generation | default: beam search (beam size = 4), "--greedy" for greedy search | @@ -229,16 +255,15 @@ cd distributed unset KMP_AFFINITY # Distributed inference in FP32 -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai # Distributed inference in BF16 -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai # Distributed inference with Weight-Only Quantization -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --tasks lambada_openai ``` - ## 4.2 Detail usage of running LLM models ### 4.2.1 Run generation with one instance @@ -622,28 +647,28 @@ unset KMP_AFFINITY - Command: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype float32 --ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype float32 --ipex --tasks ``` - An example of llama2 7b model: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype float32 --ipex --tasks lambada_openai ``` #### 5.2.2.3 BF16: - Command: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype bfloat16 -ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --dtype bfloat16 -ipex --tasks ``` - An example of llama2 7b model: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --dtype bfloat16 --ipex --tasks lambada_openai ``` #### 5.2.2.4 Weight-only quantization (INT8): - Command: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks ``` Similar to script usage for performance benchmarking, we need to update some arguments of the running command specifically for some models to achieve better accuracy. @@ -661,7 +686,7 @@ Similar to script usage for performance benchmarking, we need to update some arg - An example of llama2 7b model: ```bash -deepspeed --num_gpus 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks +deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --ipex --tasks ``` ## 5.3 How to Shard model for Distributed tests with DeepSpeed (autoTP) diff --git a/examples/cpu/inference/python/llm/llm_sq_recipes.md b/examples/cpu/inference/python/llm/llm_sq_recipes.md index 009938e07..22df336e2 100644 --- a/examples/cpu/inference/python/llm/llm_sq_recipes.md +++ b/examples/cpu/inference/python/llm/llm_sq_recipes.md @@ -12,9 +12,10 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama ## Example command for model tuning with AutoTune API | Model ID | Command | |---|:---:| -| meta-llama/Llama-2-7b-hf | python run.py -m meta-llama/Llama-2-7b-hf --ipex-smooth-quant --batch-size 56 --calib-len 2048 --fallback-add --alpha auto --init-alpha 0.8 --alpha-min 0.8 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'mean' | +| meta-llama/Llama-2-13b-hf | python run.py -m meta-llama/Llama-2-13b-hf --ipex-smooth-quant --alpha auto --init-alpha 0.8 --alpha-min 0.75 --alpha-max 0.99 --alpha-step 0.01 --shared-criterion 'max' --calib-len 1024 --calib-padding --fallback-add | | meta-llama/Llama-2-70b-hf | python run.py -m meta-llama/Llama-2-70b-hf --ipex-smooth-quant --batch-size 56 --calib-shuffle --fallback-add --alpha 0.8 | | EleutherAI/gpt-j-6b | python run.py -m EleutherAI/gpt-j-6b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --fallback-add --alpha 0.85 | +| tiiuae/falcon-7b | python run.py -m tiiuae/falcon-7b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.95 | | tiiuae/falcon-40b | python run.py -m tiiuae/falcon-40b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.9 | | facebook/opt-30b | python run.py -m facebook/opt-30b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle | | facebook/opt-1.3b | python run.py -m facebook/opt-1.3b --ipex-smooth-quant --batch-size 56 --calib-iters 100 --calib-shuffle --alpha 0.85 | diff --git a/examples/cpu/serving/torchserve/README.md b/examples/cpu/serving/torchserve/README.md index e18f302fd..8a9b30464 100644 --- a/examples/cpu/serving/torchserve/README.md +++ b/examples/cpu/serving/torchserve/README.md @@ -14,7 +14,7 @@ docker run \ --rm -it -u root \ --entrypoint='' \ -v $PWD:/home/model-server \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu \ + intel/intel-optimized-pytorch:2.3.0-serving-cpu \ python quantize_model.py ``` @@ -31,7 +31,7 @@ docker run \ --rm -it -u root \ --entrypoint='' \ -v $PWD:/home/model-server \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu \ + intel/intel-optimized-pytorch:2.3.0-serving-cpu \ torch-model-archiver \ --model-name ipex-resnet50 \ --version 1.0 \ @@ -43,7 +43,7 @@ docker run \ > [!NOTE] > If you are working under a corporate proxy you will need to include the following parameters in your `docker run` command: `-e http_proxy=${http_proxy} -e https_proxy=${https_proxy}`. -#### Advanced Model Archival +### Advanced Model Archival The `--handler` argument is an important component of serving as it controls the inference pipeline. Torchserve provides several default handlers [built into the application](https://pytorch.org/serve/default_handlers.html#torchserve-default-inference-handlers). that are often enough for most inference cases, but you may need to create a custom handler if your application's inference needs additional preprocessing, postprocessing or using other variables to derive a final output. To create a custom handler, first inherit `BaseHandler` or another built-in handler and override any necessary functionality. Usually, you only need to override the preprocessing and postprocessing methods to achieve an application's inference needs. @@ -88,7 +88,7 @@ docker run \ -v $PWD/model-store:/home/model-server/model-store \ -v $PWD/wf-store:/home/model-server/wf-store \ --net=host \ - intel/intel-optimized-pytorch:2.2.0-serving-cpu + intel/intel-optimized-pytorch:2.3.0-serving-cpu ``` > [!TIP] diff --git a/examples/cpu/serving/triton/Dockerfile b/examples/cpu/serving/triton/Dockerfile index 73d9fc66d..8897773f5 100644 --- a/examples/cpu/serving/triton/Dockerfile +++ b/examples/cpu/serving/triton/Dockerfile @@ -15,11 +15,11 @@ ARG TORCHAUDIO_VERSION ARG IPEX_VERSION RUN python3 -m pip install --no-cache-dir \ - torch==${PYTORCH_VERSION:-2.1.0+cpu} \ - torchaudio==${TORCHAUDIO_VERSION:-2.1.0+cpu} \ - torchvision==${TORCHVISION_VERSION:-0.16.0+cpu} \ + torch==${PYTORCH_VERSION:-2.3.0+cpu} \ + torchaudio==${TORCHAUDIO_VERSION:-2.3.0+cpu} \ + torchvision==${TORCHVISION_VERSION:-0.18.0+cpu} \ -f https://download.pytorch.org/whl/cpu/torch_stable.html \ - intel_extension_for_pytorch==${IPEX_VERSION:-2.1.0}+cpu \ + intel_extension_for_pytorch==${IPEX_VERSION:-2.3.0}+cpu \ -f https://developer.intel.com/ipex-whl-stable-cpu \ configargparse \ intel-openmp \ diff --git a/examples/cpu/serving/triton/requirements.txt b/examples/cpu/serving/triton/requirements.txt index 3dcefb4b2..dc40f4345 100644 --- a/examples/cpu/serving/triton/requirements.txt +++ b/examples/cpu/serving/triton/requirements.txt @@ -1,7 +1,7 @@ -torch==2.2.0 --index-url https://download.pytorch.org/whl/cpu -torchvision==0.17.0 --index-url https://download.pytorch.org/whl/cpu -torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cpu -intel_extension_for_pytorch==2.2.0 +torch==2.3.0 --index-url https://download.pytorch.org/whl/cpu +torchvision==0.18.0 --index-url https://download.pytorch.org/whl/cpu +torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cpu +intel_extension_for_pytorch==2.3.0 transformers==4.38.1 tritonclient[all]==2.41.1 intel-openmp==2024.0.2 diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py index 04f05dc49..12aea9a7f 100644 --- a/intel_extension_for_pytorch/llm/functional/fusions.py +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -22,25 +22,31 @@ def rotary_embedding( ): r""" Applies RotaryEmbedding (see https://huggingface.co/papers/2104.09864) - on the `query ` or `key` before their multi-head attention computation. + on the `query ` or `key` before their multi-head attention computation. + Args: - - query, key (torch.Tensor) : inputs to be applied with position embeddings, taking shape of - [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). - - sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor generated to be applied on query/key. - - rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. - - head_dim (int) : head dim from the input shape. - - rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, - so the offset is 1. - if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, - so the offset is rotary_dim/2. - - position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. the according position_ids - for the input. The shape should be [batch size, sequence length]. + query, key (torch.Tensor) : inputs to be applied with position embeddings, + taking shape of [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). + sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor + generated to be applied on query/key. + rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. + head_dim (int) : head dim from the input shape. + rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, + so the offset is 1. + + if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, + so the offset is rotary_dim/2. + + position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. + The according position_ids for the input. The shape should be [batch size, sequence length]. + Return - - query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim]. + query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim]. """ + return RotaryEmbedding.apply_function( query, key, sin, cos, rotary_dim, rotary_half, position_ids ) @@ -50,12 +56,14 @@ def rms_norm(hidden_states: torch.Tensor, weight: torch.Tensor, eps: float): r""" Applies RMSnorm on the input (hidden states). (see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L76) + Args: - - hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. - - weight (torch.Tensor): the weight to apply RMSnorm. - - eps (float) : the variance_epsilon to apply RMSnorm. + hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. + weight (torch.Tensor): the weight to apply RMSnorm. + eps (float) : the variance_epsilon to apply RMSnorm. """ + return RMSNorm.apply_function(hidden_states, weight, eps) @@ -69,12 +77,14 @@ def fast_layer_norm( r""" Applies PyTorch Layernorm (see https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) on the input (hidden states). + Args: - - hidden_states(torch.Tensor) : the input tensor to apply normalization. - - normalized_shape (int or list) or torch.Size) input shape from an expected input of size. - - weight (torch.Tensor): the weight to apply normalization. - - bias (torch.Tensor): an additive bias for normalization. - - eps (float): a value added to the denominator for numerical stability. + hidden_states(torch.Tensor) : the input tensor to apply normalization. + normalized_shape (int or list) or torch.Size) input shape from an + expected input of size. + weight (torch.Tensor): the weight to apply normalization. + bias (torch.Tensor): an additive bias for normalization. + eps (float): a value added to the denominator for numerical stability. """ @@ -105,33 +115,49 @@ def indirect_access_kv_cache_attention( buffers(key and value use different buffers) to store all key/value hidden states and beam index information. It can use beam index history to decide which beam should be used by a timestamp and this information will generate an offset to access the kv_cache buffer. + Data Format: - - The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], - the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. - All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. - - forward - - query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - scale_attn (float):scale used by the attention layer. should be the sqrt(head_size). - - layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - beam-idx: history beam idx, shape:(max_seq, beam*batch); - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). - - head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. - - attention_mask(torch.Tensor): Attention mask information. - - text_max_length (int) : the max length of kv cache to be used for generation (allocate the pre-cache buffer). + + The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], + the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. + All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. + + Args: + query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). + key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). + value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). + scale_attn (float):scale used by the attention layer. should be the sqrt(head_size). + layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). + + - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - beam-idx: history beam idx, shape:(max_seq, beam*batch); + + - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). + + head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. + attention_mask(torch.Tensor): Attention mask information. + text_max_length (int) : the max length of kv cache to be used for generation + (allocate the pre-cache buffer). Return: - - attn_output: weighted value which is the output of scale dot product. shape (beam*batch, seq_len, head_num, head_size). - - attn_weights: The output tensor of the first matmul in scale dot product which is not supported by kernel now. - - new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). + attn_output: weighted value which is the output of scale dot product. + shape (beam*batch, seq_len, head_num, head_size). + + attn_weights: the output tensor of the first matmul in scale dot product + which is not supported by kernel now. + + new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). Notes: - - How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model - see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model + see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + + .. highlight:: python + .. code-block:: python + def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: @@ -143,6 +169,7 @@ def _reorder_cache( return past_key_values """ + return IndirectAccessKVCacheAttention.apply_function( query, key, @@ -176,25 +203,33 @@ def varlen_attention( ): r""" Applies PyTorch scaled_dot_product_attention on the inputs of query, key and value - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), - and accept the variant (different) sequence length among the query, key and value. + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), + and accept the variant (different) sequence length among the query, key and value. + + This module does not have args for `module init`. + + `forward()` Args: - module init: this module does not have args for module init - forward: - - query (torch.Tensor): shape [query_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - key (torch.Tensor): shape [key_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - value (torch.Tensor): shape [value_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - out (torch.Tensor): buffer to get the results, the shape is the same as query. - - seqlen_q (torch.Tensor): shape [batch_size + 1], points the current query_tokens among total sequence length. - - seqlen_k (torch.Tensor): shape [batch_size + 1], points the current key_tokens among total sequence length. - - max_seqlen_q (int): max/total sequence length of query. - - max_seqlen_k (int): max/total sequence length of key. - - pdropout (float): dropout probability; if greater than 0.0, dropout is applied, default is 0.0. - - softmax_scale (float): scaling factor applied is prior to softmax. - - is_causal (bool): whether to apply causal attention masking, default is True. + query (torch.Tensor): shape [query_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + key (torch.Tensor): shape [key_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + value (torch.Tensor): shape [value_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + out (torch.Tensor): buffer to get the results, the shape is the same as query. + seqlen_q (torch.Tensor): shape [batch_size + 1], + points the current query_tokens among total sequence length. + seqlen_k (torch.Tensor): shape [batch_size + 1], + points the current key_tokens among total sequence length. + max_seqlen_q (int): max/total sequence length of query. + max_seqlen_k (int): max/total sequence length of key. + pdropout (float): dropout probability; if greater than 0.0, dropout is applied, default is 0.0. + softmax_scale (float): scaling factor applied is prior to softmax. + is_causal (bool): whether to apply causal attention masking, default is True. """ + return VarlenAttention.apply_function( query, key, diff --git a/intel_extension_for_pytorch/llm/modules/linear_fusion.py b/intel_extension_for_pytorch/llm/modules/linear_fusion.py index 380cf8de4..26e4c99d3 100644 --- a/intel_extension_for_pytorch/llm/modules/linear_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/linear_fusion.py @@ -53,12 +53,21 @@ def init_on_device(self, x, op_type): class LinearSilu(IPEXLinearFusion): r""" Applies a linear transformation to the `input` data, and then apply PyTorch SILU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) on the result: + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) + on the result: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.silu(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with silu. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with silu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -66,6 +75,7 @@ class LinearSilu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -80,15 +90,25 @@ def forward(self, x): class Linear2SiluMul(IPEXLinear2Fusion): r""" - Applies two linear transformation to the `input` data (`linear_s` and `linear_m`), then apply PyTorch SILU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) on the result from `linear_s` - , and multiplies the result from `linear_m`: + Applies two linear transformation to the `input` data (`linear_s` and + `linear_m`), then apply PyTorch SILU + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) + on the result from `linear_s`, and multiplies the result from `linear_m`: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.silu(linear_s(input)) * linear_m(input) + Args: - linear_s (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with silu. - linear_m (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with mul. + linear_s (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with silu. + linear_m (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with mul. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_s_module = torch.nn.Linear(4096, 4096) @@ -97,6 +117,7 @@ class Linear2SiluMul(IPEXLinear2Fusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear_s, linear_m): @@ -112,12 +133,21 @@ def forward(self, x): class LinearRelu(IPEXLinearFusion): r""" Applies a linear transformation to the `input` data, and then apply PyTorch RELU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.relu.html) on the result: + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.relu.html) + on the result: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.relu(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with relu. + linear (torch.nn.Linear module) : the original torch.nn.Linear module + to be fused with relu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -125,6 +155,7 @@ class LinearRelu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -142,11 +173,19 @@ class LinearNewGelu(IPEXLinearFusion): Applies a linear transformation to the `input` data, and then apply NewGELUActivation (see https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py#L50) on the result: + + .. highlight:: python + .. code-block:: python + result = NewGELUActivation(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with new_gelu. + linear (torch.nn.Linear module) : the original torch.nn.Linear module + to be fused with new_gelu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -154,6 +193,7 @@ class LinearNewGelu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -169,12 +209,21 @@ def forward(self, x): class LinearGelu(IPEXLinearFusion): r""" Applies a linear transformation to the `input` data, and then apply PyTorch GELU - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.gelu.html) on the result: + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.gelu.html) + on the result: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.gelu(linear(input)) + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with gelu. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with gelu. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -182,6 +231,7 @@ class LinearGelu(IPEXLinearFusion): >>> # module forward: >>> input = torch.randn(4096, 4096) >>> result = ipex_fusion(input) + """ def __init__(self, linear): @@ -199,12 +249,19 @@ class LinearSiluMul(IPEXLinearFusion): Applies a linear transformation to the `input` data, then apply PyTorch SILU (see https://pytorch.org/docs/stable/generated/torch.nn.functional.silu.html) on the result, and multiplies the result by `other`: + + .. highlight:: python + .. code-block:: python + result = torch.nn.functional.silu(linear(input)) * other + Args: linear (torch.nn.Linear module) : the original torch.nn.Linear module to - be fused with silu and mul. + be fused with silu and mul. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -213,6 +270,7 @@ class LinearSiluMul(IPEXLinearFusion): >>> input = torch.randn(4096, 4096) >>> other = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other) + """ def __init__(self, linear): @@ -227,12 +285,21 @@ def forward(self, x, y): class LinearMul(IPEXLinearFusion): r""" - Applies a linear transformation to the `input` data, and then multiplies the result by `other`: + Applies a linear transformation to the `input` data, and then multiplies + the result by `other`: + + .. highlight:: python + .. code-block:: python + result = linear(input) * other + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with mul. + linear (torch.nn.Linear module) : the original torch.nn.Linear module + to be fused with mul. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -241,6 +308,7 @@ class LinearMul(IPEXLinearFusion): >>> input = torch.randn(4096, 4096) >>> other = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other) + """ def __init__(self, linear): @@ -255,12 +323,21 @@ def forward(self, x, y): class LinearAdd(IPEXLinearFusion): r""" - Applies a linear transformation to the `input` data, and then add the result by `other`: + Applies a linear transformation to the `input` data, + and then add the result by `other`: + + .. highlight:: python + .. code-block:: python + result = linear(input) + other + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with add. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with add. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -269,6 +346,7 @@ class LinearAdd(IPEXLinearFusion): >>> input = torch.randn(4096, 4096) >>> other = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other) + """ def __init__(self, linear): @@ -283,12 +361,21 @@ def forward(self, x, y): class LinearAddAdd(IPEXLinearFusion): r""" - Applies a linear transformation to the `input` data, and then add the result by `other_1` and `other_2`: + Applies a linear transformation to the `input` data, + and then add the result by `other_1` and `other_2`: + + .. highlight:: python + .. code-block:: python + result = linear(input) + other_1 + other_2 + Args: - linear (torch.nn.Linear module) : the original torch.nn.Linear module to be fused with add and add. + linear (torch.nn.Linear module) : the original torch.nn.Linear + module to be fused with add and add. + Shape: Input and output shapes are the same as torch.nn.Linear. + Examples: >>> # module init: >>> linear_module = torch.nn.Linear(4096, 4096) @@ -298,6 +385,7 @@ class LinearAddAdd(IPEXLinearFusion): >>> other_1 = torch.randn(4096, 4096) >>> other_2 = torch.randn(4096, 4096) >>> result = ipex_fusion(input, other_1, other_2) + """ def __init__(self, linear): diff --git a/intel_extension_for_pytorch/llm/modules/mha_fusion.py b/intel_extension_for_pytorch/llm/modules/mha_fusion.py index 940fea611..1589b1444 100644 --- a/intel_extension_for_pytorch/llm/modules/mha_fusion.py +++ b/intel_extension_for_pytorch/llm/modules/mha_fusion.py @@ -7,7 +7,10 @@ class RotaryEmbedding(nn.Module): r""" [module init and forward] Applies RotaryEmbedding (see https://huggingface.co/papers/2104.09864) - on the `query ` or `key` before their multi-head attention computation. + on the ``query`` or ``key`` before their multi-head attention computation. + + `module init` + Args: max_position_embeddings (int): size (max) of the position embeddings. pos_embd_dim (int): dimension of the position embeddings. @@ -19,20 +22,22 @@ class RotaryEmbedding(nn.Module): long_factor and short_factor, see details: https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/blob/main/config.json#L23. - forward: - - input (torch.Tensor) : input to be applied with position embeddings, - taking shape of [batch size, sequence length, num_head/num_kv_head, head_dim] - (as well as the output shape). - - position_ids (torch.Tensor): the according position_ids for the input. - The shape should be [batch size, sequence length. In some cases, - there is only one element which the past_kv_length, and position id - can be constructed by past_kv_length + current_position. - - num_head (int) : head num from the input shape. - - head_dim (int) : head dim from the input shape. - - offset (int) : the offset value. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, - so the offset is 1. For llama, cos/sin is applied to the neighboring rotary_dim elements, - so the offset is rotary_dim/2. - - rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. + `forward()` + + Args: + input (torch.Tensor) : input to be applied with position embeddings, + taking shape of [batch size, sequence length, num_head/num_kv_head, head_dim] + (as well as the output shape). + position_ids (torch.Tensor): the according position_ids for the input. + The shape should be [batch size, sequence length. In some cases, + there is only one element which the past_kv_length, and position id + can be constructed by past_kv_length + current_position. + num_head (int) : head num from the input shape. + head_dim (int) : head dim from the input shape. + offset (int) : the offset value. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, + so the offset is 1. For llama, cos/sin is applied to the neighboring rotary_dim elements, + so the offset is rotary_dim/2. + rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. Examples: >>> # module init: @@ -42,25 +47,29 @@ class RotaryEmbedding(nn.Module): >>> position_ids = torch.arange(32).unsqueeze(0) >>> query_rotery = rope_module(query, position_ids, 16, 256, 1, 64) - [Direct function call] This module also provides a `.apply_function` function call to be used on query and key - at the same time without initializing the module (assume rotary embedding - sin/cos values are provided). + [Direct function call] This module also provides a `.apply_function` function call + to be used on query and key at the same time without initializing the module + (assume rotary embedding sin/cos values are provided). + + `apply_function()` + Args: - - query, key (torch.Tensor) : inputs to be applied with position embeddings, taking shape of - [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). - - sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor generated to be applied on query/key. - - rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. - - head_dim (int) : head dim from the input shape. - - rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, - so the offset is 1. - if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, - so the offset is rotary_dim/2. - - position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. the according position_ids - for the input. The shape should be [batch size, sequence length]. - Return - - query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] - or [num_tokens, num_head/num_kv_head, head_dim]. + query, key (torch.Tensor) : inputs to be applied with position embeddings, taking shape of + [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim] (as well as the output shape). + sin/cos (torch.Tensor): [num_tokens, rotary_dim] the sin/cos value tensor generated to be applied on query/key. + rotary_ndims (int): the rotary dimension. e.g., 64 for GPTJ. head size for LLama. + head_dim (int) : head dim from the input shape. + rotary_half (bool) : if False. e.g., GPT-J 6B/ChatGLM, cos/sin is applied to the neighboring 2 elements, + so the offset is 1. + if True, e.g., for llama, cos/sin is applied to the neighboring rotary_dim elements, + so the offset is rotary_dim/2. + position_ids (torch.Tensor): Default is None and optional if sin/cos is provided. the according position_ids + for the input. The shape should be [batch size, sequence length]. + + Return: + query, key (torch.Tensor): [batch size, sequence length, num_head/num_kv_head, head_dim] + or [num_tokens, num_head/num_kv_head, head_dim]. """ @@ -144,14 +153,9 @@ def apply_function( runtime_module = cls.runtime_ops.get_module_from_device( query.device.type, IPEXCustomOpType.ROPE, False ) - - query_, key_ = runtime_module.rotary_embedding( + query, key = runtime_module.rotary_embedding( query, key, sin, cos, rotary_dim, rotary_half, position_ids ) - - # keep the inplace context as used in TGI - query.copy_(query_) - key.copy_(key_) return query, key @@ -159,16 +163,20 @@ class FastLayerNorm(nn.Module): r""" [module init and forward] Applies PyTorch Layernorm (see https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) on the input (hidden states). + + `module init` + Args: - module init: - - normalized_shape ((int or list) or torch.Size) input shape from an expected input of size. - - eps (float): a value added to the denominator for numerical stability. - - weight (torch.Tensor): the weight of Layernorm to apply normalization. - - bias (torch.Tensor): an additive bias for normalization. + normalized_shape ((int or list) or torch.Size) input shape from an expected input of size. + eps (float): a value added to the denominator for numerical stability. + weight (torch.Tensor): the weight of Layernorm to apply normalization. + bias (torch.Tensor): an additive bias for normalization. + + `forward()` - forward: - - hidden_states (torch.Tensor) : input to be applied Layernorm, usually taking shape of - [batch size, sequence length, hidden_size] (as well as the output shape). + Args: + hidden_states (torch.Tensor) : input to be applied Layernorm, usually taking shape of + [batch size, sequence length, hidden_size] (as well as the output shape). Examples: >>> # module init: @@ -179,13 +187,16 @@ class FastLayerNorm(nn.Module): >>> result = layernorm_module(input) [Direct function call] This module also provides a `.apply_function` function call to apply fast layernorm - without initializing the module. + without initializing the module. + + `apply_function()` + Args: - - hidden_states(torch.Tensor) : the input tensor to apply normalization. - - normalized_shape (int or list) or torch.Size) input shape from an expected input of size. - - weight (torch.Tensor): the weight to apply normalization. - - bias (torch.Tensor): an additive bias for normalization. - - eps (float): a value added to the denominator for numerical stability. + hidden_states(torch.Tensor) : the input tensor to apply normalization. + normalized_shape (int or list) or torch.Size) input shape from an expected input of size. + weight (torch.Tensor): the weight to apply normalization. + bias (torch.Tensor): an additive bias for normalization. + eps (float): a value added to the denominator for numerical stability. """ @@ -227,16 +238,20 @@ class RMSNorm(nn.Module): r""" [module init and forward] Applies RMSnorm on the input (hidden states). (see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L76) + + `module init` + Args: - module init: - - hidden_size (int) : the size of the hidden states. - - eps (float) : the variance_epsilon to apply RMSnorm, default using 1e-6. - - weight (torch.Tensor): the weight to apply RMSnorm, default None and will use `torch.ones(hidden_size)`. + hidden_size (int) : the size of the hidden states. + eps (float) : the variance_epsilon to apply RMSnorm, default using 1e-6. + weight (torch.Tensor): the weight to apply RMSnorm, default None + and will use `torch.ones(hidden_size)`. + + `forward()` - forward: - - hidden_states (torch.Tensor) : input to be applied RMSnorm, usually taking shape of - [batch size, sequence length, hidden_size] - (as well as the output shape). + Args: + hidden_states (torch.Tensor) : input to be applied RMSnorm, usually taking shape of + [batch size, sequence length, hidden_size] (as well as the output shape). Examples: >>> # module init: @@ -245,12 +260,15 @@ class RMSNorm(nn.Module): >>> input = torch.randn(1, 32, 4096) >>> result = rmsnorm_module(input) - [Direct function call] This module also provides a `.apply_function` function call to apply RMSNorm without - initializing the module. + [Direct function call] This module also provides a `.apply_function` function + call to apply RMSNorm without initializing the module. + + `apply_function()` + Args: - - hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. - - weight (torch.Tensor): the weight to apply RMSnorm. - - eps (float) : the variance_epsilon to apply RMSnorm. + hidden_states(torch.Tensor) : the input tensor to apply RMSNorm. + weight (torch.Tensor): the weight to apply RMSnorm. + eps (float) : the variance_epsilon to apply RMSnorm. """ @@ -281,23 +299,31 @@ def forward(self, x: torch.Tensor): class VarlenAttention(nn.Module): r""" [module init and forward] Applies PyTorch scaled_dot_product_attention on the inputs of query, key and value - (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), - and accept the variant (different) sequence length among the query, key and value. + (see https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html), + and accept the variant (different) sequence length among the query, key and value. + + This module does not have args for `module init`. + + `forward()` Args: - module init: this module does not have args for module init - forward: - - query (torch.Tensor): shape [query_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - key (torch.Tensor): shape [key_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - value (torch.Tensor): shape [value_tokens, num_head, head_size], where tokens is total sequence length among batch size. - - out (torch.Tensor): buffer to get the results, the shape is the same as query. - - seqlen_q (torch.Tensor): shape [batch_size + 1], points the current query_tokens among total sequence length. - - seqlen_k (torch.Tensor): shape [batch_size + 1], points the current key_tokens among total sequence length. - - max_seqlen_q (int): max/total sequence length of query. - - max_seqlen_k (int): max/total sequence length of key. - - pdropout (float): dropout probability; if greater than 0.0, dropout is applied, default is 0.0. - - softmax_scale (float): scaling factor applied is prior to softmax. - - is_causal (bool): whether to apply causal attention masking, default is True. + query (torch.Tensor): shape [query_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + key (torch.Tensor): shape [key_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + value (torch.Tensor): shape [value_tokens, num_head, head_size], + where tokens is total sequence length among batch size. + out (torch.Tensor): buffer to get the results, the shape is the same as query. + seqlen_q (torch.Tensor): shape [batch_size + 1], points the + current query_tokens among total sequence length. + seqlen_k (torch.Tensor): shape [batch_size + 1], points the + current key_tokens among total sequence length. + max_seqlen_q (int): max/total sequence length of query. + max_seqlen_k (int): max/total sequence length of key. + pdropout (float): dropout probability; if greater than 0.0, + dropout is applied, default is 0.0. + softmax_scale (float): scaling factor applied is prior to softmax. + is_causal (bool): whether to apply causal attention masking, default is True. Examples: >>> # module init: @@ -315,10 +341,10 @@ class VarlenAttention(nn.Module): >>> softmax_scale = 0.5 >>> varlenAttention_module(query, key, value, out, seqlen_q, seqlen_k, max_seqlen_q, max_seqlen_k, pdropout, softmax_scale) - [Direct function call] This module also provides a `.apply_function` function call to apply VarlenAttention without - initializing the module. - Args: - - The parameters are the same as the forward call. + [Direct function call] This module also provides a `.apply_function` + function call to apply VarlenAttention without initializing the module. + + The parameters of `apply_function()` are the same as the `forward()` call. """ @@ -409,58 +435,65 @@ class PagedAttention: for key/value cache. The basic logic as following figure. Firstly, The DRAM buffer which includes num_blocks are pre-allocated to store key or value cache. For every block, block_size tokens can be stored. In the forward pass, the cache manager will firstly allocate some slots from this buffer and use reshape_and_cache API to store - the key/value and then use single_query_cached_kv_attention API to do the scale-dot-product of MHA. + the key/value and then use single_query_cached_kv_attention API to do the scale-dot-product of MHA. The block is basic allocation unit of paged attention and the token intra-block are stored one-by-one. The block tables are used to map the logical block of sequence into the physical block. [class method]: reshape_and_cache - ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) + ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) This operator is used to store the key/value token states into the pre-allcated kv_cache buffers of paged attention. + Args: - - key (torch.Tensor): The keytensor. The shape should be [num_seqs, num_heads, head_size]. - - value (torch.Tensor): The value tensor. The shape should be [num_seqs, num_heads, head_size]. - - key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - value_cache (torch.Tensor): The pre-allocated buffer to store the value cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - slot_mapping (torch.Tensor): It stores the position to store the key/value in the pre-allocated buffers. - The shape should be the number of sequences. For sequence _i_, the slot_mapping[i]//block_number - can get the block index, and the slot_mapping%block_size can get the offset of this block. + key (torch.Tensor): The keytensor. The shape should be [num_seqs, num_heads, head_size]. + value (torch.Tensor): The value tensor. The shape should be [num_seqs, num_heads, head_size]. + key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + value_cache (torch.Tensor): The pre-allocated buffer to store the value cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + slot_mapping (torch.Tensor): It stores the position to store the key/value in the pre-allocated buffers. + The shape should be the number of sequences. For sequence ``i``, the ``slot_mapping[i] // block_number`` + can get the block index, and the ``slot_mapping % block_size`` can get the offset of this block. [class method]: single_query_cached_kv_attention - ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( - out, - query, - key_cache, - value_cache, - head_mapping, - scale, - block_tables, - context_lens, - block_size, - max_context_len, - alibi_slopes - ) + + .. highlight:: python + .. code-block:: python + + ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( + out, + query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes + ) This operator is used to be calculated the scale-dot-product based on the paged attention. + Args: - - out (torch.Tensor): The output tensor with shape of [num_seqs, num_heads, head_size]. where the num_seqs - is the number of the sequence in this batch. The num_heads means the number of query - head. head_size means the head dimension. - - query (torch.Tensor): The query tensor. The shape should be [num_seqs, num_heads, head_size]. - - key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - value_cache(torch.Tensor): The pre-allocated buffer to store the value cache. The shape should be - [num_blocks, block_size, num_heads, head_size]. - - head_mapping(torch.Tensor): The mapping from the query head to the kv head. The shape should be - the number of query heads. - - scale (float): The scale used by the scale-dot-product. In general, it is: float(1.0 / (head_size ** 0.5)). - - block_tables:(torch.Tensor): The mapping table used to mapping the logical sequence to the physical sequence. - The shape should be [num_seqs, max_num_blocks_per_seq]. - - context_lens (torch.Tensor): The sequence length for every sequence. The size is [num_seqs]. - - block_size (int): The block size which means the number of token in every block. - - max_context_len (int): The max sequence length. - - alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads). + out (torch.Tensor): The output tensor with shape of [num_seqs, num_heads, head_size], + where the num_seqs is the number of the sequence in this batch. The num_heads + means the number of query head. head_size means the head dimension. + query (torch.Tensor): The query tensor. The shape should be [num_seqs, num_heads, head_size]. + key_cache (torch.Tensor): The pre-allocated buffer to store the key cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + value_cache(torch.Tensor): The pre-allocated buffer to store the value cache. + The shape should be [num_blocks, block_size, num_heads, head_size]. + head_mapping(torch.Tensor): The mapping from the query head to the kv head. + The shape should be the number of query heads. + scale (float): The scale used by the scale-dot-product. + In general, it is: ``float(1.0 / (head_size ** 0.5))``. + block_tables:(torch.Tensor): The mapping table used to mapping the logical sequence + to the physical sequence. The shape should be [num_seqs, max_num_blocks_per_seq]. + context_lens (torch.Tensor): The sequence length for every sequence. The size is [num_seqs]. + block_size (int): The block size which means the number of token in every block. + max_context_len (int): The max sequence length. + alibi_slopes (torch.Tensor, optinal): which is the alibi slope with the shape of (num_heads). """ @@ -477,13 +510,7 @@ def reshape_and_cache( ): return cls.runtime_ops.get_module_from_device( key.device.type, IPEXCustomOpType.PAGED_ATTENTION, False - ).reshape_and_cache( - key, - value, - key_cache, - value_cache, - slot_mapping.int() if slot_mapping.dtype is torch.long else slot_mapping, - ) + ).reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) @classmethod def single_query_cached_kv_attention( @@ -527,37 +554,55 @@ class IndirectAccessKVCacheAttention(nn.Module): buffers(key and value use different buffers) to store all key/value hidden states and beam index information. It can use beam index history to decide which beam should be used by a timestamp and this information will generate an offset to access the kv_cache buffer. + Data Format: - - The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], - the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. - All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. - [Module init and forward] + The shape of the pre-allocated key(value) buffer is [max_seq, beam*batch, head_num, head_size], + the hidden state of key/value which is the shape of [beam*batch, head_num, head_size] is stored token by token. + All beam idx information of every timestamp is also stored in a Tensor with the shape of [max_seq, beam*batch]. + + `module init` + Args: - module init - - text_max_length (int) : the max length of kv cache to be used for generation (allocate the pre-cache buffer). - - forward - - query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). - - scale_attn (float):scale used by the attention layer. should be the sqrt(head_size). - - layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); - beam-idx: history beam idx, shape:(max_seq, beam*batch); - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). - - head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. - - attention_mask(torch.Tensor): Attention mask information. + text_max_length (int) : the max length of kv cache to be used + for generation (allocate the pre-cache buffer). + + `forward()` + + Args: + query (torch.Tensor): Query tensor; shape: (beam*batch, seq_len, head_num, head_dim). + key (torch.Tensor): Key tensor; shape: (beam*batch, seq_len, head_num, head_dim). + value (torch.Tensor): Value tensor; shape: (beam*batch, seq_len, head_num, head_dim). + scale_attn (float):scale used by the attention layer. should be ``sqrt(head_size)``. + layer_past (tuple(torch.Tensor)): tuple(seq_info, key_cache, value_cache, beam-idx). + + - key_cache: key cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - value_cache: value cache tensor, shape: (max_seq, beam*batch, head_num, head_dim); + + - beam-idx: history beam idx, shape:(max_seq, beam*batch); + + - seq_info: Sequence info tensor, shape:(1, 1, max_seq, max_seq). + + head_mask (torch.Tensor): Head mask tensor which is not supported by kernel yet. + attention_mask(torch.Tensor): Attention mask information. Return: - - attn_output: weighted value which is the output of scale dot product. shape (beam*batch, seq_len, head_num, head_size). - - attn_weights: The output tensor of the first matmul in scale dot product which is not supported by kernel now. - - new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). + attn_output: Weighted value which is the output of scale dot product. + shape (beam*batch, seq_len, head_num, head_size). + + attn_weights: The output tensor of the first matmul in scale dot product + which is not supported by kernel now. + + new_layer_past: updated layer_past (seq_info, key_cache, value_cache, beam-idx). Notes: - - How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model - see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + How to reorder KV cache when using the format of IndirectAccessKVCacheAttention (e.g., on llama model + see https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L1318) + + .. highlight:: python + .. code-block:: python + def _reorder_cache( self, past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor ) -> Tuple[Tuple[torch.Tensor]]: @@ -568,10 +613,10 @@ def _reorder_cache( layer_past[3][layer_past[0].size(-2) - 1] = beam_idx return past_key_values - [Direct function call] This module also provides a `.apply_function` function call to apply IndirectAccessKVCacheAttention - without initializing the module. - Args: - - The parameters are the same as the forward call. + [Direct function call] This module also provides a `.apply_function` function call + to apply IndirectAccessKVCacheAttention without initializing the module. + + The parameters of `apply_function()` are the same as the `forward()` call. """ From 1f688511c0dfe4af8b50fed13c16f3a98bcc84bd Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Wed, 22 May 2024 08:08:36 +0800 Subject: [PATCH 076/199] Update dependency_version.yml 20240522 (#2915) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 7ffa00ac3..68d27d4e9 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240520+cpu + version: 2.4.0.dev20240521+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240520+cpu + version: 2.2.0.dev20240521+cpu torchvision: - version: 0.19.0.dev20240520+cpu + version: 0.19.0.dev20240521+cpu transformers: version: 4.38.1 From 21b50308b852f942bf4a0d645e8c1a74dd6232c8 Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Wed, 22 May 2024 10:26:31 +0800 Subject: [PATCH 077/199] Fix iakv regression (#2900) * Fix iakv regression * Remove unuse loop --- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 250 +++++++++--------- 1 file changed, 125 insertions(+), 125 deletions(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index e497942b5..205c3c31f 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -557,10 +557,14 @@ scale_dot_product_for_indirect_access_kv_cache( auto thread_numbers = omp_get_max_threads(); auto max_parallel_parts = thread_numbers * 4; + auto target_block_size = 32L; + if (bs <= 32 and seq_len < 65536) { + target_block_size = 1L; + } auto kv_block_size = bs * head_num >= max_parallel_parts ? seq_len : std::max(seq_len / max_parallel_parts, 1L); - kv_block_size = std::min(kv_block_size, 32L); + kv_block_size = std::min(kv_block_size, target_block_size); auto kv_block_count = (seq_len + kv_block_size - 1) / kv_block_size; auto need_update_beam_idx = offset > 0 and bs > 1; auto b_ptr = beam_idx.data_ptr(); @@ -585,37 +589,48 @@ scale_dot_product_for_indirect_access_kv_cache( for (auto hi = 0; hi < head_num; hi++) { auto k_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - k_start); + auto query_ti = 0; for (auto ti = k_start; ti < k_start + block_size; ti++) { - for (auto query_ti = 0; query_ti < cur_len; query_ti++) { - auto kv_hi = hi / group_size; // maping the query head to - // key/value head to support MGA/MQA - auto q_ptr_start = q_ptr + - (bi * cur_len + query_ti) * head_num * head_size + - hi * head_size; - auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; - auto attn_w_pos = - attn_w_ptr + attn_w_stride + query_ti * seq_len + ti; - attn_w_pos[0] = 0.0f; - auto kc_token_start = ti * kc_token_stride; - auto kc_t_beam_start = kc_token_start; - auto beam = need_update_beam_idx ? new_beam_idx[bi][ti] : 0; - if (ti > - query_ti + offset) { // only caculate the innerproduct for - // the past token and current token - attn_w_pos[0] = -10000.0f; - } else if (ti == query_ti + offset) { // caculate the innerproduct - // for the current token and - // store the key - if (cur_len > 1) { // this may occur for processing the promt - auto beam_size = beam_batch / bs; - // need to store key accross beam - kc_t_beam_start = - kc_t_beam_start + bi * beam_size * kv_head * head_size; - } else { - kc_t_beam_start = kc_t_beam_start + bi * kv_head * head_size; - } - auto kc_head_start = - k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + auto kv_hi = hi / group_size; // maping the query head to + // key/value head to support MGA/MQA + auto q_ptr_start = q_ptr + + (bi * cur_len + query_ti) * head_num * head_size + + hi * head_size; + auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; + auto attn_w_pos = + attn_w_ptr + attn_w_stride + query_ti * seq_len + ti; + attn_w_pos[0] = 0.0f; + auto kc_token_start = ti * kc_token_stride; + auto kc_t_beam_start = kc_token_start; + auto beam = need_update_beam_idx ? new_beam_idx[bi][ti] : 0; + if (ti > query_ti + offset) { // only caculate the innerproduct for + // the past token and current token + attn_w_pos[0] = -10000.0f; + } else if (ti == query_ti + offset) { // caculate the innerproduct + // for the current token and + // store the key + if (cur_len > 1) { // this may occur for processing the promt + auto beam_size = beam_batch / bs; + // need to store key accross beam + kc_t_beam_start = + kc_t_beam_start + bi * beam_size * kv_head * head_size; + } else { + kc_t_beam_start = kc_t_beam_start + bi * kv_head * head_size; + } + auto kc_head_start = + k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + auto k_ptr_start = k_ptr + + (bi * cur_len + ti - offset) * kv_head * head_size + + kv_hi * head_size; + reduce_head( + q_ptr_start, + k_ptr_start, + attn_w_pos, + head_size, + true, + kc_head_start); + } else { // caculate the innerproduct for the past token + if (ti >= offset) { auto k_ptr_start = k_ptr + (bi * cur_len + ti - offset) * kv_head * head_size + kv_hi * head_size; @@ -624,38 +639,24 @@ scale_dot_product_for_indirect_access_kv_cache( k_ptr_start, attn_w_pos, head_size, - true, - kc_head_start); - } else { // caculate the innerproduct for the past token - if (ti >= offset) { - auto k_ptr_start = k_ptr + - (bi * cur_len + ti - offset) * kv_head * head_size + - kv_hi * head_size; - reduce_head( - q_ptr_start, - k_ptr_start, - attn_w_pos, - head_size, - false, - nullptr); - } else { + false, + nullptr); + } else { + kc_t_beam_start = kc_t_beam_start + beam * kv_head * head_size; + if (cur_len > 1) { + auto beam_size = beam_batch / bs; kc_t_beam_start = - kc_t_beam_start + beam * kv_head * head_size; - if (cur_len > 1) { - auto beam_size = beam_batch / bs; - kc_t_beam_start = - kc_t_beam_start + bi * beam_size * kv_head * head_size; - } - auto kc_head_start = - k_cache_ptr + kc_t_beam_start + kv_hi * head_size; - reduce_head( - q_ptr_start, - kc_head_start, - attn_w_pos, - head_size, - false, - nullptr); + kc_t_beam_start + bi * beam_size * kv_head * head_size; } + auto kc_head_start = + k_cache_ptr + kc_t_beam_start + kv_hi * head_size; + reduce_head( + q_ptr_start, + kc_head_start, + attn_w_pos, + head_size, + false, + nullptr); } } } @@ -742,85 +743,84 @@ scale_dot_product_for_indirect_access_kv_cache( thread_id = omp_get_thread_num(); auto v_start = block_id * kv_block_size; auto block_size = std::min(kv_block_size, seq_len - v_start); + auto query_ti = 0; for (auto vi = v_start; vi < v_start + block_size; vi++) { - for (auto query_ti = 0; query_ti < cur_len; query_ti++) { - auto kv_hi = hi / group_size; // maping the query head to - // key/value head to support MGA/MQA - auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; - auto attn_w_query_start = - attn_w_ptr + attn_w_stride + query_ti * seq_len; - // calculate weighted value and store the result to attn_outs[bs, - // head_num, cur_len, head_size] - auto attn_out_head_stride = thread_id * attn_outs_stride_priv + - (bi * head_num + hi) * cur_len * head_size; - auto attn_out_start = private_attn_out_ptr + - attn_out_head_stride + query_ti * head_size; + auto kv_hi = hi / group_size; // maping the query head to + // key/value head to support MGA/MQA + auto attn_w_stride = (bi * head_num + hi) * cur_len * seq_len; + auto attn_w_query_start = + attn_w_ptr + attn_w_stride + query_ti * seq_len; + // calculate weighted value and store the result to attn_outs[bs, + // head_num, cur_len, head_size] + auto attn_out_head_stride = thread_id * attn_outs_stride_priv + + (bi * head_num + hi) * cur_len * head_size; + auto attn_out_start = private_attn_out_ptr + attn_out_head_stride + + query_ti * head_size; - auto vc_token_start = vi * kc_token_stride; - auto beam = need_update_beam_idx ? new_beam_idx[bi][vi] : 0; - if (vi == query_ti + offset) { // caculate the attention values - // for the current token - auto vc_t_beam_start = vc_token_start; - if (cur_len > 1) { // this may occur for processing the promt + auto vc_token_start = vi * kc_token_stride; + auto beam = need_update_beam_idx ? new_beam_idx[bi][vi] : 0; + if (vi == query_ti + offset) { // caculate the attention values + // for the current token + auto vc_t_beam_start = vc_token_start; + if (cur_len > 1) { // this may occur for processing the promt + auto beam_size = beam_batch / bs; + // removed the redundant computation, need to store key + // accross beam + vc_t_beam_start = + vc_t_beam_start + bi * beam_size * kv_head * head_size; + } else { + vc_t_beam_start = vc_t_beam_start + bi * kv_head * head_size; + } + auto v_cache_head_start = + v_cache_ptr + vc_t_beam_start + kv_hi * head_size; + auto v_ptr_start = v_ptr + + (bi * cur_len + vi - offset) * kv_head * head_size + + kv_hi * head_size; + mul_attenion_weights_and_value_of_head( + attn_w_query_start[vi], + v_ptr_start, + attn_out_start, + head_size, + true, + v_cache_head_start, + flag_access[thread_id][bi][hi]); + } else if (vi < query_ti + offset) { // caculate attention + // values for the past + // token + if (vi >= offset) { + auto v_ptr_start = v_ptr + + (bi * cur_len + vi - offset) * kv_head * head_size + + kv_hi * head_size; + mul_attenion_weights_and_value_of_head( + attn_w_query_start[vi], + v_ptr_start, + attn_out_start, + head_size, + false, + nullptr, + flag_access[thread_id][bi][hi]); + } else { + auto vc_t_beam_start = + vc_token_start + beam * kv_head * head_size; + if (cur_len > 1) { auto beam_size = beam_batch / bs; - // removed the redundant computation, need to store key - // accross beam vc_t_beam_start = vc_t_beam_start + bi * beam_size * kv_head * head_size; - } else { - vc_t_beam_start = vc_t_beam_start + bi * kv_head * head_size; } auto v_cache_head_start = v_cache_ptr + vc_t_beam_start + kv_hi * head_size; - auto v_ptr_start = v_ptr + - (bi * cur_len + vi - offset) * kv_head * head_size + - kv_hi * head_size; mul_attenion_weights_and_value_of_head( attn_w_query_start[vi], - v_ptr_start, + v_cache_head_start, attn_out_start, head_size, - true, - v_cache_head_start, + false, + nullptr, flag_access[thread_id][bi][hi]); - } else if (vi < query_ti + offset) { // caculate attention - // values for the past - // token - if (vi >= offset) { - auto v_ptr_start = v_ptr + - (bi * cur_len + vi - offset) * kv_head * head_size + - kv_hi * head_size; - mul_attenion_weights_and_value_of_head( - attn_w_query_start[vi], - v_ptr_start, - attn_out_start, - head_size, - false, - nullptr, - flag_access[thread_id][bi][hi]); - } else { - auto vc_t_beam_start = - vc_token_start + beam * kv_head * head_size; - if (cur_len > 1) { - auto beam_size = beam_batch / bs; - vc_t_beam_start = - vc_t_beam_start + bi * beam_size * kv_head * head_size; - } - auto v_cache_head_start = - v_cache_ptr + vc_t_beam_start + kv_hi * head_size; - mul_attenion_weights_and_value_of_head( - attn_w_query_start[vi], - v_cache_head_start, - attn_out_start, - head_size, - false, - nullptr, - flag_access[thread_id][bi][hi]); - } } - if (flag_access[thread_id][bi][hi] == 0) - flag_access[thread_id][bi][hi] = 1; } + if (flag_access[thread_id][bi][hi] == 0) + flag_access[thread_id][bi][hi] = 1; } } } From f95244a66db1d68c28536327dc5e1c12657eb1fd Mon Sep 17 00:00:00 2001 From: Xu Han Date: Wed, 22 May 2024 13:25:09 +0800 Subject: [PATCH 078/199] remove ipex cpu module's python dependency. (#2911) (#2914) * correct all_reduce schema * remove ipex cpu module's python dependency. --------- Co-authored-by: blzheng --- csrc/cpu/CMakeLists.txt | 8 -------- csrc/cpu/aten/kernels/MoEKrnl.cpp | 34 +++++++++++-------------------- tests/cpu/cpp/CMakeLists.txt | 8 -------- 3 files changed, 12 insertions(+), 38 deletions(-) diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt index 74c7057d3..460621a3a 100644 --- a/csrc/cpu/CMakeLists.txt +++ b/csrc/cpu/CMakeLists.txt @@ -251,14 +251,6 @@ if(BUILD_STRIPPED_BIN) set_target_properties(${PLUGIN_NAME_CPU} PROPERTIES LINK_FLAGS_RELEASE -s) endif() -find_package(PythonLibs) -if(${PYTHONLIBS_FOUND}) - target_link_libraries(${PLUGIN_NAME_CPU} PUBLIC ${PYTHON_LIBRARIES}) -endif() - -find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") -target_link_libraries(${PLUGIN_NAME_CPU} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) - install(TARGETS ${PLUGIN_NAME_CPU} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/csrc/cpu/aten/kernels/MoEKrnl.cpp b/csrc/cpu/aten/kernels/MoEKrnl.cpp index ea982318e..80d3ae2fa 100644 --- a/csrc/cpu/aten/kernels/MoEKrnl.cpp +++ b/csrc/cpu/aten/kernels/MoEKrnl.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "tpp/kernels/TPPGEMMKrnl.h" @@ -16,6 +15,15 @@ namespace cpu { namespace { +at::Tensor call_AllReduce(const at::Tensor& self) { + static auto op_allreduce = + c10::Dispatcher::singleton() + .findSchemaOrThrow("deepspeed_comm::all_reduce", "") + .typed(); + auto ret = op_allreduce.call(self); + return ret; +} + at::Tensor mixtral_moe_tpp_kernl_impl( const at::Tensor& hidden_states, const at::Tensor& top_x, @@ -46,13 +54,7 @@ at::Tensor mixtral_moe_tpp_kernl_impl( tpp_linear_nobias_forward_cpu(curr_state, down_wei, c10::nullopt); } if (is_distributed) { - py::gil_scoped_acquire acquire; - py::function allreduce = py::module_::import("torch") - .attr("ops") - .attr("deepspeed_comm") - .attr("all_reduce"); - allreduce(curr_state); - py::gil_scoped_release release; + call_AllReduce(curr_state); } curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); @@ -98,13 +100,7 @@ at::Tensor mixtral_moe_kernl_impl( c10::nullopt); } if (is_distributed) { - py::gil_scoped_acquire acquire; - py::function allreduce = py::module_::import("torch") - .attr("ops") - .attr("deepspeed_comm") - .attr("all_reduce"); - allreduce(curr_state); - py::gil_scoped_release release; + call_AllReduce(curr_state); } curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); @@ -130,13 +126,7 @@ at::Tensor mixtral_moe_woq_kernl_impl( down_wei); if (is_distributed) { - py::gil_scoped_acquire acquire; - py::function allreduce = py::module_::import("torch") - .attr("ops") - .attr("deepspeed_comm") - .attr("all_reduce"); - allreduce(curr_state); - py::gil_scoped_release release; + call_AllReduce(curr_state); } curr_state = curr_state * routing_w; output.index_add_(0, top_x, curr_state.squeeze(0).to(hidden_states.dtype())); diff --git a/tests/cpu/cpp/CMakeLists.txt b/tests/cpu/cpp/CMakeLists.txt index fc5dff343..cc299d0a5 100644 --- a/tests/cpu/cpp/CMakeLists.txt +++ b/tests/cpu/cpp/CMakeLists.txt @@ -69,13 +69,5 @@ target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC c10) # Link IPEX target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC intel-ext-pt-cpu) -find_package(PythonLibs) -if(${PYTHONLIBS_FOUND}) - target_link_libraries(${CPU_CPP_TEST_NAME} PUBLIC ${PYTHON_LIBRARIES}) -endif() - -find_library(TORCH_PYTHON_LIBRARY torch_python PATH "${TORCH_INSTALL_PREFIX}/lib") -target_link_libraries(${CPU_CPP_TEST_NAME} PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY}) - install(TARGETS ${CPU_CPP_TEST_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) From 6582a4ef6fcbf915059fa4350b71ae3bc0be49b5 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Wed, 22 May 2024 18:27:37 +0800 Subject: [PATCH 079/199] load low-precision checkpoints in safetensors format (#2917) * load low-precision checkpoints in safetensors format * Fix lint issue --- examples/cpu/inference/python/llm/run.py | 9 +++++++++ .../llm/single_instance/run_quantization.py | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 16c254520..16448e43f 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -470,6 +470,15 @@ def main(args_in: Optional[List[str]] = None) -> None: ) if args.gptq_legacy_format: quant_cmd.extend(["--gptq-legacy-format"]) + elif args.low_precision_checkpoint != "": + quant_cmd.extend( + [ + "--low-precision-checkpoint", + str(args.low_precision_checkpoint), + ] + ) + if args.gptq_legacy_format: + quant_cmd.extend(["--gptq-legacy-format"]) else: # No need to set group size if args.gptq is true # Group size is read from the checkpoint diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index e5024a201..07446a654 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -867,7 +867,21 @@ def calib_func(prepared_model): group_size=args.group_size, ) if args.low_precision_checkpoint != "": - low_precision_checkpoint = torch.load(args.low_precision_checkpoint) + if args.low_precision_checkpoint.endswith( + ".pt" + ) or args.low_precision_checkpoint.endswith(".pth"): + low_precision_checkpoint = torch.load(args.low_precision_checkpoint) + elif args.low_precision_checkpoint.endswith(".safetensors"): + try: + import safetensors + except ImportError: + print( + "Please install safetensors package to load safetensors checkpoint." + ) + exit(1) + low_precision_checkpoint = safetensors.torch.load_file( + args.low_precision_checkpoint + ) if args.gptq_legacy_format: config_dict = ( ipex.utils.weight_only_quantization._legacy_lowp_checkpoint_config() From f43161d98a49ce056e55f77d73d23b9a89892a53 Mon Sep 17 00:00:00 2001 From: Cao E Date: Thu, 23 May 2024 08:49:32 +0800 Subject: [PATCH 080/199] enable ConcatLinear for fp16 LLM (#2909) --- .../transformers/models/cpu/fusions/linear_fusion.py | 4 ++-- tests/cpu/test_ipex_optimize_transformers.py | 8 +++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py index f7fe68b10..ed036dac0 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py @@ -358,9 +358,9 @@ def __init__(self, module, tpp=False, woq=False): mod, concat_scales, concat_zeros ) elif ( - self.tpp - and hasattr(module, "concat_linear") + hasattr(module, "concat_linear") and module.concat_linear is not None + and (self.tpp or module.concat_linear.weight.dtype == torch.half) ): self.concat_linear = module.concat_linear else: diff --git a/tests/cpu/test_ipex_optimize_transformers.py b/tests/cpu/test_ipex_optimize_transformers.py index b920f9c55..c11b7e93f 100644 --- a/tests/cpu/test_ipex_optimize_transformers.py +++ b/tests/cpu/test_ipex_optimize_transformers.py @@ -1,6 +1,7 @@ import unittest import torch import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch._C as core import sys import subprocess import os @@ -145,7 +146,8 @@ def model_replacement_check( with torch.no_grad(): key_hf = ref_m(**input_dict) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, ): key_ipex = ipex_m(**input_dict) error_message = f"model={m.name}, deployment_mode={deployment_mode}, torchcompile={torchcompile}, return_dict={return_dict}" @@ -160,6 +162,8 @@ def model_replacement_check( def test_model_replacement(self): dtypes = [torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) enable_torchcompile = [False, True] deployment_mode = [True, False] return_dict = [False, True] @@ -168,6 +172,8 @@ def test_model_replacement(self): ): if torchcompile and deployment_mode: continue + if dtype == torch.float16: + _disable_tpp() self.model_replacement_check(m, dtype, jit, torchcompile, return_dict) _disable_tpp() From c6a1de1119e74d807cd9fbb41936b09f1d7e4eb8 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Thu, 23 May 2024 11:23:17 +0800 Subject: [PATCH 081/199] Update dependency_version.yml 20240523 (#2919) --- dependency_version.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 68d27d4e9..e518c3829 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240521+cpu + version: 2.4.0.dev20240522+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240521+cpu + version: 2.2.0.dev20240522+cpu torchvision: - version: 0.19.0.dev20240521+cpu + version: 0.19.0.dev20240522+cpu transformers: version: 4.38.1 From 91e920692fe81994dd41b5823f06530e65a63dc2 Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Thu, 23 May 2024 14:13:41 +0800 Subject: [PATCH 082/199] check diffusers version in test_stable_diffuser (#2918) --- tests/cpu/test_fx_optimization.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/cpu/test_fx_optimization.py b/tests/cpu/test_fx_optimization.py index 29ec7cb33..bea11e588 100644 --- a/tests/cpu/test_fx_optimization.py +++ b/tests/cpu/test_fx_optimization.py @@ -25,12 +25,15 @@ try: import diffusers - HAS_DIFFUSERS = True + if diffusers.__version__ > "0.25.0": + HAS_DIFFUSERS = False + else: + HAS_DIFFUSERS = True except ImportError: HAS_DIFFUSERS = False except RuntimeError: HAS_DIFFUSERS = False -skipIfNoDIFFUSERS = unittest.skipIf(not HAS_DIFFUSERS, "no diffusers") +skipIfNoDIFFUSERS = unittest.skipIf(not HAS_DIFFUSERS, "no expected diffusers version") class MultipleLinear(torch.nn.Module): From b29f8aa25221e12251d3ead2ce11f56217674141 Mon Sep 17 00:00:00 2001 From: blzheng Date: Thu, 23 May 2024 16:42:38 +0800 Subject: [PATCH 083/199] enable optimized whisper (#2923) --- .../run_accuracy_with_deepspeed.py | 408 +++++++++++++++++- .../run_generation_with_deepspeed.py | 29 +- examples/cpu/inference/python/llm/run.py | 16 +- .../llm/single_instance/run_accuracy.py | 296 ++++++++++++- .../llm/single_instance/run_generation.py | 29 +- .../llm/single_instance/run_quantization.py | 132 +++++- .../python/llm/utils/create_shard_model.py | 2 + .../python/llm/utils/model_class/llm.py | 1 + .../python/llm/utils/model_class/whisper.py | 49 +++ .../transformers/generation/beam_sample.py | 47 ++ .../transformers/generation/beam_search.py | 47 ++ .../transformers/generation/greedy_search.py | 48 +++ .../transformers/generation/sample.py | 48 +++ .../models/cpu/modules/attentions.py | 1 + .../models/cpu/modules/decoder.py | 18 + .../transformers/models/reference/models.py | 206 ++++++++- .../models/reference/modules/attentions.py | 106 +++++ .../models/reference/modules/decoder.py | 152 +++++++ .../transformers/optimize.py | 96 ++++- tests/cpu/hf_configs/whisper/config.json | 144 +++++++ ...test_ipex_optimize_transformers_nightly.py | 13 + 21 files changed, 1856 insertions(+), 32 deletions(-) create mode 100644 examples/cpu/inference/python/llm/utils/model_class/whisper.py create mode 100644 tests/cpu/hf_configs/whisper/config.json diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index b07ce9933..f608de80a 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -19,14 +19,24 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) - +from datasets import load_dataset +from torch.utils.data import DataLoader import sys sys.path.append(sys.path[0] + "/../../") try: + import lmms_eval + from lmms_eval.api.instance import Instance + from lmms_eval.api.model import lmms + from lmms_eval.api.registry import register_model + from lmms_eval import evaluator as lmms_evaluator + from lmms_eval import utils as lmms_utils + from lmms_eval.api.registry import ALL_TASKS + from lmms_eval.tasks import initialize_tasks from llava.model.language_model.llava_llama import ( # noqa F401 LlavaLlamaForCausalLM, ) @@ -43,14 +53,6 @@ DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, ) - import lmms_eval - from lmms_eval.api.instance import Instance - from lmms_eval.api.model import lmms - from lmms_eval.api.registry import register_model - from lmms_eval import evaluator as lmms_evaluator - from lmms_eval import utils as lmms_utils - from lmms_eval.api.registry import ALL_TASKS - from lmms_eval.tasks import initialize_tasks except ImportError: def register_model(name): @@ -85,6 +87,7 @@ def decorator(func): "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -1645,8 +1648,377 @@ def _collate(x): return res +class LibriSpeech: + _DEFAULT_MAX_LENGTH = 2048 + + def __init__( + self, + pretrained: str, + device="cpu", + with_ipex=True, + with_jit=True, + with_greedy=False, + batch_size=1, + max_length=None, + dtype: Optional[Union[str, torch.dtype]] = "auto", + tp_number=1, + config=None, + add_special_tokens=True, + ): + model_id = pretrained + self._device = device + self._batch_size = batch_size + self._with_jit = with_jit + self._with_ipex = with_ipex + self._with_greedy = with_greedy + self._max_length = max_length + self._dtype = dtype + self._tp_number = tp_number + self.add_special_tokens = add_special_tokens + + load_dtype = torch.float32 + infer_dtype = torch.float32 + if args.quant_with_amp or dtype == "bfloat16": + load_dtype = torch.bfloat16 + infer_dtype = torch.bfloat16 + else: + if dtype == "float16": + load_dtype = torch.half + infer_dtype = torch.half + elif dtype == "int8": + load_dtype = torch.float32 + infer_dtype = torch.int8 + self.amp_dtype = ( + torch.bfloat16 + if args.quant_with_amp or self._dtype == "bfloat16" + else torch.float32 + ) + model_type = next( + (x for x in MODEL_CLASSES.keys() if x in model_id.lower()), "auto" + ) + model_class = MODEL_CLASSES[model_type] + + self.tokenizer = model_class[1].from_pretrained( + model_id, trust_remote_code=True + ) + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, + ) + + # For now, Falcon, baichuan and gptbigcode have accuracy issue with from_config with deepspeed meta device load. + # TODO: we will change the scope once deepspeed providing the support + if world_size == 1 or model_type in [ + "whisper", + ]: + self.model = model_class[0].from_pretrained( + model_id, + config=self.config, + low_cpu_mem_usage=True, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + else: + with deepspeed.OnDevice(dtype=load_dtype, device="meta"): + if model_class[0] == AutoModelForCausalLM: + self.model = ( + model_class[0] + .from_config(self.config, trust_remote_code=True) + .to(load_dtype) + ) + else: + self.model = model_class[0].from_pretrained( + model_id, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + + self.model = self.model.eval() + + checkpoints_json = "checkpoints.json" + + def print_rank0(*msg): + if local_rank != 0: + return + print(*msg) + + def get_repo_root(model_name_or_path): + if os.path.exists(model_name_or_path): + # local path + # use absolute path here to avoid path error in deepspeed + model_name_or_path = os.path.abspath(model_name_or_path) + return model_name_or_path + # checks if online or not + if is_offline_mode(): + print_rank0("Offline mode: forcing local_files_only=True") + # download only on first process + allow_patterns = ["*.bin", "*.model", "*.json", "*.txt", "*.py", "*LICENSE"] + if local_rank == 0: + snapshot_download( + model_name_or_path, + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None), + allow_patterns=allow_patterns, + # ignore_patterns=["*.safetensors"], + ) + + dist.barrier() + + return snapshot_download( + model_name_or_path, + local_files_only=is_offline_mode(), + cache_dir=os.getenv("TRANSFORMERS_CACHE", None), + allow_patterns=allow_patterns, + # ignore_patterns=["*.safetensors"], + ) + + def get_checkpoint_files(model_name_or_path): + cached_repo_dir = get_repo_root(model_name_or_path) + + # extensions: .bin | .pt + # creates a list of paths from all downloaded files in cache dir + file_list = [ + str(entry) + for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") + if entry.is_file() + ] + return file_list + + def write_checkpoints_json(): + checkpoint_files = get_checkpoint_files(model_id) + if local_rank == 0: + # model.config.model_type.upper() + data = { + "type": "BLOOM", + "checkpoints": checkpoint_files, + "version": 1.0, + } + json.dump(data, open(checkpoints_json, "w")) + + repo_root = get_repo_root(model_id) + write_checkpoints_json() + dist.barrier() + self.model = deepspeed.init_inference( + self.model, + mp_size=tp_number, + base_dir=repo_root, + dtype=infer_dtype, + checkpoint=checkpoints_json, + ) + + self.model = self.model.module + + if self._with_ipex: + ipex_woq_enabled = args.ipex_weight_only_quantization + if ipex_woq_enabled: + from intel_extension_for_pytorch.quantization import WoqWeightDtype + + if args.weight_dtype == "INT8": + weight_dtype = WoqWeightDtype.INT8 + elif args.weight_dtype == "INT4": + weight_dtype = WoqWeightDtype.INT4 + else: + assert args.weight_dtype == "NF4" + weight_dtype = WoqWeightDtype.NF4 + + if args.lowp_mode == "INT8": + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + elif args.lowp_mode == "FP32": + lowp_mode = ipex.quantization.WoqLowpMode.NONE + elif args.lowp_mode == "FP16": + lowp_mode = ipex.quantization.WoqLowpMode.FP16 + elif args.lowp_mode == "BF16": + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + else: # AUTO + if weight_dtype == WoqWeightDtype.INT4: + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + else: + lowp_mode = ipex.quantization.WoqLowpMode.BF16 + + act_quant_mode_dict = { + "PER_TENSOR": ipex.quantization.WoqActQuantMode.PER_TENSOR, + "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, + "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, + "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + } + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode_dict[args.act_quant_mode], + group_size=args.group_size, + ) + self.model = ipex.llm.optimize( + self.model.eval(), + dtype=infer_dtype, + quantization_config=qconfig if ipex_woq_enabled else None, + inplace=True, + deployment_mode=False, + ) + + self.base_model = self.model + + self.num_beams = 1 if with_greedy else 4 + self.iter = 0 + + if self._with_jit: + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(self.config.num_hidden_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(self.amp_dtype) + sample_inputs = { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "past_key_values": past_key_values, + "encoder_outputs": (last_hidden_state,), + } + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + if self._dtype != "int8": + traced_model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + traced_model = torch.jit.freeze(traced_model.eval()) + else: + traced_model = torch.jit.load(args.quantized_model_path) + traced_model = torch.jit.freeze(traced_model.eval()) + + traced_model(**sample_inputs) + traced_model(**sample_inputs) + ipex._set_optimized_model_for_generation( + self.model, optimized_model=traced_model + ) + self.dataset = load_dataset("librispeech_asr", split="test.clean") + self.dataloader = DataLoader( + self.dataset, + batch_size=1, + shuffle=False, + ) + + def _levenshtein(self, a: List, b: List) -> int: + """Calculates the Levenshtein distance between a and b.""" + n, m = len(a), len(b) + if n > m: + # Make sure n <= m, to use O(min(n,m)) space + a, b = b, a + n, m = m, n + + current = list(range(n + 1)) + for i in range(1, m + 1): + previous, current = current, [i] + [0] * n + for j in range(1, n + 1): + add, delete = previous[j] + 1, current[j - 1] + 1 + change = previous[j - 1] + if a[j - 1] != b[i - 1]: + change = change + 1 + current[j] = min(add, delete, change) + + return current[n] + + def word_error_rate(self, hypotheses: List[str], references: List[str]) -> float: + """ + Computes Average Word Error rate between two texts represented as + corresponding lists of string. Hypotheses and references must have same length. + + Args: + hypotheses: list of hypotheses + references: list of references + + Returns: + (float) average word error rate + """ + scores = 0 + words = 0 + if len(hypotheses) != len(references): + raise ValueError( + "In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references)) + ) + for h, r in zip(hypotheses, references): + h_list = h.split() + r_list = r.split() + words += len(r_list) + scores += self._levenshtein(h_list, r_list) + if words != 0: + wer = 1.0 * scores / words + else: + wer = float("inf") + return wer, scores, words + + def evaluate(self): + results = [] + references = [] + for batch_ndx, sample in enumerate(self.dataloader): + inputs = sample["audio"]["array"].squeeze(0) + model_inputs = self.tokenizer( + inputs, sampling_rate=16000, return_tensors="pt" + ).input_features + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + output = self.model.generate( + model_inputs, + do_sample=False, + temperature=0.9, + num_beams=self.num_beams, + ) + gen_text = self.tokenizer.batch_decode(output, skip_special_tokens=True) + if len(results) == 0: + results = gen_text + references = sample["text"] + else: + results += gen_text + references += sample["text"] + references = [r.capitalize() for r in references] + wer, scores, words = self.word_error_rate(results, references) + return wer, scores, words + + lm_tasks = [] lmms_tasks = [] +other_tasks = [] lm_all_tasks = lm_eval.tasks.ALL_TASKS try: initialize_tasks() @@ -1657,6 +2029,8 @@ def _collate(x): lm_tasks.append(task) elif task in ALL_TASKS: lmms_tasks.append(task) + elif task in ["librispeech_asr"]: + other_tasks.append(task) else: print(f"Task {task} in not supported by lm_eval and lmms_eval") exit(0) @@ -1730,3 +2104,19 @@ def _collate(x): cli_args=args, ) print(lmms_evaluator.make_table(results)) +elif len(other_tasks) != 0: + if "librispeech_asr" in other_tasks: + evaluator = LibriSpeech( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=True, + with_greedy=False, + ) + wer, scores, num_words = evaluator.evaluate() + print("Evaluation WER: {0}".format(wer)) + print("Accuracy: {:.15f} ".format(1 - wer)) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 4ec30d387..58e428d72 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -18,6 +18,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, TextStreamer, ) @@ -56,6 +57,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -141,6 +143,12 @@ type=str, help="image url for image-to-text task", ) +parser.add_argument( + "--audio", + default="example.flac", + type=str, + help="audio file for speech-to-text task", +) parser.add_argument("--print-memory", action="store_true") parser.add_argument("--token-latency", action="store_true") parser.add_argument( @@ -336,6 +344,8 @@ def get_checkpoint_files(model_name_or_path): config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) if model_type == "mpt" and args.prompt is None: config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) +if model_type == "whisper": + config.text_max_length = config.max_source_positions + config.max_target_positions if model_type == "llava": config.use_cache = True @@ -374,6 +384,7 @@ def get_checkpoint_files(model_name_or_path): "git", "qwen", "yuan", + "whisper", ]: model = model_class[0].from_pretrained( model_name, @@ -586,6 +597,13 @@ def load_image(image_file): conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() inputs = [prompt] * args.batch_size +elif model_type == "whisper": + import librosa + + sample = librosa.load(args.audio, sr=16000) + prompt = sample[0] + inputs = [prompt] * args.batch_size + generate_kwargs.pop("min_new_tokens", None) else: # input tokens input_sentences = [] @@ -644,6 +662,9 @@ def generate(): for img in image ] input_tokens = {"input_ids": input_ids, "images": image_tensor} + elif model_type == "whisper": + input_tokens = tokenizer(inputs, sampling_rate=16000, return_tensors="pt") + input_ids = input_tokens.input_features else: input_tokens = tokenizer.batch_encode_plus( inputs, return_token_type_ids=False, return_tensors="pt" @@ -654,15 +675,17 @@ def generate(): input_tokens[t] = input_tokens[t].to( get_accelerator().current_device_name() ) - - outputs = model.generate(**input_tokens, **generate_kwargs) + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if infer_dtype == torch.bfloat16 else False + ): + outputs = model.generate(**input_tokens, **generate_kwargs) gen_ids = outputs[0] if args.token_latency else outputs input_tokens_lengths = [x.shape[0] for x in input_ids] output_tokens_lengths = [x.shape[0] for x in gen_ids] total_new_tokens = [ - o - i if model.config.model_type != "t5" else o + o if model.config.model_type in ["t5", "whisper"] else o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths) ] gen_text = tokenizer.batch_decode( diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 16448e43f..e29689b30 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -233,7 +233,12 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument( "--image-url", default=None, type=str, help="image url for image-to-text task" ) - + parser.add_argument( + "--audio", + default=None, + type=str, + help="audio file for speech-to-text task", + ) # deepspeed inference related arguments. parser.add_argument("--autotp", action="store_true") parser.add_argument("--shard-model", action="store_true") @@ -293,6 +298,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--config-file", str(args.config_file)]) if args.image_url is not None: infer_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + infer_cmd.extend(["--audio", str(args.audio)]) print("LLM RUNTIME INFO: running model geneartion...") result = subprocess.run(infer_cmd) @@ -422,6 +429,8 @@ def main(args_in: Optional[List[str]] = None) -> None: quant_cmd.extend(["--greedy"]) if args.image_url is not None: quant_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + quant_cmd.extend(["--audio", str(args.audio)]) if args.ipex_weight_only_quantization: quant_cmd.extend(["--ipex-weight-only-quantization"]) quant_cmd.extend(["--weight-dtype", str(args.weight_dtype)]) @@ -551,6 +560,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--token-latency"]) if args.image_url is not None: infer_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + infer_cmd.extend(["--audio", str(args.audio)]) if args.prompt is not None: infer_cmd.extend(["--prompt", str(args.prompt)]) @@ -594,6 +605,7 @@ def main(args_in: Optional[List[str]] = None) -> None: "yuan": ("/yuan_local_shard"), "phi-3": ("/phi-3_local_shard"), "phi": ("/phi_local_shard"), + "whisper": ("/whisper_local_shard"), } model_type = next( ( @@ -657,6 +669,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--token-latency"]) if args.image_url is not None: infer_cmd.extend(["--image-url", str(args.image_url)]) + if args.audio is not None: + infer_cmd.extend(["--audio", str(args.audio)]) if args.prompt is not None: infer_cmd.extend(["--prompt", str(args.prompt)]) diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 5dbf4f249..2cdc8d563 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -7,6 +7,8 @@ import math import torch.nn.functional as F import re +from datasets import load_dataset +from torch.utils.data import DataLoader sys.path.append(sys.path[0] + "/../../") from transformers import ( @@ -14,6 +16,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) @@ -38,6 +41,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -100,6 +104,14 @@ import transformers try: + import lmms_eval + from lmms_eval.api.instance import Instance + from lmms_eval.api.model import lmms + from lmms_eval.api.registry import register_model + from lmms_eval import evaluator as lmms_evaluator + from lmms_eval import utils as lmms_utils + from lmms_eval.api.registry import ALL_TASKS + from lmms_eval.tasks import initialize_tasks from llava.model.language_model.llava_llama import ( # noqa F401 LlavaLlamaForCausalLM, ) @@ -116,14 +128,6 @@ DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, ) - import lmms_eval - from lmms_eval.api.instance import Instance - from lmms_eval.api.model import lmms - from lmms_eval.api.registry import register_model - from lmms_eval import evaluator as lmms_evaluator - from lmms_eval import utils as lmms_utils - from lmms_eval.api.registry import ALL_TASKS - from lmms_eval.tasks import initialize_tasks except ImportError: def register_model(name): @@ -1338,8 +1342,266 @@ def _collate(x): return res +class LibriSpeech: + def __init__( + self, + pretrained: str, + device: Optional[str] = "cpu", + with_ipex=True, + with_jit=True, + with_greedy=False, + batch_size=1, + dtype: Optional[Union[str, torch.dtype]] = "auto", + config=None, + add_special_tokens=True, + ) -> None: + model_id = pretrained + self.device = torch.device(device) + self.batch_size = int(batch_size) + self.with_jit = with_jit + self.with_ipex = with_ipex + self.with_greedy = with_greedy + self.dtype = dtype + self.add_special_tokens = add_special_tokens + load_dtype = torch.float32 + infer_dtype = torch.float32 + if dtype == "float16": + load_dtype = torch.half + infer_dtype = torch.half + elif dtype == "bfloat16": + load_dtype = torch.bfloat16 + infer_dtype = torch.bfloat16 + elif dtype in ["int8", "int4", "nf4"]: + load_dtype = torch.float32 + infer_dtype = torch.int8 + self.amp_dtype = ( + torch.bfloat16 + if args.quant_with_amp or self.dtype == "bfloat16" + else torch.float32 + ) + + model_type = next( + (x for x in MODEL_CLASSES.keys() if x in model_id.lower()), "auto" + ) + model_class = MODEL_CLASSES[model_type] + self.tokenizer = model_class[1].from_pretrained( + model_id, trust_remote_code=True + ) + self.config = AutoConfig.from_pretrained( + model_id if config is None else config, + torchscript=with_jit, + trust_remote_code=True, + ) + self.config.torchscript = self.with_jit + if self.dtype in ("int8", "int4", "nf4"): + try: + with ipex.OnDevice(dtype=torch.float, device="meta"): + self.model = model_class[0].from_config( + self.config, trust_remote_code=True + ) + except (RuntimeError, AttributeError) as e: + print("Warning: Loading model to meta device failed:", e) + self.model = model_class[0].from_pretrained( + model_id, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + else: + self.model = model_class[0].from_pretrained( + model_id, + low_cpu_mem_usage=True, + config=self.config, + torch_dtype=load_dtype, + trust_remote_code=True, + ) + + self.model = self.model.eval() + if with_ipex and dtype not in ["int8", "int4", "nf4"]: + self.model = ipex.llm.optimize( + self.model.eval(), + dtype=infer_dtype, + inplace=True, + deployment_mode=False, + ) + + if args.torch_compile: + if dtype in ["int8", "int4", "nf4"]: + raise SystemExit( + "[ERROR] Currently this script does not support torch.compile with int8/int4/nf4 datatype," + " please set dtype to float32 or bfloat16 if want to use torch.compile." + ) + if with_jit: + raise SystemExit( + "[ERROR] JIT cannot co-work with torch.compile, please set jit to False if want to use" + " torch.compile." + ) + self.model.forward = torch.compile( + self.model.forward, dynamic=True, backend=args.backend + ) + + self.base_model = self.model + + self.iter = 0 + self.num_beams = 1 if with_greedy else 4 + self.tp_number = 1 + if self.with_jit: + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros( + [ + 1, + 32, + self.model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + self.model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ], + dtype=self.amp_dtype, + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(self.config.num_hidden_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(self.amp_dtype) + sample_inputs = { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "past_key_values": past_key_values, + "encoder_outputs": (last_hidden_state,), + } + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + if self.dtype != "int8": + traced_model = torch.jit.trace( + self.model.eval(), + example_kwarg_inputs=sample_inputs, + strict=False, + check_trace=False, + ) + traced_model = torch.jit.freeze(traced_model.eval()) + else: + traced_model = torch.jit.load(args.quantized_model_path) + traced_model = torch.jit.freeze(traced_model.eval()) + + traced_model(**sample_inputs) + traced_model(**sample_inputs) + ipex._set_optimized_model_for_generation( + self.model, optimized_model=traced_model + ) + self.dataset = load_dataset("librispeech_asr", split="test.clean") + self.dataloader = DataLoader( + self.dataset, + batch_size=1, + shuffle=False, + ) + + def _levenshtein(self, a: List, b: List) -> int: + """Calculates the Levenshtein distance between a and b.""" + n, m = len(a), len(b) + if n > m: + # Make sure n <= m, to use O(min(n,m)) space + a, b = b, a + n, m = m, n + + current = list(range(n + 1)) + for i in range(1, m + 1): + previous, current = current, [i] + [0] * n + for j in range(1, n + 1): + add, delete = previous[j] + 1, current[j - 1] + 1 + change = previous[j - 1] + if a[j - 1] != b[i - 1]: + change = change + 1 + current[j] = min(add, delete, change) + + return current[n] + + def word_error_rate(self, hypotheses: List[str], references: List[str]) -> float: + """ + Computes Average Word Error rate between two texts represented as + corresponding lists of string. Hypotheses and references must have same length. + + Args: + hypotheses: list of hypotheses + references: list of references + + Returns: + (float) average word error rate + """ + scores = 0 + words = 0 + if len(hypotheses) != len(references): + raise ValueError( + "In word error rate calculation, hypotheses and reference" + " lists must have the same number of elements. But I got:" + "{0} and {1} correspondingly".format(len(hypotheses), len(references)) + ) + for h, r in zip(hypotheses, references): + h_list = h.split() + r_list = r.split() + words += len(r_list) + scores += self._levenshtein(h_list, r_list) + if words != 0: + wer = 1.0 * scores / words + else: + wer = float("inf") + return wer, scores, words + + def evaluate(self): + results = [] + references = [] + for batch_ndx, sample in enumerate(self.dataloader): + inputs = sample["audio"]["array"].squeeze(0) + model_inputs = self.tokenizer( + inputs, sampling_rate=16000, return_tensors="pt" + ).input_features + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if self.amp_dtype == torch.bfloat16 else False, + ): + output = self.model.generate( + model_inputs, + do_sample=False, + temperature=0.9, + num_beams=self.num_beams, + ) + gen_text = self.tokenizer.batch_decode(output, skip_special_tokens=True) + if len(results) == 0: + results = gen_text + references = sample["text"] + else: + results += gen_text + references += sample["text"] + references = [r.capitalize() for r in references] + wer, scores, words = self.word_error_rate(results, references) + return wer, scores, words + + lm_tasks = [] lmms_tasks = [] +other_tasks = [] lm_all_tasks = lm_eval.tasks.ALL_TASKS try: initialize_tasks() @@ -1350,6 +1612,8 @@ def _collate(x): lm_tasks.append(task) elif task in ALL_TASKS: lmms_tasks.append(task) + elif task in ["librispeech_asr"]: + other_tasks.append(task) else: print(f"Task {task} in not supported by lm_eval and lmms_eval") exit(0) @@ -1420,3 +1684,19 @@ def _collate(x): cli_args=args, ) print(lmms_evaluator.make_table(results)) +elif len(other_tasks) != 0: + if "librispeech_asr" in other_tasks: + evaluator = LibriSpeech( + pretrained=args.model, + device="cpu", + batch_size=args.batch_size, + with_ipex=args.ipex, + with_jit=not args.disable_jit, + dtype=args.dtype, + config=args.config_file, + add_special_tokens=True, + with_greedy=False, + ) + wer, scores, num_words = evaluator.evaluate() + print("Evaluation WER: {0}".format(wer)) + print("Accuracy: {:.15f} ".format(1 - wer)) diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 1c4c04a0f..a747adabe 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -10,6 +10,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) @@ -46,6 +47,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } @@ -104,6 +106,12 @@ type=str, help="image url for image-to-text task", ) +parser.add_argument( + "--audio", + default="example.flac", + type=str, + help="audio file for speech-to-text task", +) parser.add_argument( "--config-file", default=None, type=str, help="specific configuration file" ) @@ -170,6 +178,8 @@ config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) if model_type == "mpt" and args.prompt is None: config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) +if model_type == "whisper": + config.text_max_length = config.max_source_positions + config.max_target_positions if not hasattr(config, "lm_head_generation"): config.lm_head_generation = True @@ -246,6 +256,10 @@ def load_image(image_file): roles = conv.roles if re.search("yuan", model.config.architectures[0], re.IGNORECASE): model.config.batch_size = int(args.batch_size) * num_beams +if re.search("whisper", model.config.architectures[0], re.IGNORECASE): + import librosa + + sample = librosa.load(args.audio, sr=16000) def trace_handler(prof): @@ -297,6 +311,9 @@ def trace_handler(prof): conv.append_message(conv.roles[0], prompt) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() + elif model_type == "whisper": + prompt = sample[0] + generate_kwargs.pop("min_new_tokens", None) else: # input prompt current_path = pathlib.Path(__file__).parent.resolve() @@ -351,6 +368,11 @@ def trace_handler(prof): elif model_type == "git": input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values output = model.generate(pixel_values=input_ids, **generate_kwargs) + elif model_type == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) @@ -364,7 +386,7 @@ def trace_handler(prof): input_tokens_lengths = [x.shape[0] for x in input_ids] output_tokens_lengths = [x.shape[0] for x in gen_ids] total_new_tokens = [ - o - i if model.config.model_type != "t5" else o + o if model.config.model_type in ["t5", "whisper"] else o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths) ] print(gen_text, total_new_tokens, flush=True) @@ -409,6 +431,11 @@ def trace_handler(prof): output = model.generate( pixel_values=input_ids, **generate_kwargs ) + elif model_type == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = model.generate(input_ids, **generate_kwargs) diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 07446a654..c31160757 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -38,6 +38,7 @@ from llm.utils.model_class.phi import PhiConfig from llm.utils.model_class.phi import Phi3Config from llm.utils.model_class.yuan import YuanConfig +from llm.utils.model_class.whisper import WhisperConfig parser = argparse.ArgumentParser("LLM generation script (int8 path)", add_help=False) parser.add_argument( @@ -72,6 +73,12 @@ type=str, help="image url for image-to-text task", ) +parser.add_argument( + "--audio", + default="example.flac", + type=str, + help="audio file for speech-to-text task", +) parser.add_argument( "--qconfig-summary-file", default="", help="qconfig for static quantization" ) @@ -349,6 +356,10 @@ def load_image(image_file): model = PhiConfig(args.model_id) elif re.search("yuan", config.architectures[0], re.IGNORECASE): model = YuanConfig(args.model_id) +elif re.search("whisper", config.architectures[0], re.IGNORECASE): + import librosa + + model = WhisperConfig(args.model_id) else: raise AssertionError("Not support %s." % (args.model_id)) @@ -359,6 +370,8 @@ def load_image(image_file): config.max_seq_len = int(args.input_tokens) + int(args.max_new_tokens) if model.name in ["git", "llava"]: config.batch_size = int(args.batch_size) * num_beams +if model.name == "whisper": + config.text_max_length = config.max_source_positions + config.max_target_positions user_model = model.get_user_model(config, args.benchmark) @@ -529,6 +542,32 @@ def get_example_inputs(model): torch.ones((batch_size, 1), dtype=torch.long), tuple(past_key_value), ) + elif model.example_inputs_mode == EXAMPLE_INPUTS_MODE.KV_ENC: + past_key_value = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [1, 32, n_heads, head_dim], dtype=amp_dtype + ).contiguous(), + torch.zeros( + [1, 32, n_heads, head_dim], dtype=amp_dtype + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(n_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(amp_dtype) + example_inputs = ( + torch.ones(4).to(torch.long).unsqueeze(0), + past_key_value, + (last_hidden_state,), + ) else: raise RuntimeError( "Your model does not match existing example inputs used in ipex quantization, exiting..." @@ -573,6 +612,12 @@ def __init__( def tokenize_function(self, examples): if "prompt" in examples: example = self.tokenizer(examples["prompt"]) + elif "audio" in examples: + inputs = [d["array"] for d in examples["audio"]] + example = self.tokenizer( + inputs, sampling_rate=16000, return_tensors="pt" + ) + example["input_ids"] = example["input_features"] elif "text" in examples: example = self.tokenizer(examples["text"]) elif "code" in examples: @@ -689,6 +734,66 @@ def collate_batch(self, batch): tuple(global_past_key_value), (last_hidden_state,), ) + elif model.example_inputs_mode == EXAMPLE_INPUTS_MODE.KV_ENC: + input_bs = int(args.batch_size * num_beams) + model_kwargs = {} + model_kwargs = ( + user_model._prepare_encoder_decoder_kwargs_for_generation( + torch.vstack(input_ids_padded).unsqueeze(0), + model_kwargs, + "input_features", + ) + ) + last_hidden_state = model_kwargs["encoder_outputs"][ + "last_hidden_state" + ] + global_past_key_value = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + user_model.model.decoder.layers[i] + .encoder_attn.k_proj(last_hidden_state) + .view( + int(input_bs), + -1, + user_model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + user_model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ) + .contiguous(), + user_model.model.decoder.layers[i] + .encoder_attn.v_proj(last_hidden_state) + .view( + int(input_bs), + -1, + user_model.model.decoder.layers[ + i + ].encoder_attn.num_heads, + user_model.model.decoder.layers[ + i + ].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(n_layers) + ] + ) + decoder_input_ids = ( + torch.zeros(input_bs).to(torch.long).unsqueeze(1) + ) + model_inputs = ( + decoder_input_ids, + tuple(global_past_key_value), + (last_hidden_state,), + ) else: raise RuntimeError( "Your model does not match existing example inputs used in ipex smooth quant, exiting..." @@ -699,9 +804,12 @@ def collate_batch(self, batch): return (model_inputs, last_ind) - calib_dataset = load_dataset( - args.dataset if args.dataset else model.default_dataset, split="train" - ) + if model.default_dataset == "librispeech_asr": + calib_dataset = load_dataset(model.default_dataset, split="train.clean.100") + else: + calib_dataset = load_dataset( + args.dataset if args.dataset else model.default_dataset, split="train" + ) if args.calib_shuffle: calib_dataset = calib_dataset.shuffle(seed=42) user_model.eval() @@ -975,6 +1083,10 @@ def calib_func(prepared_model): conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() image_processor = model.get_image_processor() + elif model.name == "whisper": + sample = librosa.load(args.audio, sr=16000) + prompt = sample[0] + generate_kwargs.pop("min_new_tokens", None) else: # input prompt current_path = pathlib.Path(__file__).parent.resolve() @@ -1028,6 +1140,11 @@ def calib_func(prepared_model): elif model.name == "git": input_ids = tokenizer(images=prompt, return_tensors="pt").pixel_values output = user_model.generate(pixel_values=input_ids, **generate_kwargs) + elif model.name == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = user_model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = user_model.generate(input_ids, **generate_kwargs) @@ -1040,7 +1157,7 @@ def calib_func(prepared_model): input_tokens_lengths = [x.shape[0] for x in input_ids] output_tokens_lengths = [x.shape[0] for x in gen_ids] total_new_tokens = [ - o - i if user_model.config.model_type != "t5" else o + o if user_model.config.model_type in ["t5", "whisper"] else o - i for i, o in zip(input_tokens_lengths, output_tokens_lengths) ] print(gen_text, total_new_tokens, flush=True) @@ -1061,7 +1178,7 @@ def trace_handler(prof): activities=[torch.profiler.ProfilerActivity.CPU], schedule=torch.profiler.schedule(wait=1, warmup=3, active=1), on_trace_ready=trace_handler, - ) as prof: + ) as prof, torch.no_grad(), torch.cpu.amp.autocast(enabled=amp_enabled): for i in range(5): if model.name == "llava": input_ids = torch.stack( @@ -1088,6 +1205,11 @@ def trace_handler(prof): output = user_model.generate( pixel_values=input_ids, **generate_kwargs ) + elif model.name == "whisper": + input_ids = tokenizer( + prompt, sampling_rate=16000, return_tensors="pt" + ).input_features + output = user_model.generate(input_ids, **generate_kwargs) else: input_ids = tokenizer(prompt, return_tensors="pt").input_ids output = user_model.generate(input_ids, **generate_kwargs) diff --git a/examples/cpu/inference/python/llm/utils/create_shard_model.py b/examples/cpu/inference/python/llm/utils/create_shard_model.py index fcd10d89f..ee2a4b386 100644 --- a/examples/cpu/inference/python/llm/utils/create_shard_model.py +++ b/examples/cpu/inference/python/llm/utils/create_shard_model.py @@ -5,6 +5,7 @@ AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration, + WhisperForConditionalGeneration, AutoProcessor, ) @@ -33,6 +34,7 @@ "yuan": (AutoModelForCausalLM, AutoTokenizer), "phi-3": (AutoModelForCausalLM, AutoTokenizer), "phi": (AutoModelForCausalLM, AutoTokenizer), + "whisper": (WhisperForConditionalGeneration, AutoProcessor), "auto": (AutoModelForCausalLM, AutoTokenizer), } diff --git a/examples/cpu/inference/python/llm/utils/model_class/llm.py b/examples/cpu/inference/python/llm/utils/model_class/llm.py index 6a44ec8eb..4ef26b7fa 100644 --- a/examples/cpu/inference/python/llm/utils/model_class/llm.py +++ b/examples/cpu/inference/python/llm/utils/model_class/llm.py @@ -15,6 +15,7 @@ class EXAMPLE_INPUTS_MODE(IntEnum): MASK_KV_ENC = 5 MASK_KV_PIXEL = 6 EMBEDS_MASK_KV = 7 + KV_ENC = 8 class LLMConfig(ABC): diff --git a/examples/cpu/inference/python/llm/utils/model_class/whisper.py b/examples/cpu/inference/python/llm/utils/model_class/whisper.py new file mode 100644 index 000000000..da03c7ffe --- /dev/null +++ b/examples/cpu/inference/python/llm/utils/model_class/whisper.py @@ -0,0 +1,49 @@ +import torch +from .llm import LLMConfig, EXAMPLE_INPUTS_MODE +from transformers import WhisperForConditionalGeneration, AutoProcessor +import intel_extension_for_pytorch as ipex + + +class WhisperConfig(LLMConfig): + def __init__(self, model_id): + self.name = "whisper" + self.model_id = model_id + self.to_channels_last = True + self.example_inputs_mode = EXAMPLE_INPUTS_MODE.KV_ENC + + # for smooth quant + self.default_dataset = "librispeech_asr" + self.use_global_past_key_value = False + self.use_ipex_autotune = True + + def get_user_model(self, config, benchmark): + if benchmark: + try: + with ipex.OnDevice(dtype=torch.float, device="meta"): + self.model = WhisperForConditionalGeneration.from_pretrained( + self.model_id, + torch_dtype=torch.float, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + except (RuntimeError, AttributeError): + self.model = WhisperForConditionalGeneration.from_pretrained( + self.model_id, + torch_dtype=torch.float, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + else: + self.model = WhisperForConditionalGeneration.from_pretrained( + self.model_id, + torch_dtype=torch.float, + config=config, + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + return self.model + + def get_tokenizer(self): + return AutoProcessor.from_pretrained(self.model_id) diff --git a/intel_extension_for_pytorch/transformers/generation/beam_sample.py b/intel_extension_for_pytorch/transformers/generation/beam_sample.py index a97fe98fc..713294dd9 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_sample.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_sample.py @@ -191,6 +191,7 @@ def _beam_sample( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False if model_inputs["past_key_values"] is None: @@ -271,6 +272,46 @@ def _beam_sample( for i in range(self.config.num_hidden_layers) ] ) + elif self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) if first_token: if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -308,6 +349,12 @@ def _beam_sample( model_inputs["encoder_outputs"] = ( model_inputs["encoder_outputs"]["last_hidden_state"], ) + if self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if self.model_backbone == "LlavaLlamaForCausalLM" and hasattr( self, "prepare_inputs_labels_for_multimodal" ): diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py index 9ce5b276e..e36e329a0 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_search.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py @@ -193,6 +193,7 @@ def _beam_search( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False has_position_id = model_inputs.get("position_ids", None) is not None @@ -274,6 +275,46 @@ def _beam_search( for i in range(self.config.num_hidden_layers) ] ) + elif self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(batch_size * num_beams)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(batch_size * num_beams), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) if first_token and self.model_backbone != "YuanForCausalLM": if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -328,6 +369,12 @@ def _beam_search( model_inputs["encoder_outputs"] = ( model_inputs["encoder_outputs"]["last_hidden_state"], ) + if self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if self.model_backbone == "LlavaLlamaForCausalLM" and hasattr( self, "prepare_inputs_labels_for_multimodal" ): diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py index 85d8f2ab6..b874668bb 100644 --- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py +++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py @@ -174,6 +174,7 @@ def _greedy_search( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False input_bs = input_ids.size()[0] @@ -229,6 +230,47 @@ def _greedy_search( for i in range(self.config.num_hidden_layers) ] ) + if self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + if first_token: if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -279,6 +321,12 @@ def _greedy_search( self, "prepare_inputs_labels_for_multimodal" ): model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) + elif self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if first_token and self.model_backbone == "YuanForCausalLM": model_inputs.pop("past_key_values", None) if hasattr(self, "trace_graph"): diff --git a/intel_extension_for_pytorch/transformers/generation/sample.py b/intel_extension_for_pytorch/transformers/generation/sample.py index f446b412c..22f6bc5d0 100644 --- a/intel_extension_for_pytorch/transformers/generation/sample.py +++ b/intel_extension_for_pytorch/transformers/generation/sample.py @@ -180,6 +180,7 @@ def _sample( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ]: first_token = False input_bs = input_ids.size()[0] @@ -235,6 +236,47 @@ def _sample( for i in range(self.config.num_hidden_layers) ] ) + if self.model_backbone == "WhisperForConditionalGeneration": + first_token = False + beam_idx_tmp = torch.zeros( + (2048, int(input_bs)), dtype=torch.long + ).contiguous() + model_inputs["past_key_values"] = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + beam_idx_tmp, + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + self.model.decoder.layers[i] + .encoder_attn.k_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + self.model.decoder.layers[i] + .encoder_attn.v_proj( + model_inputs["encoder_outputs"]["last_hidden_state"] + ) + .view( + int(input_bs), + -1, + self.model.decoder.layers[i].encoder_attn.num_heads, + self.model.decoder.layers[i].encoder_attn.head_dim, + ) + .contiguous(), + beam_idx_tmp, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + if first_token: if hasattr(self.config, "n_layer"): num_hidden_layers = self.config.n_layer @@ -287,6 +329,12 @@ def _sample( model_inputs = self.prepare_inputs_labels_for_multimodal(**model_inputs) if first_token and self.model_backbone == "YuanForCausalLM": model_inputs.pop("past_key_values", None) + if self.model_backbone == "WhisperForConditionalGeneration": + model_inputs["encoder_outputs"] = ( + model_inputs["encoder_outputs"]["last_hidden_state"], + ) + model_inputs.pop("decoder_position_ids", None) + model_inputs.pop("decoder_attention_mask", None) if hasattr(self, "trace_graph"): model_inputs.pop("use_cache", None) model_inputs.pop("token_type_ids", None) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py index 29491dfe0..cb45433b9 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py @@ -26,6 +26,7 @@ def __init__(self, module, config, tpp=False, woq=False): "T5ForConditionalGeneration", "MptForCausalLM", "GitForCausalLM", + "WhisperForConditionalGeneration", ] or self.model_backbone == "BaichuanForCausalLM" and hasattr(module, "rotary_emb") diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py index 7d578cffb..5ed1a85d2 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py @@ -214,5 +214,23 @@ def __init__(self, module, config, tpp=False, woq=False): self.mha_linear_add = _IPEXlinearAddCPU( module.mha_linear_add.linear, tpp=tpp, woq=woq ) + elif self.model_backbone == "WhisperForConditionalGeneration": + if not self.distributed: + if hasattr(module, "mha_linear_add"): + self.mha_linear_add = _IPEXlinearAddCPU( + module.mha_linear_add.linear, tpp=tpp, woq=woq + ) + if hasattr(module, "mlp_linear_add"): + self.mlp_linear_add = _IPEXlinearAddCPU( + module.mlp_linear_add.linear, tpp=tpp, woq=woq + ) + if hasattr(module, "encoder_mha_linear_add"): + self.encoder_mha_linear_add = _IPEXlinearAddCPU( + module.encoder_mha_linear_add.linear, tpp=tpp, woq=woq + ) + if hasattr(module, "linear_gelu"): + self.linear_gelu = _IPEXlinearGeluCPU( + module.linear_gelu.linear, tpp=tpp, woq=woq + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 4fb722d6a..c8b78fc34 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -7,9 +7,10 @@ CausalLMOutputWithCrossAttentions, BaseModelOutputWithPastAndCrossAttentions, Seq2SeqLMOutput, + Seq2SeqModelOutput, BaseModelOutput, ) - +import numpy as np from ....utils._logger import logger, WarningType import transformers @@ -26,6 +27,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) + from transformers.generation.configuration_utils import GenerationConfig except ImportError: pass @@ -3345,6 +3347,203 @@ def Phi3Model_forward( ) +def WhisperModel_forward( + self, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None, + decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + if encoder_outputs is None: + input_features = self._mask_input_features( + input_features, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + input_features, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + position_ids=decoder_position_ids, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + tuple(encoder_outputs) + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +def WhisperForConditionalGeneration_forward( + self, + decoder_input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + input_features: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None, + decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]: + if labels is not None: + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_features, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + decoder_inputs_embeds=decoder_inputs_embeds, + decoder_position_ids=decoder_position_ids, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=False, + ) + + sequence_output = outputs[0] + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and sequence_output.size(1) != 1 + ): + sequence_output = sequence_output[:, -1:, :] + lm_logits = self.proj_out(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # move labels to correct device to enable PP + labels = labels.to(lm_logits.device) + loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1)) + + output = (lm_logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + +def detect_language( + self, + input_features: Optional[torch.FloatTensor] = None, + encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None, + generation_config: Optional[GenerationConfig] = None, + num_segment_frames: int = 3000, +) -> torch.Tensor: + if input_features is None and encoder_outputs is None: + raise ValueError( + "You have to specify either `input_features` or `encoder_outputs`" + ) + elif input_features is not None and encoder_outputs is not None: + raise ValueError( + "Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!" + ) + elif input_features is not None: + inputs = {"input_features": input_features[:, :, :num_segment_frames]} + batch_size = input_features.shape[0] + elif encoder_outputs is not None: + inputs = {"encoder_outputs": encoder_outputs} + batch_size = ( + encoder_outputs[0].shape[0] + if isinstance(encoder_outputs, BaseModelOutput) + else encoder_outputs[0] + ) + + generation_config = generation_config or self.generation_config + decoder_input_ids = ( + torch.ones((batch_size, 1), device=self.device, dtype=torch.long) + * generation_config.decoder_start_token_id + ) + + with torch.no_grad(): + outputs = self(**inputs, decoder_input_ids=decoder_input_ids) + if isinstance(outputs, tuple): + logits = outputs[0][:, -1] + else: + logits = outputs.logits[:, -1] + + non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool) + non_lang_mask[list(generation_config.lang_to_id.values())] = False + + logits[:, non_lang_mask] = -np.inf + + lang_ids = logits.argmax(-1) + + return lang_ids + + def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): if module.config.use_return_dict or ( "return_dict" in kwargs and kwargs["return_dict"] @@ -3398,7 +3597,10 @@ def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): ) or module.config.output_attentions: encoder_attentions = outputs[idx] idx += 1 - if module.config.architectures[0] == "T5ForConditionalGeneration": + if module.config.architectures[0] in [ + "T5ForConditionalGeneration", + "WhisperForConditionalGeneration", + ]: return Seq2SeqLMOutput( loss=loss, logits=logits, diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 2bbd4bd50..85e0f47ae 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -2022,6 +2022,101 @@ def _Phi3Attention_forward( return attn_output, attn_weights, past_key_value +def _WhisperAttention_forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + is_cross_attention = key_value_states is not None + bsz, tgt_len, _ = hidden_states.size() + query_states = self.q_proj(hidden_states) * self.scaling + if is_cross_attention and past_key_value is not None: + key_states = past_key_value[1].contiguous() + value_states = past_key_value[2].contiguous() + elif is_cross_attention: + key_states = ( + self.k_proj(key_value_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + value_states = ( + self.v_proj(key_value_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + else: + key_states = ( + self.k_proj(hidden_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + value_states = ( + self.v_proj(hidden_states) + .view(bsz, -1, self.num_heads, self.head_dim) + .contiguous() + ) + + query_states = query_states.view( + bsz, -1, self.num_heads, self.head_dim + ).contiguous() + + src_len = key_states.size(1) + if attention_mask is None: + seq_len = ( + src_len + past_key_value[0].size(-2) + if past_key_value is not None + else src_len + ) + attention_mask = torch.zeros( + [bsz, 1, tgt_len, seq_len], dtype=hidden_states.dtype + ) + if key_value_states is None and self.is_decoder: + decoded_tokens = ( + torch.tensor(past_key_value[0].size(-2)) + if past_key_value is not None + else None + ) + else: + decoded_tokens = torch.zeros(1, dtype=torch.long).contiguous()[0] + + ( + attn_output, + attn_weights, + past_key_value, + ) = self._IPEXScaleDotProduct( + query_states, + key_states, + value_states, + 1, + past_key_value, + layer_head_mask, + attention_mask, + None, + False, + decoded_tokens, + ) + if is_cross_attention: + past_key_value = ( + past_key_value[0], + key_states, + value_states, + past_key_value[3], + ) + if not output_attentions: + attn_weights = None + if not self.is_decoder: + past_key_value = None + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, -1) + # attn_output = self.out_proj(attn_output) + return attn_output, attn_weights, past_key_value + + def _create_attention_mask_for_git( self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None ): @@ -2151,6 +2246,7 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): "T5ForConditionalGeneration", "MptForCausalLM", "GitForCausalLM", + "WhisperForConditionalGeneration", ] or ( self.model_backbone == "BaichuanForCausalLM" @@ -2648,6 +2744,16 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "WhisperForConditionalGeneration": + return _WhisperAttention_forward( + self, + hidden_states, + key_value_states, + past_key_value, + attention_mask, + layer_head_mask, + output_attentions, + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index 9e3a29c6e..fb56ca61b 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -1273,6 +1273,128 @@ def Phi3DecoderLayer_forward( return outputs +def WhisperEncoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + output_attentions: bool = False, +) -> torch.Tensor: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.out_proj(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.linear_gelu(hidden_states) + if not self.distributed: + hidden_states = self.mlp_linear_add(hidden_states, residual) + else: + hidden_states = self.fc2(hidden_states) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +def WhisperDecoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, +) -> torch.Tensor: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + self_attn_past_key_value = ( + past_key_value[:4] if past_key_value is not None else None + ) + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.out_proj(hidden_states) + hidden_states = residual + hidden_states + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + cross_attn_past_key_value = ( + past_key_value[4:] if past_key_value is not None else None + ) + ( + hidden_states, + cross_attn_weights, + cross_attn_present_key_value, + ) = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + if not self.distributed: + hidden_states = self.encoder_mha_linear_add(hidden_states, residual) + else: + hidden_states = self.encoder_attn.out_proj(hidden_states) + hidden_states = residual + hidden_states + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + hidden_states = self.linear_gelu(hidden_states) + if not self.distributed: + hidden_states = self.mlp_linear_add(hidden_states, residual) + else: + hidden_states = self.fc2(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + class _IPEXDecoderLayerRef(nn.Module): def __init__(self, module, config, distributed=False): super().__init__() @@ -1470,6 +1592,19 @@ def __init__(self, module, config, distributed=False): del self.__dict__["_modules"]["mlp"].down_proj self.mha_linear_add = _IPEXlinearAddRef(module.self_attn.o_proj) del self.__dict__["_modules"]["self_attn"].o_proj + elif self.model_backbone == "WhisperForConditionalGeneration": + if not self.distributed: + self.mha_linear_add = _IPEXlinearAddRef(module.self_attn.out_proj) + del self.__dict__["_modules"]["self_attn"].out_proj + self.mlp_linear_add = _IPEXlinearAddRef(module.fc2) + del self.__dict__["_modules"]["fc2"] + if hasattr(module, "encoder_attn"): + self.encoder_mha_linear_add = _IPEXlinearAddRef( + module.encoder_attn.out_proj + ) + del self.__dict__["_modules"]["encoder_attn"].out_proj + self.linear_gelu = _IPEXlinearGeluRef(module.fc1) + del self.__dict__["_modules"]["fc1"] else: AssertionError(False, "Do not support the optimization of your model yet") @@ -1710,5 +1845,22 @@ def forward( use_cache, past_key_value, ) + elif self.model_backbone == "WhisperForConditionalGeneration": + if encoder_hidden_states is not None: + return WhisperDecoderLayer_forward( + self, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + layer_head_mask, + cross_attn_layer_head_mask, + past_key_value, + output_attentions, + use_cache, + ) + return WhisperEncoderLayer_forward( + self, hidden_states, attention_mask, layer_head_mask, output_attentions + ) else: AssertionError(False, "Do not support the optimization of your model yet") diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 661daae5a..bbd8d4bd1 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -187,10 +187,13 @@ def model_convert_reference(_model): PhiForCausalLM_forward, PhiModel_forward, Phi3Model_forward, + WhisperForConditionalGeneration_forward, + WhisperModel_forward, prepare_inputs_for_generation, prepare_inputs_for_generation_gptbigcode, prepare_inputs_for_generation_llama, prepare_inputs_labels_for_multimodal_llavallama, + detect_language, ) if not hasattr(_model.config, "architectures"): @@ -786,6 +789,31 @@ def model_convert_reference(_model): _model.config, distributed=distributed, ) + elif _model.config.architectures[0] == "WhisperForConditionalGeneration": + convert_function(_model, "detect_language", detect_language) + convert_function(_model, "forward", WhisperForConditionalGeneration_forward) + convert_function(_model.model, "forward", WhisperModel_forward) + convert_class( + _model, + type(_model.model.encoder.layers[0]), + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + type(_model.model.decoder.layers[0]), + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + type(_model.model.encoder.layers[0].self_attn), + _IPEXAttentionRef, + _model.config, + distributed=distributed, + ) return _model @@ -888,6 +916,60 @@ def get_dummy_input(_model, return_dict=False): (last_hidden_state,), ) ) + elif _model.config.architectures[0] == "WhisperForConditionalGeneration": + dtype = ( + _model.model.decoder.layers[0].mha_linear_add.dtype + if hasattr(_model.model.decoder.layers[0], "mha_linear_add") + else _model.dtype + ) + past_key_values = tuple( + [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros([1, 1, 1, 1]).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [ + 1, + 32, + _model.model.decoder.layers[i].encoder_attn.num_heads, + _model.model.decoder.layers[i].encoder_attn.head_dim, + ], + dtype=dtype, + ).contiguous(), + torch.zeros( + [ + 1, + 32, + _model.model.decoder.layers[i].encoder_attn.num_heads, + _model.model.decoder.layers[i].encoder_attn.head_dim, + ], + dtype=dtype, + ).contiguous(), + torch.zeros(1, 4, dtype=torch.long), + ) + for i in range(model_num_layers) + ] + ) + last_hidden_state = torch.rand([1, 32, 1280]).to(dtype) + sample_inputs = ( + ( + { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "past_key_values": past_key_values, + "encoder_outputs": (last_hidden_state,), + } + ) + if return_dict + else ( + torch.ones(1).to(torch.long).unsqueeze(0), + past_key_values, + (last_hidden_state,), + ) + ) + else: sample_inputs = ( { @@ -1106,6 +1188,13 @@ def model_convert_lowering( getattr(_model, model_name), "_use_sdpa" ): getattr(_model, model_name)._use_sdpa = False + if hasattr(_model, model_name): + cur_mod = getattr(_model, model_name) + for submodel_name in ["encoder", "decoder"]: + if hasattr(cur_mod, submodel_name) and hasattr( + getattr(cur_mod, submodel_name), "_use_sdpa" + ): + getattr(cur_mod, submodel_name)._use_sdpa = False for supported_mlp_class in [_IPEXDecoderLayerRef]: lowering_class_cpu( @@ -1217,7 +1306,7 @@ def optimize( Well supported model family with full functionalities: Llama, GPT-J, GPT-Neox, OPT, Falcon, Bloom, CodeGen, Baichuan, ChatGLM, GPTBigCode, - T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan, Phi. + T5, Mistral, MPT, Mixtral, StableLM, QWen, Git, Llava, Yuan, Phi, Whisper. For the model that is not in the scope of supported model family above, will try to apply default ipex.optimize transparently to get benifits (not include quantizations, @@ -1307,6 +1396,7 @@ def optimize( "YuanForCausalLM", "PhiForCausalLM", "Phi3ForCausalLM", + "WhisperForConditionalGeneration", ] if well_supported_model: @@ -1315,8 +1405,8 @@ def optimize( if quantization_config is not None: logger.warning( "ipex.llm.optimize supports quantizations on Llama, GPT-J, GPT-Neox, Falcon, OPT, Bloom, CodeGen," - + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, Llava, Yuan, " - + "and Phi, fallback to origin model" + + " Baichuan, ChatGLM, GPTBigCode, T5, Mistral, Mixtral, MPT, StableLM, QWen, Git, Llava, Yuan," + + " Phi, and Whisper, fallback to origin model" ) return model diff --git a/tests/cpu/hf_configs/whisper/config.json b/tests/cpu/hf_configs/whisper/config.json new file mode 100644 index 000000000..b1d5ff58e --- /dev/null +++ b/tests/cpu/hf_configs/whisper/config.json @@ -0,0 +1,144 @@ +{ + "_name_or_path": "openai/whisper-large-v2", + "activation_dropout": 0.0, + "activation_function": "gelu", + "architectures": [ + "WhisperForConditionalGeneration" + ], + "attention_dropout": 0.0, + "begin_suppress_tokens": [ + 220, + 50257 + ], + "bos_token_id": 50257, + "d_model": 1280, + "decoder_attention_heads": 20, + "decoder_ffn_dim": 5120, + "decoder_layerdrop": 0.0, + "decoder_layers": 1, + "decoder_start_token_id": 50258, + "dropout": 0.0, + "encoder_attention_heads": 20, + "encoder_ffn_dim": 5120, + "encoder_layerdrop": 0.0, + "encoder_layers": 1, + "eos_token_id": 50257, + "forced_decoder_ids": [ + [ + 1, + 50259 + ], + [ + 2, + 50359 + ], + [ + 3, + 50363 + ] + ], + "init_std": 0.02, + "is_encoder_decoder": true, + "max_length": 448, + "max_source_positions": 1500, + "max_target_positions": 448, + "model_type": "whisper", + "num_hidden_layers": 1, + "num_mel_bins": 80, + "pad_token_id": 50257, + "scale_embedding": false, + "suppress_tokens": [ + 1, + 2, + 7, + 8, + 9, + 10, + 14, + 25, + 26, + 27, + 28, + 29, + 31, + 58, + 59, + 60, + 61, + 62, + 63, + 90, + 91, + 92, + 93, + 359, + 503, + 522, + 542, + 873, + 893, + 902, + 918, + 922, + 931, + 1350, + 1853, + 1982, + 2460, + 2627, + 3246, + 3253, + 3268, + 3536, + 3846, + 3961, + 4183, + 4667, + 6585, + 6647, + 7273, + 9061, + 9383, + 10428, + 10929, + 11938, + 12033, + 12331, + 12562, + 13793, + 14157, + 14635, + 15265, + 15618, + 16553, + 16604, + 18362, + 18956, + 20075, + 21675, + 22520, + 26130, + 26161, + 26435, + 28279, + 29464, + 31650, + 32302, + 32470, + 36865, + 42863, + 47425, + 49870, + 50254, + 50258, + 50358, + 50359, + 50360, + 50361, + 50362 + ], + "torch_dtype": "float32", + "transformers_version": "4.27.0.dev0", + "use_cache": true, + "vocab_size": 51865 +} \ No newline at end of file diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 0f697d4be..0b34ba776 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -172,6 +172,13 @@ lambda m: m.model.layers[0].self_attn.__class__, lambda m: m.model.layers[0].__class__, ), + model_info( + "whisper", + transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration, + False, + lambda m: m.model.decoder.layers[0].self_attn.__class__, + lambda m: m.model.decoder.layers[0].__class__, + ), ] @@ -247,6 +254,12 @@ def model_replacement_check( input_dict["decoder_input_ids"] = decoder_input_ids.unsqueeze(0) if m.name == "git": input_dict["pixel_values"] = torch.zeros(1, 3, 224, 224) + if m.name == "whisper": + last_hidden_state = torch.rand([1, 32, 1280]) + input_dict = { + "decoder_input_ids": torch.ones(4).to(torch.long).unsqueeze(0), + "encoder_outputs": (last_hidden_state,), + } with torch.no_grad(): key_hf = ref_m(**input_dict) From 57f381b16fb5ba79ec829619488974dab211329d Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 23 May 2024 19:07:45 +0800 Subject: [PATCH 084/199] update oneDNN to 74fb846b88 on main (#2924) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 0430e47c6..7d2635dc9 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 0430e47c6b2704627977b99ab5556aa0ba6908ce +Subproject commit 7d2635dc94a53637287c4c144ff0618f3472e2c1 From 798da3b7de4095616414e60d23c62d1a493ef24c Mon Sep 17 00:00:00 2001 From: zhuhaozhe Date: Fri, 24 May 2024 13:17:38 +0800 Subject: [PATCH 085/199] fix rope for BS > 1 (#2912) * fix rope for BS > 1 * fix ut --- csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp | 10 ++++++++-- tests/cpu/test_rope.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp index ba044e2e6..0894ef23e 100644 --- a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp +++ b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp @@ -86,7 +86,12 @@ std::tuple ApplyROPEKernel( auto out_stride_kb = concat_qkv ? key.stride(0) : 0; auto out_stride_ks = concat_qkv ? key.stride(1) : 0; auto emb_pos_ptr = t_emb_pos.data_ptr(); // [MP][HR] - auto pos_ptr = t_pos.data_ptr(); // [MB][S] + auto pos_ptr = t_pos.data_ptr(); // [B][S] or [1][S] + bool t_pos_no_repeated_for_batch = false; + if (t_pos.numel() != 1 && t_pos.size(0) == 1 && B > 1) { + // we do not perform t_pos.repeat here to avoid the overhead of copying + t_pos_no_repeated_for_batch = true; + } { #pragma omp parallel for collapse(3) for (int b = 0; b < B; b++) { @@ -106,7 +111,8 @@ std::tuple ApplyROPEKernel( sin_start = emb_pos_ptr + (p + s) * HR; cos_start = emb_pos_ptr + (p + s) * HR + COFF; } else { - p = pos_ptr[b * S + s]; + auto start_idx = t_pos_no_repeated_for_batch ? 0 : b * S; + p = pos_ptr[start_idx + s]; sin_start = emb_pos_ptr + p * HR; cos_start = emb_pos_ptr + p * HR + COFF; } diff --git a/tests/cpu/test_rope.py b/tests/cpu/test_rope.py index 3e9a8575a..70f482476 100644 --- a/tests/cpu/test_rope.py +++ b/tests/cpu/test_rope.py @@ -7,7 +7,7 @@ class FusedROPETester(TestCase): def setUp(self): - self.batch = 1 + self.batch = 2 self.seq_len = 32 self.max_seq_len = 384 self.head_size = 256 @@ -76,7 +76,10 @@ def hf_forward( query, key, position_ids, embed_positions, offset=None, rotary_dim=None ): embed_positions = _get_embed_positions(embed_positions, position_ids) - sincos = embed_positions.squeeze()[position_ids] + repeated_position_ids = position_ids.unsqueeze(-1).repeat( + 1, 1, embed_positions.shape[-1] + ) + sincos = torch.gather(embed_positions, 1, repeated_position_ids) sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1) if rotary_dim < self.head_size: From 65d26b5e511a4b7de3a9b0ccbadb40b785ecc0b1 Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Fri, 24 May 2024 15:28:36 +0800 Subject: [PATCH 086/199] Add readme for fast_bert example (#2795) (#2921) --- examples/cpu/features/fast_bert/README.md | 18 ++++++++++++++++++ .../cpu/tpp/fused_bert.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) create mode 100644 examples/cpu/features/fast_bert/README.md diff --git a/examples/cpu/features/fast_bert/README.md b/examples/cpu/features/fast_bert/README.md new file mode 100644 index 000000000..51bbcdf19 --- /dev/null +++ b/examples/cpu/features/fast_bert/README.md @@ -0,0 +1,18 @@ +# Feature Description: + +`ipex.fast_bert` proposed a technique to speed up BERT workloads. Implementation leverages the idea from [Tensor Processing Primitives](https://arxiv.org/pdf/2104.05755.pdf). + +Currently `ipex.fast_bert` API is only well optimized for training. For inference, it ensures functionality, while to get peak perf, please use `ipex.optimize` API + torchscript. + +# Prerequisite: +Transformers 4.6.0 ~ 4.38.1 + +# Usage Example: +Training: +``` +python fast_bert_training_bf16.py +``` +Inference: +``` +python fast_bert_inference_bf16.py +``` diff --git a/intel_extension_for_pytorch/cpu/tpp/fused_bert.py b/intel_extension_for_pytorch/cpu/tpp/fused_bert.py index 03a513a08..1b0b293b7 100644 --- a/intel_extension_for_pytorch/cpu/tpp/fused_bert.py +++ b/intel_extension_for_pytorch/cpu/tpp/fused_bert.py @@ -1243,7 +1243,7 @@ def fast_bert(model, dtype=torch.float, optimizer=None, unpad=False): >>> model = ... >>> model.load_state_dict(torch.load(PATH)) >>> model.eval() - >>> optimized_model = ipex.tpp_bert(model, dtype=torch.bfloat16) + >>> optimized_model = ipex.fast_bert(model, dtype=torch.bfloat16) >>> # running evaluation step. >>> # bfloat16 training case. >>> optimizer = ... From 27315216181eb76ea095e13d458efe645d8db445 Mon Sep 17 00:00:00 2001 From: blzheng Date: Mon, 27 May 2024 11:35:43 +0800 Subject: [PATCH 087/199] fix compatibility issues with transformers (#2931) --- .../transformers/models/reference/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index c8b78fc34..b4f2cf1e1 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -15,6 +15,7 @@ import transformers try: + from transformers.generation.configuration_utils import GenerationConfig from transformers.modeling_attn_mask_utils import ( _prepare_4d_causal_attention_mask, ) @@ -27,7 +28,6 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) - from transformers.generation.configuration_utils import GenerationConfig except ImportError: pass From 64fb34a6cae21880e7e45b35b71aaae8aa060bf3 Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Mon, 27 May 2024 13:32:17 +0900 Subject: [PATCH 088/199] update doc footer (#2933) --- docs/_static/custom.css | 4 ++++ docs/_templates/footer.html | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/_static/custom.css b/docs/_static/custom.css index b26690143..010bffac7 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -17,6 +17,10 @@ a#wap_dns { display: none; } +a#wap_nac { + display: none; +} + /* replace the copyright to eliminate the copyright symbol enforced by the ReadTheDocs theme */ div[role=contentinfo] { diff --git a/docs/_templates/footer.html b/docs/_templates/footer.html index 94c8435e1..92839d2d0 100644 --- a/docs/_templates/footer.html +++ b/docs/_templates/footer.html @@ -1,3 +1,3 @@ {% extends '!footer.html' %} {% block extrafooter %} {{super}} -

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
+

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
{% endblock %} From 560f9feb6f2e9feba8bad31afd96ae73614ab4d3 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Mon, 27 May 2024 14:34:42 +0800 Subject: [PATCH 089/199] WOQ: add relu/silu/mul post-op fusion (#2926) * WOQ: add relu/silu/mul post-op fusion * Fix lint issues * Fix clang-format issues * Fix UT failures on old platforms --- csrc/cpu/aten/Linear.cpp | 145 ++++++++++--- csrc/cpu/aten/Linear.h | 33 ++- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 203 ++++++++++++++---- csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp | 44 +--- csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h | 10 +- csrc/cpu/jit/cpu/kernels/OpContext.cpp | 18 +- csrc/cpu/jit/cpu/kernels/OpContext.h | 18 +- csrc/cpu/tpp/kernels/TPPGEMMKrnl.h | 68 +++--- .../nn/modules/weight_only_quantization.py | 5 + .../models/cpu/fusions/linear_fusion.py | 56 +++++ tests/cpu/test_quantization_default_recipe.py | 131 ++++++----- 11 files changed, 476 insertions(+), 255 deletions(-) diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp index 3958369a3..490acc3f7 100644 --- a/csrc/cpu/aten/Linear.cpp +++ b/csrc/cpu/aten/Linear.cpp @@ -453,7 +453,7 @@ at::Tensor woq_linear_forward( ->run(input); } -at::Tensor woq_linear_eltwise_kernel( +at::Tensor woq_linear_unary_kernel( const at::Tensor& self, const at::Tensor& weight, int64_t weight_dtype, @@ -469,10 +469,14 @@ at::Tensor woq_linear_eltwise_kernel( int64_t post_op_fusion_type = WOQ_FUSE_NONE; if (post_op == "gelu") { if (algorithm == "none") { - post_op_fusion_type = WOQ_FUSE_GELU; + post_op_fusion_type = WOQ_FUSE_GELU_ERF; } else if (algorithm == "tanh") { - post_op_fusion_type = WOQ_FUSE_NEW_GELU; + post_op_fusion_type = WOQ_FUSE_GELU_TANH; } + } else if (post_op == "relu") { + post_op_fusion_type = WOQ_FUSE_RELU; + } else if (post_op == "silu") { + post_op_fusion_type = WOQ_FUSE_SILU; } int64_t quant_w_mode = group_size > 0 ? 1 : 0; return woq_tpp_gemm_kernel_stub( @@ -496,7 +500,7 @@ at::Tensor woq_linear_gelu_forward( const at::Tensor& op_context) { return reinterpret_cast( op_context.data_ptr()[0]) - ->run_eltwise( + ->run_unary( input, "gelu", torch::List>(), "none"); } @@ -505,39 +509,27 @@ at::Tensor woq_linear_new_gelu_forward( const at::Tensor& op_context) { return reinterpret_cast( op_context.data_ptr()[0]) - ->run_eltwise( + ->run_unary( input, "gelu", torch::List>(), "tanh"); } -at::Tensor woq_linear_add_kernel( - const at::Tensor& self, - const at::Tensor& weight, - int64_t weight_dtype, - const std::vector& scales_list, - const std::vector& zps_list, - const std::vector& bias_list, - int64_t group_size, - int64_t lowp_mode, - const std::vector& others, - int64_t act_quant_mode) { - int64_t quant_w_mode = group_size > 0 ? 1 : 0; - return woq_tpp_gemm_kernel_stub( - kCPU, - self, - weight, - scales_list, - zps_list, - bias_list, - weight_dtype, - lowp_mode, - WOQ_FUSE_ADD, // post op add - others, - act_quant_mode, - quant_w_mode, - group_size); +at::Tensor woq_linear_relu_forward( + const at::Tensor& input, + const at::Tensor& op_context) { + return reinterpret_cast( + op_context.data_ptr()[0]) + ->run_unary(input, "relu", torch::List>(), ""); } -at::Tensor woq_linear_add_add_kernel( +at::Tensor woq_linear_silu_forward( + const at::Tensor& input, + const at::Tensor& op_context) { + return reinterpret_cast( + op_context.data_ptr()[0]) + ->run_unary(input, "silu", torch::List>(), ""); +} + +at::Tensor woq_linear_binary_kernel( const at::Tensor& self, const at::Tensor& weight, int64_t weight_dtype, @@ -546,8 +538,17 @@ at::Tensor woq_linear_add_add_kernel( const std::vector& bias_list, int64_t group_size, int64_t lowp_mode, + const c10::string_view& post_op, const std::vector& others, int64_t act_quant_mode) { + int64_t post_op_fusion_type = WOQ_FUSE_NONE; + if (post_op == "add") { + post_op_fusion_type = WOQ_FUSE_ADD; + } else if (post_op == "add_add") { + post_op_fusion_type = WOQ_FUSE_ADD_ADD; + } else if (post_op == "mul") { + post_op_fusion_type = WOQ_FUSE_MUL; + } int64_t quant_w_mode = group_size > 0 ? 1 : 0; return woq_tpp_gemm_kernel_stub( kCPU, @@ -558,7 +559,7 @@ at::Tensor woq_linear_add_add_kernel( bias_list, weight_dtype, lowp_mode, - WOQ_FUSE_ADD_ADD, // post op add-add + post_op_fusion_type, others, act_quant_mode, quant_w_mode, @@ -571,7 +572,7 @@ at::Tensor woq_linear_add_forward( const std::vector& others) { return reinterpret_cast( op_context.data_ptr()[0]) - ->run_add(input, others); + ->run_binary(input, "add", others); } at::Tensor woq_linear_add_add_forward( @@ -580,7 +581,16 @@ at::Tensor woq_linear_add_add_forward( const std::vector& others) { return reinterpret_cast( op_context.data_ptr()[0]) - ->run_add_add(input, others); + ->run_binary(input, "add_add", others); +} + +at::Tensor woq_linear_mul_forward( + const at::Tensor& input, + const at::Tensor& op_context, + const std::vector& others) { + return reinterpret_cast( + op_context.data_ptr()[0]) + ->run_binary(input, "mul", others); } #endif @@ -745,6 +755,28 @@ at::Tensor woq_linear_new_gelu_forward( return op.call(cpu_cached_cast(target_type, input), op_context); } +at::Tensor woq_linear_relu_forward( + const at::Tensor& input, + const at::Tensor& op_context) { + c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU); + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("torch_ipex::woq_linear_relu", "") + .typed(); + auto target_type = get_autocast_dtype(); + return op.call(cpu_cached_cast(target_type, input), op_context); +} + +at::Tensor woq_linear_silu_forward( + const at::Tensor& input, + const at::Tensor& op_context) { + c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU); + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("torch_ipex::woq_linear_silu", "") + .typed(); + auto target_type = get_autocast_dtype(); + return op.call(cpu_cached_cast(target_type, input), op_context); +} + at::Tensor woq_linear_add_forward( const at::Tensor& input, const at::Tensor& op_context, @@ -774,6 +806,21 @@ at::Tensor woq_linear_add_add_forward( op_context, cpu_cached_cast(target_type, others)); } + +at::Tensor woq_linear_mul_forward( + const at::Tensor& input, + const at::Tensor& op_context, + const std::vector& others) { + c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU); + static auto op = torch::Dispatcher::singleton() + .findSchemaOrThrow("torch_ipex::woq_linear_mul", "") + .typed(); + auto target_type = get_autocast_dtype(); + return op.call( + cpu_cached_cast(target_type, input), + op_context, + cpu_cached_cast(target_type, others)); +} #endif at::Tensor matmul_i8i8i32(const at::Tensor& input, const at::Tensor& weight) { @@ -829,6 +876,24 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { "woq_linear_new_gelu", c10::DispatchKey::AutocastCPU, torch_ipex::autocast::woq_linear_new_gelu_forward); + m.def("woq_linear_relu(Tensor input, Tensor W_prepack) -> Tensor"); + m.impl( + "woq_linear_relu", + c10::DispatchKey::CPU, + torch_ipex::cpu::woq_linear_relu_forward); + m.impl( + "woq_linear_relu", + c10::DispatchKey::AutocastCPU, + torch_ipex::autocast::woq_linear_relu_forward); + m.def("woq_linear_silu(Tensor input, Tensor W_prepack) -> Tensor"); + m.impl( + "woq_linear_silu", + c10::DispatchKey::CPU, + torch_ipex::cpu::woq_linear_silu_forward); + m.impl( + "woq_linear_silu", + c10::DispatchKey::AutocastCPU, + torch_ipex::autocast::woq_linear_silu_forward); m.def( "woq_linear_add(Tensor input, Tensor W_prepack, Tensor[] others) -> Tensor"); m.impl( @@ -849,6 +914,16 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { "woq_linear_add_add", c10::DispatchKey::AutocastCPU, torch_ipex::autocast::woq_linear_add_add_forward); + m.def( + "woq_linear_mul(Tensor input, Tensor W_prepack, Tensor[] others) -> Tensor"); + m.impl( + "woq_linear_mul", + c10::DispatchKey::CPU, + torch_ipex::cpu::woq_linear_mul_forward); + m.impl( + "woq_linear_mul", + c10::DispatchKey::AutocastCPU, + torch_ipex::autocast::woq_linear_mul_forward); #endif // fuse eltwise m.def( diff --git a/csrc/cpu/aten/Linear.h b/csrc/cpu/aten/Linear.h index 3d00e26bd..c037516f5 100644 --- a/csrc/cpu/aten/Linear.h +++ b/csrc/cpu/aten/Linear.h @@ -107,7 +107,7 @@ at::Tensor woq_linear_kernel( int64_t lowp_mode, int64_t act_quant_mode); -at::Tensor woq_linear_eltwise_kernel( +at::Tensor woq_linear_unary_kernel( const at::Tensor& self, const at::Tensor& weight, int64_t weight_dtype, @@ -121,19 +121,7 @@ at::Tensor woq_linear_eltwise_kernel( int64_t lowp_mode, int64_t act_quant_mode); -at::Tensor woq_linear_add_kernel( - const at::Tensor& self, - const at::Tensor& weight, - int64_t weight_dtype, - const std::vector& scales_list, - const std::vector& zps_list, - const std::vector& bias_list, - int64_t group_size, - int64_t lowp_mode, - const std::vector& others, - int64_t act_quant_mode); - -at::Tensor woq_linear_add_add_kernel( +at::Tensor woq_linear_binary_kernel( const at::Tensor& self, const at::Tensor& weight, int64_t weight_dtype, @@ -142,6 +130,7 @@ at::Tensor woq_linear_add_add_kernel( const std::vector& bias_list, int64_t group_size, int64_t lowp_mode, + const c10::string_view& post_op, const std::vector& others, int64_t act_quant_mode); @@ -230,11 +219,17 @@ IPEX_DECLARE_DISPATCH(woq_tpp_gemm_kernel_fn, woq_tpp_gemm_kernel_stub); IPEX_DECLARE_DISPATCH(woq_tpp_gemm_packB_fn, woq_tpp_gemm_packB_stub); IPEX_DECLARE_DISPATCH(woq_tpp_gemm_unpackB_fn, woq_tpp_gemm_unpackB_stub); -#define WOQ_FUSE_NONE 0 -#define WOQ_FUSE_GELU 1 -#define WOQ_FUSE_ADD 2 -#define WOQ_FUSE_ADD_ADD 3 -#define WOQ_FUSE_NEW_GELU 4 +// Fusion types +#define WOQ_FUSE_NONE 0x0 +// Unary post ops +#define WOQ_FUSE_GELU_ERF 0x1 +#define WOQ_FUSE_GELU_TANH 0x2 +#define WOQ_FUSE_RELU 0x3 +#define WOQ_FUSE_SILU 0x4 +// Binary post ops +#define WOQ_FUSE_ADD 0x10 +#define WOQ_FUSE_ADD_ADD 0x20 +#define WOQ_FUSE_MUL 0x30 #endif diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index f6782c937..6bb2a03d5 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -22,10 +22,10 @@ namespace { using namespace tpp; using TensorList = std::vector; -#define FUSE_GELU_ERF 1 -#define FUSE_ADD 2 -#define FUSE_ADD_ADD 3 -#define FUSE_GELU_TANH 4 +// #define FUSE_GELU_ERF 1 +// #define FUSE_ADD 2 +// #define FUSE_ADD_ADD 3 +// #define FUSE_GELU_TANH 4 #define LOWP_MODE_NONE 0 #define LOWP_MODE_FP16 1 @@ -1797,16 +1797,16 @@ void qlinear_woq_affine_dequant_upfront_impl( Nc /= 2; Nb *= 2; } - auto gelu_fwd_tpp_ptr = fusion_type == FUSE_GELU_ERF + auto gelu_fwd_tpp_ptr = fusion_type == WOQ_FUSE_GELU_ERF ? std::make_shared>( GeluFwdTPP(block_m, Nb, ldy, ldy)) : nullptr; - auto gelu_fwd_tpp_rem_ptr = fusion_type == FUSE_GELU_ERF + auto gelu_fwd_tpp_rem_ptr = fusion_type == WOQ_FUSE_GELU_ERF ? std::make_shared>( GeluFwdTPP(rem, Nb, ldy, ldy)) : nullptr; bool has_add_post_op = - fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD; + fusion_type == WOQ_FUSE_ADD || fusion_type == WOQ_FUSE_ADD_ADD; auto add_tpp_ptr = has_add_post_op ? std::make_shared>( AddTPP(block_m, Nb, ldy, ldy)) @@ -1815,14 +1815,36 @@ void qlinear_woq_affine_dequant_upfront_impl( ? std::make_shared>( AddTPP(rem, Nb, ldy, ldy)) : nullptr; - auto gelu_tanh_fwd_tpp_ptr = fusion_type == FUSE_GELU_TANH + auto gelu_tanh_fwd_tpp_ptr = fusion_type == WOQ_FUSE_GELU_TANH ? std::make_shared>( GeluTanhFwdTPP(block_m, Nb, ldy, ldy)) : nullptr; - auto gelu_tanh_fwd_tpp_rem_ptr = fusion_type == FUSE_GELU_TANH + auto gelu_tanh_fwd_tpp_rem_ptr = fusion_type == WOQ_FUSE_GELU_TANH ? std::make_shared>( GeluTanhFwdTPP(rem, Nb, ldy, ldy)) : nullptr; + auto relu_fwd_tpp_ptr = fusion_type == WOQ_FUSE_RELU + ? std::make_shared>( + ReLUFwdTPP(block_m, Nb, ldy, ldy, false)) + : nullptr; + auto relu_fwd_tpp_rem_ptr = fusion_type == WOQ_FUSE_RELU + ? std::make_shared>( + ReLUFwdTPP(rem, Nb, ldy, ldy, false)) + : nullptr; + auto silu_fwd_tpp_ptr = fusion_type == WOQ_FUSE_SILU + ? std::make_shared>( + SiLUFwdTPP(block_m, Nb, ldy, ldy)) + : nullptr; + auto silu_fwd_tpp_rem_ptr = fusion_type == WOQ_FUSE_SILU + ? std::make_shared>( + SiLUFwdTPP(rem, Nb, ldy, ldy)) + : nullptr; + auto mul_tpp_ptr = fusion_type == WOQ_FUSE_MUL + ? std::make_shared>(MulTPP(block_m, Nb, ldy, ldy)) + : nullptr; + auto mul_tpp_rem_ptr = fusion_type == WOQ_FUSE_MUL + ? std::make_shared>(MulTPP(rem, Nb, ldy, ldy)) + : nullptr; auto in0_ptr = GetVLAPtr(tin0, {Nc, Nb}); auto in1_ptr = GetVLAPtr(tin1, {Nc, Nb}); @@ -1830,14 +1852,21 @@ void qlinear_woq_affine_dequant_upfront_impl( auto tpp_linear_with_post_op = [&](at::Tensor& in, at::Tensor& out, int fuse_type = 0) { - if (fuse_type == FUSE_GELU_ERF) { + if (fuse_type == WOQ_FUSE_GELU_ERF) { tpp_linear_gelu(in, dqw, b, out); - } else if (fuse_type == FUSE_ADD || fuse_type == FUSE_ADD_ADD) { + } else if (fuse_type == WOQ_FUSE_GELU_TANH) { + tpp_linear_gelu_tanh(in, dqw, b, out); + } else if (fuse_type == WOQ_FUSE_RELU) { + tpp_linear_relu(in, dqw, b, out); + } else if (fuse_type == WOQ_FUSE_SILU) { + tpp_linear_silu(in, dqw, b, out); + } else if (fuse_type == WOQ_FUSE_MUL) { + tpp_linear_mul(in, tin0, dqw, b, out); + } else if ( + fuse_type == WOQ_FUSE_ADD || fuse_type == WOQ_FUSE_ADD_ADD) { TLA_ASSERT( false, "fuse_type should not be ADD or ADD_ADD since it's slower than aten add"); - } else if (fuse_type == FUSE_GELU_TANH) { - tpp_linear_gelu_tanh(in, dqw, b, out); } else { tpp_linear_bias(in, dqw, b, out); } @@ -1890,7 +1919,7 @@ void qlinear_woq_affine_dequant_upfront_impl( auto in_ptr = GetVLAPtr(y_gemm, {Nc, Nb}); auto out_ptr = GetVLAPtr(y, {Nc, Nb}); // Convert y to T and handle post ops - if (fusion_type == 0) { + if (fusion_type == WOQ_FUSE_NONE) { post_loop([&](int* ind) { int m = ind[0], nc = ind[1]; if (m + block_m <= M) { @@ -1899,7 +1928,7 @@ void qlinear_woq_affine_dequant_upfront_impl( cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); } }); - } else if (fusion_type == FUSE_GELU_ERF) { + } else if (fusion_type == WOQ_FUSE_GELU_ERF) { post_loop([&](int* ind) { int m = ind[0], nc = ind[1]; if (m + block_m <= M) { @@ -1910,7 +1939,7 @@ void qlinear_woq_affine_dequant_upfront_impl( (*gelu_fwd_tpp_rem_ptr)(out_ptr[m][nc], out_ptr[m][nc]); } }); - } else if (fusion_type == FUSE_GELU_TANH) { + } else if (fusion_type == WOQ_FUSE_GELU_TANH) { post_loop([&](int* ind) { int m = ind[0], nc = ind[1]; if (m + block_m <= M) { @@ -1922,7 +1951,29 @@ void qlinear_woq_affine_dequant_upfront_impl( out_ptr[m][nc], out_ptr[m][nc]); } }); - } else if (fusion_type == FUSE_ADD) { + } else if (fusion_type == WOQ_FUSE_RELU) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*relu_fwd_tpp_ptr)(out_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*relu_fwd_tpp_rem_ptr)(out_ptr[m][nc], out_ptr[m][nc]); + } + }); + } else if (fusion_type == WOQ_FUSE_SILU) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*silu_fwd_tpp_ptr)(out_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*silu_fwd_tpp_rem_ptr)(out_ptr[m][nc], out_ptr[m][nc]); + } + }); + } else if (fusion_type == WOQ_FUSE_ADD) { post_loop([&](int* ind) { int m = ind[0], nc = ind[1]; if (m + block_m <= M) { @@ -1935,7 +1986,7 @@ void qlinear_woq_affine_dequant_upfront_impl( out_ptr[m][nc], in0_ptr[m][nc], out_ptr[m][nc]); } }); - } else if (fusion_type == FUSE_ADD_ADD) { + } else if (fusion_type == WOQ_FUSE_ADD_ADD) { post_loop([&](int* ind) { int m = ind[0], nc = ind[1]; if (m + block_m <= M) { @@ -1952,10 +2003,24 @@ void qlinear_woq_affine_dequant_upfront_impl( out_ptr[m][nc], in1_ptr[m][nc], out_ptr[m][nc]); } }); + } else if (fusion_type == WOQ_FUSE_MUL) { + post_loop([&](int* ind) { + int m = ind[0], nc = ind[1]; + if (m + block_m <= M) { + cvt_y_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*mul_tpp_ptr)( + out_ptr[m][nc], in0_ptr[m][nc], out_ptr[m][nc]); + } else { + cvt_y_rem_tpp(in_ptr[m][nc], out_ptr[m][nc]); + (*mul_tpp_rem_ptr)( + out_ptr[m][nc], in0_ptr[m][nc], out_ptr[m][nc]); + } + }); } } else { // Tout == TGemmOut // For add/add_add, using aten add is faster than TPP fused kernel - if (fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD) { + if (fusion_type == WOQ_FUSE_ADD || + fusion_type == WOQ_FUSE_ADD_ADD) { maybe_cvt_x_and_compute(y, 0); for (auto& tin : others_list) { y.add_(tin.view(y.sizes())); @@ -2106,40 +2171,58 @@ void qlinear_woq_affine_impl( auto gelu_erf_fwd_rem_tpp = GeluFwdTPP(BLOCK_M_rem, Nb, ldy, ldy); auto gelu_tanh_fwd_tpp = GeluTanhFwdTPP(BLOCK_M, Nb, ldy, ldy); auto gelu_tanh_fwd_rem_tpp = GeluTanhFwdTPP(BLOCK_M_rem, Nb, ldy, ldy); + auto relu_fwd_tpp = ReLUFwdTPP(BLOCK_M, Nb, ldy, ldy, false); + auto relu_fwd_rem_tpp = ReLUFwdTPP(BLOCK_M_rem, Nb, ldy, ldy, false); + auto silu_fwd_tpp = SiLUFwdTPP(BLOCK_M, Nb, ldy, ldy); + auto silu_fwd_rem_tpp = SiLUFwdTPP(BLOCK_M_rem, Nb, ldy, ldy); auto add_tpp = AddTPP(BLOCK_M, Nb, ldy, ldy); auto add_rem_tpp = AddTPP(BLOCK_M_rem, Nb, ldy, ldy); - bool is_fusion_type_addrelated = - fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD; + auto mul_tpp = MulTPP(BLOCK_M, Nb, ldy, ldy); + auto mul_rem_tpp = MulTPP(BLOCK_M_rem, Nb, ldy, ldy); + bool has_extra_input = fusion_type == WOQ_FUSE_ADD || + fusion_type == WOQ_FUSE_ADD_ADD || fusion_type == WOQ_FUSE_MUL; auto post_ops_fn = [&](int m, int nc) { Tout* y_ptr = (Tout*)py[m][nc]; - Tout* tin0_ptr = is_fusion_type_addrelated ? (Tout*)pin0[m][nc] : nullptr; - Tout* tin1_ptr = fusion_type == FUSE_ADD_ADD ? (Tout*)pin1[m][nc] : nullptr; - if (fusion_type == FUSE_GELU_ERF) { + Tout* tin0_ptr = has_extra_input ? (Tout*)pin0[m][nc] : nullptr; + Tout* tin1_ptr = + fusion_type == WOQ_FUSE_ADD_ADD ? (Tout*)pin1[m][nc] : nullptr; + if (fusion_type == WOQ_FUSE_GELU_ERF) { gelu_erf_fwd_tpp(y_ptr, y_ptr); - } else if (fusion_type == FUSE_ADD) { + } else if (fusion_type == WOQ_FUSE_GELU_TANH) { + gelu_tanh_fwd_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_RELU) { + relu_fwd_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_SILU) { + silu_fwd_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_ADD) { add_tpp(y_ptr, tin0_ptr, y_ptr); - } else if (fusion_type == FUSE_ADD_ADD) { + } else if (fusion_type == WOQ_FUSE_ADD_ADD) { add_tpp(y_ptr, tin0_ptr, y_ptr); add_tpp(y_ptr, tin1_ptr, y_ptr); - } else if (fusion_type == FUSE_GELU_TANH) { - gelu_tanh_fwd_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_MUL) { + mul_tpp(y_ptr, tin0_ptr, y_ptr); } }; auto post_ops_rem_fn = [&](int m, int nc) { Tout* y_ptr = (Tout*)py[m][nc]; - Tout* tin0_ptr = (fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD) - ? (Tout*)pin0[m][nc] - : nullptr; - Tout* tin1_ptr = fusion_type == FUSE_ADD_ADD ? (Tout*)pin1[m][nc] : nullptr; - if (fusion_type == FUSE_GELU_ERF) { + Tout* tin0_ptr = has_extra_input ? (Tout*)pin0[m][nc] : nullptr; + Tout* tin1_ptr = + fusion_type == WOQ_FUSE_ADD_ADD ? (Tout*)pin1[m][nc] : nullptr; + if (fusion_type == WOQ_FUSE_GELU_ERF) { gelu_erf_fwd_rem_tpp(y_ptr, y_ptr); - } else if (fusion_type == FUSE_ADD) { + } else if (fusion_type == WOQ_FUSE_GELU_TANH) { + gelu_tanh_fwd_rem_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_RELU) { + relu_fwd_rem_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_SILU) { + silu_fwd_rem_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_ADD) { add_rem_tpp(y_ptr, tin0_ptr, y_ptr); - } else if (fusion_type == FUSE_ADD_ADD) { + } else if (fusion_type == WOQ_FUSE_ADD_ADD) { add_rem_tpp(y_ptr, tin0_ptr, y_ptr); add_rem_tpp(y_ptr, tin1_ptr, y_ptr); - } else if (fusion_type == FUSE_GELU_TANH) { - gelu_tanh_fwd_rem_tpp(y_ptr, y_ptr); + } else if (fusion_type == WOQ_FUSE_MUL) { + mul_rem_tpp(y_ptr, tin0_ptr, y_ptr); } }; @@ -4115,14 +4198,27 @@ at::Tensor qlinear_woq_affine( : bf16_idx; y = at::add(y, biases[b_index]); } - if (fusion_type == FUSE_GELU_ERF) { - y = at::gelu(y); - } else if (fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD) { + if (fusion_type == WOQ_FUSE_GELU_ERF) { + at::gelu_(y); + } else if (fusion_type == WOQ_FUSE_GELU_TANH) { + at::gelu_(y, "tanh"); + } else if (fusion_type == WOQ_FUSE_RELU) { + at::relu_(y); + } else if (fusion_type == WOQ_FUSE_SILU) { + at::silu_(y); + } else if (fusion_type == WOQ_FUSE_ADD || fusion_type == WOQ_FUSE_ADD_ADD) { for (auto& tin : others_list) { y = at::add(y, tin.view(y.sizes())); } - } else if (fusion_type == FUSE_GELU_TANH) { - y = at::gelu(y, "tanh"); + } else if (fusion_type == WOQ_FUSE_MUL) { + for (auto& tin : others_list) { + y = at::mul(y, tin.view(y.sizes())); + } + } else { + TORCH_CHECK( + fusion_type == WOQ_FUSE_NONE, + "WOQ: Unexpected fusion type: ", + fusion_type); } auto out_sizes = x.sizes().vec(); out_sizes.back() = N; @@ -4243,14 +4339,27 @@ at::Tensor qlinear_woq_affine( : bf16_idx; y = at::add(y, biases[b_index]); } - if (fusion_type == FUSE_GELU_ERF) { - y = at::gelu(y); - } else if (fusion_type == FUSE_ADD || fusion_type == FUSE_ADD_ADD) { + if (fusion_type == WOQ_FUSE_GELU_ERF) { + at::gelu_(y); + } else if (fusion_type == WOQ_FUSE_GELU_TANH) { + at::gelu_(y, "tanh"); + } else if (fusion_type == WOQ_FUSE_RELU) { + at::relu_(y); + } else if (fusion_type == WOQ_FUSE_SILU) { + at::silu_(y); + } else if (fusion_type == WOQ_FUSE_ADD || fusion_type == WOQ_FUSE_ADD_ADD) { for (auto& tin : others_list) { y = at::add(y, tin.view(y.sizes())); } - } else if (fusion_type == FUSE_GELU_TANH) { - y = at::gelu(y, "tanh"); + } else if (fusion_type == WOQ_FUSE_MUL) { + for (auto& tin : others_list) { + y = at::mul(y, tin.view(y.sizes())); + } + } else { + TORCH_CHECK( + fusion_type == WOQ_FUSE_NONE, + "WOQ: Unexpected fusion type: ", + fusion_type); } auto out_sizes = x.sizes().vec(); out_sizes.back() = N; diff --git a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp index 11e8787bc..627c38fce 100644 --- a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp +++ b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp @@ -311,8 +311,8 @@ at::Tensor run(ContextLinearWoq& context, const at::Tensor& input) { return res; } -// Called by IpexWoqLinearOpContext::run_eltwise -at::Tensor run_eltwise( +// Called by IpexWoqLinearOpContext::run_unary +at::Tensor run_unary( ContextLinearWoq& context, const at::Tensor& input, const c10::string_view& post_op, @@ -330,7 +330,7 @@ at::Tensor run_eltwise( auto input_ = input.contiguous(); // handle GPTQ with act-order input_ = _shuffle_input_channels_if_needed(context, input_); - return woq_linear_eltwise_kernel( + return woq_linear_unary_kernel( input_, context.at_weight_, context.weight_dtype_, @@ -345,40 +345,11 @@ at::Tensor run_eltwise( context.act_quant_mode_); } -// Called by IpexWoqLinearOpContext::run_add -at::Tensor run_add( - ContextLinearWoq& context, - const at::Tensor& input, - const std::vector& others) { - // TPP kernel packs weight to 4d (Nc, Kc, block_k, block_n) - auto w_k = context.weight_shape_[1]; - TORCH_CHECK( - input.size(input.dim() - 1) == w_k, - "WOQ linear: input and weight shapes do not match, got k = ", - input.size(input.dim() - 1), - " and ", - w_k, - " respectively."); - auto input_ = input.contiguous(); - // handle GPTQ with act-order - input_ = _shuffle_input_channels_if_needed(context, input_); - return woq_linear_add_kernel( - input_, - context.at_weight_, - context.weight_dtype_, - context.scales_list_, - context.zero_points_list_, - context.bias_list_, - context.group_size_, - context.lowp_mode_, - others, - context.act_quant_mode_); -} - -// Called by IpexWoqLinearOpContext::run_add_add -at::Tensor run_add_add( +// Called by IpexWoqLinearOpContext::run_binary +at::Tensor run_binary( ContextLinearWoq& context, const at::Tensor& input, + const c10::string_view& post_op, const std::vector& others) { // TPP kernel packs weight to 4d (Nc, Kc, block_k, block_n) auto w_k = context.weight_shape_[1]; @@ -392,7 +363,7 @@ at::Tensor run_add_add( auto input_ = input.contiguous(); // handle GPTQ with act-order input_ = _shuffle_input_channels_if_needed(context, input_); - return woq_linear_add_add_kernel( + return woq_linear_binary_kernel( input_, context.at_weight_, context.weight_dtype_, @@ -401,6 +372,7 @@ at::Tensor run_add_add( context.bias_list_, context.group_size_, context.lowp_mode_, + post_op, others, context.act_quant_mode_); } diff --git a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h index c68aa6e11..f03ded598 100644 --- a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h +++ b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h @@ -53,21 +53,17 @@ ContextLinearWoq create( at::Tensor run(ContextLinearWoq& context, const at::Tensor& input); -at::Tensor run_eltwise( +at::Tensor run_unary( ContextLinearWoq& context, const at::Tensor& input, const c10::string_view& post_op, const torch::List>& scalars, const c10::optional& algorithm); -at::Tensor run_add( - ContextLinearWoq& context, - const at::Tensor& input, - const std::vector& others); - -at::Tensor run_add_add( +at::Tensor run_binary( ContextLinearWoq& context, const at::Tensor& input, + const c10::string_view& post_op, const std::vector& others); at::Tensor pack(ContextLinearWoq& context, const at::Tensor& tensor); diff --git a/csrc/cpu/jit/cpu/kernels/OpContext.cpp b/csrc/cpu/jit/cpu/kernels/OpContext.cpp index b2bd4cf8c..3dfeeee14 100644 --- a/csrc/cpu/jit/cpu/kernels/OpContext.cpp +++ b/csrc/cpu/jit/cpu/kernels/OpContext.cpp @@ -399,27 +399,21 @@ at::Tensor IpexWoqLinearOpContext::run(const at::Tensor& input) { return torch_ipex::cpu::detail::woq_linear::run(op_context_, input); } -at::Tensor IpexWoqLinearOpContext::run_eltwise( +at::Tensor IpexWoqLinearOpContext::run_unary( const at::Tensor& input, const c10::string_view& post_op, const torch::List>& scalars, const c10::optional& algorithm) { - return torch_ipex::cpu::detail::woq_linear::run_eltwise( + return torch_ipex::cpu::detail::woq_linear::run_unary( op_context_, input, post_op, scalars, algorithm); } -at::Tensor IpexWoqLinearOpContext::run_add( - const at::Tensor& input, - const std::vector& others) { - return torch_ipex::cpu::detail::woq_linear::run_add( - op_context_, input, others); -} - -at::Tensor IpexWoqLinearOpContext::run_add_add( +at::Tensor IpexWoqLinearOpContext::run_binary( const at::Tensor& input, + const c10::string_view& post_op, const std::vector& others) { - return torch_ipex::cpu::detail::woq_linear::run_add_add( - op_context_, input, others); + return torch_ipex::cpu::detail::woq_linear::run_binary( + op_context_, input, post_op, others); } at::Tensor IpexWoqLinearOpContext::to_public(const at::Tensor& tensor) { diff --git a/csrc/cpu/jit/cpu/kernels/OpContext.h b/csrc/cpu/jit/cpu/kernels/OpContext.h index 2d5779213..a5a141b9c 100644 --- a/csrc/cpu/jit/cpu/kernels/OpContext.h +++ b/csrc/cpu/jit/cpu/kernels/OpContext.h @@ -404,18 +404,15 @@ class WoqLinearOpContext : public torch::jit::CustomClassHolder { virtual at::Tensor run(const at::Tensor& input) = 0; - virtual at::Tensor run_eltwise( + virtual at::Tensor run_unary( const at::Tensor& input, const c10::string_view& post_op, const torch::List>& scalars, const c10::optional& algorithm) = 0; - virtual at::Tensor run_add( - const at::Tensor& input, - const std::vector& others) = 0; - - virtual at::Tensor run_add_add( + virtual at::Tensor run_binary( const at::Tensor& input, + const c10::string_view& post_op, const std::vector& others) = 0; virtual at::Tensor to_public(const at::Tensor& tensor) = 0; @@ -462,18 +459,15 @@ class IpexWoqLinearOpContext final : public WoqLinearOpContext { virtual at::Tensor run(const at::Tensor& input) override; - virtual at::Tensor run_eltwise( + virtual at::Tensor run_unary( const at::Tensor& input, const c10::string_view& post_op, const torch::List>& scalars, const c10::optional& algorithm) override; - virtual at::Tensor run_add( - const at::Tensor& input, - const std::vector& others) override; - - virtual at::Tensor run_add_add( + virtual at::Tensor run_binary( const at::Tensor& input, + const c10::string_view& post_op, const std::vector& others) override; virtual at::Tensor to_public(const at::Tensor& tensor) override; diff --git a/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h b/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h index 3ab141406..f5bb644fe 100644 --- a/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h +++ b/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h @@ -250,7 +250,7 @@ inline void tpp_linear_no_bias( } } -template +template inline void tpp_linear_mul( const at::Tensor t_in, const at::Tensor t_in1, @@ -278,10 +278,10 @@ inline void tpp_linear_mul( auto t_wt_V = torch_ipex::tpp::wt_tensor_for_fwd(Nk, Hk, Nc, Hc, t_wt_); auto in = GetVLAPtr(t_in, {Nc, Hc}); - auto in1 = GetVLAPtr(t_in1, {Nk, Hk}); + auto in1 = GetVLAPtr(t_in1, {Nk, Hk}); auto wt_V = GetVLAPtr(t_wt_V, {Nc, Hc * Hk}); - auto bias = GetVLAPtr(t_bias, {Hk}); - auto out = GetVLAPtr(t_out, {Nk, Hk}); + auto bias = GetVLAPtr(t_bias, {Hk}); + auto out = GetVLAPtr(t_out, {Nk, Hk}); auto Ncb = Nc; auto BSb = 64L; @@ -290,16 +290,16 @@ inline void tpp_linear_mul( Ncb = NCB_BLOCK_SIZE; bool with_bias = (t_bias.numel() > 0); - auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); - auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); - auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); - auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); + auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); + auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); + auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); + auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); auto brgemm_tpp = SCOPEITGEMM( - (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); + (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); auto brgemm_tpp_rem = SCOPEITGEMM( - (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); - auto mul_tpp = SCOPEIT((MulTPP(BSb, Hk, K, K)), EW_MUL); - auto mul_tpp_rem = SCOPEIT((MulTPP(rem, Hk, K, K)), EW_MUL); + (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); + auto mul_tpp = SCOPEIT((MulTPP(BSb, Hk, K, K)), EW_MUL); + auto mul_tpp_rem = SCOPEIT((MulTPP(rem, Hk, K, K)), EW_MUL); { RECORD_SCOPE(tpp_linear_mul_krnl, {t_in, t_wt_V}); @@ -859,7 +859,7 @@ inline void tpp_linear_add( } } -template +template inline void tpp_linear_silu( const at::Tensor t_in, const at::Tensor t_wt, @@ -887,8 +887,8 @@ inline void tpp_linear_silu( auto in = GetVLAPtr(t_in, {Nc, Hc}); auto wt_V = GetVLAPtr(t_wt_V, {Nc, Hc * Hk}); - auto bias = GetVLAPtr(t_bias, {Hk}); - auto out = GetVLAPtr(t_out, {Nk, Hk}); + auto bias = GetVLAPtr(t_bias, {Hk}); + auto out = GetVLAPtr(t_out, {Nk, Hk}); auto Ncb = Nc; auto BSb = 64L; @@ -897,16 +897,16 @@ inline void tpp_linear_silu( Ncb = NCB_BLOCK_SIZE; bool with_bias = (t_bias.numel() > 0); - auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); - auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); - auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); - auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); + auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); + auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); + auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); + auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); auto brgemm_tpp = SCOPEITGEMM( - (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); + (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); auto brgemm_tpp_rem = SCOPEITGEMM( - (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); - auto silu_fwd_tpp = SCOPEIT(SiLUFwdTPP(BSb, Hk, K, K), ACT); - auto silu_fwd_tpp_rem = SCOPEIT(SiLUFwdTPP(rem, Hk, K, K), ACT); + (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); + auto silu_fwd_tpp = SCOPEIT(SiLUFwdTPP(BSb, Hk, K, K), ACT); + auto silu_fwd_tpp_rem = SCOPEIT(SiLUFwdTPP(rem, Hk, K, K), ACT); { RECORD_SCOPE(tpp_linear_silu_krnl, {t_in, t_wt_V}); @@ -951,7 +951,7 @@ inline void tpp_linear_silu( } } -template +template inline void tpp_linear_relu( const at::Tensor t_in, const at::Tensor t_wt, @@ -979,8 +979,8 @@ inline void tpp_linear_relu( auto in = GetVLAPtr(t_in, {Nc, Hc}); auto wt_V = GetVLAPtr(t_wt_V, {Nc, Hc * Hk}); - auto bias = GetVLAPtr(t_bias, {Hk}); - auto out = GetVLAPtr(t_out, {Nk, Hk}); + auto bias = GetVLAPtr(t_bias, {Hk}); + auto out = GetVLAPtr(t_out, {Nk, Hk}); auto Ncb = Nc; auto BSb = 64L; @@ -989,16 +989,16 @@ inline void tpp_linear_relu( Ncb = NCB_BLOCK_SIZE; bool with_bias = (t_bias.numel() > 0); - auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); - auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); - auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); - auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); + auto copy_bias_tpp = SCOPEIT(CpyBiasTPP(BSb, Hk, K), BIAS); + auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); + auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); + auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); auto brgemm_tpp = SCOPEITGEMM( - (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); + (BrgemmTPP(BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); auto brgemm_tpp_rem = SCOPEITGEMM( - (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); - auto relu_fwd_tpp = SCOPEIT(ReLUFwdTPP(BSb, Hk, K, K, false), ACT); - auto relu_fwd_tpp_rem = SCOPEIT(ReLUFwdTPP(rem, Hk, K, K, false), ACT); + (BrgemmTPP(rem, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb))); + auto relu_fwd_tpp = SCOPEIT(ReLUFwdTPP(BSb, Hk, K, K, false), ACT); + auto relu_fwd_tpp_rem = SCOPEIT(ReLUFwdTPP(rem, Hk, K, K, false), ACT); { RECORD_SCOPE(tpp_linear_relu_krnl, {t_in, t_wt_V}); diff --git a/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py b/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py index a5fc248a0..ce3b2eff2 100644 --- a/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py +++ b/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py @@ -271,6 +271,7 @@ def _init_from_mod(cls, mod, dtype): mod.out_features, mod.mp_group, mod.bias, # save the original bias value + mod.bias is not None, dtype=dtype, ) @@ -303,6 +304,9 @@ def _init_cls( act_quant_mode, ) qlinear.weight = qlinear._op_context.get_weight() + qlinear._lowp_mode = lowp_mode + qlinear._act_quant_mode = act_quant_mode + qlinear._group_size = group_size return qlinear @@ -337,6 +341,7 @@ def _init_from_mod(cls, mod, dtype): mod.rank, mod.world_size, mod.bias, # save the original bias value + mod.bias is not None, dtype=dtype, ) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py index ed036dac0..542455735 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py @@ -41,6 +41,15 @@ def forward(self, x): ), self.linear.out_features, ) + elif ( + self.woq + and hasattr(self.linear, "_op_context") + and self.linear._op_context is not None + ): + return torch.ops.torch_ipex.woq_linear_silu( + x, + self.linear._op_context.get_data_handle(), + ) else: # fallback path return nn.functional.silu(self.linear(x)) @@ -63,6 +72,15 @@ def forward(self, x): ), self.linear.out_features, ) + elif ( + self.woq + and hasattr(self.linear, "_op_context") + and self.linear._op_context is not None + ): + return torch.ops.torch_ipex.woq_linear_relu( + x, + self.linear._op_context.get_data_handle(), + ) else: # fallback path return nn.functional.relu(self.linear(x)) @@ -87,6 +105,16 @@ def forward(self, x, y): ), self.linear.out_features, ) + elif ( + self.woq + and hasattr(self.linear, "_op_context") + and self.linear._op_context is not None + ): + return torch.ops.torch_ipex.woq_linear_mul( + x, + self.linear._op_context.get_data_handle(), + [y], + ) else: # fallback path return self.linear(x) * y @@ -412,6 +440,22 @@ def forward(self, x): else x.new_empty(0) ), ) + elif ( + self.woq + and hasattr(self.linear_s, "_op_context") + and self.linear_s._op_context is not None + and hasattr(self.linear_m, "_op_context") + and self.linear_m._op_context is not None + ): + y = torch.ops.torch_ipex.woq_linear_silu( + x, + self.linear_s._op_context.get_data_handle(), + ) + return torch.ops.torch_ipex.woq_linear_mul( + x, + self.linear_m._op_context.get_data_handle(), + [y], + ) else: # fallback path return nn.functional.silu(self.linear_s(x)) * self.linear_m(x) @@ -438,5 +482,17 @@ def forward(self, x, y): self.linear.out_features, ) return x1 * y + elif ( + self.woq + and hasattr(self.linear, "_op_context") + and self.linear._op_context is not None + ): + return ( + torch.ops.torch_ipex.woq_linear_silu( + x, + self.linear._op_context.get_data_handle(), + ) + * y + ) else: # fallback path return nn.functional.silu(self.linear(x)) * y diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py index 8643f630a..d6b0befd4 100644 --- a/tests/cpu/test_quantization_default_recipe.py +++ b/tests/cpu/test_quantization_default_recipe.py @@ -1033,28 +1033,39 @@ def test(feature, has_bias): for shape, use_bias in cases: test(shape, use_bias) - def test_weight_only_quantization_gelu_fused_op(self): + def _test_weight_only_quantization_unary_fused_op_helper( + self, + post_op_module, + fused_op, + ): class Mod(nn.Module): def __init__(self, bias): super().__init__() self.linear = nn.Linear(64, 64, bias=bias) - self.gelu = nn.GELU() + self.post_op = post_op_module def forward(self, x): - return self.gelu(self.linear(x)) + return self.post_op(self.linear(x)) + weight_dtype_list = [ + WoqWeightDtype.INT8, + WoqWeightDtype.INT4, + WoqWeightDtype.NF4, + ] bias_list = [False, True] bf16_list = [False, True] batch_size_list = [4, 1024] - cases = itertools.product(bias_list, bf16_list, batch_size_list) - for bias, bf16, bs in cases: + cases = itertools.product( + weight_dtype_list, bias_list, bf16_list, batch_size_list + ) + for w_dtype, bias, bf16, bs in cases: with torch.cpu.amp.autocast( enabled=bf16, dtype=torch.bfloat16 if bf16 else None ): model = Mod(bias).eval() data = torch.rand(bs, 64) qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( - lowp_mode=2 + weight_dtype=w_dtype, lowp_mode=2 ) prepared_model = prepare( model, qconfig, example_inputs=data, inplace=False @@ -1062,50 +1073,39 @@ def forward(self, x): with torch.no_grad(): woq_model = convert(prepared_model) output1 = woq_model(data) - output2 = torch.ops.torch_ipex.woq_linear_gelu( + output2 = fused_op( data, woq_model.linear._op_context.get_data_handle() ) torch.testing.assert_close( output1, output2.to(output1.dtype), atol=1e-2, rtol=1e-4 ) + def test_weight_only_quantization_gelu_fused_op(self): + self._test_weight_only_quantization_unary_fused_op_helper( + nn.GELU(), torch.ops.torch_ipex.woq_linear_gelu + ) + def test_weight_only_quantization_new_gelu_fused_op(self): - class Mod(nn.Module): - def __init__(self, bias): - super().__init__() - self.linear = nn.Linear(64, 64, bias=bias) - self.gelu = nn.GELU(approximate="tanh") + self._test_weight_only_quantization_unary_fused_op_helper( + nn.GELU(approximate="tanh"), torch.ops.torch_ipex.woq_linear_new_gelu + ) - def forward(self, x): - return self.gelu(self.linear(x)) + def test_weight_only_quantization_relu_fused_op(self): + self._test_weight_only_quantization_unary_fused_op_helper( + nn.ReLU(), torch.ops.torch_ipex.woq_linear_relu + ) - bias_list = [False, True] - bf16_list = [False, True] - batch_size_list = [4, 1024] - cases = itertools.product(bias_list, bf16_list, batch_size_list) - for bias, bf16, bs in cases: - with torch.cpu.amp.autocast( - enabled=bf16, dtype=torch.bfloat16 if bf16 else None - ): - model = Mod(bias).eval() - data = torch.rand(bs, 64) - qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( - lowp_mode=2 - ) - prepared_model = prepare( - model, qconfig, example_inputs=data, inplace=False - ) - with torch.no_grad(): - woq_model = convert(prepared_model) - output1 = woq_model(data) - output2 = torch.ops.torch_ipex.woq_linear_new_gelu( - data, woq_model.linear._op_context.get_data_handle() - ) - torch.testing.assert_close( - output1, output2.to(output1.dtype), atol=1e-2, rtol=1e-4 - ) + def test_weight_only_quantization_silu_fused_op(self): + self._test_weight_only_quantization_unary_fused_op_helper( + nn.SiLU(), torch.ops.torch_ipex.woq_linear_silu + ) - def test_weight_only_quantization_add_fused_op(self): + def _test_weight_only_quantization_binary_fused_op_helper( + self, + num_extra_inputs, + post_op, + fused_op, + ): class Mod(nn.Module): def __init__(self, bias): super().__init__() @@ -1114,44 +1114,69 @@ def __init__(self, bias): def forward(self, x, others): y = self.linear(x) for o in others: - y = torch.add(y, o) + y = post_op(y, o) return y + weight_dtype_list = [ + WoqWeightDtype.INT8, + WoqWeightDtype.INT4, + WoqWeightDtype.NF4, + ] bias_list = [False, True] bf16_list = [False, True] - others_len_list = [1, 2] batch_size_list = [4, 1024] cases = itertools.product( - bias_list, bf16_list, others_len_list, batch_size_list + weight_dtype_list, bias_list, bf16_list, batch_size_list ) - for bias, bf16, others_len, bs in cases: + for w_dtype, bias, bf16, bs in cases: with torch.cpu.amp.autocast( enabled=bf16, dtype=torch.bfloat16 if bf16 else None ): model = Mod(bias).eval() data = torch.rand(bs, 64) - others = [torch.rand(bs, 64)] * others_len - fused_op = ( - torch.ops.torch_ipex.woq_linear_add - if others_len == 1 - else torch.ops.torch_ipex.woq_linear_add_add - ) + extra_inputs = [torch.rand(bs, 64) for _ in range(num_extra_inputs)] qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( - lowp_mode=2 + weight_dtype=w_dtype, lowp_mode=2 ) prepared_model = prepare( model, qconfig, example_inputs=data, inplace=False ) with torch.no_grad(): woq_model = convert(prepared_model) - output1 = woq_model(data, others) + output1 = woq_model(data, extra_inputs) output2 = fused_op( - data, woq_model.linear._op_context.get_data_handle(), others + data, + woq_model.linear._op_context.get_data_handle(), + extra_inputs, ) torch.testing.assert_close( output1, output2.to(output1.dtype), atol=1.5e-2, rtol=1e-3 ) + def test_weight_only_quantization_add_fused_op(self): + # linear - add + num_extra_inputs = 1 + self._test_weight_only_quantization_binary_fused_op_helper( + num_extra_inputs, + torch.add, + torch.ops.torch_ipex.woq_linear_add, + ) + # linear - add - add + num_extra_inputs = 2 + self._test_weight_only_quantization_binary_fused_op_helper( + num_extra_inputs, + torch.add, + torch.ops.torch_ipex.woq_linear_add_add, + ) + + def test_weight_only_quantization_mul_fused_op(self): + num_extra_inputs = 1 + self._test_weight_only_quantization_binary_fused_op_helper( + num_extra_inputs, + torch.mul, + torch.ops.torch_ipex.woq_linear_mul, + ) + def test_weight_only_quantization_lowp_mode_functionality(self): from intel_extension_for_pytorch.quantization import WoqLowpMode From f3d9d39ff1d90ffa9fd6dd3e9dda5449a178834f Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Mon, 27 May 2024 20:35:55 +0900 Subject: [PATCH 090/199] update dockerfile to use native python env (#2886) --- docker/Dockerfile.compile | 37 ++++------ examples/cpu/inference/python/llm/Dockerfile | 67 ++++++++++--------- .../python/llm/tools/env_activate.sh | 2 +- .../inference/python/llm/tools/env_setup.sh | 33 ++++++--- scripts/compile_bundle.sh | 15 +++-- 5 files changed, 86 insertions(+), 68 deletions(-) diff --git a/docker/Dockerfile.compile b/docker/Dockerfile.compile index b272765a0..cc699b4d5 100644 --- a/docker/Dockerfile.compile +++ b/docker/Dockerfile.compile @@ -13,7 +13,6 @@ RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy. RUN apt update && \ apt full-upgrade -y && \ DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \ - sudo \ ca-certificates \ git \ curl \ @@ -22,32 +21,27 @@ RUN apt update && \ numactl \ gcc-12 \ g++-12 \ + python3 \ + python3-dev \ + python3-pip \ make RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \ update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \ - update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 + update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3 100 RUN apt clean && \ rm -rf /var/lib/apt/lists/* && \ if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi -RUN useradd -m ubuntu && \ - echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers -USER ubuntu -WORKDIR /home/ubuntu - -RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash miniconda.sh -b -p ./miniconda3 && \ - rm miniconda.sh && \ - echo "source ~/miniconda3/bin/activate" >> ./.bashrc +WORKDIR /root +ENV PATH=/root/.local/bin:${PATH} FROM base AS dev -COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch +COPY . ./intel-extension-for-pytorch RUN cp ./intel-extension-for-pytorch/scripts/compile_bundle.sh ./ && \ sed -i "s/VER_IPEX=.*/VER_IPEX=/" compile_bundle.sh -RUN . ./miniconda3/bin/activate && \ - conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \ - bash compile_bundle.sh && \ +RUN CC=gcc CXX=g++ bash compile_bundle.sh && \ cd intel-extension-for-pytorch && \ python -m pip install pyyaml && \ VER_TORCH=$(python tools/yaml_utils.py -f dependency_version.yml -d pytorch -k version) && \ @@ -58,18 +52,11 @@ RUN . ./miniconda3/bin/activate && \ echo ${VER_TORCH} | grep "dev" > /dev/null; TORCH_DEV=$?; URL_NIGHTLY=""; if [ ${TORCH_DEV} -eq 0 ]; then URL_NIGHTLY="nightly/"; fi; echo "#!/bin/bash\npython -m pip install torch==${VER_TORCH} torchvision==${VER_TORCHVISION} torchaudio==${VER_TORCHAUDIO} --index-url https://download.pytorch.org/whl/${URL_NIGHTLY}cpu" > torch_install.sh FROM base AS deploy -COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/dist ./wheels -COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh . -COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/torch_install.sh . -RUN . ./miniconda3/bin/activate && \ - conda create -y -n py310 python=3.10 && conda activate py310 && \ - bash ./torch_install.sh && rm ./torch_install.sh && \ +COPY --from=dev /root/intel-extension-for-pytorch/dist ./wheels +COPY --from=dev /root/torch_install.sh . +RUN bash ./torch_install.sh && rm ./torch_install.sh && \ python -m pip install ./wheels/*.whl && \ python -m pip install intel-openmp && \ - conda install -y jemalloc gperftools -c conda-forge && \ python -m pip cache purge && \ - conda clean -a -y && \ rm -rf ./wheels && \ - echo "conda activate py310" >> ./.bashrc && \ - ldpreload=$(bash get_libstdcpp_lib.sh) && echo "export LD_PRELOAD=${ldpreload}" >> ./.bashrc && rm get_libstdcpp_lib.sh && \ echo "echo \"**Note:** For better performance, please consider to launch workloads with command 'ipexrun'.\"" >> ./.bashrc diff --git a/examples/cpu/inference/python/llm/Dockerfile b/examples/cpu/inference/python/llm/Dockerfile index 41e29c42f..9f7dd58ec 100644 --- a/examples/cpu/inference/python/llm/Dockerfile +++ b/examples/cpu/inference/python/llm/Dockerfile @@ -13,55 +13,62 @@ RUN if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy. RUN apt update && \ apt full-upgrade -y && \ DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \ - sudo \ ca-certificates \ git \ curl \ wget \ vim \ + python3 \ + python3-dev \ + python3-pip \ numactl \ gcc-12 \ g++-12 \ - make + make \ + cmake RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \ update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 && \ update-alternatives --install /usr/bin/cc cc /usr/bin/gcc 100 && \ - update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 -RUN apt clean && \ - rm -rf /var/lib/apt/lists/* && \ - if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi + update-alternatives --install /usr/bin/c++ c++ /usr/bin/g++ 100 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python3 100 -RUN useradd -m ubuntu && \ - echo 'ubuntu ALL=(ALL) NOPASSWD: ALL' >> /etc/sudoers -USER ubuntu -WORKDIR /home/ubuntu - -RUN curl -fsSL -v -o miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ - bash miniconda.sh -b -p ./miniconda3 && \ - rm miniconda.sh && \ - echo "source ~/miniconda3/bin/activate" >> ./.bashrc +WORKDIR /root +ENV PATH=/root/.local/bin:${PATH} # --build-arg COMPILE=ON to compile from source FROM base AS dev ARG COMPILE -COPY --chown=ubuntu:ubuntu . ./intel-extension-for-pytorch -RUN . ./miniconda3/bin/activate && \ - conda create -y -n compile_py310 python=3.10 && conda activate compile_py310 && \ - cd intel-extension-for-pytorch/examples/cpu/inference/python/llm && \ - if [ -z ${COMPILE} ]; then bash tools/env_setup.sh 6; else bash tools/env_setup.sh 2; fi +COPY . ./intel-extension-for-pytorch +RUN cd intel-extension-for-pytorch/examples/cpu/inference/python/llm && \ + export CC=gcc && export CXX=g++ && \ + if [ -z ${COMPILE} ]; then bash tools/env_setup.sh 6; else bash tools/env_setup.sh 2; fi && \ + unset CC && unset CXX FROM base AS deploy -COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/examples/cpu/inference/python/llm ./llm -RUN rm ./llm/tools/get_libstdcpp_lib.sh -COPY --from=dev --chown=ubuntu:ubuntu /home/ubuntu/intel-extension-for-pytorch/examples/cpu/inference/python/llm/tools/get_libstdcpp_lib.sh ./llm/tools/get_libstdcpp_lib.sh -RUN . ./miniconda3/bin/activate && \ - conda create -y -n py310 python=3.10 && conda activate py310 && \ - echo "conda activate py310" >> ./.bashrc && \ +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive apt install --no-install-recommends -y \ + google-perftools \ + openssh-server \ + net-tools && \ + apt clean && \ + rm -rf /var/lib/apt/lists/* && \ + if [ -f /etc/apt/apt.conf.d/proxy.conf ]; then rm /etc/apt/apt.conf.d/proxy.conf; fi +COPY --from=dev /root/intel-extension-for-pytorch/examples/cpu/inference/python/llm ./llm +COPY --from=dev /root/intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh ./llm/tools +RUN cd /usr/lib/x86_64-linux-gnu/ && ln -s libtcmalloc.so.4 libtcmalloc.so && cd && \ echo "echo \"**Note:** For better performance, please consider to launch workloads with command 'ipexrun'.\"" >> ./.bashrc && \ cd ./llm && \ bash tools/env_setup.sh 1 && \ python -m pip cache purge && \ - conda clean -a -y && \ - sudo mv ./oneCCL_release /opt/oneCCL && \ - sudo chown -R root:root /opt/oneCCL && \ - sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ./tools/env_activate.sh + mv ./oneCCL_release /opt/oneCCL && \ + chown -R root:root /opt/oneCCL && \ + sed -i "s|ONECCL_PATH=.*|ONECCL_PATH=/opt/oneCCL|" ./tools/env_activate.sh && \ + LN=$(grep "Conda environment is not available." -n ./tools/env_activate.sh | cut -d ":" -f 1) && sed -i "${LN}s|.*| export LD_PRELOAD=\${LD_PRELOAD}:/usr/lib/x86_64-linux-gnu/libtcmalloc.so:/usr/local/lib/libiomp5.so|" ./tools/env_activate.sh +ARG PORT_SSH=22 +RUN mkdir /var/run/sshd && \ + sed -i "s/#Port.*/Port ${PORT_SSH}/" /etc/ssh/sshd_config && \ + echo "service ssh start" >> /root/.bashrc && \ + ssh-keygen -b 4096 -f /root/.ssh/id_rsa -N "" && \ + mv /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ + echo "Host *\n Port ${PORT_SSH}\n IdentityFile /root/.ssh/id_rsa\n StrictHostKeyChecking no" > /root/.ssh/config +EXPOSE ${PORT_SSH} diff --git a/examples/cpu/inference/python/llm/tools/env_activate.sh b/examples/cpu/inference/python/llm/tools/env_activate.sh index 2ab5c5342..759c008f7 100644 --- a/examples/cpu/inference/python/llm/tools/env_activate.sh +++ b/examples/cpu/inference/python/llm/tools/env_activate.sh @@ -16,7 +16,7 @@ if [ $? -eq 0 ]; then # Tcmalloc is a recommended malloc implementation that emphasizes fragmentation avoidance and scalable concurrency support. export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libtcmalloc.so else - echo "Conda environment is not available. You need to set environment variable LD_PRELOAD to dynamic libraries of Intel OpenMP and TcMalloc manually." + echo "Conda environment is not available. You need to set environment variable LD_PRELOAD to dynamic libraries of Intel OpenMP and TcMalloc manually if they are not in library search paths." fi ONECCL_PATH=${BASEFOLDER}/../oneCCL_release diff --git a/examples/cpu/inference/python/llm/tools/env_setup.sh b/examples/cpu/inference/python/llm/tools/env_setup.sh index aa08caebb..5e568c3e6 100644 --- a/examples/cpu/inference/python/llm/tools/env_setup.sh +++ b/examples/cpu/inference/python/llm/tools/env_setup.sh @@ -32,7 +32,7 @@ if [ ! -f ${WHEELFOLDER}/lm_eval*.whl ] || fi # Check existance of required Linux commands -for CMD in conda gcc g++; do +for CMD in gcc g++; do command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" is required."; exit 1;) done @@ -131,12 +131,21 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then echo " Found GCC version $(gcc -dumpfullversion)" echo " Installing gcc and g++ 12.3 with conda" echo "" - conda install -y sysroot_linux-64 - conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge - if [ -z ${CONDA_BUILD_SYSROOT} ]; then - source ${CONDA_PREFIX}/etc/conda/activate.d/activate-gcc_linux-64.sh - source ${CONDA_PREFIX}/etc/conda/activate.d/activate-gxx_linux-64.sh - source ${CONDA_PREFIX}/etc/conda/activate.d/activate-binutils_linux-64.sh + set +e + command -v conda > /dev/null + EXIST_CONDA=$? + set -e + if [ ${EXIST_CONDA} -gt 0 ]; then + echo "[Error] Command \"conda\" is not available." + exit 5 + else + conda install -y sysroot_linux-64 + conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge + if [ -z ${CONDA_BUILD_SYSROOT} ]; then + source ${CONDA_PREFIX}/etc/conda/activate.d/activate-gcc_linux-64.sh + source ${CONDA_PREFIX}/etc/conda/activate.d/activate-gxx_linux-64.sh + source ${CONDA_PREFIX}/etc/conda/activate.d/activate-binutils_linux-64.sh + fi fi fi @@ -206,7 +215,15 @@ if [ $((${MODE} & 0x02)) -ne 0 ]; then cd intel-extension-for-pytorch/examples/cpu/inference/python/llm fi if [ $((${MODE} & 0x01)) -ne 0 ]; then - conda install -y gperftools -c conda-forge + set +e + command -v conda > /dev/null + EXIST_CONDA=$? + set -e + if [ ${EXIST_CONDA} -gt 0 ]; then + echo "[WARNING] Command \"conda\" is not available. Please install tcmalloc manually." + else + conda install -y gperftools -c conda-forge + fi bash ${AUX_INSTALL_SCRIPT} python -m pip install ${WHEELFOLDER}/*.whl rm -rf ${WHEELFOLDER} diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh index b507ad683..b4ded6057 100644 --- a/scripts/compile_bundle.sh +++ b/scripts/compile_bundle.sh @@ -23,7 +23,7 @@ if [ $# -gt 0 ]; then fi # Check existance of required Linux commands -for CMD in conda git nproc; do +for CMD in git nproc; do command -v ${CMD} > /dev/null || (echo "Error: Command \"${CMD}\" not found." ; exit 1) done @@ -97,8 +97,11 @@ function ver_compare() { fi echo ${RET} } -GCC_CONDA=0 set +e +command -v conda > /dev/null +EXIST_CONDA=$? + +GCC_CONDA=0 command -v gcc > /dev/null EXIST_CC=$? command -v g++ > /dev/null @@ -121,7 +124,7 @@ else GCC_CONDA=1 else DIR_GCC=$(which gcc) - if [[ ${DIR_GCC} =~ ${CONDA_PREFIX} ]]; then + if [ ! -z ${CONDA_PREFIX} ] && [[ ${DIR_GCC} =~ ${CONDA_PREFIX} ]]; then GCC_CONDA=2 fi fi @@ -216,6 +219,10 @@ ABI=$(python -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))") # Compile individual component if [ ${GCC_CONDA} -eq 1 ]; then + if [ ${EXIST_CONDA} -gt 0 ]; then + echo "Command \"conda\" not found. Exit." + exit 2 + fi conda install -y sysroot_linux-64 conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge fi @@ -240,7 +247,7 @@ fi set +e command -v make > /dev/null if [ $? -gt 0 ]; then - conda install -y make -c conda-forge + python -m pip install make fi set -e From 9089e1730b03df6296f1bf733d177f24524caa6c Mon Sep 17 00:00:00 2001 From: Xu Han Date: Tue, 28 May 2024 08:48:39 +0800 Subject: [PATCH 091/199] rename USE_CCL build option. (#2935) (#2936) --- cmake/cpu/Options.cmake | 4 ++-- csrc/cpu/CMakeLists.txt | 12 ++++++------ csrc/cpu/aten/CollectiveCommunicationPrimitive.cpp | 2 +- csrc/cpu/aten/CollectiveCommunicationPrimitive.h | 2 +- csrc/cpu/aten/ShmAllReduceAdd.cpp | 2 +- csrc/cpu/aten/ShmAllReduceAdd.h | 2 +- .../kernels/CollectiveCommunicationPrimitiveKrnl.cpp | 2 +- csrc/cpu/aten/kernels/SHMAllreduceAddKrnl.cpp | 2 +- csrc/cpu/comm/comm.cpp | 12 ++++++------ csrc/cpu/comm/messager.h | 2 +- setup.py | 2 +- 11 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cmake/cpu/Options.cmake b/cmake/cpu/Options.cmake index fffe745ff..99cae2bbd 100644 --- a/cmake/cpu/Options.cmake +++ b/cmake/cpu/Options.cmake @@ -15,7 +15,7 @@ endif() if(WIN32) set(USE_SHM OFF) - set(USE_CCL OFF) + set(BUILD_CPU_WITH_ONECCL OFF) endif() @@ -53,7 +53,7 @@ function (print_cpu_config_summary) message(STATUS " IPEX_DISP_OP : ${IPEX_DISP_OP}") message(STATUS " BUILD_XSMM_VIA_CMAKE : ${BUILD_LIBXSMM_VIA_CMAKE}") message(STATUS " USE_LIBXSMM : ${USE_LIBXSMM}") - message(STATUS " USE_CCL : ${USE_CCL}") + message(STATUS " BUILD_CPU_WITH_ONECCL : ${BUILD_CPU_WITH_ONECCL}") message(STATUS " USE_SHM : ${USE_SHM}") message(STATUS "") message(STATUS "********************************") diff --git a/csrc/cpu/CMakeLists.txt b/csrc/cpu/CMakeLists.txt index 460621a3a..7a11f25bd 100644 --- a/csrc/cpu/CMakeLists.txt +++ b/csrc/cpu/CMakeLists.txt @@ -13,7 +13,7 @@ set(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE) #find_package(TorchCCL REQUIRED) # Find OneCCL Lib set(DEPENDS_LIB) -if(USE_CCL) +if(BUILD_CPU_WITH_ONECCL) include(${IPEX_ROOT_DIR}/cmake/Modules/FindoneCCL.cmake) # Find OneCCL Lib link_directories(${IPEX_CPU_CPP_THIRD_PARTY_ROOT}/oneCCL/deps/mpi/lib) @@ -44,9 +44,9 @@ if(USE_LIBXSMM) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_LIBXSMM") endif(USE_LIBXSMM) -if(USE_CCL) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_CCL") -endif(USE_CCL) +if(BUILD_CPU_WITH_ONECCL) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DBUILD_CPU_WITH_ONECCL") +endif(BUILD_CPU_WITH_ONECCL) if(USE_SHM) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SHM") @@ -144,9 +144,9 @@ target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${ONEDNN_GENERATED_INCLUDE} target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${IPEX_CPU_CPP_THIRD_PARTY_ROOT}/ideep/include) target_include_directories(${PLUGIN_NAME_CPU} PUBLIC ${PYTHON_INCLUDE_DIR}) -if(USE_CCL) +if(BUILD_CPU_WITH_ONECCL) target_link_libraries(${PLUGIN_NAME_CPU} PUBLIC ${DEPENDS_LIB}) -endif(USE_CCL) +endif(BUILD_CPU_WITH_ONECCL) include(${IPEX_ROOT_DIR}/cmake/ClangFormat.cmake) if(CLANG_FORMAT) diff --git a/csrc/cpu/aten/CollectiveCommunicationPrimitive.cpp b/csrc/cpu/aten/CollectiveCommunicationPrimitive.cpp index 757f54c4c..3021a12e2 100644 --- a/csrc/cpu/aten/CollectiveCommunicationPrimitive.cpp +++ b/csrc/cpu/aten/CollectiveCommunicationPrimitive.cpp @@ -1,4 +1,4 @@ -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL #include "CollectiveCommunicationPrimitive.h" #include #include diff --git a/csrc/cpu/aten/CollectiveCommunicationPrimitive.h b/csrc/cpu/aten/CollectiveCommunicationPrimitive.h index ef3904d5c..5498216ff 100644 --- a/csrc/cpu/aten/CollectiveCommunicationPrimitive.h +++ b/csrc/cpu/aten/CollectiveCommunicationPrimitive.h @@ -1,5 +1,5 @@ #pragma once -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL #include #include diff --git a/csrc/cpu/aten/ShmAllReduceAdd.cpp b/csrc/cpu/aten/ShmAllReduceAdd.cpp index a66cfbb2f..29d417dfd 100644 --- a/csrc/cpu/aten/ShmAllReduceAdd.cpp +++ b/csrc/cpu/aten/ShmAllReduceAdd.cpp @@ -1,4 +1,4 @@ -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL #include "ShmAllReduceAdd.h" #include #include diff --git a/csrc/cpu/aten/ShmAllReduceAdd.h b/csrc/cpu/aten/ShmAllReduceAdd.h index ec0ae1d00..f4151aa03 100644 --- a/csrc/cpu/aten/ShmAllReduceAdd.h +++ b/csrc/cpu/aten/ShmAllReduceAdd.h @@ -1,6 +1,6 @@ #pragma once -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL #include #include diff --git a/csrc/cpu/aten/kernels/CollectiveCommunicationPrimitiveKrnl.cpp b/csrc/cpu/aten/kernels/CollectiveCommunicationPrimitiveKrnl.cpp index 825376c76..73f62070c 100644 --- a/csrc/cpu/aten/kernels/CollectiveCommunicationPrimitiveKrnl.cpp +++ b/csrc/cpu/aten/kernels/CollectiveCommunicationPrimitiveKrnl.cpp @@ -1,4 +1,4 @@ -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL #include #include #include diff --git a/csrc/cpu/aten/kernels/SHMAllreduceAddKrnl.cpp b/csrc/cpu/aten/kernels/SHMAllreduceAddKrnl.cpp index 7151448a0..06f3a58e5 100644 --- a/csrc/cpu/aten/kernels/SHMAllreduceAddKrnl.cpp +++ b/csrc/cpu/aten/kernels/SHMAllreduceAddKrnl.cpp @@ -1,4 +1,4 @@ -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL #include #include #include diff --git a/csrc/cpu/comm/comm.cpp b/csrc/cpu/comm/comm.cpp index c9d69a769..5f92279b7 100644 --- a/csrc/cpu/comm/comm.cpp +++ b/csrc/cpu/comm/comm.cpp @@ -5,28 +5,28 @@ namespace torch_ipex { namespace cpu { int get_rank() { -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL return Messenger::getInstance().getRank(); #else - TORCH_CHECK(false, "USE_CCL is not enabled."); + TORCH_CHECK(false, "BUILD_CPU_WITH_ONECCL is not enabled."); return 0; #endif } int get_world_size() { -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL return Messenger::getInstance().getSize(); #else - TORCH_CHECK(false, "USE_CCL is not enabled."); + TORCH_CHECK(false, "BUILD_CPU_WITH_ONECCL is not enabled."); return 0; #endif } void barrier() { -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL Messenger::getInstance().barrier(); #else - TORCH_CHECK(false, "USE_CCL is not enabled."); + TORCH_CHECK(false, "BUILD_CPU_WITH_ONECCL is not enabled."); return; #endif } diff --git a/csrc/cpu/comm/messager.h b/csrc/cpu/comm/messager.h index 0e9e16af9..210af46ce 100644 --- a/csrc/cpu/comm/messager.h +++ b/csrc/cpu/comm/messager.h @@ -1,5 +1,5 @@ #pragma once -#ifdef USE_CCL +#ifdef BUILD_CPU_WITH_ONECCL #include #include diff --git a/setup.py b/setup.py index 22eefef8d..7363c9e6f 100644 --- a/setup.py +++ b/setup.py @@ -802,7 +802,7 @@ def run(self): else: build_option_cpu = {**build_option_common, "BUILD_MODULE_TYPE": "CPU"} if _get_build_target() in ["develop", "python"]: - build_option_cpu["USE_CCL"] = "ON" + build_option_cpu["BUILD_CPU_WITH_ONECCL"] = "ON" build_option_cpu["USE_SHM"] = "ON" build_option_cpu["ENABLE_MPI_TESTS"] = "OFF" build_option_cpu["BUILD_REG_TESTS"] = "OFF" From 40bcbb6b7ddd1d8dbe49778f194d94063579880e Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Tue, 28 May 2024 13:24:36 +0800 Subject: [PATCH 092/199] fix rope inplace and pagedatten input (#2938) --- .../models/cpu/fusions/mha_fusion.py | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index 2de1e0ddb..df3d32a10 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -80,8 +80,11 @@ def rotary_embedding( if query.dim() == 3: input_3d = True - query = query.unsqueeze(0) - key = key.unsqueeze(0) + query_ = query.unsqueeze(0) + key_ = key.unsqueeze(0) + else: + query_ = query + key_ = key if rotary_half: offset = rotary_dim // 2 @@ -111,8 +114,8 @@ def rotary_embedding( .view(-1, head_dim) ) - query, _, _ = torch.ops.torch_ipex.rotary_position_embedding( - query, + query_, _, _ = torch.ops.torch_ipex.rotary_position_embedding( + query_, sin_cos, position_ids, num_head, @@ -121,8 +124,8 @@ def rotary_embedding( rotary_dim, ) - key, _, _ = torch.ops.torch_ipex.rotary_position_embedding( - key, + key_, _, _ = torch.ops.torch_ipex.rotary_position_embedding( + key_, sin_cos, position_ids, num_kv_head, @@ -131,9 +134,11 @@ def rotary_embedding( rotary_dim, ) if input_3d: - query = query.view([-1, num_head, head_dim]) - key = key.view([-1, num_kv_head, head_dim]) - + query_ = query_.view([-1, num_head, head_dim]) + key_ = key_.view([-1, num_kv_head, head_dim]) + # keep the inplace context as used in TGI + query.copy_(query_) + key.copy_(key_) return query, key @@ -338,7 +343,11 @@ class _IPEXPagedAttentionCPU: @classmethod def reshape_and_cache(cls, key, value, key_cache, value_cache, slot_mapping): torch.ops.torch_ipex.reshape_and_cache( - key, value, key_cache, value_cache, slot_mapping + key, + value, + key_cache, + value_cache, + slot_mapping.int() if slot_mapping.dtype is torch.long else slot_mapping, ) @classmethod From 48c4e5c52904468abb1efe3f7226aeb8dac0030e Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 30 May 2024 11:44:14 +0800 Subject: [PATCH 093/199] update oneDNN to 7444492423 on rls-v3.5 (#2944) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 7d2635dc9..6dde4eee3 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 7d2635dc94a53637287c4c144ff0618f3472e2c1 +Subproject commit 6dde4eee367757e838a32d9a68ae0227c02c83ad From cd1f0ce34748336e221e9d89a05fe325482948da Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Thu, 30 May 2024 15:59:05 +0800 Subject: [PATCH 094/199] [Paged Attention]Eanble flash decoding and use a new data layout (#2929) * Eanble flash decoding and use a new data layout to get better data locality --- csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp | 464 ++++++++++++------- tests/cpu/test_paged_attention.py | 39 +- 2 files changed, 321 insertions(+), 182 deletions(-) diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp index 0696c9d10..964af1590 100644 --- a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp @@ -1,10 +1,14 @@ #include +#include +#include #include #include #include #include #include "vec/vec.h" +#define PARTITION_SIZE 512 + namespace torch_ipex { namespace cpu { @@ -56,6 +60,95 @@ inline void mul_attenion_weights_and_value_of_head( #endif } +// 1) out = exp(a - val) +// 2) val = sum(out) +template +inline void _exp_reduce_sum_fusion_kernel( + T1* a, + const int& size, + T2* out, + T1& val) { + auto vec_size = at::vec::Vectorized::size(); + auto vec_max = at::vec::Vectorized(val); + T1 tmp_sum = 0; + auto vec_tmp_sum = at::vec::Vectorized(tmp_sum); + long i = 0; + for (; i < vec_size * (size / vec_size); i += vec_size) { + auto tmp0 = at::vec::Vectorized::loadu(a + i); + auto tmp1 = tmp0 - vec_max; + auto tmp2 = tmp1.exp_u20(); + vec_tmp_sum += tmp2; + tmp2.store(out + i); + } + tmp_sum = at::vec::vec_reduce_all( + [](at::vec::Vectorized& x, at::vec::Vectorized& y) { + return x + y; + }, + vec_tmp_sum); + for (; i < size; i++) { + auto tmp0 = a[i]; + auto tmp1 = tmp0 - val; + auto tmp2 = exp(tmp1); + tmp_sum += tmp2; + out[i] = tmp2; + } + val = tmp_sum; +} + +// 1) out = a * scale + alibi_mask +// 2) max = max(out) +template +inline void _mul_alibi_reduce_max_fusion_kernel( + scalar_t* a, + const scalar_t& scale, + const int& size, + scalar_t* out, + scalar_t& max, + const int& token_start, + const int& context_len, + const scalar_t& alibi_slope) { + for (auto i = 0; i < size; i++) { + a[i] = a[i] * scale; + auto alibi_slopes_val = alibi_slope * (i + token_start + 1 - context_len); + a[i] += alibi_slopes_val; + max = std::max(max, a[i]); + } +} + +// 1) out = a * scale +// 2) max = max(out) +template +inline void _mul_reduce_max_fusion_kernel( + scalar_t* a, + const scalar_t& scale, + const int& size, + scalar_t* out, + scalar_t& max) { + auto vec_size = at::vec::Vectorized::size(); + auto vec_scale = at::vec::Vectorized(scale); + scalar_t tmp_max = -std::numeric_limits::infinity(); + auto vec_tmp_max = at::vec::Vectorized(tmp_max); + long i = 0; + for (; i < vec_size * (size / vec_size); i += vec_size) { + auto tmp0 = at::vec::Vectorized::loadu(a + i); + auto tmp1 = tmp0 * vec_scale; + vec_tmp_max = at::vec::maximum(vec_tmp_max, tmp1); + tmp1.store(out + i); + } + tmp_max = at::vec::vec_reduce_all( + [](at::vec::Vectorized& x, at::vec::Vectorized& y) { + return at::vec::maximum(x, y); + }, + vec_tmp_max); + for (; i < size; i++) { + auto tmp0 = a[i]; + auto tmp1 = tmp0 * scale; + tmp_max = std::max(tmp_max, tmp1); + out[i] = tmp1; + } + max = tmp_max; +} + /** * Performs scale-dot-product for the next token based on cached key-value * attention. @@ -72,9 +165,6 @@ inline void mul_attenion_weights_and_value_of_head( * shape should be [num_blocks, block_size, num_heads, head_size]. * @param value_cache The pre-allocated buffer to store the value cache. The * shape should be [num_blocks, block_size, num_heads, head_size]. - * @param head_mapping Head mapping tensor [num_heads]. The mapping from the - * query head to the kv head to support GQA/MQA. The shape should be the number - * of query heads. * @param scale Scaling factor for attention weights. In general, it is: * float(1.0 / (head_size ** 0.5)). * @param block_tables Block tables tensor [num_seqs, max_num_blocks_per_seq]. @@ -91,7 +181,6 @@ void single_query_cached_kv_attention_kernel( at::Tensor& query, at::Tensor& key_cache, at::Tensor& value_cache, - at::Tensor& head_mapping, const double scale, at::Tensor& block_tables, at::Tensor& context_lens, @@ -102,7 +191,6 @@ void single_query_cached_kv_attention_kernel( auto query_ptr = query.data_ptr(); auto key_cache_ptr = key_cache.data_ptr(); auto value_cache_ptr = value_cache.data_ptr(); - auto head_mapping_ptr = head_mapping.data_ptr(); auto block_tables_ptr = block_tables.data_ptr(); auto context_lens_ptr = context_lens.data_ptr(); auto alibi_slopes_ptr = alibi_slopes.has_value() @@ -111,187 +199,235 @@ void single_query_cached_kv_attention_kernel( auto num_seqs = query.size(0); auto num_heads = query.size(1); auto head_size = query.size(2); - auto num_kv_heads = key_cache.size(2); + auto num_kv_heads = key_cache.size(1); + auto kv_head_group_size = num_heads / num_kv_heads; auto max_num_blocks_per_seq = block_tables.size(1); - auto attn_weights = at::empty( - {num_seqs, num_heads, max_context_len}, - query.options().dtype(at::ScalarType::Float)); - auto attn_weights_ptr = attn_weights.data_ptr(); + auto kv_block_strideN = key_cache.stride(0); - auto kv_block_strideP = key_cache.stride(1); - auto kv_block_strideH = key_cache.stride(2); + auto kv_block_strideP = key_cache.stride(2); + auto kv_block_strideH = key_cache.stride(1); + + auto out_strideN = out.stride(0); + auto out_strideH = out.stride(1); auto q_strideN = query.stride(0); auto q_strideH = query.stride(1); - auto attn_weights_strideN = attn_weights.stride(0); - auto attn_weights_strideH = attn_weights.stride(1); + auto max_num_partitions = + (max_context_len + PARTITION_SIZE - 1) / PARTITION_SIZE; + + auto max_logits = at::empty( + {num_seqs, num_heads, max_num_partitions + 1}, + query.options().dtype(at::ScalarType::Float)); + + auto exp_sum = at::empty( + {num_seqs, num_heads, max_num_partitions + 1}, + query.options().dtype(at::ScalarType::Float)); + + auto tmp_out = at::empty( + {num_seqs, num_heads, max_num_partitions, head_size}, + query.options().dtype(at::ScalarType::Float)); + + auto tmp_out_ptr = tmp_out.data_ptr(); + auto max_logits_ptr = max_logits.data_ptr(); + auto exp_sum_ptr = exp_sum.data_ptr(); + + auto max_logits_strideN = max_logits.stride(0); + auto max_logits_strideH = max_logits.stride(1); + auto exp_sum_strideN = exp_sum.stride(0); + auto exp_sum_strideH = exp_sum.stride(1); + auto tmp_out_strideN = tmp_out.stride(0); + auto tmp_out_strideH = tmp_out.stride(1); + auto tmp_out_strideS = tmp_out.stride(2); + + auto max_logic_blocks = (max_context_len + block_size - 1) / block_size; + + auto thread_numbers = omp_get_max_threads(); + auto max_parallel_parts = thread_numbers * 4; if (alibi_slopes.has_value()) { auto alibi_slopes_size = alibi_slopes.value().size(0); TORCH_CHECK( alibi_slopes_size == num_heads, "alibi_slopes size is not equal to num_heads"); } - -#pragma omp parallel for collapse(3) +#pragma omp parallel for collapse(3) schedule(static, 1) for (auto seq_id = 0; seq_id < num_seqs; seq_id++) { - for (auto head_id = 0; head_id < num_heads; head_id++) { - for (auto token_id = 0; token_id < max_context_len; token_id++) { + for (auto partition_id = 0; partition_id < max_num_partitions; + partition_id++) { + for (auto head_id = 0; head_id < num_heads; head_id++) { auto context_len = context_lens_ptr[seq_id]; - if (token_id >= context_len) + auto partition_start = partition_id * PARTITION_SIZE; + if (partition_start >= context_len) continue; - auto attn_w_pos = attn_weights_ptr + seq_id * attn_weights_strideN + - head_id * attn_weights_strideH + token_id; + auto partition_end = + std::min(partition_start + PARTITION_SIZE, context_len); + auto token_num = partition_end - partition_start; + auto block_num = (token_num + block_size - 1) / block_size; + auto logical_block_start = partition_start / block_size; + auto logical_block_end = logical_block_start + block_num; + auto need_update = block_num > 1; + auto kv_head_id = head_id / kv_head_group_size; auto q_ptr_start = query_ptr + seq_id * q_strideN + head_id * q_strideH; - auto block_id = block_tables_ptr - [seq_id * max_num_blocks_per_seq + token_id / block_size]; - auto block_offset = token_id % block_size; - auto k_cache_start = key_cache_ptr + block_id * kv_block_strideN + - block_offset * kv_block_strideP + - head_mapping_ptr[head_id] * kv_block_strideH; - reduce_head( - q_ptr_start, k_cache_start, attn_w_pos, head_size); + auto max_logits_offset = seq_id * max_logits_strideN + + head_id * max_logits_strideH + partition_id; + auto exp_sum_offset = + seq_id * exp_sum_strideN + head_id * exp_sum_strideH + partition_id; + auto tmp_out_start = tmp_out_ptr + seq_id * tmp_out_strideN + + head_id * tmp_out_strideH + partition_id * tmp_out_strideS; + float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0}; + auto logits_position = 0; + // 1)calculate the matmul(query, key) for this partition + for (auto logical_block_id = logical_block_start; + logical_block_id < logical_block_end; + logical_block_id++) { + auto physical_block_id = block_tables_ptr + [seq_id * max_num_blocks_per_seq + logical_block_id]; + auto tokens_in_block = + std::min(block_size, context_len - logical_block_id * block_size); + auto token_start = logical_block_id * block_size; + auto token_end = token_start + tokens_in_block; + for (auto token_id = token_start; token_id < token_end; token_id++) { + auto block_offset = token_id - token_start; + auto k_cache_start = key_cache_ptr + + physical_block_id * kv_block_strideN + + block_offset * kv_block_strideP + kv_head_id * kv_block_strideH; + reduce_head( + q_ptr_start, + k_cache_start, + &(logits[logits_position]), + head_size); + logits_position++; + } + } + // 2) calculate the max and exp_sum for this partition + auto partition_max = -std::numeric_limits::infinity(); + if (alibi_slopes_ptr != nullptr) { + _mul_alibi_reduce_max_fusion_kernel( + logits, + scale, + token_num, + logits, + partition_max, + partition_start, + context_len, + alibi_slopes_ptr[head_id]); + } else { + _mul_reduce_max_fusion_kernel( + logits, scale, token_num, logits, partition_max); + } + max_logits_ptr[max_logits_offset] = partition_max; + _exp_reduce_sum_fusion_kernel( + logits, token_num, logits, partition_max); + exp_sum_ptr[exp_sum_offset] = partition_max; + + // 3) calculate the matmul(exp(logits-partition_max), value) for this + // partition, need to divide the global exp_sum in the final result. + logits_position = 0; + for (auto logical_block_id = logical_block_start; + logical_block_id < logical_block_end; + logical_block_id++) { + auto physical_block_id = block_tables_ptr + [seq_id * max_num_blocks_per_seq + logical_block_id]; + auto tokens_in_block = + std::min(block_size, context_len - logical_block_id * block_size); + auto token_start = logical_block_id * block_size; + auto token_end = token_start + tokens_in_block; + for (auto token_id = token_start; token_id < token_end; token_id++) { + auto block_offset = token_id - token_start; + auto v_cache_start = value_cache_ptr + + physical_block_id * kv_block_strideN + + block_offset * kv_block_strideP + kv_head_id * kv_block_strideH; + auto accumulated = logits_position > 0; + mul_attenion_weights_and_value_of_head( + logits[logits_position], + v_cache_start, + tmp_out_start, + head_size, + accumulated); + logits_position++; + } + } } } } -// div+add+softmax +// calculate the final output #pragma omp parallel for collapse(2) for (auto seq_id = 0; seq_id < num_seqs; seq_id++) { for (auto head_id = 0; head_id < num_heads; head_id++) { - auto max_val = -10000.0f; - float sum = 0.0f; + auto global_max = -std::numeric_limits::infinity(); + auto global_exp_sum = 0.0; auto context_len = context_lens_ptr[seq_id]; - auto attn_w_start = attn_weights_ptr + seq_id * attn_weights_strideN + - head_id * attn_weights_strideH; -#if defined(CPU_CAPABILITY_AVX512) - if (alibi_slopes_ptr != nullptr) { - auto alibi_slope = alibi_slopes_ptr[head_id]; - torch_ipex::cpu::kernel:: - _dil_div_add_alibi_and_reduce_max_fusion_kernel( - attn_w_start, - scale, - context_len, - attn_w_start, - max_val, - alibi_slope, - true); - } else { - torch_ipex::cpu::kernel:: - _dil_div_add_alibi_and_reduce_max_fusion_kernel( - attn_w_start, - scale, - context_len, - attn_w_start, - max_val, - 1, - false); + auto partition_num = (context_len + PARTITION_SIZE - 1) / PARTITION_SIZE; + // calculate the global max and exp_sum for this head + for (auto partition_id = 0; partition_id < max_num_partitions; + partition_id++) { + if (partition_id >= partition_num) + break; + auto max_logit = max_logits_ptr + [seq_id * max_logits_strideN + head_id * max_logits_strideH + + partition_id]; + global_max = std::max(global_max, max_logit); } - torch_ipex::cpu::kernel::_dil_exp_reduce_sum_fusion_kernel( - attn_w_start, context_len, attn_w_start, max_val); - torch_ipex::cpu::kernel::_dil_normalization_kernel( - attn_w_start, max_val, context_len, attn_w_start); + // update the partition 0 result with the global max + auto partition0_out_start = + tmp_out_ptr + seq_id * tmp_out_strideN + head_id * tmp_out_strideH; + auto max_logit0 = max_logits_ptr + [seq_id * max_logits_strideN + head_id * max_logits_strideH]; + float exp_val = expf(max_logit0 - global_max); + global_exp_sum += + exp_sum_ptr[seq_id * exp_sum_strideN + head_id * exp_sum_strideH] * + exp_val; + at::vec::Vectorized exp_val_vec0(exp_val); + at::vec::map( + [&](auto a) { return a * exp_val_vec0; }, + partition0_out_start, + partition0_out_start, + head_size); -#else - // div+add+softmax - for (auto token_id = 0; token_id < context_lens_ptr[seq_id]; token_id++) { - attn_w_start[token_id] = attn_w_start[token_id] * scale; - if (alibi_slopes_ptr != nullptr) { - auto alibi_slope = alibi_slopes_ptr[head_id]; - auto alibi_slopes_val = - alibi_slope * (token_id + 1 - context_lens_ptr[seq_id]); - attn_w_start[token_id] = attn_w_start[token_id] + alibi_slopes_val; - } - if (attn_w_start[token_id] > max_val) { - max_val = attn_w_start[token_id]; + // accumulate the partition 1 to partition n result into partition 0 + if (partition_num > 1) { + for (auto partition_id = 1; partition_id < partition_num; + partition_id++) { + if (partition_id * PARTITION_SIZE >= context_len) + break; + auto tmp_out_start = tmp_out_ptr + seq_id * tmp_out_strideN + + head_id * tmp_out_strideH + partition_id * tmp_out_strideS; + auto max_logit = max_logits_ptr + [seq_id * max_logits_strideN + head_id * max_logits_strideH + + partition_id]; + auto exp_sum = exp_sum_ptr + [seq_id * exp_sum_strideN + head_id * exp_sum_strideH + + partition_id]; + exp_val = expf(max_logit - global_max); + global_exp_sum += exp_sum * exp_val; + at::vec::Vectorized exp_val_vec(exp_val); + at::vec::map2( + [&](auto a, auto b) { return a + exp_val_vec * b; }, + partition0_out_start, + partition0_out_start, + tmp_out_start, + head_size); } } - // exp and sum - for (auto token_id = 0; token_id < context_lens_ptr[seq_id]; token_id++) { - attn_w_start[token_id] = exp(attn_w_start[token_id] - max_val); - sum += attn_w_start[token_id]; - } - // normalize - for (auto token_id = 0; token_id < context_lens_ptr[seq_id]; token_id++) { - attn_w_start[token_id] = attn_w_start[token_id] / sum; - } -#endif - } - } - - auto thread_numbers = omp_get_max_threads(); - auto private_attn_outs = - at::empty({thread_numbers, num_seqs, num_heads, head_size}, at::kFloat); - auto private_attn_out_flag = - at::zeros({thread_numbers, num_seqs, num_heads}, at::kByte); - auto flag_access = private_attn_out_flag.accessor(); - auto private_attn_out_ptr = private_attn_outs.data_ptr(); - auto private_attn_out_strideT = private_attn_outs.stride(0); - auto private_attn_out_strideN = private_attn_outs.stride(1); - auto private_attn_out_strideH = private_attn_outs.stride(2); - auto attn_out_strideN = out.stride(0); - auto attn_out_strideH = out.stride(1); -// mul and accumulate -#pragma omp parallel for collapse(3) - for (auto seq_id = 0; seq_id < num_seqs; seq_id++) { - for (auto head_id = 0; head_id < num_heads; head_id++) { - for (auto token_id = 0; token_id < max_context_len; token_id++) { - auto context_len = context_lens_ptr[seq_id]; - auto thread_id = omp_get_thread_num(); - if (token_id >= context_len) - continue; - auto attn_w = attn_weights_ptr - [seq_id * attn_weights_strideN + head_id * attn_weights_strideH + - token_id]; - auto block_id = block_tables_ptr - [seq_id * max_num_blocks_per_seq + token_id / block_size]; - auto block_offset = token_id % block_size; - auto v_cache_start = value_cache_ptr + block_id * kv_block_strideN + - block_offset * kv_block_strideP + - head_mapping_ptr[head_id] * kv_block_strideH; - auto attn_out_start = private_attn_out_ptr + - thread_id * private_attn_out_strideT + seq_id * attn_out_strideN + - head_id * attn_out_strideH; - mul_attenion_weights_and_value_of_head( - attn_w, - v_cache_start, - attn_out_start, - head_size, - flag_access[thread_id][seq_id][head_id]); - if (flag_access[thread_id][seq_id][head_id] == 0) { - flag_access[thread_id][seq_id][head_id] = 1; - } - } // for token_id - } // for head_id - } // for seq_id - { - RECORD_FUNCTION( - "ipex::single_query_cached_kv_attention::reduction_private_result", - c10::ArrayRef({})); -#pragma omp parallel for collapse(2) - for (auto seq_id = 0; seq_id < num_seqs; seq_id++) { - for (auto hi = 0; hi < num_heads; hi++) { - auto thr0_head_start = private_attn_out_ptr + - seq_id * private_attn_out_strideN + hi * private_attn_out_strideH; - if (flag_access[0][seq_id][hi] == 0) { - torch_ipex::cpu::kernel::zero_ker(thr0_head_start, head_size); - } - for (auto thread_id = 1; thread_id < thread_numbers; thread_id++) { - if (flag_access[thread_id][seq_id][hi] == 0) { - continue; - } - auto attn_out_head_offset = thread_id * private_attn_out_strideT + - seq_id * private_attn_out_strideN + hi * private_attn_out_strideH; - auto private_attn_out_start = - private_attn_out_ptr + attn_out_head_offset; - torch_ipex::cpu::kernel::add_ker( - thr0_head_start, private_attn_out_start, head_size); - } - auto out_start = out_ptr + (seq_id * num_heads + hi) * head_size; - torch_ipex::cpu::kernel::move_ker( - out_start, thr0_head_start, head_size); - } + // copy the partition 0 result into attn_outs + auto attn_out_start = + out_ptr + seq_id * out_strideN + head_id * out_strideH; + float inverse_global_sum = 1.0 / (global_exp_sum + 1e-8); + at::vec::Vectorized inverse_global_sum_vec(inverse_global_sum); + // rescale the partition 0 result with global exp_sum + at::vec::map( + [&](auto a) { return a * inverse_global_sum_vec; }, + partition0_out_start, + partition0_out_start, + head_size); + // copy the partition 0 result into attn_outs + at::vec::map( + [&](auto a) { return a; }, + attn_out_start, + partition0_out_start, + head_size); } } @@ -330,7 +466,7 @@ void reshape_and_cache_kernel( auto num_tokens = key.size(0); auto head_num = key.size(1); auto head_size = key.size(2); - auto block_size = key_cache.size(1); + auto block_size = key_cache.size(2); auto hidden_size = head_num * head_size; auto key_cache_ptr = key_cache.data_ptr(); auto key_ptr = key.data_ptr(); @@ -338,16 +474,16 @@ void reshape_and_cache_kernel( auto value_ptr = value.data_ptr(); auto slot_mapping_ptr = slot_mapping.data_ptr(); auto cache_strideN = key_cache.stride(0); - auto cache_strideP = key_cache.stride(1); - auto cache_strideH = key_cache.stride(2); + auto cache_strideP = key_cache.stride(2); + auto cache_strideH = key_cache.stride(1); auto state_strideN = key.stride(0); auto state_strideH = key.stride(1); #pragma omp parallel for collapse(2) for (auto ti = 0; ti < num_tokens; ti++) { for (auto hi = 0; hi < head_num; hi++) { - auto block_id = slot_mapping_ptr[ti] / block_size; + auto physical_block_id = slot_mapping_ptr[ti] / block_size; auto block_offset = slot_mapping_ptr[ti] % block_size; - auto cache_offset = block_id * cache_strideN + + auto cache_offset = physical_block_id * cache_strideN + block_offset * cache_strideP + hi * cache_strideH; auto state_offset = ti * state_strideN + hi * state_strideH; auto key_cache_start = key_cache_ptr + cache_offset; @@ -384,7 +520,6 @@ void single_query_cached_kv_attention_kernel_impl( query, key_cache, value_cache, - head_mapping, scale, block_tables, context_lens, @@ -397,7 +532,6 @@ void single_query_cached_kv_attention_kernel_impl( query, key_cache, value_cache, - head_mapping, scale, block_tables, context_lens, @@ -449,4 +583,4 @@ IPEX_REGISTER_DISPATCH( &reshape_and_cache_cpu_kernel_impl); } // namespace cpu -} // namespace torch_ipex +} // namespace torch_ipex \ No newline at end of file diff --git a/tests/cpu/test_paged_attention.py b/tests/cpu/test_paged_attention.py index cefe7f48c..25b3a8b19 100644 --- a/tests/cpu/test_paged_attention.py +++ b/tests/cpu/test_paged_attention.py @@ -21,14 +21,14 @@ def create_kv_caches( torch.manual_seed(seed) scale = head_size**-0.5 - key_cache_shape = (num_blocks, block_size, num_head, head_size) + key_cache_shape = (num_blocks, num_head, block_size, head_size) key_caches = [] for _ in range(num_layer): key_cache = torch.empty(size=key_cache_shape, dtype=dtype) key_cache.uniform_(-scale, scale) key_caches.append(key_cache) - value_cache_shape = (num_blocks, block_size, num_head, head_size) + value_cache_shape = (num_blocks, num_head, block_size, head_size) value_caches = [] for _ in range(num_layer): value_cache = torch.empty(size=value_cache_shape, dtype=dtype) @@ -64,9 +64,9 @@ def ref_single_query_cached_kv_attention( alibi_slopes: Optional[torch.Tensor], ) -> None: num_query_heads = query.shape[1] - num_kv_head = value_cache.shape[2] + num_kv_head = value_cache.shape[1] head_size = value_cache.shape[3] - block_size = value_cache.shape[1] + block_size = value_cache.shape[2] num_seqs = query.shape[0] block_tables = block_tables.cpu().tolist() @@ -79,15 +79,19 @@ def ref_single_query_cached_kv_attention( keys = [] values = [] for j in range(context_len): - block_number = int(block_table[j // block_size]) - block_offset = j % block_size - - k = key_cache[block_number, block_offset, :, :] - k = k.reshape(num_kv_head, head_size) - keys.append(k) - - v = value_cache[block_number, block_offset, :, :] - values.append(v) + key = torch.empty( + num_kv_head, head_size, dtype=query.dtype, device="cpu" + ) + value = torch.empty( + num_kv_head, head_size, dtype=query.dtype, device="cpu" + ) + for k in range(num_kv_head): + block_number = int(block_table[j // block_size]) + block_offset = j % block_size + key[k, :] = key_cache[block_number, k, block_offset, :] + value[k, :] = value_cache[block_number, k, block_offset, :] + keys.append(key) + values.append(value) keys = torch.stack(keys, dim=0) values = torch.stack(values, dim=0) if num_queries_per_kv > 1: @@ -281,10 +285,11 @@ def _test_reshape_and_cache_func( block_offsets = slot_mapping % block_size block_offsets = block_offsets.cpu().tolist() for i in range(num_token): - block_idx = block_indicies[i] - block_offset = block_offsets[i] - cloned_key_cache[block_idx, block_offset, :, :] = key[i] - cloned_value_cache[block_idx, block_offset, :, :] = value[i] + for j in range(num_head): + block_idx = block_indicies[i] + block_offset = block_offsets[i] + cloned_key_cache[block_idx, j, block_offset, :] = key[i][j] + cloned_value_cache[block_idx, j, block_offset, :] = value[i][j] assert torch.allclose(key_cache, cloned_key_cache) assert torch.allclose(value_cache, cloned_value_cache) From b44a1f19c0285da105970a5a97c10fd5f6b74e3b Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Thu, 30 May 2024 22:13:35 +0800 Subject: [PATCH 095/199] update .bom list with the whole folder coverage (#2937) --- .bom | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.bom b/.bom index bdbbe7cbf..7fd9dfb71 100644 --- a/.bom +++ b/.bom @@ -73,7 +73,6 @@ psutil python-idna scipy TorchDynamo -gloo gperftools numpy PyBind11 @@ -130,3 +129,22 @@ Python python3-distutils typing_extension sudo +caffe2 +datasets +diffusers +evaluate +fire +flash-attn +huggingface-hub +librosa +lm-eval +lmms-eval +mypy +nltk +peft +safetensors +scikit-learn +tqdm +transformers-stream_generator +typing-extensions +unidiff \ No newline at end of file From 18873438cd50878bd4320fa1b464cf8fe16cf757 Mon Sep 17 00:00:00 2001 From: Xu Han Date: Fri, 31 May 2024 22:51:53 +0800 Subject: [PATCH 096/199] add cpuid mcdt_on (#2945) --- csrc/cpu/isa/cpu_feature.cpp | 8 ++++++++ csrc/cpu/isa/cpu_feature.hpp | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/csrc/cpu/isa/cpu_feature.cpp b/csrc/cpu/isa/cpu_feature.cpp index c32d2e26d..2d0ebdb9a 100644 --- a/csrc/cpu/isa/cpu_feature.cpp +++ b/csrc/cpu/isa/cpu_feature.cpp @@ -108,6 +108,12 @@ void CPUFeature::detect_intel_cpu_feature() { MICRO_CLASS_MEMBER(amx_fp16) = check_reg_bit(eax, 21); } + + if (max_sub_leaf >= 2) { + read_cpuidex(0x00000007, 2, &eax, &ebx, &ecx, &edx); + + MICRO_CLASS_MEMBER(mcdt_on) = check_reg_bit(edx, 5); + } } if (max_extend_id >= 0x80000001) { @@ -435,6 +441,8 @@ void CPUFeature::show_features() { MICRO_CLASS_PRINT_BOOL_STATUS(prefetchw); MICRO_CLASS_PRINT_BOOL_STATUS(prefetchwt1); + + MICRO_CLASS_PRINT_BOOL_STATUS(mcdt_on); #endif } } // namespace cpu diff --git a/csrc/cpu/isa/cpu_feature.hpp b/csrc/cpu/isa/cpu_feature.hpp index bb79f2ed3..598ad9cdf 100644 --- a/csrc/cpu/isa/cpu_feature.hpp +++ b/csrc/cpu/isa/cpu_feature.hpp @@ -87,6 +87,8 @@ class CPUFeature { MICRO_CLASS_MEMBER_DECL(avx512_bf16); MICRO_CLASS_MEMBER_DECL(avx512_vp2intersect); + MICRO_CLASS_MEMBER_DECL(mcdt_on); + public: MICRO_CLASS_CHECK_FUNC(avx512_f); MICRO_CLASS_CHECK_FUNC(avx512_cd); @@ -108,6 +110,8 @@ class CPUFeature { MICRO_CLASS_CHECK_FUNC(avx512_bf16); MICRO_CLASS_CHECK_FUNC(avx512_vp2intersect); + MICRO_CLASS_CHECK_FUNC(mcdt_on); + // AMX private: MICRO_CLASS_MEMBER_DECL(amx_bf16); From c238d37edf3fc1aa9781dde55b273739e403f72e Mon Sep 17 00:00:00 2001 From: Xu Han Date: Sat, 1 Jun 2024 19:41:54 +0800 Subject: [PATCH 097/199] delete old lint from doc. (#2946) --- CONTRIBUTING.md | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d888a816c..23418d006 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -127,45 +127,6 @@ pytest test/cpu/test_nn.py -k Loss -v The above is an example of testing a change to all Loss functions: this command runs tests such as `TestNN.test_BCELoss` and `TestNN.test_MSELoss` and can be useful to save keystrokes. -### Local linting - -You can run the same linting steps that are used in CI locally via `make`: - -```bash -# Lint all files -make lint -j 6 # run lint (using 6 parallel jobs) - -# Lint only the files you have changed -make quicklint -j 6 -``` - -These jobs may require extra dependencies that aren't dependencies of Intelยฎ Extension for PyTorch\* itself, so you can install them via this command, which you should only have to run once: - -```bash -make setup_lint -``` - -To run a specific linting step, use one of these targets or see the [`Makefile`](Makefile) for a complete list of options. - -```bash -# Check for tabs, trailing newlines, etc. -make quick_checks - -make flake8 - -make mypy - -make cmakelint - -make clang-tidy -``` - -To run a lint only on changes, add the `CHANGED_ONLY` option: - -```bash -make CHANGED_ONLY=--changed-only -``` - ### C++ Unit Testing Intelยฎ Extension for PyTorch\* offers tests located in the `test/cpp` folder. These tests are written in C++ and use the Google Test testing framework. After compiling Intelยฎ Extension for PyTorch\* from source, the test runner binaries will be written to the `build/bin` folder. The command to run one of these tests is `./build/bin/FILENAME --gtest_filter=TESTSUITE.TESTNAME`, where `TESTNAME` is the name of the test you'd like to run and `TESTSUITE` is the suite that test is defined in. From 0ae7aa3cf03d9a760566d2f88da7d56ae8f54230 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Mon, 3 Jun 2024 10:59:49 +0800 Subject: [PATCH 098/199] update oneDNN to a7ed0bcbde on rls-v3.5 (#2949) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index 6dde4eee3..e2a639872 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 6dde4eee367757e838a32d9a68ae0227c02c83ad +Subproject commit e2a6398723651c8e09a387d63475c82fbcb0dcf5 From 1f0f4ec51c437a41309f538ae4e0c6b653c88375 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Mon, 3 Jun 2024 14:01:46 +0800 Subject: [PATCH 099/199] Add fused add_rmsnorm kernel (#2951) * Add add_rmsnorm fused op * Support add_back --- csrc/cpu/aten/RMSNorm.cpp | 13 ++ csrc/cpu/aten/RMSNorm.h | 16 ++- csrc/cpu/aten/kernels/RMSNormKrnl.cpp | 119 ++++++++++++++++++ csrc/cpu/vec/vec512/perf_kernel/rmsnorm.h | 103 +++++++++++++++ .../models/cpu/fusions/mha_fusion.py | 10 +- tests/cpu/test_rmsnorm.py | 92 +++++++++++++- 6 files changed, 340 insertions(+), 13 deletions(-) diff --git a/csrc/cpu/aten/RMSNorm.cpp b/csrc/cpu/aten/RMSNorm.cpp index bedb06497..791b6fa74 100644 --- a/csrc/cpu/aten/RMSNorm.cpp +++ b/csrc/cpu/aten/RMSNorm.cpp @@ -6,6 +6,7 @@ namespace torch_ipex { namespace cpu { IPEX_DEFINE_DISPATCH(rmsnorm_kernel_stub); +IPEX_DEFINE_DISPATCH(add_rmsnorm_kernel_stub); at::Tensor dil_RMSNorm( const at::Tensor& input, @@ -16,6 +17,15 @@ at::Tensor dil_RMSNorm( return rmsnorm_kernel_stub(kCPU, input, b, eps); } +at::Tensor add_RMSNorm( + const at::Tensor& input, + at::Tensor& input1, + const at::Tensor& b, + double eps, + bool add_back) { + return add_rmsnorm_kernel_stub(kCPU, input, input1, b, eps, add_back); +} + } // namespace cpu } // namespace torch_ipex @@ -24,5 +34,8 @@ namespace { TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { m.def("rmsnorm(Tensor input, Tensor weight, float eps) -> Tensor"); m.impl("rmsnorm", c10::DispatchKey::CPU, torch_ipex::cpu::dil_RMSNorm); + m.def( + "add_rmsnorm(Tensor input, Tensor input1, Tensor weight, float eps, bool add_back) -> Tensor"); + m.impl("add_rmsnorm", c10::DispatchKey::CPU, torch_ipex::cpu::add_RMSNorm); } } // namespace \ No newline at end of file diff --git a/csrc/cpu/aten/RMSNorm.h b/csrc/cpu/aten/RMSNorm.h index 10ede6a7d..ee081abe3 100644 --- a/csrc/cpu/aten/RMSNorm.h +++ b/csrc/cpu/aten/RMSNorm.h @@ -17,12 +17,26 @@ at::Tensor rmsnorm_kernel_impl( const at::Tensor& input, const at::Tensor& b, float eps); -} + +at::Tensor add_rmsnorm_kernel_impl( + const at::Tensor& input, + at::Tensor& input1, + const at::Tensor& b, + float eps, + bool add_back); // if true, store sum in input1 +} // namespace using rms_norm_kernel_fn = at::Tensor (*)(const at::Tensor&, const at::Tensor&, float); +using add_rms_norm_kernel_fn = at::Tensor (*)( + const at::Tensor&, + at::Tensor&, + const at::Tensor&, + float, + bool); IPEX_DECLARE_DISPATCH(rms_norm_kernel_fn, rmsnorm_kernel_stub); +IPEX_DECLARE_DISPATCH(add_rms_norm_kernel_fn, add_rmsnorm_kernel_stub); } // namespace cpu } // namespace torch_ipex diff --git a/csrc/cpu/aten/kernels/RMSNormKrnl.cpp b/csrc/cpu/aten/kernels/RMSNormKrnl.cpp index 8475d46dd..cdcd259f5 100644 --- a/csrc/cpu/aten/kernels/RMSNormKrnl.cpp +++ b/csrc/cpu/aten/kernels/RMSNormKrnl.cpp @@ -30,6 +30,38 @@ void RMSNormKernelImpl( } }); } + +template +void AddRMSNormKernelImpl( + const at::Tensor& a, + at::Tensor& b, + const at::Tensor& gamma, + int64_t M, + int64_t N, + float eps, + bool add_back, + at::Tensor& Y) { + DCHECK(a.numel() == M * N); + DCHECK(!gamma.defined() || gamma.numel() == M * N); + const T* a_data = a.data_ptr(); + T* b_data = b.data_ptr(); + const T1* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; + T* Y_data = Y.data_ptr(); + at::parallel_for(0, M, 1, [&](int64_t start, int64_t end) { + for (const auto i : c10::irange(start, end)) { + const T* a_ptr = a_data + i * N; + T* b_ptr = b_data + i * N; + T* Y_ptr = Y_data + i * N; + if (add_back) { + kernel::_add_back_and_compute_rmsnorm( + a_ptr, b_ptr, N, eps, gamma_data, Y_ptr); + } else { + kernel::_add_and_compute_rmsnorm( + a_ptr, b_ptr, N, eps, gamma_data, Y_ptr); + } + } + }); +} #endif at::Tensor rmsnorm_kernel_impl( @@ -99,8 +131,95 @@ at::Tensor rmsnorm_kernel_impl( #endif } +at::Tensor add_rmsnorm_kernel_impl( + const at::Tensor& input, + at::Tensor& input1, + const at::Tensor& b, + float eps, + bool add_back) { + DCHECK(input.sizes() == input1.sizes()); +#if defined(CPU_CAPABILITY_AVX512) + const auto input_shape = input.sizes(); + const auto input_ndim = input.dim(); + const int axis = input_ndim - 1; + const int64_t M = + c10::multiply_integers(input_shape.cbegin(), input_shape.cbegin() + axis); + const int64_t N = + c10::multiply_integers(input_shape.cbegin() + axis, input_shape.cend()); + auto X = input.contiguous(); + if (add_back) { + DCHECK(input1.is_contiguous()); + } + auto X1 = add_back ? input1 : input1.contiguous(); + at::Tensor Y = at::native::empty_like( + X, + c10::nullopt /* dtype */, + c10::nullopt /* layout */, + c10::nullopt /* device */, + c10::nullopt /* pin_memory */, + at::MemoryFormat::Contiguous); + if (input.scalar_type() == at::ScalarType::Float && + b.scalar_type() == at::ScalarType::Float) { + AddRMSNormKernelImpl(X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::Float && + b.scalar_type() == at::ScalarType::BFloat16) { + AddRMSNormKernelImpl(X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::Float && + b.scalar_type() == at::ScalarType::Half) { + AddRMSNormKernelImpl(X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::BFloat16 && + b.scalar_type() == at::ScalarType::Float) { + AddRMSNormKernelImpl(X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::BFloat16 && + b.scalar_type() == at::ScalarType::BFloat16) { + AddRMSNormKernelImpl( + X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::BFloat16 && + b.scalar_type() == at::ScalarType::Half) { + AddRMSNormKernelImpl( + X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::Half && + b.scalar_type() == at::ScalarType::Half) { + AddRMSNormKernelImpl(X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::Half && + b.scalar_type() == at::ScalarType::BFloat16) { + AddRMSNormKernelImpl( + X, X1, b, M, N, eps, add_back, Y); + } else if ( + input.scalar_type() == at::ScalarType::Half && + b.scalar_type() == at::ScalarType::Float) { + AddRMSNormKernelImpl(X, X1, b, M, N, eps, add_back, Y); + } else { + TORCH_CHECK(false, "Unsupported input type"); + } + return Y; +#else + if (add_back) { + input1.add_(input); + auto X = input1.to(at::kFloat); + auto variance = at::mean(at::pow(X, 2), -1, true); + auto hidden_states = at::rsqrt(at::add(variance, eps)); + return at::mul(b, at::mul(X, hidden_states)).to(input.scalar_type()); + } + auto X = input.to(at::kFloat); + auto X1 = input1.to(at::kFloat); + auto X2 = X + X1; + auto variance = at::mean(at::pow(X2, 2), -1, true); + auto hidden_states = at::rsqrt(at::add(variance, eps)); + return at::mul(b, at::mul(X2, hidden_states)).to(input.scalar_type()); +#endif +} + } // namespace IPEX_REGISTER_DISPATCH(rmsnorm_kernel_stub, &rmsnorm_kernel_impl); +IPEX_REGISTER_DISPATCH(add_rmsnorm_kernel_stub, &add_rmsnorm_kernel_impl); } // namespace cpu } // namespace torch_ipex diff --git a/csrc/cpu/vec/vec512/perf_kernel/rmsnorm.h b/csrc/cpu/vec/vec512/perf_kernel/rmsnorm.h index 44415456c..995042931 100644 --- a/csrc/cpu/vec/vec512/perf_kernel/rmsnorm.h +++ b/csrc/cpu/vec/vec512/perf_kernel/rmsnorm.h @@ -55,6 +55,109 @@ void _compute_rmsnorm( } } +template +void _add_and_compute_rmsnorm( + const T* a_ptr, + const T* b_ptr, + const int& size, + float eps, + const T1* gamma_ptr, + T* out_ptr) { + auto vec_acc_pow = _mm512_set1_ps(0.0); + int i; + for (i = 0; i <= size - 16; i += 16) { + auto vec_a = _loadu(a_ptr + i); + auto vec_b = _loadu(b_ptr + i); + vec_a = vec_a + vec_b; + auto s = vec_a * vec_a; + vec_acc_pow += s; + } + if (i < size) { + __mmask16 mask = (1 << (size - i)) - 1; + auto vec_a = _maskz_loadu(a_ptr + i, mask); + auto vec_b = _maskz_loadu(b_ptr + i, mask); + vec_a = vec_a + vec_b; + auto s = vec_a * vec_a; + vec_acc_pow += s; + } + float var_val = _mm512_reduce_add_ps(vec_acc_pow) / static_cast(size); + float scale = float(1.0) / std::sqrt(var_val + eps); + auto vec_scale = _mm512_set1_ps(scale); + for (i = 0; i <= size - 16; i += 16) { + auto vec_input = _loadu(a_ptr + i); + auto vec_input1 = _loadu(b_ptr + i); + vec_input = vec_input + vec_input1; + auto vec_gamma = _mm512_set1_ps(1.0); + if (gamma_ptr) { + vec_gamma = _loadu(gamma_ptr + i); + } + auto vec_res = vec_input * vec_scale * vec_gamma; + _storeu(out_ptr + i, vec_res); + } + if (i < size) { + __mmask16 mask = (1 << (size - i)) - 1; + auto vec_input = _maskz_loadu(a_ptr + i, mask); + auto vec_input1 = _maskz_loadu(b_ptr + i, mask); + vec_input = vec_input + vec_input1; + auto vec_gamma = _mm512_set1_ps(1.0); + if (gamma_ptr) { + vec_gamma = _maskz_loadu(gamma_ptr + i, mask); + } + auto vec_res = vec_input * vec_scale * vec_gamma; + _mask_storeu(out_ptr + i, vec_res, mask); + } +} + +template +void _add_back_and_compute_rmsnorm( + const T* a_ptr, + T* b_ptr, + const int& size, + float eps, + const T1* gamma_ptr, + T* out_ptr) { + auto vec_acc_pow = _mm512_set1_ps(0.0); + int i; + for (i = 0; i <= size - 16; i += 16) { + auto vec_a = _loadu(a_ptr + i); + auto vec_b = _loadu(b_ptr + i); + vec_a = vec_a + vec_b; + _store_data(b_ptr + i, vec_a); + auto s = vec_a * vec_a; + vec_acc_pow += s; + } + if (i < size) { + __mmask16 mask = (1 << (size - i)) - 1; + auto vec_a = _maskz_loadu(a_ptr + i, mask); + auto vec_b = _maskz_loadu(b_ptr + i, mask); + vec_a = vec_a + vec_b; + _mask_store_data(b_ptr + i, vec_a, mask); + auto s = vec_a * vec_a; + vec_acc_pow += s; + } + float var_val = _mm512_reduce_add_ps(vec_acc_pow) / static_cast(size); + float scale = float(1.0) / std::sqrt(var_val + eps); + auto vec_scale = _mm512_set1_ps(scale); + for (i = 0; i <= size - 16; i += 16) { + auto vec_input = _loadu(b_ptr + i); + auto vec_gamma = _mm512_set1_ps(1.0); + if (gamma_ptr) { + vec_gamma = _loadu(gamma_ptr + i); + } + auto vec_res = vec_input * vec_scale * vec_gamma; + _storeu(out_ptr + i, vec_res); + } + if (i < size) { + __mmask16 mask = (1 << (size - i)) - 1; + auto vec_input = _maskz_loadu(b_ptr + i, mask); + auto vec_gamma = _mm512_set1_ps(1.0); + if (gamma_ptr) { + vec_gamma = _maskz_loadu(gamma_ptr + i, mask); + } + auto vec_res = vec_input * vec_scale * vec_gamma; + _mask_storeu(out_ptr + i, vec_res, mask); + } +} } // namespace kernel } // namespace cpu } // namespace torch_ipex diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index df3d32a10..a656e674d 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -509,15 +509,9 @@ def add_rms_norm_cpu( ): assert bias is None, "bias is not supported in add_rmsnorm yet" if add is not None: - if add_back: - add.add_(x) - input = add - else: - input = add + x + return torch.ops.torch_ipex.add_rmsnorm(x, add, weight, eps, add_back) else: - input = x - - return torch.ops.torch_ipex.rmsnorm(input, weight, eps) + return torch.ops.torch_ipex.rmsnorm(x, weight, eps) def add_layer_norm_cpu( diff --git a/tests/cpu/test_rmsnorm.py b/tests/cpu/test_rmsnorm.py index 3bb6adb8c..54feece72 100644 --- a/tests/cpu/test_rmsnorm.py +++ b/tests/cpu/test_rmsnorm.py @@ -2,6 +2,7 @@ import torch.nn as nn from common_utils import TestCase import unittest +import itertools class RMSNorm(nn.Module): @@ -10,12 +11,29 @@ def __init__(self, hidden_size, eps=1e-6, dtype=torch.float): self.weight = nn.Parameter(torch.ones(hidden_size, dtype=dtype)) self.variance_epsilon = eps - def forward(self, hidden_states, fused_rmsnorm=False): + def forward( + self, hidden_states, fused_rmsnorm=False, extra_input=None, add_back=False + ): if fused_rmsnorm: - return torch.ops.torch_ipex.rmsnorm( - hidden_states, self.weight, self.variance_epsilon - ) + if extra_input is None: + return torch.ops.torch_ipex.rmsnorm( + hidden_states, self.weight, self.variance_epsilon + ) + else: + return torch.ops.torch_ipex.add_rmsnorm( + hidden_states, + extra_input, + self.weight, + self.variance_epsilon, + add_back, + ) else: + if extra_input is not None: + if add_back: + extra_input.add_(hidden_states) + hidden_states = extra_input + else: + hidden_states = hidden_states + extra_input input_dtype = hidden_states.dtype hidden_states = hidden_states.to(torch.float32) variance = hidden_states.pow(2).mean(-1, keepdim=True) @@ -72,6 +90,72 @@ def test_RMSNorm(self): fused_y1_fp16 = model(x_fp16, fused_rmsnorm=True) self.assertEqual(y1_fp16, fused_y1_fp16, prec=1e-2) + def test_add_RMSNorm(self): + add_back_list = [False, True] + dim_list = [2, 3, 4, 5] + cases = itertools.product(add_back_list, dim_list) + for add_back, dim in cases: + # RMSNorm input is fp32 + for weight_dtype in [torch.float32, torch.half, torch.bfloat16]: + with torch.no_grad(): + input_size = [ + 3, + ] + for _ in range(dim - 1): + input_size.append(20) + x = torch.randn(input_size) + x1 = torch.randn(input_size) + model = RMSNorm(input_size, dtype=weight_dtype).eval() + x2 = x1.clone() + y1_fp32 = model(x, extra_input=x1, add_back=add_back) + fused_y1_fp32 = model( + x, fused_rmsnorm=True, extra_input=x2, add_back=add_back + ) + self.assertEqual(y1_fp32, fused_y1_fp32) + self.assertEqual(x1, x2) + # RMSNorm input is bf16 + for weight_dtype in [torch.float32, torch.half, torch.bfloat16]: + with torch.no_grad(): + input_size = [ + 3, + ] + for _ in range(dim - 1): + input_size.append(20) + x_bf16 = torch.randn(input_size, dtype=torch.bfloat16) + x1_bf16 = torch.randn(input_size, dtype=torch.bfloat16) + model = RMSNorm(input_size, dtype=weight_dtype).eval() + x2_bf16 = x1_bf16.clone() + y1_bf16 = model(x_bf16, extra_input=x1_bf16, add_back=add_back) + fused_y1_bf16 = model( + x_bf16, + fused_rmsnorm=True, + extra_input=x2_bf16, + add_back=add_back, + ) + self.assertEqual(y1_bf16, fused_y1_bf16, prec=2e-2) + self.assertEqual(x1_bf16, x2_bf16) + # RMSNorm input is fp16 + for weight_dtype in [torch.float32, torch.half, torch.bfloat16]: + with torch.no_grad(): + input_size = [ + 3, + ] + for _ in range(dim - 1): + input_size.append(20) + x_fp16 = torch.randn(input_size, dtype=torch.half) + x1_fp16 = torch.randn(input_size, dtype=torch.half) + model = RMSNorm(input_size, dtype=weight_dtype).eval() + x2_fp16 = x1_fp16.clone() + y1_fp16 = model(x_fp16, extra_input=x1_fp16, add_back=add_back) + fused_y1_fp16 = model( + x_fp16, + fused_rmsnorm=True, + extra_input=x2_fp16, + add_back=add_back, + ) + self.assertEqual(y1_fp16, fused_y1_fp16, prec=1e-2) + self.assertEqual(x1_fp16, x2_fp16) + if __name__ == "__main__": test = unittest.main() From ec2ededbfaea2cf12757d4e89f87cbc2cde14e9a Mon Sep 17 00:00:00 2001 From: Xu Han Date: Mon, 3 Jun 2024 19:38:27 +0800 Subject: [PATCH 100/199] Xu sync back code (#2955) * rename IPEX_OP_REGISTER_DISPATCH * sync cmake files. --- cmake/BuildFlags.cmake | 8 ++++---- cmake/Modules/FindoneMKL.cmake | 10 ++++++++-- cmake/Options.cmake | 1 + csrc/cpu/aten/Cast.cpp | 4 ++-- csrc/cpu/aten/Converter.cpp | 4 ++-- csrc/cpu/aten/FP8Linear.cpp | 2 +- csrc/cpu/aten/optimizer/AdamFusedStep.cpp | 2 +- csrc/cpu/aten/optimizer/LambFusedStep.cpp | 2 +- csrc/cpu/aten/optimizer/SgdFusedStep.cpp | 2 +- csrc/cpu/aten/optimizer/SplitSgdStep.cpp | 4 ++-- csrc/utils/CustomOperatorRegistration.h | 12 ++++++------ 11 files changed, 29 insertions(+), 22 deletions(-) diff --git a/cmake/BuildFlags.cmake b/cmake/BuildFlags.cmake index 2c59d6cee..f9c31e091 100644 --- a/cmake/BuildFlags.cmake +++ b/cmake/BuildFlags.cmake @@ -14,10 +14,10 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}") if(NOT WINDOWS) - string(REGEX MATCH "-D_GLIBCXX_USE_CXX11_ABI=([0-9]+)" torch_cxx11 ${TORCH_CXX_FLAGS}) - set(GLIBCXX_USE_CXX11_ABI ${CMAKE_MATCH_1}) - if(BUILD_WITH_XPU AND NOT GLIBCXX_USE_CXX11_ABI) - message(FATAL_ERROR "Must set _GLIBCXX_USE_CXX11_ABI=1 for XPU build, but not is ${GLIBCXX_USE_CXX11_ABI}!") + string(REGEX MATCH "-D_GLIBCXX_USE_CXX11_ABI=([0-9]+)" torch_cxx11 ${TORCH_CXX_FLAGS}) + set(GLIBCXX_USE_CXX11_ABI ${CMAKE_MATCH_1}) + if(BUILD_WITH_XPU AND NOT GLIBCXX_USE_CXX11_ABI) + message(FATAL_ERROR "Must set _GLIBCXX_USE_CXX11_ABI=1 for XPU build, but not is ${GLIBCXX_USE_CXX11_ABI}!") endif() endif() diff --git a/cmake/Modules/FindoneMKL.cmake b/cmake/Modules/FindoneMKL.cmake index 0b397a3a9..1290a22ba 100644 --- a/cmake/Modules/FindoneMKL.cmake +++ b/cmake/Modules/FindoneMKL.cmake @@ -34,6 +34,7 @@ set(mkl_root_hint) # install mkl-include and mkl-static for CPU build function (install_mkl_packages) + message(STATUS "Download and install mkl-include and mkl-static for IPEX CPU build automatically.") set(REQ_MKL_VERSION 2021.0.0) execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install "mkl-include>=${REQ_MKL_VERSION}" RESULT_VARIABLE mkl_iret COMMAND_ERROR_IS_FATAL ANY) @@ -74,10 +75,15 @@ if(BUILD_MODULE_TYPE STREQUAL "GPU") get_mkl_from_env_var() else() if(BUILD_WITH_XPU) - get_mkl_from_env_var() + if(NOT USE_ONEMKL) + # IPEX CPU must use onemkl. If setup "USE_ONEMKL=OFF", force ipex cpu roll back to static link mkl. + set(BUILD_STATIC_ONEMKL ON) + install_mkl_packages() + else() + get_mkl_from_env_var() + endif() else() if(BUILD_STATIC_ONEMKL) - message(STATUS "Download and install mkl-include and mkl-static for IPEX CPU build automatically.") install_mkl_packages() else() get_mkl_from_env_var() diff --git a/cmake/Options.cmake b/cmake/Options.cmake index fa040e308..81381674d 100644 --- a/cmake/Options.cmake +++ b/cmake/Options.cmake @@ -46,6 +46,7 @@ function (print_config_summary) message(STATUS "Options:") message(STATUS " BUILD_WITH_CPU : ${BUILD_WITH_CPU}") message(STATUS " BUILD_WITH_XPU : ${BUILD_WITH_XPU}") + message(STATUS " BUILD_STATIC_ONEMKL : ${BUILD_STATIC_ONEMKL}") message(STATUS " BUILD_NO_CLANGFORMAT : ${BUILD_NO_CLANGFORMAT}") message(STATUS " BUILD_STATS : ${BUILD_STATS}") message(STATUS " BUILD_STRIPPED_BIN : ${BUILD_STRIPPED_BIN}") diff --git a/csrc/cpu/aten/Cast.cpp b/csrc/cpu/aten/Cast.cpp index 8e2c9ce49..d41b39734 100644 --- a/csrc/cpu/aten/Cast.cpp +++ b/csrc/cpu/aten/Cast.cpp @@ -173,9 +173,9 @@ at::Tensor cast_from_fp8( namespace { IPEX_LIBRARY_FRAGMENT() { - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "cast_to_fp8", torch_ipex::cpu::cast_to_fp8, c10::DispatchKey::CPU); - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "cast_from_fp8", torch_ipex::cpu::cast_from_fp8, c10::DispatchKey::CPU); } diff --git a/csrc/cpu/aten/Converter.cpp b/csrc/cpu/aten/Converter.cpp index c1ed977dc..ac503e21b 100644 --- a/csrc/cpu/aten/Converter.cpp +++ b/csrc/cpu/aten/Converter.cpp @@ -49,11 +49,11 @@ std::tuple split_float_bfloat16( namespace { IPEX_LIBRARY_FRAGMENT() { - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "split_float_bfloat16", torch_ipex::cpu::bf16::converter::split_float_bfloat16, c10::DispatchKey::CPU); - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "cat_bfloat16_float", torch_ipex::cpu::bf16::converter::cat_bfloat16_float, c10::DispatchKey::CPU); diff --git a/csrc/cpu/aten/FP8Linear.cpp b/csrc/cpu/aten/FP8Linear.cpp index 30335f57f..11c096366 100644 --- a/csrc/cpu/aten/FP8Linear.cpp +++ b/csrc/cpu/aten/FP8Linear.cpp @@ -152,7 +152,7 @@ at::Tensor fp8_linear( namespace { IPEX_LIBRARY_FRAGMENT() { - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "fp8_linear", torch_ipex::cpu::fp8_linear, c10::DispatchKey::CPU); } diff --git a/csrc/cpu/aten/optimizer/AdamFusedStep.cpp b/csrc/cpu/aten/optimizer/AdamFusedStep.cpp index ff500c9a7..28e3fe739 100644 --- a/csrc/cpu/aten/optimizer/AdamFusedStep.cpp +++ b/csrc/cpu/aten/optimizer/AdamFusedStep.cpp @@ -106,7 +106,7 @@ void adam_fused_step( namespace { IPEX_LIBRARY_FRAGMENT() { - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "adam_fused_step", torch_ipex::cpu::adam_fused_step, at::DispatchKey::CPU); diff --git a/csrc/cpu/aten/optimizer/LambFusedStep.cpp b/csrc/cpu/aten/optimizer/LambFusedStep.cpp index 2577ba44c..bd4c78bd8 100644 --- a/csrc/cpu/aten/optimizer/LambFusedStep.cpp +++ b/csrc/cpu/aten/optimizer/LambFusedStep.cpp @@ -92,7 +92,7 @@ std::tuple lamb_fused_step( namespace { IPEX_LIBRARY_FRAGMENT() { - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "lamb_fused_step", torch_ipex::cpu::lamb_fused_step, at::DispatchKey::CPU); diff --git a/csrc/cpu/aten/optimizer/SgdFusedStep.cpp b/csrc/cpu/aten/optimizer/SgdFusedStep.cpp index bdc646405..ed6071f3d 100644 --- a/csrc/cpu/aten/optimizer/SgdFusedStep.cpp +++ b/csrc/cpu/aten/optimizer/SgdFusedStep.cpp @@ -89,7 +89,7 @@ c10::optional sgd_fused_step( namespace { IPEX_LIBRARY_FRAGMENT() { - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "sgd_fused_step", torch_ipex::cpu::sgd_fused_step, at::DispatchKey::CPU); } } // namespace diff --git a/csrc/cpu/aten/optimizer/SplitSgdStep.cpp b/csrc/cpu/aten/optimizer/SplitSgdStep.cpp index 80b12ccca..3c219db2b 100644 --- a/csrc/cpu/aten/optimizer/SplitSgdStep.cpp +++ b/csrc/cpu/aten/optimizer/SplitSgdStep.cpp @@ -25,9 +25,9 @@ at::Tensor packed_add( namespace { IPEX_LIBRARY_FRAGMENT() { - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "packed_add", torch_ipex::cpu::packed_add, at::DispatchKey::CPU); - IPEX_OP_IPEX_REGISTER_DISPATCH( + IPEX_OP_REGISTER_DISPATCH( "packed_add", torch_ipex::cpu::packed_add, at::DispatchKey::SparseCPU); } diff --git a/csrc/utils/CustomOperatorRegistration.h b/csrc/utils/CustomOperatorRegistration.h index 3801032dd..bfc726c2a 100644 --- a/csrc/utils/CustomOperatorRegistration.h +++ b/csrc/utils/CustomOperatorRegistration.h @@ -133,14 +133,14 @@ operator will automatically convert to normal tensor layout when execution. "torch_ipex::" NAME, Func, m); \ m.impl(TORCH_SELECTIVE_NAME("torch_ipex::" NAME), Func); -#define IPEX_OP_IPEX_REGISTER_DISPATCH(NAME, Func, DispatchKey) \ - torch_ipex::construct_function_schema_and_register( \ - "torch_ipex::" NAME, Func, m); \ +#define IPEX_OP_REGISTER_DISPATCH(NAME, Func, DispatchKey) \ + torch_ipex::construct_function_schema_and_register( \ + "torch_ipex::" NAME, Func, m); \ m.impl(TORCH_SELECTIVE_NAME("torch_ipex::" NAME), DispatchKey, Func); -#define IPEX_OP_IPEX_REGISTER_DISPATCH_NEED_PLAIN(NAME, Func, DispatchKey) \ - torch_ipex::construct_function_schema_and_register( \ - "torch_ipex::" NAME, Func, m); \ +#define IPEX_OP_REGISTER_DISPATCH_NEED_PLAIN(NAME, Func, DispatchKey) \ + torch_ipex::construct_function_schema_and_register( \ + "torch_ipex::" NAME, Func, m); \ m.impl(TORCH_SELECTIVE_NAME("torch_ipex::" NAME), DispatchKey, Func); #define IPEX_LIBRARY_FRAGMENT() TORCH_LIBRARY_FRAGMENT(torch_ipex, m) From 61ff58f74e3f3704655195d85de8e6d97d65dfad Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Tue, 4 Jun 2024 08:53:21 +0800 Subject: [PATCH 101/199] Add fast path for beam idx update in the iakv (#2950) * Eanble flash decoding and use a new data layout to get better data locality * Reduce the beam_idx update overhead * Tune the block size 96->93ms --- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 34 +++++++++++++++---- csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp | 2 +- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index 205c3c31f..0bff47b20 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -559,7 +559,7 @@ scale_dot_product_for_indirect_access_kv_cache( auto target_block_size = 32L; if (bs <= 32 and seq_len < 65536) { - target_block_size = 1L; + target_block_size = 8L; } auto kv_block_size = bs * head_num >= max_parallel_parts ? seq_len @@ -568,7 +568,11 @@ scale_dot_product_for_indirect_access_kv_cache( auto kv_block_count = (seq_len + kv_block_size - 1) / kv_block_size; auto need_update_beam_idx = offset > 0 and bs > 1; auto b_ptr = beam_idx.data_ptr(); - long new_beam_idx[beam_batch][offset + query.size(1) + 1]; + auto max_cache_size = beam_idx.size(0); + long new_beam_idx[beam_batch][offset + query.size(1) + 1] = {}; + auto prompt_len = b_ptr[(max_cache_size - 2) * beam_batch]; + auto prompt_bs = b_ptr[(max_cache_size - 1) * beam_batch]; + auto beam_size = beam_batch / prompt_bs; if (need_update_beam_idx) { // according to the last decoded token to get the target beam for the past // token @@ -576,6 +580,8 @@ scale_dot_product_for_indirect_access_kv_cache( new_beam_idx[i][offset - 1] = b_ptr[(offset - 1) * bs + i]; for (int j = offset - 2; j >= 0; j--) { // for the token of input, the target beam is alwarys 0 + if (j < prompt_len - 1 && bs == beam_size) + break; // fast path for latency mode new_beam_idx[i][j] = b_ptr[j * bs + new_beam_idx[i][j + 1]]; } } @@ -910,9 +916,13 @@ scale_dot_product_for_indirect_access_kv_cache_half( auto attn_out_ptr = attn_outs.data_ptr(); // torch_ipex::cpu::kernel::zero_ker(attn_out_ptr, attn_outs.numel()); auto attn_w_ptr = attn_weights.data_ptr(); - long new_beam_idx[beam_batch][offset + query.size(1) + 1]; auto b_ptr = beam_idx.data_ptr(); - auto need_update_beam_idx = offset > 0 && bs > 1; + auto max_cache_size = beam_idx.size(0); + long new_beam_idx[beam_batch][offset + query.size(1) + 1] = {}; + auto prompt_len = b_ptr[(max_cache_size - 2) * beam_batch]; + auto prompt_bs = b_ptr[(max_cache_size - 1) * beam_batch]; + auto beam_size = beam_batch / prompt_bs; + auto need_update_beam_idx = offset > 0 and bs > 1; if (need_update_beam_idx) { // according to the last decoded token to get the target beam for the past // token @@ -920,6 +930,8 @@ scale_dot_product_for_indirect_access_kv_cache_half( new_beam_idx[i][offset - 1] = b_ptr[(offset - 1) * bs + i]; for (int j = offset - 2; j >= 0; j--) { // for the token of input, the target beam is alwarys 0 + if (j < prompt_len - 1 && bs == beam_size) + break; // fast path for latency mode new_beam_idx[i][j] = b_ptr[j * bs + new_beam_idx[i][j + 1]]; } } @@ -1415,7 +1427,7 @@ masked_multihead_self_attention_kernel_impl( value_cache = at::empty( {max_positions, beam_batch, value.size(2), value.size(3)}, value.options()); - beam_idx = at::empty({max_positions, beam_batch}, beam_idx.options()); + beam_idx = at::zeros({max_positions + 2, beam_batch}, beam_idx.options()); auto beam_idx_access = beam_idx.accessor(); #pragma omp parallel for collapse(2) for (auto i = 0; i < max_positions; i++) { @@ -1428,15 +1440,19 @@ masked_multihead_self_attention_kernel_impl( } } } + beam_idx_access[max_positions][0] = cur_len; // record the prompt token len + beam_idx_access[max_positions + 1][0] = + query.size(0); // record the promt bs info + } else if (offset > 0 && offset + cur_len > cache_size) { - auto new_cache_size = cache_size * 2; + auto new_cache_size = cache_size * 2 + 2; auto new_key_cache = at::empty( {new_cache_size, beam_batch, key.size(2), key.size(3)}, key.options()); auto new_value_cache = at::empty( {new_cache_size, beam_batch, value.size(2), value.size(3)}, value.options()); auto new_beam_idx = - at::empty({new_cache_size, beam_batch}, beam_idx.options()); + at::zeros({new_cache_size, beam_batch}, beam_idx.options()); new_key_cache.slice(0, 0, cache_size).copy_(key_cache); new_value_cache.slice(0, 0, cache_size).copy_(value_cache); new_beam_idx.slice(0, 0, cache_size).copy_(beam_idx); @@ -1447,6 +1463,10 @@ masked_multihead_self_attention_kernel_impl( new_beam_idx_access[i][j] = beam_idx_access[0][j]; } } + new_beam_idx_access[new_cache_size - 2][0] = + beam_idx_access[cache_size - 2][0]; + new_beam_idx_access[new_cache_size - 1][0] = + beam_idx_access[cache_size - 1][0]; key_cache = new_key_cache; value_cache = new_value_cache; beam_idx = new_beam_idx; diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp index 964af1590..f620b90f9 100644 --- a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp @@ -583,4 +583,4 @@ IPEX_REGISTER_DISPATCH( &reshape_and_cache_cpu_kernel_impl); } // namespace cpu -} // namespace torch_ipex \ No newline at end of file +} // namespace torch_ipex From 9469f8ccc6959f1ab5c955aa07d0c2bfc57a315c Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Tue, 4 Jun 2024 10:16:32 +0900 Subject: [PATCH 102/199] Move oneAPI IPEX samples back to IPEX repo (#2943) * restruct example directories * add jupyter notebook of IntelPytorch Inference AMX BF16 and INT8 * mv 2 examples from onesample (#2787) * mv 2 examples from onesample * fix license format * add jupyter notebook readme * move oneAPI IPEX inference sample optimize (#2798) * clear output of notebook * Update example. Add example 'complete flag' * update readme, remove aikit and refer ipex installation guide * remove installation part in jupyter notebook * remove installation part in jupyter notebook and add kernel select * each sample use conda env seperately * Update cpu example jupyter nootbook README * rm install jupyter and refer to readme, fix table format * Create IPEX_Getting_Started.ipynb * Create IntelPytorch_Quantization.ipynb * remove training examples --------- Co-authored-by: Zheng, Zhaoqiong Co-authored-by: Neo Zhang Jianyu Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com> Co-authored-by: Wang, Xigui Co-authored-by: yqiu-intel <113460727+YuningQiu@users.noreply.github.com> --- docs/tutorials/examples.md | 45 - .../python/jupyter-notebooks/.gitkeep | 0 .../IPEX_Getting_Started.ipynb | 367 ++++++++ ...InferenceOptimizations_AMX_BF16_INT8.ipynb | 589 ++++++++++++ ...ytorch_Interactive_Chat_Quantization.ipynb | 521 ++++++++++ .../IntelPytorch_Quantization.ipynb | 347 +++++++ .../python/jupyter-notebooks/README.md | 61 ++ .../optimize_pytorch_models_with_ipex.ipynb | 888 ++++++++++++++++++ .../python/pytorch_inference_vnni.py | 156 +++ .../jupyter-notebooks/python/resnet50.py | 60 ++ .../python/{ => python-scripts}/README.md | 2 +- .../bert_eager_mode_inference_bf16.py | 0 .../bert_eager_mode_inference_fp32.py | 0 .../bert_general_inference_script.py | 0 .../bert_torchdynamo_mode_inference_bf16.py | 0 .../bert_torchdynamo_mode_inference_fp32.py | 0 .../bert_torchscript_mode_inference_bf16.py | 0 .../bert_torchscript_mode_inference_fp32.py | 0 .../{ => python-scripts}/int8_deployment.py | 0 .../int8_quantization_dynamic.py | 0 .../int8_quantization_static.py | 0 .../resnet50_eager_mode_inference_bf16.py | 0 .../resnet50_eager_mode_inference_fp32.py | 0 .../resnet50_general_inference_script.py | 0 ...esnet50_torchdynamo_mode_inference_bf16.py | 0 ...esnet50_torchdynamo_mode_inference_fp32.py | 0 ...esnet50_torchscript_mode_inference_bf16.py | 0 ...esnet50_torchscript_mode_inference_fp32.py | 0 .../training/{ => python-scripts}/README.md | 18 +- .../distributed_data_parallel_training.py | 0 .../training/single_instance_training_bf16.py | 51 - .../training/single_instance_training_fp32.py | 49 - scripts/build_doc.sh | 70 +- 33 files changed, 3025 insertions(+), 199 deletions(-) create mode 100644 examples/cpu/inference/python/jupyter-notebooks/.gitkeep create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/README.md create mode 100644 examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py create mode 100644 examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py rename examples/cpu/inference/python/{ => python-scripts}/README.md (97%) rename examples/cpu/inference/python/{ => python-scripts}/bert_eager_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_eager_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_general_inference_script.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchdynamo_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchdynamo_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchscript_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchscript_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/int8_deployment.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/int8_quantization_dynamic.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/int8_quantization_static.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_eager_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_eager_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_general_inference_script.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchdynamo_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchdynamo_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchscript_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchscript_mode_inference_fp32.py (100%) rename examples/cpu/training/{ => python-scripts}/README.md (74%) rename examples/cpu/training/{ => python-scripts}/distributed_data_parallel_training.py (100%) delete mode 100644 examples/cpu/training/single_instance_training_bf16.py delete mode 100644 examples/cpu/training/single_instance_training_fp32.py diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md index f90505a2d..3c9eacaa6 100644 --- a/docs/tutorials/examples.md +++ b/docs/tutorials/examples.md @@ -25,51 +25,6 @@ Before running these examples, please note the following: ### Training -#### Single-instance Training - -To use Intelยฎ Extension for PyTorch\* on training, you need to make the following changes in your code: - -1. Import `intel_extension_for_pytorch` as `ipex`. -2. Invoke the `ipex.optimize` function to apply optimizations against the model and optimizer objects, as shown below: - - -```python -... -import torch -import intel_extension_for_pytorch as ipex -... -model = Model() -criterion = ... -optimizer = ... -model.train() -# For Float32 -model, optimizer = ipex.optimize(model, optimizer=optimizer) -# For BFloat16 -model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) -# Invoke the code below to enable beta feature torch.compile -model = torch.compile(model, backend="ipex") -... -optimizer.zero_grad() -output = model(data) -... -``` - -Below you can find complete code examples demonstrating how to use the extension on training for different data types: - -##### Float32 - -**Note:** You need to install `torchvision` Python package to run the following example. - -[//]: # (marker_train_single_fp32_complete) -[//]: # (marker_train_single_fp32_complete) - -##### BFloat16 - -**Note:** You need to install `torchvision` Python package to run the following example. - -[//]: # (marker_train_single_bf16_complete) -[//]: # (marker_train_single_bf16_complete) - #### Distributed Training Distributed training with PyTorch DDP is accelerated by oneAPI Collective Communications Library Bindings for Pytorch\* (oneCCL Bindings for Pytorch\*). The extension supports FP32 and BF16 data types. More detailed information and examples are available at the [Github repo](https://github.com/intel/torch-ccl). diff --git a/examples/cpu/inference/python/jupyter-notebooks/.gitkeep b/examples/cpu/inference/python/jupyter-notebooks/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb new file mode 100644 index 000000000..10c934360 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Started with Intelยฎ Extension for PyTorch (IPEX)\n", + "This code sample will guide users how to run a PyTorch inference workload on CPU by using oneAPI AI Analytics Toolkit and also analyze the CPU usage via oneDNN verbose logs." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resnet50 Inference on CPU\n", + "***\n", + "This section shows users how to run resnet50 inference on CPU." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### prerequisites" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ignore all warning messages\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import os" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the installation path of your oneAPI AI Analytics toolkit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env ONEAPI_INSTALL=/opt/intel/oneapi" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the resnet50 inference sample from Intelยฎ Extension for PyTorch (IPEX) github repository" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/intel/intel-extension-for-pytorch/master/examples/cpu/inference/python/resnet50_general_inference_script.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check PyTorch and Intelยฎ Extension for PyTorch (IPEX) verson in current ipython kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run ../../version_check.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run resnet50 on CPU" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Run on CPU via Intelยฎ Extension for PyTorch (IPEX)\n", + "There is a PyTorch conda environment with Intelยฎ Extension for PyTorch (IPEX) installation in current AI Kit installation.\n", + "Users could run resnet50_general_inference_script.py on Intel CPU on this PyTorch conda environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile run.sh\n", + "#!/bin/bash\n", + "source $ONEAPI_INSTALL/setvars.sh --force > /dev/null 2>&1\n", + "source activate pytorch\n", + "echo \"########## Executing the run\"\n", + "DNNL_VERBOSE=1 python resnet50_general_inference_script.py > infer_rn50_cpu.csv\n", + "echo \"########## Done with the run\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Submitting build.sh and run.sh to the job queue\n", + "\n", + "Now we can submit build.sh and run.sh to the job queue.\n", + "\n", + "NOTE - it is possible to execute any of the build and run commands in local environments.\n", + "To enable users to run their scripts either on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command qsub. If the check fails, it is assumed that build/run will be local." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! chmod 755 ../../q; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ../../q run.sh; else ./run.sh; fi" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze Verbose Logs\n", + "***\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download profile_utils.py to parse oneDNN verbose logs from previous section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/oneapi-src/oneAPI-samples/master/Libraries/oneDNN/tutorials/profiling/profile_utils.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: List out all oneDNN verbose logs\n", + "users should see the verbose log listed in the table below.\n", + "\n", + "|Log File Name | Description |\n", + "|:-----|:----|\n", + "|infer_rn50_cpu.csv| log for cpu run |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "filenames= os.listdir (\".\") \n", + "result = []\n", + "keyword = \".csv\"\n", + "for filename in filenames: \n", + " #if os.path.isdir(os.path.join(os.path.abspath(\".\"), filename)): \n", + " if filename.find(keyword) != -1:\n", + " result.append(filename)\n", + "result.sort()\n", + "\n", + "index =0 \n", + "for folder in result:\n", + " print(\" %d : %s \" %(index, folder))\n", + " index+=1" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Pick a verbose log by putting its index value below\n", + "Users can pick cpu log for analysis. \n", + "Once users finish Step 2 to Step 7 for one log file, they can go back to step 2 and select another log file for analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FdIndex=0" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Parse verbose log and get the data back\n", + "> Users will also get a oneDNN.json file with timeline information for oneDNN primitives. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logfile = result[FdIndex]\n", + "print(logfile)\n", + "from profile_utils import oneDNNUtils, oneDNNLog\n", + "onednn = oneDNNUtils()\n", + "log1 = oneDNNLog()\n", + "log1.load_log(logfile)\n", + "data = log1.data\n", + "exec_data = log1.exec_data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: Time breakdown for exec type\n", + "The exec type includes exec and create. \n", + "\n", + "|exec type | Description | \n", + "|:-----|:----| \n", + "|exec | Time for primitives exection. Better to spend most of time on primitives execution. | \n", + "|create| Time for primitives creation. Primitives creation happens once. Better to spend less time on primitive creation. | " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5: Time breakdown for architecture type\n", + "The supported architecture only includes CPU. \n", + "so users should see 100% CPU time. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onednn.breakdown(exec_data,\"arch\",\"time\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6: Time breakdown for primitives type\n", + "The primitives type includes convolution, reorder, sum, etc. \n", + "For this simple convolution net example, convolution and inner product primitives are expected to spend most of time. \n", + "However, the exact time percentage of different primitivies may vary among different architectures. \n", + "Users can easily identify top hotpots of primitives executions with this time breakdown. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onednn.breakdown(exec_data,\"type\",\"time\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7: Time breakdown for JIT kernel type\n", + "oneDNN uses just-in-time compilation (JIT) to generate optimal code for some functions based on input parameters and instruction set supported by the system. \n", + "Therefore, users can see different JIT kernel type among different CPU architectures. \n", + "For example, users can see avx_core_vnni JIT kernel if the workload uses VNNI instruction on Cascake Lake platform. \n", + "Moreover, users can identify the top hotspots of JIT kernel executions with this time breakdown. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onednn.breakdown(exec_data,\"jit\",\"time\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output(both stdout and stderr) is displayed on the command line console" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "py39" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb new file mode 100644 index 000000000..c4bca3199 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb @@ -0,0 +1,589 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "75f9200e-7830-4ee5-8637-e67b5df57eac", + "metadata": {}, + "source": [ + "# PyTorch Inference Optimizations with Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) Bfloat16 Integer8" + ] + }, + { + "cell_type": "markdown", + "id": "48eb565f-ef03-40cb-9182-5b2b752331e8", + "metadata": {}, + "source": [ + "The `PyTorch* Inference Optimizations with Advanced Matrix Extensions Bfloat16 Integer8` sample demonstrates how to perform inference using the ResNet50 and BERT models using the Intelยฎ Extension for PyTorch (IPEX).\n", + "\n", + "The Intelยฎ Extension for PyTorch (IPEX) extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX).\n", + "\n", + "| Area | Description\n", + "|:--- |:---\n", + "| What you will learn | Inference performance improvements using Intelยฎ Extension for PyTorch (IPEX) with Intelยฎ AMX BF16/INT8\n", + "| Time to complete | 5 minutes\n", + "| Category | Code Optimization\n", + "\n", + "## Purpose\n", + "\n", + "The Intelยฎ Extension for PyTorch (IPEX) allows you to speed up inference on Intelยฎ Xeon Scalable processors with lower precision data formats and specialized computer instructions. The bfloat16 (BF16) data format uses half the bit width of floating-point-32 (FP32), which lessens the amount of memory needed and execution time to process. Likewise, the integer8 (INT8) data format uses half the bit width of BF16. You should notice performance optimization with the Intelยฎ AMX instruction set when compared to Intelยฎ Vector Neural Network Instructions (Intelยฎ VNNI).\n", + "\n", + "## Prerequisites\n", + "\n", + "| Optimized for | Description\n", + "|:--- |:---\n", + "| OS | Ubuntu* 18.04 or newer\n", + "| Hardware | 4th Gen Intelยฎ Xeonยฎ Scalable Processors or newer\n", + "| Software | Intelยฎ Extension for PyTorch (IPEX)\n", + "\n", + "## Key Implementation Details\n", + "\n", + "This code sample will perform inference on the ResNet50 and BERT models while using Intelยฎ Extension for PyTorch (IPEX). For each pretrained model, there will be a warm up of 20 samples before running inference on the specified number of samples (i.e. 1000) to record the time. Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) is supported on BF16 and INT8 data types starting with the 4th Generation of Xeon Scalable Processors. The inference time will be compared, showcasing the speedup over FP32 when using AVX-512, Intelยฎ AMX, BF16, and INT8. The following run cases are executed: \n", + "\n", + "1. FP32 (baseline)\n", + "2. BF16 using AVX512_CORE_AMX\n", + "3. INT8 using AVX512_CORE_VNNI\n", + "4. INT8 using AVX512_CORE_AMX\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "4c254afc", + "metadata": {}, + "source": [ + "## Installation of required packages\n", + "\n", + "Ensure the kernel is set to Pytorch-CPU before running the follwing code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa457cee-5b1e-4ec9-b03a-2a7b2a8b464e", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install matplotlib transformers py-cpuinfo sentencepiece sacremoses " + ] + }, + { + "cell_type": "markdown", + "id": "4e41ce52-c94c-4bdf-a528-0e0200fd5501", + "metadata": {}, + "source": [ + "## Imports, Constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4eedf0-5c7c-49d3-be15-f46b4988d9ff", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from time import time\n", + "import matplotlib.pyplot as plt\n", + "import torch\n", + "import intel_extension_for_pytorch as ipex\n", + "from intel_extension_for_pytorch.quantization import prepare, convert\n", + "import torchvision\n", + "from torchvision import models\n", + "from transformers import BertModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17246f67-0059-4b5f-afe8-a105d767b139", + "metadata": {}, + "outputs": [], + "source": [ + "SUPPORTED_MODELS = [\"resnet50\", \"bert\"] # models supported by this code sample\n", + "\n", + "# ResNet sample data parameters\n", + "RESNET_BATCH_SIZE = 64\n", + "\n", + "# BERT sample data parameters\n", + "BERT_BATCH_SIZE = 64\n", + "BERT_SEQ_LENGTH = 512" + ] + }, + { + "cell_type": "markdown", + "id": "9771f165", + "metadata": {}, + "source": [ + "## Identify Supported ISA \n", + "We identify the underlying supported ISA to determine whether Intelยฎ AMX is supported. The 4th Gen Intelยฎ Xeonยฎ Scalable Processor (codenamed Sapphire Rapids) or newer must be used to run this sample. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25c339a4", + "metadata": {}, + "outputs": [], + "source": [ + "# Check if hardware supports Intelยฎ AMX\n", + "import sys\n", + "sys.path.append('../../')\n", + "from cpuinfo import get_cpu_info\n", + "info = get_cpu_info()\n", + "flags = info['flags']\n", + "amx_supported = False\n", + "for flag in flags:\n", + " if \"amx\" in flag:\n", + " amx_supported = True\n", + " break\n", + "if not amx_supported:\n", + " print(\"Intelยฎ AMX is not supported on current hardware. Code sample cannot be run.\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "7b3f461d", + "metadata": {}, + "source": [ + "If the message \"Intelยฎ AMX is not supported on current hardware. Code sample cannot be run.\" is printed above, the hardware being used does not support Intelยฎ AMX. Therefore, this code sample cannot proceed." + ] + }, + { + "cell_type": "markdown", + "id": "6ccd66ee-aac5-4a60-8f66-417612d4d3af", + "metadata": {}, + "source": [ + "## Running Inference\n", + "The function runInference() will perform inference on the selected model, precision, and whether Intelยฎ AMX is to be enabled. The environment variable `ONEDNN_MAX_CPU_ISA` is used to enable or disable Intelยฎ AMX. **Note that this environment variable is only initialized once.** This means to run with Intelยฎ AMX and Intelยฎ VNNI, there will need to be separate processes. The best practice is to set this environment variable before running your script. For more information, refer to the [oneDNN documentation on CPU Dispatcher Control](https://www.intel.com/content/www/us/en/develop/documentation/onednn-developer-guide-and-reference/top/performance-profiling-and-inspection/cpu-dispatcher-control.html). \n", + "\n", + "To use BF16 in operations, use the `torch.cpu.amp.autocast()` function to perform forward pass. For INT8, the quantization feature from Intelยฎ Extension for PyTorch (IPEX) is used to quantize the FP32 model to INT8 before running inference.\n", + "\n", + "Torchscript is also utilized to deploy the model in graph mode instead of imperative mode for faster runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f08d718", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"ONEDNN_MAX_CPU_ISA\"] = \"AVX512_CORE_AMX\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b8e21c9-aaa5-4f75-b00a-0d875cc0bfba", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Function to perform inference on Resnet50 and BERT\n", + "\"\"\"\n", + "def runInference(model, data, modelName=\"resnet50\", dataType=\"FP32\", amx=True):\n", + " \"\"\"\n", + " Input parameters\n", + " model: the PyTorch model object used for inference\n", + " data: a sample input into the model\n", + " modelName: str representing the name of the model, supported values - resnet50, bert\n", + " dataType: str representing the data type for model parameters, supported values - FP32, BF16, INT8\n", + " amx: set to False to disable Intelยฎ AMX on BF16, Default: True\n", + " Return value\n", + " inference_time: the time in seconds it takes to perform inference with the model\n", + " \"\"\"\n", + " \n", + " # Display run case\n", + " if amx:\n", + " isa_text = \"AVX512_CORE_AMX\"\n", + " else:\n", + " isa_text = \"AVX512_CORE_VNNI\"\n", + " print(\"%s %s inference with %s\" %(modelName, dataType, isa_text))\n", + "\n", + " # Special variables for specific models\n", + " batch_size = None\n", + " if \"resnet50\" == modelName:\n", + " batch_size = RESNET_BATCH_SIZE\n", + " elif \"bert\" == modelName:\n", + " d = torch.randint(model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) # sample data input for torchscript and inference\n", + " batch_size = BERT_BATCH_SIZE\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + "\n", + " # Prepare model for inference based on precision (FP32, BF16, INT8)\n", + " if \"INT8\" == dataType:\n", + " # Quantize model to INT8 if needed (one time)\n", + " model_filename = \"quantized_model_%s.pt\" %modelName\n", + " if not os.path.exists(model_filename):\n", + " qconfig = ipex.quantization.default_static_qconfig\n", + " prepared_model = prepare(model, qconfig, example_inputs=data, inplace=False)\n", + " converted_model = convert(prepared_model)\n", + " with torch.no_grad():\n", + " if \"resnet50\" == modelName:\n", + " traced_model = torch.jit.trace(converted_model, data)\n", + " elif \"bert\" == modelName:\n", + " traced_model = torch.jit.trace(converted_model, (d,), check_trace=False, strict=False)\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + " traced_model = torch.jit.freeze(traced_model)\n", + " traced_model.save(model_filename)\n", + "\n", + " # Load INT8 model for inference\n", + " model = torch.jit.load(model_filename)\n", + " model.eval()\n", + " model = torch.jit.freeze(model)\n", + " elif \"BF16\" == dataType:\n", + " model = ipex.optimize(model, dtype=torch.bfloat16)\n", + " with torch.no_grad():\n", + " with torch.cpu.amp.autocast():\n", + " if \"resnet50\" == modelName:\n", + " model = torch.jit.trace(model, data)\n", + " elif \"bert\" == modelName:\n", + " model = torch.jit.trace(model, (d,), check_trace=False, strict=False)\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + " model = torch.jit.freeze(model)\n", + " else: # FP32\n", + " with torch.no_grad():\n", + " if \"resnet50\" == modelName:\n", + " model = torch.jit.trace(model, data)\n", + " elif \"bert\" == modelName:\n", + " model = torch.jit.trace(model, (d,), check_trace=False, strict=False)\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + " model = torch.jit.freeze(model)\n", + "\n", + " # Run inference\n", + " with torch.no_grad():\n", + " if \"BF16\" == dataType:\n", + " with torch.cpu.amp.autocast():\n", + " # Warm up\n", + " for i in range(5):\n", + " model(data)\n", + " \n", + " # Measure latency\n", + " start_time = time()\n", + " model(data)\n", + " end_time = time()\n", + " else:\n", + " # Warm up\n", + " for i in range(5):\n", + " model(data)\n", + " \n", + " # Measure latency\n", + " start_time = time()\n", + " model(data)\n", + " end_time = time()\n", + " inference_time = end_time - start_time\n", + " print(\"Inference on batch size %d took %.3f seconds\" %(batch_size, inference_time))\n", + "\n", + " return inference_time" + ] + }, + { + "cell_type": "markdown", + "id": "1dad2dae", + "metadata": {}, + "source": [ + "The function summarizeResults() displays the inference times and generates one graph for comparing the inference times and another graph for comparing the speedup using FP32 as the baseline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cf736a2", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Prints out results and displays figures summarizing output.\n", + "\"\"\"\n", + "def summarizeResults(modelName=\"\", results=None, batch_size=1):\n", + " \"\"\"\n", + " Input parameters\n", + " modelName: a str representing the name of the model\n", + " results: a dict with the run case and its corresponding time in seconds\n", + " batch_size: an integer for the batch size\n", + " Return value\n", + " None\n", + " \"\"\"\n", + "\n", + " # Inference time results\n", + " print(\"\\nSummary for %s (Batch Size = %d)\" %(modelName, batch_size))\n", + " for key in results.keys():\n", + " print(\"%s inference time: %.3f seconds\" %(key, results[key]))\n", + "\n", + " # Create bar chart with inference time results\n", + " plt.figure()\n", + " plt.title(\"%s Inference Time (Batch Size = %d)\" %(modelName, batch_size))\n", + " plt.xlabel(\"Run Case\")\n", + " plt.ylabel(\"Inference Time (seconds)\")\n", + " plt.bar(results.keys(), results.values())\n", + "\n", + " # Calculate speedup when using Intelยฎ AMX\n", + " print(\"\\n\")\n", + " bf16_with_amx_speedup = results[\"FP32\"] / results[\"BF16_with_AMX\"]\n", + " print(\"BF16 with Intelยฎ AMX is %.2fX faster than FP32\" %bf16_with_amx_speedup)\n", + " int8_with_vnni_speedup = results[\"FP32\"] / results[\"INT8_with_VNNI\"]\n", + " print(\"INT8 without Intelยฎ AMX is %.2fX faster than FP32\" %int8_with_vnni_speedup)\n", + " int8_with_amx_speedup = results[\"FP32\"] / results[\"INT8_with_AMX\"]\n", + " print(\"INT8 with Intelยฎ AMX is %.2fX faster than FP32\" %int8_with_amx_speedup)\n", + " print(\"\\n\\n\")\n", + "\n", + " # Create bar chart with speedup results\n", + " plt.figure()\n", + " plt.title(\"%s Intelยฎ AMX BF16/INT8 Speedup over FP32\" %modelName)\n", + " plt.xlabel(\"Run Case\")\n", + " plt.ylabel(\"Speedup\")\n", + " plt.bar(results.keys(), \n", + " [1, bf16_with_amx_speedup, int8_with_vnni_speedup, int8_with_amx_speedup]\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "9e42672a", + "metadata": {}, + "source": [ + "### VNNI: ResNet50 and BERT\n", + "Since ONEDNN_MAX_CPU_ISA is initialized ONCE when a workload is being run, another process must be used to run with a different setting. \n", + "In other words, changing ONEDNN_MAX_CPU_ISA during runtime in the same process will not have any effect.\n", + "Thus, to run with VNNI, a separate script is run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "555ec5a9", + "metadata": {}, + "outputs": [], + "source": [ + "!python python/pytorch_inference_vnni.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d194fa7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Record the inference times for INT8 using AVX-512\n", + "int8_with_vnni_resnet_inference_time = 0.033 #TODO: enter in inference time\n", + "int8_with_vnni_bert_inference_time = 0.691 #TODO: enter in inference time" + ] + }, + { + "cell_type": "markdown", + "id": "c61288e7", + "metadata": {}, + "source": [ + "### : ResNet50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4a6a84c", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up ResNet50 model and sample data\n", + "resnet_model = models.resnet50(pretrained=True)\n", + "resnet_data = torch.rand(RESNET_BATCH_SIZE, 3, 224, 224)\n", + "resnet_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b26789b9", + "metadata": {}, + "outputs": [], + "source": [ + "# FP32 (baseline)\n", + "fp32_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"FP32\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ad0c512", + "metadata": {}, + "outputs": [], + "source": [ + "# BF16 using Intelยฎ \n", + "bf16_amx_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"BF16\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cd9f1bd", + "metadata": {}, + "outputs": [], + "source": [ + "# INT8 using Intelยฎ \n", + "int8_amx_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"INT8\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59fcbbe2", + "metadata": {}, + "outputs": [], + "source": [ + "# Summarize and display results\n", + "results_resnet = {\n", + " \"FP32\": fp32_resnet_inference_time,\n", + " \"BF16_with_AMX\": bf16_amx_resnet_inference_time,\n", + " \"INT8_with_VNNI\": int8_with_vnni_resnet_inference_time,\n", + " \"INT8_with_AMX\": int8_amx_resnet_inference_time\n", + " }\n", + "summarizeResults(\"ResNet50\", results_resnet, RESNET_BATCH_SIZE)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "75a62b72", + "metadata": {}, + "source": [ + "The first graph displays the inference times on the specified number of samples. In general, the times should be decreasing from left to right because using lower precision and with accelerates the computations. The second graph displays the relative speedup of each run case compared to that of FP32. In general the speedup should be increasing from left to right." + ] + }, + { + "cell_type": "markdown", + "id": "b36fa4b3", + "metadata": {}, + "source": [ + "### BERT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27f173e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up BERT model and sample data\n", + "bert_model = BertModel.from_pretrained(\"bert-base-uncased\")\n", + "bert_data = torch.randint(bert_model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH])\n", + "bert_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a5847c1", + "metadata": {}, + "outputs": [], + "source": [ + "# FP32 (baseline)\n", + "fp32_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"FP32\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d35fc58e", + "metadata": {}, + "outputs": [], + "source": [ + "# BF16 using Intelยฎ \n", + "bf16_amx_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"BF16\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b3d2ccd", + "metadata": {}, + "outputs": [], + "source": [ + "# INT8 using Intelยฎ \n", + "int8_amx_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"INT8\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3721e698", + "metadata": {}, + "outputs": [], + "source": [ + "# Summarize and display results\n", + "results_bert = {\n", + " \"FP32\": fp32_bert_inference_time,\n", + " \"BF16_with_AMX\": bf16_amx_bert_inference_time,\n", + " \"INT8_with_VNNI\": int8_with_vnni_bert_inference_time,\n", + " \"INT8_with_AMX\": int8_amx_bert_inference_time\n", + " }\n", + "summarizeResults(\"BERT\", results_bert, BERT_BATCH_SIZE)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "03e63f93", + "metadata": {}, + "source": [ + "The first graph displays the inference times on the specified number of samples. In general, the times should be decreasing from left to right because using lower precision and with accelerates the computations. The second graph displays the relative speedup of each run case compared to that of FP32. In general the speedup should be increasing from left to right." + ] + }, + { + "cell_type": "markdown", + "id": "b559aeb8", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "0da073a6", + "metadata": {}, + "source": [ + "This code sample shows how to enable and disable during runtime, as well as the performance improvements using BF16 and INT8 for inference on the ResNet50 and BERT models. Performance will vary based on your hardware and software versions. To see a larger performance gap between VNNI and , increase the batch size. For even more speedup, consider using the Intelยฎ Extension for PyTorch (IPEX) [Launch Script](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/launch_script.html). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa0877d6-e045-4091-b5e4-4dfcb6d04f7d", + "metadata": {}, + "outputs": [], + "source": [ + "print('[CODE_SAMPLE_COMPLETED_SUCCESSFULLY]')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "vscode": { + "interpreter": { + "hash": "ed6ae0d06e7bec0fef5f1fb38f177ceea45508ce95c68ed2f49461dd6a888a39" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb new file mode 100644 index 000000000..7e3f0a889 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5b88f590-e457-4052-9dbd-74d7be597dc1", + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================\n", + "# Copyright ยฉ 2023 Intel Corporation\n", + "# \n", + "# SPDX-License-Identifier: MIT\n", + "# =============================================================" + ] + }, + { + "cell_type": "markdown", + "id": "6f25b97a-56f7-4309-87fa-d9626baecf5e", + "metadata": {}, + "source": [ + "# Interactive Chat Based on DialoGPT Model Using Intelยฎ Extension for PyTorch* Quantization\n", + "\n", + "This code sample shows usage of DiloGPT model as interactive chat with Intelยฎ Extension for PyTorch* INT8 quantization.\n", + "\n", + "## DialoGPT\n", + "\n", + "DialoGPT is a model based on GPT-2 architecture proposed by Microsoft in 2019. It's goal was to create open-domain chatbots capable of producing natural responses to a variety of conversational topics." + ] + }, + { + "cell_type": "markdown", + "id": "f7c87090-2f40-4c29-b70b-c9d413bd3bff", + "metadata": {}, + "source": [ + "The `Interactive chat based on DialoGPT model using Intelยฎ Extension for PyTorch* Quantization` sample demonstrates how to create interactive chat based on pre-trained DialoGPT model and add the Intelยฎ Extension for PyTorch* quantization to it.\n", + "\n", + "| Area | Description|\n", + "|-----------------------|------------|\n", + "| What you will learn | How to create interactive chat and add INT8 dynamic quantization form Intelยฎ Extension for PyTorch*|\n", + "| Time to complete | 10 minutes|\n", + "| Category | Concepts and Functionality|\n", + "\n", + "The Intelยฎ Extension for PyTorch* extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX).\n", + "\n", + "## Purpose\n", + "\n", + "This sample shows how to create interactive chat based on the pre-trained DialoGPT model from HuggingFace and how to add INT8 dynamic quantization to it. The Intelยฎ Extension for PyTorch* gives users the ability to speed up operations on processors with INT8 data format and specialized computer instructions. The INT8 data format uses quarter the bit width of floating-point-32 (FP32), lowering the amount of memory needed and execution time to process with minimum to zero accuracy loss.\n", + "\n", + "## Prerequisites\n", + "\n", + "| Optimized for | Description|\n", + "|-------------------------|------------|\n", + "| OS | Ubuntu* 20.04 or newer|\n", + "| Hardware | Intelยฎ Xeonยฎ Scalable Processor family|\n", + "| Software | Intelยฎ Extension for PyTorch*|" + ] + }, + { + "cell_type": "markdown", + "id": "0174e7dd-58ae-47ea-8f11-fa3d1ee8c317", + "metadata": {}, + "source": [ + "## Environment Setup" + ] + }, + { + "cell_type": "markdown", + "id": "cc24bdae-fcb7-40a5-8bb8-76472598730b", + "metadata": {}, + "source": [ + "### Install Jupyter notebook by Conda\n", + "\n", + "Please refer to the guide in README.md to setup running environment:\n", + "\n", + "1. Create Conda running environment.\n", + "2. Install Jupyter notebook.\n", + "3. Install Intelยฎ Extension for PyTorch* for CPU packages.\n", + "4. Startup Jupyter notebook service and open by web browser.\n", + "\n", + "\n", + "#### Set Kernel to PyTorch-CPU\n", + "\n", + "In Jupyter notebook menu, change kernel \"PyTorch-CPU\" by Kernel->Change Kernel." + ] + }, + { + "cell_type": "markdown", + "id": "d2d9847c-2d72-4dfc-a4a9-b87c987ff363", + "metadata": {}, + "source": [ + "### Install other python packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9e7f549-65dd-4286-9e04-8d13a766c0e3", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install transformers matplotlib" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8c1ae1f1-4878-4dc6-bbb3-5a0f17fbbd00", + "metadata": {}, + "source": [ + "Let's start with importing all necessary packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45146b66-e41e-400e-8a1b-5e680bbb7575", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "import torch\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "2e158dd4-e2a7-44ca-af2d-052f88247e97", + "metadata": {}, + "source": [ + "## Model and tokenizer loading\n", + "\n", + "The first implemented function is loading tokenizer and model. \n", + "\n", + "Function input is link to the pre-trained model. In this sample we are using `microsoft/DialoGPT-large` from HuggingFace. This is also default parameter for this function. Of course, you can use also `microsoft/DialoGPT-medium` or `microsoft/DialoGPT-samll` models. Especially if you have limited resources. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6142753-eab1-4167-9818-4b40c900473c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def load_tokenizer_and_model(model=\"microsoft/DialoGPT-large\"):\n", + " \"\"\"\n", + " Load tokenizer and model instance for some specific DialoGPT model.\n", + " \"\"\"\n", + " # Initialize tokenizer and model\n", + " print(\"Loading model...\")\n", + " tokenizer = AutoTokenizer.from_pretrained(model, padding_side='left')\n", + " model = AutoModelForCausalLM.from_pretrained(model)\n", + " \n", + " # Return tokenizer and model\n", + " return tokenizer, model" + ] + }, + { + "cell_type": "markdown", + "id": "a4e150e6-4976-4998-93be-5f5f9ddcbb5b", + "metadata": { + "tags": [] + }, + "source": [ + "## INT8 Dynamic Quantization\n", + "\n", + "**Quantization** is a systematic reduction of the precision of all or several layers within the model. This means that we turn a higher-precision type, such as the FP32 (32 bits) most commonly used in Deep Learning, into a lower-precision type, such as FP16 (16 bits) or INT8 (8 bits). \n", + "\n", + "With type reduction, it is possible to effectively reduce the size of the model and also faster inference. That means:\n", + "\n", + "* lower memory bandwidth, \n", + "* lower storage, \n", + "* higher performance with minimum to zero accuracy loss. \n", + "\n", + "This is especially important, with large models such as those based on the Transformers architecture, like BERT or used in this sample GPT. \n", + "\n", + "We can distinguish 2 types of quantization:\n", + "\n", + "* static - requires an additional pass over a dataset to work, only activations do calibration,\n", + "* dynamic - multiplies input values by the scale factor, then rounds the result to the nearest, the scale factor for activations is determined dynamically based on the data range observed in runtime.\n", + "\n", + "In this sample we are using **the dynamic quantization**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cca006fa-6fce-4e5f-81c0-240d12757493", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from intel_extension_for_pytorch.quantization import prepare, convert\n", + "import intel_extension_for_pytorch as ipex\n", + "\n", + "def quantize_model(tokenizer, model):\n", + " \"\"\"\n", + " Adding Intelยฎ Extension for PyTorch* dynamic quantization to the model\n", + " \"\"\"\n", + " # Evaluate model\n", + " model.eval()\n", + " \n", + " print(\"Quantization in progress...\")\n", + " \n", + " # Prepare example outputs for the model\n", + " question, text = \"What is SYCL?\", \"SYCL is an industry-driven standard, developed by Kronos Group and announced in March 2014.\"\n", + " inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + " jit_inputs = tuple((inputs['input_ids']))\n", + " \n", + " # Create configuration for dynamic quantization\n", + " qconfig = ipex.quantization.default_dynamic_qconfig\n", + " \n", + " # Optimize model\n", + " model = ipex.optimize(model)\n", + " \n", + " # Prepare model for quantization using previously prepared parameters\n", + " prepared_model = prepare(model, qconfig, example_inputs=jit_inputs, inplace=False)\n", + " \n", + " # Convert types in model\n", + " converted_model = convert(prepared_model)\n", + " \n", + " return tokenizer, converted_model" + ] + }, + { + "cell_type": "markdown", + "id": "0efd690e-96bd-49aa-8a6b-863d4de3cdfa", + "metadata": {}, + "source": [ + "## Response generation \n", + "\n", + "Response generation in DialoGPT architecture based on **encoder-decoder** model. It means that first we need to *encode input sentence*, to later on be able to *decode it* generating response.\n", + "\n", + "As the model based on transformers architecture they have known issue of copying things. To avoid repetition in chat responses we used Top-K sampling and Top-p sampling.\n", + "\n", + "**Top-K sampling** filters the K most likely next words and redistributes the probability mass among only those K next words. **Top-p sampling**, rather than selecting only the most likely K words, selects the smallest possible set of words whose cumulative probability exceeds the probability p. The probability mass is then redistributed among the words in this set. As a result, the size of the set of words can be dynamically increased and decreased based on the probability distribution of the next word." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d90bd2c3-ff9c-4e52-994d-341792e3e035", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def generate_response(tokenizer, model, chat_round, chat_history_ids):\n", + " \"\"\"\n", + " Generate a response to some user input.\n", + " \"\"\"\n", + " # Encode user input and End-of-String (EOS) token\n", + " new_input_ids = tokenizer.encode(input(\">> You:\") + tokenizer.eos_token, return_tensors='pt')\n", + " \n", + " # Append tokens to chat history\n", + " bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_round > 0 else new_input_ids\n", + " \n", + " # Generate response given maximum chat length history of 2000 tokens\n", + " chat_history_ids = model.generate(\n", + " bot_input_ids,\n", + " do_sample=True, \n", + " max_length=2000,\n", + " top_k=50, \n", + " top_p=0.95,\n", + " pad_token_id=tokenizer.eos_token_id\n", + " )\n", + " \n", + " # Print response\n", + " print(\"DialoGPT: {}\".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))\n", + " \n", + " # Return the chat history ids\n", + " return chat_history_ids" + ] + }, + { + "cell_type": "markdown", + "id": "db1b079b-476c-47da-8a6a-3d42fccc32d4", + "metadata": {}, + "source": [ + "The next step is to prepare a function that allows interactive conversation for `n` rounds. This means that we will use the previously prepared `generate_response` function n-times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28968553-b051-442d-abc2-92d8ac34415a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def chat_for_n_rounds(tokenizer, model, n=5):\n", + " \"\"\"\n", + " Chat with chatbot for n rounds (n = 5 by default)\n", + " \"\"\"\n", + "\n", + " # Initialize history variable\n", + " chat_history_ids = None\n", + "\n", + " # Chat for n rounds\n", + " for chat_round in range(n):\n", + " chat_history_ids = generate_response(tokenizer, model, chat_round, chat_history_ids)" + ] + }, + { + "cell_type": "markdown", + "id": "41b0f86f-2b17-41cd-911e-9ac9a92be4a0", + "metadata": {}, + "source": [ + "Now, it is time to use implemented functions - initializing the model and adding INT8 dynamic quantization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1749c7a-4bba-4731-bbc6-da560edcfed2", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize tokenizer and model\n", + "tokenizer, model = load_tokenizer_and_model()\n", + "\n", + "# Adding ipex quantization to the model\n", + "tokenizer, model = quantize_model(tokenizer, model)" + ] + }, + { + "cell_type": "markdown", + "id": "31bae96c-276c-463f-8085-2cd8e97b5f30", + "metadata": {}, + "source": [ + "Let's play with the model by 5 rounds. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d79fa9d7-5713-4ceb-b489-90f1a4f6a4cf", + "metadata": {}, + "outputs": [], + "source": [ + "chat_for_n_rounds(tokenizer, model, 5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "16402940-0779-44a1-98b5-3f23c5784bd4", + "metadata": {}, + "source": [ + "## Performance comparison\n", + "\n", + "Now that we know that the DialoGPT model still performs well as a chat bot after quantization, let's compare the model's performance before and after applying INT8 dynamic quantization.\n", + "\n", + "Let's start with defining function that will measure time that model needs for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c8b4677-4635-4abd-8eca-9ed43b9b6624", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from time import time\n", + "def test_inference(model, data, warmup=5 , iters=25):\n", + " print(\"Warmup...\")\n", + " for i in range(warmup):\n", + " out = model(data)\n", + "\n", + " print(\"Inference...\")\n", + " inference_time = 0\n", + " for i in range(iters):\n", + " start_time = time()\n", + " out = model(data)\n", + " end_time = time()\n", + " inference_time = inference_time + (end_time - start_time)\n", + "\n", + " inference_time = inference_time / iters\n", + " return inference_time" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a41f2a40-2176-4f04-b277-e3622df90430", + "metadata": {}, + "source": [ + "First, let's measure average time of inference for original model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cb034fe-9b8b-4ee9-9975-8a6a03ce79a4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Inference with FP32\")\n", + "tokenizer_fp32, model_fp32 = load_tokenizer_and_model()\n", + "data = torch.randint(model_fp32.config.vocab_size, size=[1, 512])\n", + "fp32_inference_time = test_inference(model_fp32, data = data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6c58546c-5c60-482c-9782-ac901855ddce", + "metadata": { + "tags": [] + }, + "source": [ + "Then, the average inference time of model after INT8 dynamic quantization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05fcd18c-0674-4715-a606-ce5ce9e42560", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Inference with Dynamic INT8\")\n", + "tokenizer_int8, model_int8 = load_tokenizer_and_model()\n", + "tokenizer_int8, model_int8 = quantize_model(tokenizer_int8, model_int8)\n", + "data = torch.randint(model_int8.config.vocab_size, size=[1, 512])\n", + "int8_inference_time = test_inference(model_int8, data = data)" + ] + }, + { + "cell_type": "markdown", + "id": "2ef0648b-c926-42ac-8367-e1a3edb067ea", + "metadata": {}, + "source": [ + "Now, it's time to show nup the results on the bar chart using `matplotlib` library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d492e5d-e188-489b-a18d-aa32cca0a1b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Create bar chart with training time results\n", + "plt.figure(figsize=(4,3))\n", + "plt.title(\"DialoGPT Inference Time\")\n", + "plt.ylabel(\"Inference Time (seconds)\")\n", + "plt.bar([\"FP32\", \"INT8 dynamic\"], [fp32_inference_time, int8_inference_time])" + ] + }, + { + "cell_type": "markdown", + "id": "d2c31e73-2d6d-4323-9609-f04191f8863d", + "metadata": {}, + "source": [ + "DialoGPT by Microsoft is another conversational chatbot that everyone can use. \n", + "\n", + "Based on this architecture, we created an interactive chat in this sample. The use of top-k and top-p allowed us to avoid some of the repetition in the chat answers. Furthermore, the addition of dynamic INT8 quantization reduced memory usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b56ff32-34d0-4866-9050-df1bdf7ad736", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89043271-f3dc-4d4d-a630-40c570c53d98", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb new file mode 100644 index 000000000..03020685e --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimize PyTorch Models using Intelยฎ Extension for PyTorch (IPEX) Quantization\n", + "This code sample will quantize a ResNet50 model while using Intel's Extension for PyTorch (IPEX). The model will run inference with FP32 and INT8 precision, including static INT8 quantization and dynamic INT8 quantization. During Static Quantization, the model calibrated with the CIFAR10 dataset. The inference time will be compared, showcasing the speedup of INT8 Quantization.\n", + "\n", + "## Environment Setup\n", + "Ensure the PyTorch kernel is activated before running this notebook.\n", + "\n", + "## Imports, Dataset, Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torchvision\n", + "from time import time\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import intel_extension_for_pytorch as ipex\n", + "from intel_extension_for_pytorch.quantization import prepare, convert\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hyperparameters and constants\n", + "LR = 0.001\n", + "DOWNLOAD = True\n", + "DATA = 'datasets/cifar10/'\n", + "WARMUP = 3\n", + "ITERS = 100\n", + "transform = torchvision.transforms.Compose([\n", + "torchvision.transforms.Resize((224, 224)),\n", + "torchvision.transforms.ToTensor(),\n", + "torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", + "])\n", + "test_dataset = torchvision.datasets.CIFAR10(\n", + " root=DATA,\n", + " train=False,\n", + " transform=transform,\n", + " download=DOWNLOAD,\n", + ")\n", + "calibration_data_loader = torch.utils.data.DataLoader(\n", + " dataset=test_dataset,\n", + " batch_size=128\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get model from torchvision" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = torch.rand(1, 3, 224, 224)\n", + "model_fp32 = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)\n", + "model_fp32.eval()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference with FP32 model\n", + "\n", + "The function below will test the inference time with input model and return the average inference time for 1 iteration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def inference(model, WARMUP, ITERS, data):\n", + " print(\"Warmup before benchmark ...\")\n", + " for i in range(WARMUP):\n", + " out = model(data)\n", + "\n", + " print(\"Inference ...\")\n", + " inference_time = 0\n", + " for i in range(ITERS):\n", + " start_time = time()\n", + " out = model(data)\n", + " end_time = time()\n", + " inference_time = inference_time + (end_time - start_time)\n", + "\n", + " inference_time = inference_time / ITERS\n", + " print(\"Inference Time Avg: \", inference_time)\n", + " return inference_time" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Static Quantization \n", + "The function below staticQuantize will calibrate the fp32 model with calibration dataloader and return the quantized static int8 model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def staticQuantize(model_fp32, data, calibration_data_loader):\n", + " # Acquire inference times for static quantization INT8 model \n", + " qconfig_static = ipex.quantization.default_static_qconfig\n", + " # # Alternatively, define your own qconfig:\n", + " # from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig\n", + " # qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),\n", + " # weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric))\n", + " prepared_model_static = prepare(model_fp32, qconfig_static, example_inputs=data, inplace=False)\n", + " print(\"Calibration with Static Quantization ...\")\n", + " for batch_idx, (data, target) in enumerate(calibration_data_loader):\n", + " prepared_model_static(data)\n", + " if batch_idx % 10 == 0:\n", + " print(\"Batch %d/%d complete, continue ...\" %(batch_idx+1, len(calibration_data_loader)))\n", + " print(\"Calibration Done\")\n", + "\n", + " converted_model_static = convert(prepared_model_static)\n", + " with torch.no_grad():\n", + " traced_model_static = torch.jit.trace(converted_model_static, data)\n", + " traced_model_static = torch.jit.freeze(traced_model_static)\n", + "\n", + " # save the quantized static model \n", + " traced_model_static.save(\"quantized_model_static.pt\")\n", + " return traced_model_static\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dynamic Quantization \n", + "The function below dynamicQuantize will quantize the fp32 model with dynamic quantization and return the quantized dynamic int8 model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def dynamicQuantize(model_fp32, data):\n", + " # Acquire inference times for dynamic quantization INT8 model\n", + " qconfig_dynamic = ipex.quantization.default_dynamic_qconfig\n", + " print(\"Quantize Model with Dynamic Quantization ...\")\n", + "\n", + " prepared_model_dynamic = prepare(model_fp32, qconfig_dynamic, example_inputs=data, inplace=False)\n", + "\n", + " converted_model_dynamic = convert(prepared_model_dynamic)\n", + " with torch.no_grad():\n", + " traced_model_dynamic = torch.jit.trace(converted_model_dynamic, data)\n", + " traced_model_dynamic = torch.jit.freeze(traced_model_dynamic)\n", + "\n", + " # save the quantized dynamic model \n", + " traced_model_dynamic.save(\"quantized_model_dynamic.pt\")\n", + " return traced_model_dynamic\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantize the FP32 Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists('quantized_model_static.pt'):\n", + " # Static Quantizaton & Save Model to quantized_model_static.pt\n", + " print('quantize the model with static quantization')\n", + " staticQuantize(model_fp32, data, calibration_data_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists('quantized_model_dynamic.pt'):\n", + " # Dynamic Quantization & Save Model to quantized_model_dynamic.pt\n", + " print('quantize the model with dynamic quantization')\n", + " dynamicQuantize(model_fp32, data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference With FP32 Model, Static INT8 Model and Dynamic INT8 Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Inference with FP32\")\n", + "fp32_inference_time = inference(model_fp32, WARMUP, ITERS, data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Inference with Static INT8\")\n", + "traced_model_static = torch.jit.load('quantized_model_static.pt')\n", + "traced_model_static.eval()\n", + "traced_model_static = torch.jit.freeze(traced_model_static)\n", + "int8_inference_time_static = inference(traced_model_static, WARMUP, ITERS, data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Inference with Dynamic INT8\")\n", + "traced_model_dynamic = torch.jit.load('quantized_model_dynamic.pt')\n", + "traced_model_dynamic.eval()\n", + "traced_model_dynamic = torch.jit.freeze(traced_model_dynamic)\n", + "int8_inference_time_dynamic = inference(traced_model_dynamic, WARMUP, ITERS, data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary of Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference time results\n", + "print(\"Summary\")\n", + "print(\"FP32 inference time: %.3f\" %fp32_inference_time)\n", + "print(\"INT8 static quantization inference time: %.3f\" %int8_inference_time_static)\n", + "print(\"INT8 dynamic quantization inference time: %.3f\" %int8_inference_time_dynamic)\n", + "\n", + "# Create bar chart with training time results\n", + "plt.figure(figsize=(4,3))\n", + "plt.title(\"ResNet Inference Time\")\n", + "plt.xlabel(\"Test Case\")\n", + "plt.ylabel(\"Inference Time (seconds)\")\n", + "plt.bar([\"FP32\", \"INT8 static\", \"INT8 dynamic\"], [fp32_inference_time, int8_inference_time_static, int8_inference_time_dynamic])\n", + "\n", + "# Calculate speedup when using quantization\n", + "speedup_from_fp32_static = fp32_inference_time / int8_inference_time_static\n", + "print(\"Staic INT8 %.2fX faster than FP32\" %speedup_from_fp32_static)\n", + "speedup_from_fp32_dynamic = fp32_inference_time / int8_inference_time_dynamic\n", + "print(\"Dynamic INT8 %.2fX faster than FP32\" %speedup_from_fp32_dynamic)\n", + "\n", + "\n", + "# Create bar chart with speedup results\n", + "plt.figure(figsize=(4,3))\n", + "plt.title(\"Quantization Speedup\")\n", + "plt.xlabel(\"Test Case\")\n", + "plt.ylabel(\"Speedup\")\n", + "plt.bar([\"FP32\",\"Static INT8\", \"Dynamic INT8\"], [1, speedup_from_fp32_static, speedup_from_fp32_dynamic])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + }, + "vscode": { + "interpreter": { + "hash": "4678fb2792a22465205165c52aab2f7cff7494375a364749bf16e0ac11f2a502" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/README.md b/examples/cpu/inference/python/jupyter-notebooks/README.md new file mode 100644 index 000000000..2c9dfc91a --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/README.md @@ -0,0 +1,61 @@ +# Environment Setup for Jupyter Notebook with Intel Pytorch CPU + +The Intelยฎ Extension for PyTorch (IPEX) extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX). + +## Prerequisites + +| Optimized for | Description +|:--- |:--- +| OS | Ubuntu* 18.04 or newer +| Hardware | 4th Gen Intelยฎ Xeonยฎ Scalable Processors or newer +| Software | Intelยฎ Extension for PyTorch (IPEX) + +## For Local Development Environments + +- **Install Jupyter Notebook with Conda** + +Python 3.8,3.9,3.10,3.11 are supported. +Please create a **new conda environment** for each sample. + +``` +conda create -n ipex_cpu python=3.10 -y +conda activate ipex_cpu +pip install notebook ipykernel +``` + +If encounter any issue for jupyter notebook, please refer to [*Installing Jupyter*](https://jupyter.org/install) for detailed installation instructions. + + +- **Install Intelยฎ Extension for Pytorch* with Conda** + +Follow this instructions to install latest released Intelยฎ Extension for Pytorch* + +``` +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +pip install intel-extension-for-pytorch +``` + +If a specific version is needed, please follow the Installation Section and Sanity Check Section in the [installation guide](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu) to install Intelยฎ Extension for Pytroch* with specific version. + +- **Register ipykernel with Conda** + +``` +python -m ipykernel install --user --name=PyTorch-CPU +``` + +- **Runing the jupyter notebook** + +1. Change to the sample directory. +2. Launch Jupyter Notebook. +``` +jupyter notebook --ip=0.0.0.0 --port 8888 --allow-root +``` +3. Follow the instructions to open the URL with the token in your browser. +4. Locate and select the Notebook. +5. Change your Jupyter Notebook kernel to **PyTorch-CPU**. +6. Run every cell in the Notebook in sequence. + +## Example Output + +If successful, the sample displays `[CODE_SAMPLE_COMPLETED_SUCCESSFULLY]`. Additionally, the sample generates performance and analysis diagrams for comparison. + diff --git a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb new file mode 100644 index 000000000..fe1fb8b52 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb @@ -0,0 +1,888 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1110103c-70d0-4ac0-8208-6a678b88deae", + "metadata": {}, + "source": [ + "# Optimize PyTorch Models using Intelยฎ Extension for PyTorch* (IPEX)\n", + "\n", + "This notebook guides you through the process of extending your PyTorch* code with Intelยฎ Extension for PyTorch* (IPEX) with optimizations to achieve performance boosts on Intelยฎ hardware.\n", + "\n", + "\n", + "| Area | Description\n", + "|:--- |:---\n", + "| What you will learn | Applying Intelยฎ Extension for PyTorch* (IPEX) Optimizations to a PyTorch workload in a step-by-step manner to gain performance boost\n", + "| Time to complete | 30 minutes\n", + "| Category | Code Optimization\n", + "\n", + "## Purpose\n", + "\n", + "This sample notebook shows how to get started with Intelยฎ Extension for PyTorch* (IPEX) for sample Computer Vision and NLP workloads.\n", + "\n", + "The sample starts by loading two models from the PyTorch hub: **Faster-RCNN** (Faster R-CNN) and **distilbert** (DistilBERT). After loading the models, the sample applies sequential optimizations from Intelยฎ Extension for PyTorch* (IPEX) and examines performance gains for each incremental change.\n", + "You can make code changes quickly on top of existing PyTorch code to obtain the performance speedups for model inference.\n", + "\n", + "We will be generating synthetic data to be used for inference with sample computer vision and NLP workloads. We will first use stock PyTorch models to generate predictions. Then, with minimal code changes using Intelยฎ Extension for PyTorch* (IPEX), we will see how speedups can be gained over stock PyTorch on Intelยฎ hardware. We will also see how quantization features from Intelยฎ Extension for PyTorch* (IPEX) can be used to reduce the inference time of a model.\n", + "\n", + "## Prerequisites\n", + "\n", + "\n", + "| Optimized for | Description\n", + "|:--- |:---\n", + "| OS | Ubuntu* 20.04 or newer\n", + "| Hardware | Intelยฎ Xeonยฎ Scalable processor family\n", + "| Software | Intelยฎ Extension for PyTorch*\n" + ] + }, + { + "cell_type": "markdown", + "id": "431d988d-40f1-4f98-96fd-2e17b4126eb4", + "metadata": {}, + "source": [ + "# Key Takeaways" + ] + }, + { + "cell_type": "markdown", + "id": "7438fa45-81e6-4d42-847b-fbe895ae8eed", + "metadata": {}, + "source": [ + "- Get started with Intelยฎ Extension for PyTorch* (IPEX) for drop-in acceleration\n", + "- Learn how to use the *optimize* method from Intelยฎ Extension for PyTorch* (IPEX) to apply optimizations at Python frontend to the given model (nn.Module)\n", + "- Learn how to use Quantization features from Intelยฎ Extension for PyTorch* (IPEX) to convert model to INT8\n", + "- Learn how to use Intelยฎ Extension for PyTorch* (IPEX) Launch Script module to set additional configurations on top of the previously mentioned optimizations to boost performance" + ] + }, + { + "cell_type": "markdown", + "id": "d72174e0", + "metadata": {}, + "source": [ + "# Samples" + ] + }, + { + "cell_type": "markdown", + "id": "06a26381", + "metadata": {}, + "source": [ + "## Install Intelยฎ Extension for PyTorch* for CPU and dependency packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ccfb9a4", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install transformers matplotlib" + ] + }, + { + "cell_type": "markdown", + "id": "b7eb6281-5db9-4a4f-9c6a-3f9c132f30f9", + "metadata": { + "tags": [] + }, + "source": [ + "## Computer Vision Workload - Faster R-CNN, Resnet50 Backbone" + ] + }, + { + "cell_type": "markdown", + "id": "d911257c-c9b0-4365-a308-95a4b3aea487", + "metadata": {}, + "source": [ + "Faster R-CNN is a convolutional neural network used for object detection. We are going to use the **optimize** method from Intelยฎ Extension for PyTorch* (IPEX) to apply optimizations. Following this, we will also use TorchScript to obtain performance gains." + ] + }, + { + "cell_type": "markdown", + "id": "96966a7b-b036-4f1c-8dd8-3b90b76f98c9", + "metadata": {}, + "source": [ + "Let's start by importing all the necessary packages and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a4cb03a-f6b4-465b-9363-b435b00336c8", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import torch\n", + "import torchvision\n", + "import os\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "5f613973-ffb0-482a-bc56-c295bce3c088", + "metadata": {}, + "source": [ + "**Prepare Sample Data**" + ] + }, + { + "cell_type": "markdown", + "id": "22eeae78-534e-4d44-a0f3-c48a4213ac3f", + "metadata": {}, + "source": [ + "Let's generate a random image using torch to test performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e75c62d-dbbb-47da-aa07-717e9719d86a", + "metadata": {}, + "outputs": [], + "source": [ + "# set the device to cpu\n", + "device = 'cpu'\n", + "# generate a random image to observe speedup on\n", + "image = torch.randn(1, 3, 1200, 1200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "678f5795-dcd8-439d-8d98-f4197a2417e4", + "metadata": {}, + "outputs": [], + "source": [ + "# explore image shape\n", + "\n", + "print(image.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "32a54de2-76e7-42f9-9506-5574f5bb95a4", + "metadata": {}, + "source": [ + "**Helper Functions**" + ] + }, + { + "cell_type": "markdown", + "id": "43d0d63d-1d2b-4edc-8d36-0e2dd4ff3780", + "metadata": {}, + "source": [ + "Some functions to help us with loading the model and summarizing the optimizations. The functions below will help us record the time taken to run and, plot comparison charts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c265eb1f-ef85-4ed1-941d-158d2ec16af6", + "metadata": {}, + "outputs": [], + "source": [ + "def load_model_eval_mode():\n", + " \"\"\"\n", + " Loads model and returns it in eval mode\n", + " \"\"\"\n", + " model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=weights, progress=True,\n", + " num_classes=91, weights_backbone=weights_backbone).to(device)\n", + " model = model.eval()\n", + " \n", + " return model\n", + "\n", + "def get_average_inference_time(model, image):\n", + " \"\"\"\n", + " does a model warm up and times the model runtime\n", + " \"\"\"\n", + " with torch.no_grad():\n", + " # warm up\n", + " for _ in range(25):\n", + " model(image)\n", + "\n", + " # measure\n", + " import time\n", + " start = time.time()\n", + " for _ in range(25):\n", + " output = model(image)\n", + " end = time.time()\n", + " average_inference_time = (end-start)/25*1000\n", + " \n", + " return average_inference_time\n", + "\n", + "def plot_speedup(inference_time_stock, inference_time_optimized):\n", + " \"\"\"\n", + " Plots a bar chart comparing the time taken by stock PyTorch model and the time taken by\n", + " the model optimized by Intelยฎ Extension for PyTorch* (IPEX)\n", + " \"\"\"\n", + " data = {'stock_pytorch_time': inference_time_stock, 'optimized_time': inference_time_optimized}\n", + " model_type = list(data.keys())\n", + " times = list(data.values())\n", + "\n", + " fig = plt.figure(figsize = (10, 5))\n", + "\n", + " # creating the bar plot\n", + " plt.bar(model_type, times, color ='blue',\n", + " width = 0.4)\n", + "\n", + " plt.ylabel(\"Runtime (ms)\")\n", + " plt.title(f\"Speedup acheived - {inference_time_stock/inference_time_optimized:.2f}x\")\n", + " plt.show()\n", + " \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "46eb1386-203a-4667-aa9c-9b26adcd02c4", + "metadata": {}, + "source": [ + "**Baseline PyTorch Model**" + ] + }, + { + "cell_type": "markdown", + "id": "c441b52d-597b-422c-bd01-524a687f8024", + "metadata": {}, + "source": [ + "A baseline model is the simplest version of the model that can be loaded from the PyTorch hub. Let's load the baseline for Faster R-CNN model and get predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d4efa17-9abe-4595-bcc1-8b9c3b966211", + "metadata": {}, + "outputs": [], + "source": [ + "# model configs\n", + "weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT\n", + "weights_backbone = torchvision.models.ResNet50_Weights.DEFAULT" + ] + }, + { + "cell_type": "markdown", + "id": "858fb895-8617-4997-a8ec-8974d00e4606", + "metadata": {}, + "source": [ + "**Input Image Memory Format**" + ] + }, + { + "cell_type": "markdown", + "id": "43a1fa1e-2e89-4c43-b1e4-7682686c2a4a", + "metadata": {}, + "source": [ + "There are two ways to represent image data that are inputs to a CNN model. Channels-First, and Channels-Last. In Channels-First, the channels dimension comes first followed by height and width. For example - (3, 224, 224) or NCHW where N is batch size, C is channels, H is height, and W is width. In Channels-Last, the channels dimension comes last. For example - (224, 223, 3) or NHWC." + ] + }, + { + "cell_type": "markdown", + "id": "a12280f4-b8a1-4c9a-89d3-2f092396f431", + "metadata": {}, + "source": [ + "**Channels-First**" + ] + }, + { + "cell_type": "markdown", + "id": "59f09f4f-de2c-4006-83e0-d201b58097e6", + "metadata": {}, + "source": [ + "PyTorch uses channels-first by default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f054f770-c8d6-4fe7-baed-7ed77c510a13", + "metadata": {}, + "outputs": [], + "source": [ + "# send the input to the device and pass it through the network to\n", + "# get the detections and predictions\n", + "\n", + "model = load_model_eval_mode()\n", + "\n", + "inference_time_stock = get_average_inference_time(model, image)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_stock} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "3285b34b-03a2-4654-8061-a4dd9f66d1e9", + "metadata": {}, + "source": [ + "**Channels-Last**" + ] + }, + { + "cell_type": "markdown", + "id": "31439e30-344e-455c-816a-a88eb868ede4", + "metadata": {}, + "source": [ + "Channels-Last memory format is a different way of ordering NCHW tensors allowing us to make Channels-Last memory format optimizations on Intelยฎ hardware" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0afc270-0d24-4e8d-a1a9-271dd66fc350", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "model = model.to(memory_format=torch.channels_last)\n", + "image_channels_last = image.to(memory_format=torch.channels_last)\n", + "\n", + "inference_time_stock = get_average_inference_time(model, image_channels_last)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_stock} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "9cfff9fc-0e9d-4703-8369-da1db780d15e", + "metadata": {}, + "source": [ + "Now that we have timed the stock PyTorch model, let's add minimal code changes from Intelยฎ Extension for PyTorch* (IPEX) to obtain speedups. The minimal code changes are highlighted in the following cell" + ] + }, + { + "cell_type": "markdown", + "id": "aef650d5-02db-40a6-9688-90a03fee7da2", + "metadata": {}, + "source": [ + "**Intelยฎ Extension for PyTorch* (IPEX)**" + ] + }, + { + "cell_type": "markdown", + "id": "05237e85-1039-4fb5-8651-bb815824a1d9", + "metadata": {}, + "source": [ + "As described above, Intelยฎ Extension for PyTorch* (IPEX) provides us with the ability to make minimal code changes to apply optimizations over stock PyTorch models using Intelยฎ hardware. The simple code changes are indicated below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6356714f-c2b3-46c7-8dab-103d40054eb0", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "model = model.to(memory_format=torch.channels_last)\n", + "image_channels_last = image.to(memory_format=torch.channels_last)\n", + "#################### code changes ####################\n", + "import intel_extension_for_pytorch as ipex\n", + "model = ipex.optimize(model)\n", + "######################################################" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bdd609b-13ec-4bb7-93ec-b2f8a6dd0f68", + "metadata": {}, + "outputs": [], + "source": [ + "inference_time_optimized = get_average_inference_time(model, image_channels_last)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5843fe0f-52c3-49a5-b231-f4ca514395a6", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "d263abf8-e0b6-4d99-a14a-67e46c197c3d", + "metadata": {}, + "source": [ + "> **_NOTE:_** If a below par performance is observed, please restart the notebook kernel." + ] + }, + { + "cell_type": "markdown", + "id": "6d465fcc-8115-46de-9542-2c4c8d7c1771", + "metadata": {}, + "source": [ + "**TorchScript**" + ] + }, + { + "cell_type": "markdown", + "id": "e96cbe96-92c2-4429-b90b-4e78a4bac04d", + "metadata": {}, + "source": [ + "TorchScript is a way to create serializable and optimizable models from PyTorch code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87254f28-ce41-4066-81c1-bcd646a0b871", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "model = model.to(memory_format=torch.channels_last)\n", + "with torch.no_grad():\n", + " model.backbone = torch.jit.trace(model.backbone, image_channels_last, strict=False)\n", + " model.backbone = torch.jit.freeze(model.backbone)\n", + " inference_time_optimized = get_average_inference_time(model, image_channels_last)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ad01762-ded5-4380-b649-7a3db30c3b34", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "11b2888e-1171-400d-a5f9-783e6c52f01e", + "metadata": {}, + "source": [ + "## NLP Workload - DistilBERT Base Uncased" + ] + }, + { + "cell_type": "markdown", + "id": "62666465-5647-4ccb-a53c-d389d3261629", + "metadata": {}, + "source": [ + "DistilBERT is a transformer model, smaller and faster than BERT. We will use the Quantization feature from Intelยฎ Extension for PyTorch* (IPEX) to convert the model into INT8 for faster inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e10eba7b-3d6e-46e8-8baf-8eddcc5f9d2a", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import DistilBertTokenizer, DistilBertModel, logging\n", + "logging.set_verbosity_error()" + ] + }, + { + "cell_type": "markdown", + "id": "8b7899a8-205d-4ebc-b304-e9ee65ad3643", + "metadata": {}, + "source": [ + "**Helper Functions**" + ] + }, + { + "cell_type": "markdown", + "id": "797de740-3a76-48cc-838c-ef8e981cd41b", + "metadata": {}, + "source": [ + "Similar functions as before to help us load the model and summarize the optimizations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0b68ba1-9215-4408-b3ee-5d6d610cffd7", + "metadata": {}, + "outputs": [], + "source": [ + "def load_model_eval_mode():\n", + " \"\"\"\n", + " Loads model and returns it in eval mode\n", + " \"\"\"\n", + " model = DistilBertModel.from_pretrained('distilbert-base-uncased-distilled-squad')\n", + " model.eval()\n", + " \n", + " return model\n", + "\n", + "def get_average_inference_time(model, inputs):\n", + " \"\"\"\n", + " does a model warm up and times the model runtime\n", + " \"\"\"\n", + " with torch.no_grad():\n", + " # warm up\n", + " for _ in range(25):\n", + " model(**inputs)\n", + "\n", + " # measure\n", + " import time\n", + " start = time.time()\n", + " for _ in range(25):\n", + " outputs = model(**inputs)\n", + " end = time.time()\n", + " average_inference_time = (end-start)/25*1000\n", + " \n", + " return average_inference_time" + ] + }, + { + "cell_type": "markdown", + "id": "81ec7560-5a3d-4450-badd-7ff3e5d6ae9b", + "metadata": {}, + "source": [ + "Generate sample text and tokenize using the transformers tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d94e1ea-03f8-4f2b-9c05-ae4a97d7d7d1", + "metadata": {}, + "outputs": [], + "source": [ + "# tokenizer for distilbert\n", + "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')\n", + "\n", + "# sample data\n", + "question, text = \"Who was Jim Henson?\", \"Jim Henson was a nice puppet\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e146748d-7b31-4e62-82fb-92e3cd231f0e", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "\n", + "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + "\n", + "inference_time_stock = get_average_inference_time(model, inputs)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_stock} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "fd79e93e-bc11-48b2-8a92-b74f29e4d2bf", + "metadata": {}, + "source": [ + "**Quantization**" + ] + }, + { + "cell_type": "markdown", + "id": "1cfbb3fe-d99f-4e74-abd0-04bdbbe6e632", + "metadata": {}, + "source": [ + "Quantization allows us to perform operations and store tensors at a lower precision than FP32, like INT8 for example. This compact model and data representation results in a lower memory requirement." + ] + }, + { + "cell_type": "markdown", + "id": "1c8ed525-7075-4a88-80d2-dfb739468994", + "metadata": {}, + "source": [ + "Let's import the quantization modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "342873f5-72a3-4105-a2a5-68778d0599c8", + "metadata": {}, + "outputs": [], + "source": [ + "from intel_extension_for_pytorch.quantization import prepare, convert\n", + "import intel_extension_for_pytorch as ipex" + ] + }, + { + "cell_type": "markdown", + "id": "b71e64d0-21f7-497f-86c3-d1afcaeefd6c", + "metadata": {}, + "source": [ + "**Static Quantization** \n", + " Static quantization quantizes the weights and activations of the model. It fuses activations into preceding layers where possible. It requires calibration with a representative dataset to determine optimal quantization parameters for activations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ef972ab-a272-45dd-bd55-fbd9f7db5017", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "\n", + "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + "\n", + "jit_inputs = tuple((inputs['input_ids'], inputs['attention_mask']))\n", + "\n", + "qconfig_mapping = ipex.quantization.default_static_qconfig_mapping # for static quantization\n", + "prepared_model = ipex.quantization.prepare(model, qconfig_mapping, example_inputs=jit_inputs, inplace=False)\n", + "\n", + "for i in range(2):\n", + " calibration_output = prepared_model(**inputs)\n", + "\n", + "model = convert(prepared_model)\n", + "with torch.no_grad():\n", + " model = torch.jit.trace(model, jit_inputs, strict=False)\n", + " model = torch.jit.freeze(model)\n", + " y = model(**inputs)\n", + " y = model(**inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7983dfa3-8bb2-4f9e-994c-24cbd556b94d", + "metadata": {}, + "outputs": [], + "source": [ + "inference_time_optimized = get_average_inference_time(model, inputs)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54e19218-dd71-4711-a25a-8e7c8a787d19", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "5e79ec28-6063-4d21-95d5-32be78bd49af", + "metadata": {}, + "source": [ + "**Dynamic Quantization** \n", + " In dynamic quantization the weights are quantized ahead of time but the activations are dynamically quantized during inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bef3494-b5c5-4acd-8431-71e3199a9764", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "\n", + "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + "\n", + "jit_inputs = tuple((inputs['input_ids'], inputs['attention_mask']))\n", + "\n", + "\n", + "qconfig_mapping = ipex.quantization.default_dynamic_qconfig_mapping # for dynamic quantization\n", + "prepared_model = ipex.quantization.prepare(model, qconfig_mapping, example_inputs=jit_inputs, inplace=False)\n", + "model = convert(prepared_model)\n", + "with torch.no_grad():\n", + " model = torch.jit.trace(model, jit_inputs, strict=False)\n", + " model = torch.jit.freeze(model)\n", + " y = model(**inputs)\n", + " y = model(**inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cc311f2-6215-41ff-a7ed-e1942e0cbbde", + "metadata": {}, + "outputs": [], + "source": [ + "inference_time_optimized = get_average_inference_time(model, inputs)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb75733c-91d9-4d38-97a8-a5bb4b3063a8", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "bff70b14-26d9-4ef8-894a-6d0a40973324", + "metadata": {}, + "source": [ + "## Intelยฎ Extension for PyTorch* (IPEX) Launch Script" + ] + }, + { + "cell_type": "markdown", + "id": "63f71568-902f-4e9f-a130-cc5fae1961db", + "metadata": {}, + "source": [ + "Default primitives of PyTorch and Intelยฎ Extension for PyTorch* (IPEX) are highly optimized, there are things users can do improve performance. Setting configuration options properly contributes to a performance boost. However, there is no unified configuration that is optimal to all topologies. Users need to try different combinations by themselves." + ] + }, + { + "cell_type": "markdown", + "id": "f826ac48-17e2-48b4-ab2c-0a47620790a7", + "metadata": {}, + "source": [ + "**Single instance for inference**" + ] + }, + { + "cell_type": "markdown", + "id": "f300e009-f7cb-4403-8229-938bf89b0920", + "metadata": {}, + "source": [ + "The launch script is provided as a module of Intelยฎ Extension for PyTorch* (IPEX). Below are some of those configurations that can be set using the launch script for a single instance. The launch script can be run as a shell command from a Jupyter notebook or from the shell itself." + ] + }, + { + "cell_type": "markdown", + "id": "35ed3f44-c019-4997-a73d-a043ddfa12ee", + "metadata": {}, + "source": [ + "To explore the features of the launch script module, we will be using a ResNet-50 model, which is a a convolutional neural network that is 50 layers deep.The model script is present in the scripts folder" + ] + }, + { + "cell_type": "markdown", + "id": "edbd3506-dc86-4f18-b666-dfa9e9e3705a", + "metadata": {}, + "source": [ + "It is recommended that the user check the output of [htop](https://htop.dev/) in an accompanying terminal to check the usage of cores while running the cells below. The output from htop looks as shown below." + ] + }, + { + "cell_type": "markdown", + "id": "b5015d8d-6ac6-41c4-a10e-509069b699ee", + "metadata": {}, + "source": [ + "![htop](https://intel.github.io/intel-extension-for-pytorch/latest/_images/1ins_phy.gif)" + ] + }, + { + "cell_type": "markdown", + "id": "91737a0e-1a14-4582-a9c4-63b24e963ff7", + "metadata": {}, + "source": [ + "By running the below command, One main worker thread will be launched, then it will launch threads on 2 other physical cores." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76265fe7-52cd-4574-9657-18efa5c25514", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 3 --log_path ./logs ./python/resnet50.py" + ] + }, + { + "cell_type": "markdown", + "id": "6fa2dd27-cf4f-4e1d-808a-271c35cd508b", + "metadata": {}, + "source": [ + "Similarly by increasing the number of cores, we can see an improvement in the inference time as shown below " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c522568-db0f-4438-ae1f-66623e07c96f", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 6 --log_path ./logs ./python/resnet50.py" + ] + }, + { + "cell_type": "markdown", + "id": "a3a00650-d15f-4aa7-b526-a21c6823d50a", + "metadata": {}, + "source": [ + "We saw a small example usage of the launch script module. This [documentation](https://intel.github.io/intel-extension-for-pytorch/cpu/1.12.100+cpu/tutorials/performance_tuning/launch_script.html) provides many more examples to use the launch script. As mentioned earlier, each deep learning topology can benefit from custom tuning to achieve the best performance on top of the optimizations we have discussed so far." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd6e13ed-0c0e-4e97-acaa-7a8dfba259c4", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "nbTranslate": { + "displayLangs": [ + "*" + ], + "hotkey": "alt-t", + "langInMainMenu": true, + "sourceLang": "en", + "targetLang": "fr", + "useGoogleTranslate": true + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py b/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py new file mode 100644 index 000000000..6bc1e29a8 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# encoding: utf-8 + +''' +============================================================== + Copyright ยฉ 2023 Intel Corporation + + SPDX-License-Identifier: MIT +============================================================== +''' + +import os +from time import time +import matplotlib.pyplot as plt +import torch +import intel_extension_for_pytorch as ipex +from intel_extension_for_pytorch.quantization import prepare, convert +import torchvision +from torchvision import models +from transformers import BertModel + +SUPPORTED_MODELS = ["resnet50", "bert"] # models supported by this code sample + +# ResNet sample data parameters +RESNET_BATCH_SIZE = 64 + +# BERT sample data parameters +BERT_BATCH_SIZE = 64 +BERT_SEQ_LENGTH = 512 + +os.environ["ONEDNN_MAX_CPU_ISA"] = "AVX512_CORE_VNNI" + +""" +Function to perform inference on Resnet50 and BERT +""" +def runInference(model, data, modelName="resnet50", dataType="FP32", amx=True): + """ + Input parameters + model: the PyTorch model object used for inference + data: a sample input into the model + modelName: str representing the name of the model, supported values - resnet50, bert + dataType: str representing the data type for model parameters, supported values - FP32, BF16, INT8 + amx: set to False to disable AMX on BF16, Default: True + Return value + inference_time: the time in seconds it takes to perform inference with the model + """ + + # Display run case + if amx: + isa_text = "AVX512_CORE_AMX" + else: + isa_text = "AVX512_CORE_VNNI" + print("%s %s inference with %s" %(modelName, dataType, isa_text)) + + # Special variables for specific models + batch_size = None + if "resnet50" == modelName: + batch_size = RESNET_BATCH_SIZE + elif "bert" == modelName: + d = torch.randint(model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) # sample data input for torchscript and inference + batch_size = BERT_BATCH_SIZE + else: + raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) + + # Prepare model for inference based on precision (FP32, BF16, INT8) + if "INT8" == dataType: + # Quantize model to INT8 if needed (one time) + model_filename = "quantized_model_%s.pt" %modelName + if not os.path.exists(model_filename): + qconfig = ipex.quantization.default_static_qconfig + prepared_model = prepare(model, qconfig, example_inputs=data, inplace=False) + converted_model = convert(prepared_model) + with torch.no_grad(): + if "resnet50" == modelName: + traced_model = torch.jit.trace(converted_model, data) + elif "bert" == modelName: + traced_model = torch.jit.trace(converted_model, (d,), check_trace=False, strict=False) + else: + raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) + traced_model = torch.jit.freeze(traced_model) + traced_model.save(model_filename) + + # Load INT8 model for inference + model = torch.jit.load(model_filename) + model.eval() + model = torch.jit.freeze(model) + elif "BF16" == dataType: + model = ipex.optimize(model, dtype=torch.bfloat16) + with torch.no_grad(): + with torch.cpu.amp.autocast(): + if "resnet50" == modelName: + model = torch.jit.trace(model, data) + elif "bert" == modelName: + model = torch.jit.trace(model, (d,), check_trace=False, strict=False) + else: + raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) + model = torch.jit.freeze(model) + else: # FP32 + with torch.no_grad(): + if "resnet50" == modelName: + model = torch.jit.trace(model, data) + elif "bert" == modelName: + model = torch.jit.trace(model, (d,), check_trace=False, strict=False) + else: + raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) + model = torch.jit.freeze(model) + + # Run inference + with torch.no_grad(): + if "BF16" == dataType: + with torch.cpu.amp.autocast(): + # Warm up + for i in range(5): + model(data) + + # Measure latency + start_time = time() + model(data) + end_time = time() + else: + # Warm up + for i in range(5): + model(data) + + # Measure latency + start_time = time() + model(data) + end_time = time() + inference_time = end_time - start_time + print("Inference on batch size %d took %.3f seconds" %(batch_size, inference_time)) + + return inference_time + + +""" +Perform all types of inference in main function + +Inference run cases for both Resnet50 and BERT +1) INT8 using AVX512_CORE_VNNI +""" +def main(): + # ResNet50 + resnet_model = models.resnet50(pretrained=True) + resnet_data = torch.rand(RESNET_BATCH_SIZE, 3, 224, 224) + resnet_model.eval() + int8_with_vnni_resnet_inference_time = runInference(resnet_model, resnet_data, modelName="resnet50", dataType="INT8", amx=False) + + # BERT + bert_model = BertModel.from_pretrained("bert-base-uncased") +#torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') + bert_data = torch.randint(bert_model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) + bert_model.eval() + int8_with_vnni_bert_inference_time = runInference(bert_model, bert_data, modelName="bert", dataType="INT8", amx=False) + +if __name__ == '__main__': + main() diff --git a/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py b/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py new file mode 100644 index 000000000..dae594af2 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py @@ -0,0 +1,60 @@ +import torch +import torchvision.models as models + +def inference(model, data): + with torch.no_grad(): + # warm up + for _ in range(100): + model(data) + + # measure + import time + start = time.time() + for _ in range(100): + output = model(data) + end = time.time() + print('Inference took {:.2f} ms in average'.format((end-start)/100*1000)) + +def main(args): + model = models.resnet50(pretrained=False) + model.eval() + + data = torch.rand(1, 3, 224, 224) + + import intel_extension_for_pytorch as ipex + + model = model.to(memory_format=torch.channels_last) + data = data.to(memory_format=torch.channels_last) + + if args.dtype == 'float32': + model = ipex.optimize(model, dtype=torch.float32) + elif args.dtype == 'bfloat16': + model = ipex.optimize(model, dtype=torch.bfloat16) + else: # int8 + from intel_extension_for_pytorch.quantization import prepare, convert + + qconfig = ipex.quantization.default_static_qconfig + model = prepare(model, qconfig, example_inputs=data, inplace=False) + + # calibration + n_iter = 100 + for i in range(n_iter): + model(data) + + model = convert(model) + + with torch.cpu.amp.autocast(enabled=args.dtype=='bfloat16'): + if args.torchscript: + with torch.no_grad(): + model = torch.jit.trace(model, data) + model = torch.jit.freeze(model) + + inference(model, data) + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--dtype', default='float32', choices=['float32', 'bfloat16', 'int8']) + parser.add_argument("--torchscript", default=False, action="store_true") + + main(parser.parse_args()) diff --git a/examples/cpu/inference/python/README.md b/examples/cpu/inference/python/python-scripts/README.md similarity index 97% rename from examples/cpu/inference/python/README.md rename to examples/cpu/inference/python/python-scripts/README.md index 044dc73e2..b20cf38c4 100644 --- a/examples/cpu/inference/python/README.md +++ b/examples/cpu/inference/python/python-scripts/README.md @@ -1,7 +1,7 @@ ๏ปฟ# Model Inference with Intelยฎ Extension for PyTorch\* Optimizations We provided examples about how to use Intelยฎ Extension for PyTorch\* to accelerate model inference. -The `ipex.optimize` function of Intelยฎ Extension for PyTorch* applies optimizations to the model, bringing additional performance boosts. +The `ipex.optimize` function of Intelยฎ Extension for PyTorch\* applies optimizations to the model, bringing additional performance boosts. For both computer vision workloads and NLP workloads, we recommend applying the `ipex.optimize` function against the model object. ## Environment Setup diff --git a/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/bert_eager_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/bert_eager_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/bert_eager_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/bert_general_inference_script.py b/examples/cpu/inference/python/python-scripts/bert_general_inference_script.py similarity index 100% rename from examples/cpu/inference/python/bert_general_inference_script.py rename to examples/cpu/inference/python/python-scripts/bert_general_inference_script.py diff --git a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/int8_deployment.py b/examples/cpu/inference/python/python-scripts/int8_deployment.py similarity index 100% rename from examples/cpu/inference/python/int8_deployment.py rename to examples/cpu/inference/python/python-scripts/int8_deployment.py diff --git a/examples/cpu/inference/python/int8_quantization_dynamic.py b/examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py similarity index 100% rename from examples/cpu/inference/python/int8_quantization_dynamic.py rename to examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py diff --git a/examples/cpu/inference/python/int8_quantization_static.py b/examples/cpu/inference/python/python-scripts/int8_quantization_static.py similarity index 100% rename from examples/cpu/inference/python/int8_quantization_static.py rename to examples/cpu/inference/python/python-scripts/int8_quantization_static.py diff --git a/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/resnet50_general_inference_script.py b/examples/cpu/inference/python/python-scripts/resnet50_general_inference_script.py similarity index 100% rename from examples/cpu/inference/python/resnet50_general_inference_script.py rename to examples/cpu/inference/python/python-scripts/resnet50_general_inference_script.py diff --git a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py diff --git a/examples/cpu/training/README.md b/examples/cpu/training/python-scripts/README.md similarity index 74% rename from examples/cpu/training/README.md rename to examples/cpu/training/python-scripts/README.md index 2169e8df9..066425954 100644 --- a/examples/cpu/training/README.md +++ b/examples/cpu/training/python-scripts/README.md @@ -23,22 +23,6 @@ git clone https://github.com/intel/intel-extension-for-pytorch.git cd intel-extension-for-pytorch/examples/cpu/training ``` -Running ResNet50 Float32 single precision training example: - -```bash -python single_instance_training_fp32.py -``` - -We provided support for BFloat16 half precision training. -Please refer to [Automatic Mixed Precision (AMP) introduction](https://pytorch.org/docs/stable/amp.html) for more details. -BFloat16 calculations are further accelerated on the processors supporting [Intelยฎ Advanced Matrix Extensions (AMX)](https://en.wikipedia.org/wiki/Advanced_Matrix_Extensions) instructions. - -Running ResNet50 BFloat16 half precision training example: - -```bash -python single_instance_training_bf16.py -``` - Running ResNet50 distributed training example: ```bash @@ -49,4 +33,4 @@ ipexrun --nnodes 1 distributed_data_parallel_training.py Please check [the training examples in Intelยฎ Extension for PyTorch\* online document](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/examples.html#training) for more details. -For more information and examples about distributed training via PyTorch\* DDP, please visit [oneAPI Collective Communications Library Bindings for Pytorch\* Github repository](https://github.com/intel/torch-ccl). \ No newline at end of file +For more information and examples about distributed training via PyTorch\* DDP, please visit [oneAPI Collective Communications Library Bindings for Pytorch\* Github repository](https://github.com/intel/torch-ccl). diff --git a/examples/cpu/training/distributed_data_parallel_training.py b/examples/cpu/training/python-scripts/distributed_data_parallel_training.py similarity index 100% rename from examples/cpu/training/distributed_data_parallel_training.py rename to examples/cpu/training/python-scripts/distributed_data_parallel_training.py diff --git a/examples/cpu/training/single_instance_training_bf16.py b/examples/cpu/training/single_instance_training_bf16.py deleted file mode 100644 index fa596e686..000000000 --- a/examples/cpu/training/single_instance_training_bf16.py +++ /dev/null @@ -1,51 +0,0 @@ -import torch -import torchvision -import intel_extension_for_pytorch as ipex - -LR = 0.001 -DOWNLOAD = True -DATA = "datasets/cifar10/" - -transform = torchvision.transforms.Compose( - [ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), - ] -) -train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, -) -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) - -model = torchvision.models.resnet50() -criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9) -model.train() - -model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) -# Uncomment the code below to enable beta feature `torch.compile` -# model = torch.compile(model, backend="ipex") - -for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - # Note: bf16 training requires amp.autocast() context # noqa F401 - with torch.cpu.amp.autocast(): - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - -torch.save( - { - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - }, - "checkpoint.pth", -) - -print("Execution finished") diff --git a/examples/cpu/training/single_instance_training_fp32.py b/examples/cpu/training/single_instance_training_fp32.py deleted file mode 100644 index ae2b970ad..000000000 --- a/examples/cpu/training/single_instance_training_fp32.py +++ /dev/null @@ -1,49 +0,0 @@ -import torch -import torchvision -import intel_extension_for_pytorch as ipex - -LR = 0.001 -DOWNLOAD = True -DATA = "datasets/cifar10/" - -transform = torchvision.transforms.Compose( - [ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), - ] -) -train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, -) -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) - -model = torchvision.models.resnet50() -criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9) -model.train() - -model, optimizer = ipex.optimize(model, optimizer=optimizer) -# Uncomment the code below to enable beta feature `torch.compile` -# model = torch.compile(model, backend="ipex") - -for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - -torch.save( - { - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - }, - "checkpoint.pth", -) - -print("Execution finished") diff --git a/scripts/build_doc.sh b/scripts/build_doc.sh index a34293801..8bc92cdba 100644 --- a/scripts/build_doc.sh +++ b/scripts/build_doc.sh @@ -173,25 +173,23 @@ parse_example() { cp ${MDEXAMPLE} tutorials/examples.md.bk if [[ ${DEVICE} == "cpu" ]]; then - parse_example "../examples/cpu/training/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" - parse_example "../examples/cpu/training/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" - parse_example "../examples/cpu/training/distributed_data_parallel_training.py" ${MDEXAMPLE} "(marker_train_ddp_complete)" "python" - parse_example "../examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" - parse_example "../examples/cpu/inference/python/bert_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" - parse_example "../examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_fp32)" "python" - parse_example "../examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_fp32)" "python" - parse_example "../examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" - parse_example "../examples/cpu/inference/python/bert_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" - parse_example "../examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_bf16)" "python" - parse_example "../examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_bf16)" "python" + parse_example "../examples/cpu/training/python-scripts/distributed_data_parallel_training.py" ${MDEXAMPLE} "(marker_train_ddp_complete)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_bf16)" "python" parse_example "../examples/cpu/features/fast_bert/fast_bert_inference_bf16.py" ${MDEXAMPLE} "(marker_feature_fastbert_bf16)" "python" - parse_example "../examples/cpu/inference/python/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" - parse_example "../examples/cpu/inference/python/int8_quantization_dynamic.py" ${MDEXAMPLE} "(marker_int8_dynamic)" "python" - parse_example "../examples/cpu/inference/python/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py" ${MDEXAMPLE} "(marker_int8_dynamic)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" parse_example "../examples/cpu/features/llm/llm_optimize.py" ${MDEXAMPLE} "(marker_llm_optimize)" "python" parse_example "../examples/cpu/features/llm/llm_optimize_smoothquant.py" ${MDEXAMPLE} "(marker_llm_optimize_sq)" "python" parse_example "../examples/cpu/features/llm/llm_optimize_woq.py" ${MDEXAMPLE} "(marker_llm_optimize_woq)" "python" @@ -207,24 +205,24 @@ if [[ ${DEVICE} == "cpu" ]]; then parse_example "../examples/cpu/features/graph_optimization/int8.py" tutorials/features/graph_optimization.md "(marker_feature_graph_optimization_int8)" "python" parse_example "../examples/cpu/features/graph_optimization/folding.py" tutorials/features/graph_optimization.md "(marker_feature_graph_optimization_folding)" "python" elif [[ ${DEVICE} == "gpu" ]]; then - parse_example "../examples/gpu/training/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" - parse_example "../examples/gpu/training/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" - parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" - parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" - parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" - parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" - parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp16)" "python" - parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp16)" "python" - parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp32_alt.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32_alt)" "python" - # parse_example "../examples/gpu/inference/python/int8_calibration_static_imperative.py" ${MDEXAMPLE} "(marker_int8_static_imperative)" "python" - parse_example "../examples/gpu/inference/python/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" - # parse_example "../examples/gpu/inference/python/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" + parse_example "../examples/gpu/training/python-scripts/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" + parse_example "../examples/gpu/training/python-scripts/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp32_alt.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32_alt)" "python" + # parse_example "../examples/gpu/inference/python/python-scripts/int8_calibration_static_imperative.py" ${MDEXAMPLE} "(marker_int8_static_imperative)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" + # parse_example "../examples/gpu/inference/python/python-scripts/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" parse_example "../examples/gpu/inference/cpp/example-app/example-app.cpp" ${MDEXAMPLE} "(marker_cppsdk_sample_app)" "cpp" parse_example "../examples/gpu/inference/cpp/example-app/CMakeLists.txt" ${MDEXAMPLE} "(marker_cppsdk_cmake_app)" "cmake" parse_example "../examples/gpu/inference/cpp/example-usm/example-usm.cpp" ${MDEXAMPLE} "(marker_cppsdk_sample_usm)" "cpp" From 57321eb54aa86ba449c6798927138c668117e17e Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Tue, 4 Jun 2024 12:11:48 +0900 Subject: [PATCH 103/199] Revert "Move oneAPI IPEX samples back to IPEX repo (#2943)" (#2957) This reverts commit 9469f8ccc6959f1ab5c955aa07d0c2bfc57a315c. --- docs/tutorials/examples.md | 45 + .../python/{python-scripts => }/README.md | 2 +- .../bert_eager_mode_inference_bf16.py | 0 .../bert_eager_mode_inference_fp32.py | 0 .../bert_general_inference_script.py | 0 .../bert_torchdynamo_mode_inference_bf16.py | 0 .../bert_torchdynamo_mode_inference_fp32.py | 0 .../bert_torchscript_mode_inference_bf16.py | 0 .../bert_torchscript_mode_inference_fp32.py | 0 .../{python-scripts => }/int8_deployment.py | 0 .../int8_quantization_dynamic.py | 0 .../int8_quantization_static.py | 0 .../python/jupyter-notebooks/.gitkeep | 0 .../IPEX_Getting_Started.ipynb | 367 -------- ...InferenceOptimizations_AMX_BF16_INT8.ipynb | 589 ------------ ...ytorch_Interactive_Chat_Quantization.ipynb | 521 ---------- .../IntelPytorch_Quantization.ipynb | 347 ------- .../python/jupyter-notebooks/README.md | 61 -- .../optimize_pytorch_models_with_ipex.ipynb | 888 ------------------ .../python/pytorch_inference_vnni.py | 156 --- .../jupyter-notebooks/python/resnet50.py | 60 -- .../resnet50_eager_mode_inference_bf16.py | 0 .../resnet50_eager_mode_inference_fp32.py | 0 .../resnet50_general_inference_script.py | 0 ...esnet50_torchdynamo_mode_inference_bf16.py | 0 ...esnet50_torchdynamo_mode_inference_fp32.py | 0 ...esnet50_torchscript_mode_inference_bf16.py | 0 ...esnet50_torchscript_mode_inference_fp32.py | 0 .../training/{python-scripts => }/README.md | 18 +- .../distributed_data_parallel_training.py | 0 .../training/single_instance_training_bf16.py | 51 + .../training/single_instance_training_fp32.py | 49 + scripts/build_doc.sh | 70 +- 33 files changed, 199 insertions(+), 3025 deletions(-) rename examples/cpu/inference/python/{python-scripts => }/README.md (97%) rename examples/cpu/inference/python/{python-scripts => }/bert_eager_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{python-scripts => }/bert_eager_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{python-scripts => }/bert_general_inference_script.py (100%) rename examples/cpu/inference/python/{python-scripts => }/bert_torchdynamo_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{python-scripts => }/bert_torchdynamo_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{python-scripts => }/bert_torchscript_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{python-scripts => }/bert_torchscript_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{python-scripts => }/int8_deployment.py (100%) rename examples/cpu/inference/python/{python-scripts => }/int8_quantization_dynamic.py (100%) rename examples/cpu/inference/python/{python-scripts => }/int8_quantization_static.py (100%) delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/.gitkeep delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/README.md delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py delete mode 100644 examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py rename examples/cpu/inference/python/{python-scripts => }/resnet50_eager_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{python-scripts => }/resnet50_eager_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{python-scripts => }/resnet50_general_inference_script.py (100%) rename examples/cpu/inference/python/{python-scripts => }/resnet50_torchdynamo_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{python-scripts => }/resnet50_torchdynamo_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{python-scripts => }/resnet50_torchscript_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{python-scripts => }/resnet50_torchscript_mode_inference_fp32.py (100%) rename examples/cpu/training/{python-scripts => }/README.md (74%) rename examples/cpu/training/{python-scripts => }/distributed_data_parallel_training.py (100%) create mode 100644 examples/cpu/training/single_instance_training_bf16.py create mode 100644 examples/cpu/training/single_instance_training_fp32.py diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md index 3c9eacaa6..f90505a2d 100644 --- a/docs/tutorials/examples.md +++ b/docs/tutorials/examples.md @@ -25,6 +25,51 @@ Before running these examples, please note the following: ### Training +#### Single-instance Training + +To use Intelยฎ Extension for PyTorch\* on training, you need to make the following changes in your code: + +1. Import `intel_extension_for_pytorch` as `ipex`. +2. Invoke the `ipex.optimize` function to apply optimizations against the model and optimizer objects, as shown below: + + +```python +... +import torch +import intel_extension_for_pytorch as ipex +... +model = Model() +criterion = ... +optimizer = ... +model.train() +# For Float32 +model, optimizer = ipex.optimize(model, optimizer=optimizer) +# For BFloat16 +model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) +# Invoke the code below to enable beta feature torch.compile +model = torch.compile(model, backend="ipex") +... +optimizer.zero_grad() +output = model(data) +... +``` + +Below you can find complete code examples demonstrating how to use the extension on training for different data types: + +##### Float32 + +**Note:** You need to install `torchvision` Python package to run the following example. + +[//]: # (marker_train_single_fp32_complete) +[//]: # (marker_train_single_fp32_complete) + +##### BFloat16 + +**Note:** You need to install `torchvision` Python package to run the following example. + +[//]: # (marker_train_single_bf16_complete) +[//]: # (marker_train_single_bf16_complete) + #### Distributed Training Distributed training with PyTorch DDP is accelerated by oneAPI Collective Communications Library Bindings for Pytorch\* (oneCCL Bindings for Pytorch\*). The extension supports FP32 and BF16 data types. More detailed information and examples are available at the [Github repo](https://github.com/intel/torch-ccl). diff --git a/examples/cpu/inference/python/python-scripts/README.md b/examples/cpu/inference/python/README.md similarity index 97% rename from examples/cpu/inference/python/python-scripts/README.md rename to examples/cpu/inference/python/README.md index b20cf38c4..044dc73e2 100644 --- a/examples/cpu/inference/python/python-scripts/README.md +++ b/examples/cpu/inference/python/README.md @@ -1,7 +1,7 @@ ๏ปฟ# Model Inference with Intelยฎ Extension for PyTorch\* Optimizations We provided examples about how to use Intelยฎ Extension for PyTorch\* to accelerate model inference. -The `ipex.optimize` function of Intelยฎ Extension for PyTorch\* applies optimizations to the model, bringing additional performance boosts. +The `ipex.optimize` function of Intelยฎ Extension for PyTorch* applies optimizations to the model, bringing additional performance boosts. For both computer vision workloads and NLP workloads, we recommend applying the `ipex.optimize` function against the model object. ## Environment Setup diff --git a/examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py b/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py rename to examples/cpu/inference/python/bert_eager_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py b/examples/cpu/inference/python/bert_eager_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py rename to examples/cpu/inference/python/bert_eager_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/python-scripts/bert_general_inference_script.py b/examples/cpu/inference/python/bert_general_inference_script.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/bert_general_inference_script.py rename to examples/cpu/inference/python/bert_general_inference_script.py diff --git a/examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py rename to examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py rename to examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py rename to examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py rename to examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/python-scripts/int8_deployment.py b/examples/cpu/inference/python/int8_deployment.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/int8_deployment.py rename to examples/cpu/inference/python/int8_deployment.py diff --git a/examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py b/examples/cpu/inference/python/int8_quantization_dynamic.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py rename to examples/cpu/inference/python/int8_quantization_dynamic.py diff --git a/examples/cpu/inference/python/python-scripts/int8_quantization_static.py b/examples/cpu/inference/python/int8_quantization_static.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/int8_quantization_static.py rename to examples/cpu/inference/python/int8_quantization_static.py diff --git a/examples/cpu/inference/python/jupyter-notebooks/.gitkeep b/examples/cpu/inference/python/jupyter-notebooks/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb deleted file mode 100644 index 10c934360..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb +++ /dev/null @@ -1,367 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Getting Started with Intelยฎ Extension for PyTorch (IPEX)\n", - "This code sample will guide users how to run a PyTorch inference workload on CPU by using oneAPI AI Analytics Toolkit and also analyze the CPU usage via oneDNN verbose logs." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Resnet50 Inference on CPU\n", - "***\n", - "This section shows users how to run resnet50 inference on CPU." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### prerequisites" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ignore all warning messages\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "import os" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set the installation path of your oneAPI AI Analytics toolkit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%env ONEAPI_INSTALL=/opt/intel/oneapi" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Download the resnet50 inference sample from Intelยฎ Extension for PyTorch (IPEX) github repository" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget https://raw.githubusercontent.com/intel/intel-extension-for-pytorch/master/examples/cpu/inference/python/resnet50_general_inference_script.py" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check PyTorch and Intelยฎ Extension for PyTorch (IPEX) verson in current ipython kernel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run ../../version_check.py" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run resnet50 on CPU" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Run on CPU via Intelยฎ Extension for PyTorch (IPEX)\n", - "There is a PyTorch conda environment with Intelยฎ Extension for PyTorch (IPEX) installation in current AI Kit installation.\n", - "Users could run resnet50_general_inference_script.py on Intel CPU on this PyTorch conda environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile run.sh\n", - "#!/bin/bash\n", - "source $ONEAPI_INSTALL/setvars.sh --force > /dev/null 2>&1\n", - "source activate pytorch\n", - "echo \"########## Executing the run\"\n", - "DNNL_VERBOSE=1 python resnet50_general_inference_script.py > infer_rn50_cpu.csv\n", - "echo \"########## Done with the run\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Submitting build.sh and run.sh to the job queue\n", - "\n", - "Now we can submit build.sh and run.sh to the job queue.\n", - "\n", - "NOTE - it is possible to execute any of the build and run commands in local environments.\n", - "To enable users to run their scripts either on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command qsub. If the check fails, it is assumed that build/run will be local." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! chmod 755 ../../q; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ../../q run.sh; else ./run.sh; fi" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Analyze Verbose Logs\n", - "***\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Download profile_utils.py to parse oneDNN verbose logs from previous section." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!wget https://raw.githubusercontent.com/oneapi-src/oneAPI-samples/master/Libraries/oneDNN/tutorials/profiling/profile_utils.py" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 1: List out all oneDNN verbose logs\n", - "users should see the verbose log listed in the table below.\n", - "\n", - "|Log File Name | Description |\n", - "|:-----|:----|\n", - "|infer_rn50_cpu.csv| log for cpu run |" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "filenames= os.listdir (\".\") \n", - "result = []\n", - "keyword = \".csv\"\n", - "for filename in filenames: \n", - " #if os.path.isdir(os.path.join(os.path.abspath(\".\"), filename)): \n", - " if filename.find(keyword) != -1:\n", - " result.append(filename)\n", - "result.sort()\n", - "\n", - "index =0 \n", - "for folder in result:\n", - " print(\" %d : %s \" %(index, folder))\n", - " index+=1" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 2: Pick a verbose log by putting its index value below\n", - "Users can pick cpu log for analysis. \n", - "Once users finish Step 2 to Step 7 for one log file, they can go back to step 2 and select another log file for analysis." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "FdIndex=0" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 3: Parse verbose log and get the data back\n", - "> Users will also get a oneDNN.json file with timeline information for oneDNN primitives. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "logfile = result[FdIndex]\n", - "print(logfile)\n", - "from profile_utils import oneDNNUtils, oneDNNLog\n", - "onednn = oneDNNUtils()\n", - "log1 = oneDNNLog()\n", - "log1.load_log(logfile)\n", - "data = log1.data\n", - "exec_data = log1.exec_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 4: Time breakdown for exec type\n", - "The exec type includes exec and create. \n", - "\n", - "|exec type | Description | \n", - "|:-----|:----| \n", - "|exec | Time for primitives exection. Better to spend most of time on primitives execution. | \n", - "|create| Time for primitives creation. Primitives creation happens once. Better to spend less time on primitive creation. | " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 5: Time breakdown for architecture type\n", - "The supported architecture only includes CPU. \n", - "so users should see 100% CPU time. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "onednn.breakdown(exec_data,\"arch\",\"time\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 6: Time breakdown for primitives type\n", - "The primitives type includes convolution, reorder, sum, etc. \n", - "For this simple convolution net example, convolution and inner product primitives are expected to spend most of time. \n", - "However, the exact time percentage of different primitivies may vary among different architectures. \n", - "Users can easily identify top hotpots of primitives executions with this time breakdown. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "onednn.breakdown(exec_data,\"type\",\"time\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 7: Time breakdown for JIT kernel type\n", - "oneDNN uses just-in-time compilation (JIT) to generate optimal code for some functions based on input parameters and instruction set supported by the system. \n", - "Therefore, users can see different JIT kernel type among different CPU architectures. \n", - "For example, users can see avx_core_vnni JIT kernel if the workload uses VNNI instruction on Cascake Lake platform. \n", - "Moreover, users can identify the top hotspots of JIT kernel executions with this time breakdown. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "onednn.breakdown(exec_data,\"jit\",\"time\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The output(both stdout and stderr) is displayed on the command line console" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py39", - "language": "python", - "name": "py39" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb deleted file mode 100644 index c4bca3199..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb +++ /dev/null @@ -1,589 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "75f9200e-7830-4ee5-8637-e67b5df57eac", - "metadata": {}, - "source": [ - "# PyTorch Inference Optimizations with Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) Bfloat16 Integer8" - ] - }, - { - "cell_type": "markdown", - "id": "48eb565f-ef03-40cb-9182-5b2b752331e8", - "metadata": {}, - "source": [ - "The `PyTorch* Inference Optimizations with Advanced Matrix Extensions Bfloat16 Integer8` sample demonstrates how to perform inference using the ResNet50 and BERT models using the Intelยฎ Extension for PyTorch (IPEX).\n", - "\n", - "The Intelยฎ Extension for PyTorch (IPEX) extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX).\n", - "\n", - "| Area | Description\n", - "|:--- |:---\n", - "| What you will learn | Inference performance improvements using Intelยฎ Extension for PyTorch (IPEX) with Intelยฎ AMX BF16/INT8\n", - "| Time to complete | 5 minutes\n", - "| Category | Code Optimization\n", - "\n", - "## Purpose\n", - "\n", - "The Intelยฎ Extension for PyTorch (IPEX) allows you to speed up inference on Intelยฎ Xeon Scalable processors with lower precision data formats and specialized computer instructions. The bfloat16 (BF16) data format uses half the bit width of floating-point-32 (FP32), which lessens the amount of memory needed and execution time to process. Likewise, the integer8 (INT8) data format uses half the bit width of BF16. You should notice performance optimization with the Intelยฎ AMX instruction set when compared to Intelยฎ Vector Neural Network Instructions (Intelยฎ VNNI).\n", - "\n", - "## Prerequisites\n", - "\n", - "| Optimized for | Description\n", - "|:--- |:---\n", - "| OS | Ubuntu* 18.04 or newer\n", - "| Hardware | 4th Gen Intelยฎ Xeonยฎ Scalable Processors or newer\n", - "| Software | Intelยฎ Extension for PyTorch (IPEX)\n", - "\n", - "## Key Implementation Details\n", - "\n", - "This code sample will perform inference on the ResNet50 and BERT models while using Intelยฎ Extension for PyTorch (IPEX). For each pretrained model, there will be a warm up of 20 samples before running inference on the specified number of samples (i.e. 1000) to record the time. Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) is supported on BF16 and INT8 data types starting with the 4th Generation of Xeon Scalable Processors. The inference time will be compared, showcasing the speedup over FP32 when using AVX-512, Intelยฎ AMX, BF16, and INT8. The following run cases are executed: \n", - "\n", - "1. FP32 (baseline)\n", - "2. BF16 using AVX512_CORE_AMX\n", - "3. INT8 using AVX512_CORE_VNNI\n", - "4. INT8 using AVX512_CORE_AMX\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "4c254afc", - "metadata": {}, - "source": [ - "## Installation of required packages\n", - "\n", - "Ensure the kernel is set to Pytorch-CPU before running the follwing code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa457cee-5b1e-4ec9-b03a-2a7b2a8b464e", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install matplotlib transformers py-cpuinfo sentencepiece sacremoses " - ] - }, - { - "cell_type": "markdown", - "id": "4e41ce52-c94c-4bdf-a528-0e0200fd5501", - "metadata": {}, - "source": [ - "## Imports, Constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e4eedf0-5c7c-49d3-be15-f46b4988d9ff", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "from time import time\n", - "import matplotlib.pyplot as plt\n", - "import torch\n", - "import intel_extension_for_pytorch as ipex\n", - "from intel_extension_for_pytorch.quantization import prepare, convert\n", - "import torchvision\n", - "from torchvision import models\n", - "from transformers import BertModel" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17246f67-0059-4b5f-afe8-a105d767b139", - "metadata": {}, - "outputs": [], - "source": [ - "SUPPORTED_MODELS = [\"resnet50\", \"bert\"] # models supported by this code sample\n", - "\n", - "# ResNet sample data parameters\n", - "RESNET_BATCH_SIZE = 64\n", - "\n", - "# BERT sample data parameters\n", - "BERT_BATCH_SIZE = 64\n", - "BERT_SEQ_LENGTH = 512" - ] - }, - { - "cell_type": "markdown", - "id": "9771f165", - "metadata": {}, - "source": [ - "## Identify Supported ISA \n", - "We identify the underlying supported ISA to determine whether Intelยฎ AMX is supported. The 4th Gen Intelยฎ Xeonยฎ Scalable Processor (codenamed Sapphire Rapids) or newer must be used to run this sample. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25c339a4", - "metadata": {}, - "outputs": [], - "source": [ - "# Check if hardware supports Intelยฎ AMX\n", - "import sys\n", - "sys.path.append('../../')\n", - "from cpuinfo import get_cpu_info\n", - "info = get_cpu_info()\n", - "flags = info['flags']\n", - "amx_supported = False\n", - "for flag in flags:\n", - " if \"amx\" in flag:\n", - " amx_supported = True\n", - " break\n", - "if not amx_supported:\n", - " print(\"Intelยฎ AMX is not supported on current hardware. Code sample cannot be run.\\n\")" - ] - }, - { - "cell_type": "markdown", - "id": "7b3f461d", - "metadata": {}, - "source": [ - "If the message \"Intelยฎ AMX is not supported on current hardware. Code sample cannot be run.\" is printed above, the hardware being used does not support Intelยฎ AMX. Therefore, this code sample cannot proceed." - ] - }, - { - "cell_type": "markdown", - "id": "6ccd66ee-aac5-4a60-8f66-417612d4d3af", - "metadata": {}, - "source": [ - "## Running Inference\n", - "The function runInference() will perform inference on the selected model, precision, and whether Intelยฎ AMX is to be enabled. The environment variable `ONEDNN_MAX_CPU_ISA` is used to enable or disable Intelยฎ AMX. **Note that this environment variable is only initialized once.** This means to run with Intelยฎ AMX and Intelยฎ VNNI, there will need to be separate processes. The best practice is to set this environment variable before running your script. For more information, refer to the [oneDNN documentation on CPU Dispatcher Control](https://www.intel.com/content/www/us/en/develop/documentation/onednn-developer-guide-and-reference/top/performance-profiling-and-inspection/cpu-dispatcher-control.html). \n", - "\n", - "To use BF16 in operations, use the `torch.cpu.amp.autocast()` function to perform forward pass. For INT8, the quantization feature from Intelยฎ Extension for PyTorch (IPEX) is used to quantize the FP32 model to INT8 before running inference.\n", - "\n", - "Torchscript is also utilized to deploy the model in graph mode instead of imperative mode for faster runtime." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f08d718", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ[\"ONEDNN_MAX_CPU_ISA\"] = \"AVX512_CORE_AMX\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b8e21c9-aaa5-4f75-b00a-0d875cc0bfba", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Function to perform inference on Resnet50 and BERT\n", - "\"\"\"\n", - "def runInference(model, data, modelName=\"resnet50\", dataType=\"FP32\", amx=True):\n", - " \"\"\"\n", - " Input parameters\n", - " model: the PyTorch model object used for inference\n", - " data: a sample input into the model\n", - " modelName: str representing the name of the model, supported values - resnet50, bert\n", - " dataType: str representing the data type for model parameters, supported values - FP32, BF16, INT8\n", - " amx: set to False to disable Intelยฎ AMX on BF16, Default: True\n", - " Return value\n", - " inference_time: the time in seconds it takes to perform inference with the model\n", - " \"\"\"\n", - " \n", - " # Display run case\n", - " if amx:\n", - " isa_text = \"AVX512_CORE_AMX\"\n", - " else:\n", - " isa_text = \"AVX512_CORE_VNNI\"\n", - " print(\"%s %s inference with %s\" %(modelName, dataType, isa_text))\n", - "\n", - " # Special variables for specific models\n", - " batch_size = None\n", - " if \"resnet50\" == modelName:\n", - " batch_size = RESNET_BATCH_SIZE\n", - " elif \"bert\" == modelName:\n", - " d = torch.randint(model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) # sample data input for torchscript and inference\n", - " batch_size = BERT_BATCH_SIZE\n", - " else:\n", - " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", - "\n", - " # Prepare model for inference based on precision (FP32, BF16, INT8)\n", - " if \"INT8\" == dataType:\n", - " # Quantize model to INT8 if needed (one time)\n", - " model_filename = \"quantized_model_%s.pt\" %modelName\n", - " if not os.path.exists(model_filename):\n", - " qconfig = ipex.quantization.default_static_qconfig\n", - " prepared_model = prepare(model, qconfig, example_inputs=data, inplace=False)\n", - " converted_model = convert(prepared_model)\n", - " with torch.no_grad():\n", - " if \"resnet50\" == modelName:\n", - " traced_model = torch.jit.trace(converted_model, data)\n", - " elif \"bert\" == modelName:\n", - " traced_model = torch.jit.trace(converted_model, (d,), check_trace=False, strict=False)\n", - " else:\n", - " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", - " traced_model = torch.jit.freeze(traced_model)\n", - " traced_model.save(model_filename)\n", - "\n", - " # Load INT8 model for inference\n", - " model = torch.jit.load(model_filename)\n", - " model.eval()\n", - " model = torch.jit.freeze(model)\n", - " elif \"BF16\" == dataType:\n", - " model = ipex.optimize(model, dtype=torch.bfloat16)\n", - " with torch.no_grad():\n", - " with torch.cpu.amp.autocast():\n", - " if \"resnet50\" == modelName:\n", - " model = torch.jit.trace(model, data)\n", - " elif \"bert\" == modelName:\n", - " model = torch.jit.trace(model, (d,), check_trace=False, strict=False)\n", - " else:\n", - " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", - " model = torch.jit.freeze(model)\n", - " else: # FP32\n", - " with torch.no_grad():\n", - " if \"resnet50\" == modelName:\n", - " model = torch.jit.trace(model, data)\n", - " elif \"bert\" == modelName:\n", - " model = torch.jit.trace(model, (d,), check_trace=False, strict=False)\n", - " else:\n", - " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", - " model = torch.jit.freeze(model)\n", - "\n", - " # Run inference\n", - " with torch.no_grad():\n", - " if \"BF16\" == dataType:\n", - " with torch.cpu.amp.autocast():\n", - " # Warm up\n", - " for i in range(5):\n", - " model(data)\n", - " \n", - " # Measure latency\n", - " start_time = time()\n", - " model(data)\n", - " end_time = time()\n", - " else:\n", - " # Warm up\n", - " for i in range(5):\n", - " model(data)\n", - " \n", - " # Measure latency\n", - " start_time = time()\n", - " model(data)\n", - " end_time = time()\n", - " inference_time = end_time - start_time\n", - " print(\"Inference on batch size %d took %.3f seconds\" %(batch_size, inference_time))\n", - "\n", - " return inference_time" - ] - }, - { - "cell_type": "markdown", - "id": "1dad2dae", - "metadata": {}, - "source": [ - "The function summarizeResults() displays the inference times and generates one graph for comparing the inference times and another graph for comparing the speedup using FP32 as the baseline." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0cf736a2", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "Prints out results and displays figures summarizing output.\n", - "\"\"\"\n", - "def summarizeResults(modelName=\"\", results=None, batch_size=1):\n", - " \"\"\"\n", - " Input parameters\n", - " modelName: a str representing the name of the model\n", - " results: a dict with the run case and its corresponding time in seconds\n", - " batch_size: an integer for the batch size\n", - " Return value\n", - " None\n", - " \"\"\"\n", - "\n", - " # Inference time results\n", - " print(\"\\nSummary for %s (Batch Size = %d)\" %(modelName, batch_size))\n", - " for key in results.keys():\n", - " print(\"%s inference time: %.3f seconds\" %(key, results[key]))\n", - "\n", - " # Create bar chart with inference time results\n", - " plt.figure()\n", - " plt.title(\"%s Inference Time (Batch Size = %d)\" %(modelName, batch_size))\n", - " plt.xlabel(\"Run Case\")\n", - " plt.ylabel(\"Inference Time (seconds)\")\n", - " plt.bar(results.keys(), results.values())\n", - "\n", - " # Calculate speedup when using Intelยฎ AMX\n", - " print(\"\\n\")\n", - " bf16_with_amx_speedup = results[\"FP32\"] / results[\"BF16_with_AMX\"]\n", - " print(\"BF16 with Intelยฎ AMX is %.2fX faster than FP32\" %bf16_with_amx_speedup)\n", - " int8_with_vnni_speedup = results[\"FP32\"] / results[\"INT8_with_VNNI\"]\n", - " print(\"INT8 without Intelยฎ AMX is %.2fX faster than FP32\" %int8_with_vnni_speedup)\n", - " int8_with_amx_speedup = results[\"FP32\"] / results[\"INT8_with_AMX\"]\n", - " print(\"INT8 with Intelยฎ AMX is %.2fX faster than FP32\" %int8_with_amx_speedup)\n", - " print(\"\\n\\n\")\n", - "\n", - " # Create bar chart with speedup results\n", - " plt.figure()\n", - " plt.title(\"%s Intelยฎ AMX BF16/INT8 Speedup over FP32\" %modelName)\n", - " plt.xlabel(\"Run Case\")\n", - " plt.ylabel(\"Speedup\")\n", - " plt.bar(results.keys(), \n", - " [1, bf16_with_amx_speedup, int8_with_vnni_speedup, int8_with_amx_speedup]\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "9e42672a", - "metadata": {}, - "source": [ - "### VNNI: ResNet50 and BERT\n", - "Since ONEDNN_MAX_CPU_ISA is initialized ONCE when a workload is being run, another process must be used to run with a different setting. \n", - "In other words, changing ONEDNN_MAX_CPU_ISA during runtime in the same process will not have any effect.\n", - "Thus, to run with VNNI, a separate script is run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "555ec5a9", - "metadata": {}, - "outputs": [], - "source": [ - "!python python/pytorch_inference_vnni.py" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d194fa7d", - "metadata": {}, - "outputs": [], - "source": [ - "# Record the inference times for INT8 using AVX-512\n", - "int8_with_vnni_resnet_inference_time = 0.033 #TODO: enter in inference time\n", - "int8_with_vnni_bert_inference_time = 0.691 #TODO: enter in inference time" - ] - }, - { - "cell_type": "markdown", - "id": "c61288e7", - "metadata": {}, - "source": [ - "### : ResNet50" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4a6a84c", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up ResNet50 model and sample data\n", - "resnet_model = models.resnet50(pretrained=True)\n", - "resnet_data = torch.rand(RESNET_BATCH_SIZE, 3, 224, 224)\n", - "resnet_model.eval()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b26789b9", - "metadata": {}, - "outputs": [], - "source": [ - "# FP32 (baseline)\n", - "fp32_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"FP32\", amx=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ad0c512", - "metadata": {}, - "outputs": [], - "source": [ - "# BF16 using Intelยฎ \n", - "bf16_amx_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"BF16\", amx=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cd9f1bd", - "metadata": {}, - "outputs": [], - "source": [ - "# INT8 using Intelยฎ \n", - "int8_amx_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"INT8\", amx=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "59fcbbe2", - "metadata": {}, - "outputs": [], - "source": [ - "# Summarize and display results\n", - "results_resnet = {\n", - " \"FP32\": fp32_resnet_inference_time,\n", - " \"BF16_with_AMX\": bf16_amx_resnet_inference_time,\n", - " \"INT8_with_VNNI\": int8_with_vnni_resnet_inference_time,\n", - " \"INT8_with_AMX\": int8_amx_resnet_inference_time\n", - " }\n", - "summarizeResults(\"ResNet50\", results_resnet, RESNET_BATCH_SIZE)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "75a62b72", - "metadata": {}, - "source": [ - "The first graph displays the inference times on the specified number of samples. In general, the times should be decreasing from left to right because using lower precision and with accelerates the computations. The second graph displays the relative speedup of each run case compared to that of FP32. In general the speedup should be increasing from left to right." - ] - }, - { - "cell_type": "markdown", - "id": "b36fa4b3", - "metadata": {}, - "source": [ - "### BERT" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "27f173e7", - "metadata": {}, - "outputs": [], - "source": [ - "# Set up BERT model and sample data\n", - "bert_model = BertModel.from_pretrained(\"bert-base-uncased\")\n", - "bert_data = torch.randint(bert_model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH])\n", - "bert_model.eval()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a5847c1", - "metadata": {}, - "outputs": [], - "source": [ - "# FP32 (baseline)\n", - "fp32_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"FP32\", amx=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d35fc58e", - "metadata": {}, - "outputs": [], - "source": [ - "# BF16 using Intelยฎ \n", - "bf16_amx_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"BF16\", amx=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3b3d2ccd", - "metadata": {}, - "outputs": [], - "source": [ - "# INT8 using Intelยฎ \n", - "int8_amx_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"INT8\", amx=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3721e698", - "metadata": {}, - "outputs": [], - "source": [ - "# Summarize and display results\n", - "results_bert = {\n", - " \"FP32\": fp32_bert_inference_time,\n", - " \"BF16_with_AMX\": bf16_amx_bert_inference_time,\n", - " \"INT8_with_VNNI\": int8_with_vnni_bert_inference_time,\n", - " \"INT8_with_AMX\": int8_amx_bert_inference_time\n", - " }\n", - "summarizeResults(\"BERT\", results_bert, BERT_BATCH_SIZE)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "03e63f93", - "metadata": {}, - "source": [ - "The first graph displays the inference times on the specified number of samples. In general, the times should be decreasing from left to right because using lower precision and with accelerates the computations. The second graph displays the relative speedup of each run case compared to that of FP32. In general the speedup should be increasing from left to right." - ] - }, - { - "cell_type": "markdown", - "id": "b559aeb8", - "metadata": {}, - "source": [ - "## Conclusion" - ] - }, - { - "cell_type": "markdown", - "id": "0da073a6", - "metadata": {}, - "source": [ - "This code sample shows how to enable and disable during runtime, as well as the performance improvements using BF16 and INT8 for inference on the ResNet50 and BERT models. Performance will vary based on your hardware and software versions. To see a larger performance gap between VNNI and , increase the batch size. For even more speedup, consider using the Intelยฎ Extension for PyTorch (IPEX) [Launch Script](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/launch_script.html). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa0877d6-e045-4091-b5e4-4dfcb6d04f7d", - "metadata": {}, - "outputs": [], - "source": [ - "print('[CODE_SAMPLE_COMPLETED_SUCCESSFULLY]')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - }, - "vscode": { - "interpreter": { - "hash": "ed6ae0d06e7bec0fef5f1fb38f177ceea45508ce95c68ed2f49461dd6a888a39" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb deleted file mode 100644 index 7e3f0a889..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb +++ /dev/null @@ -1,521 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "5b88f590-e457-4052-9dbd-74d7be597dc1", - "metadata": {}, - "outputs": [], - "source": [ - "# =============================================================\n", - "# Copyright ยฉ 2023 Intel Corporation\n", - "# \n", - "# SPDX-License-Identifier: MIT\n", - "# =============================================================" - ] - }, - { - "cell_type": "markdown", - "id": "6f25b97a-56f7-4309-87fa-d9626baecf5e", - "metadata": {}, - "source": [ - "# Interactive Chat Based on DialoGPT Model Using Intelยฎ Extension for PyTorch* Quantization\n", - "\n", - "This code sample shows usage of DiloGPT model as interactive chat with Intelยฎ Extension for PyTorch* INT8 quantization.\n", - "\n", - "## DialoGPT\n", - "\n", - "DialoGPT is a model based on GPT-2 architecture proposed by Microsoft in 2019. It's goal was to create open-domain chatbots capable of producing natural responses to a variety of conversational topics." - ] - }, - { - "cell_type": "markdown", - "id": "f7c87090-2f40-4c29-b70b-c9d413bd3bff", - "metadata": {}, - "source": [ - "The `Interactive chat based on DialoGPT model using Intelยฎ Extension for PyTorch* Quantization` sample demonstrates how to create interactive chat based on pre-trained DialoGPT model and add the Intelยฎ Extension for PyTorch* quantization to it.\n", - "\n", - "| Area | Description|\n", - "|-----------------------|------------|\n", - "| What you will learn | How to create interactive chat and add INT8 dynamic quantization form Intelยฎ Extension for PyTorch*|\n", - "| Time to complete | 10 minutes|\n", - "| Category | Concepts and Functionality|\n", - "\n", - "The Intelยฎ Extension for PyTorch* extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX).\n", - "\n", - "## Purpose\n", - "\n", - "This sample shows how to create interactive chat based on the pre-trained DialoGPT model from HuggingFace and how to add INT8 dynamic quantization to it. The Intelยฎ Extension for PyTorch* gives users the ability to speed up operations on processors with INT8 data format and specialized computer instructions. The INT8 data format uses quarter the bit width of floating-point-32 (FP32), lowering the amount of memory needed and execution time to process with minimum to zero accuracy loss.\n", - "\n", - "## Prerequisites\n", - "\n", - "| Optimized for | Description|\n", - "|-------------------------|------------|\n", - "| OS | Ubuntu* 20.04 or newer|\n", - "| Hardware | Intelยฎ Xeonยฎ Scalable Processor family|\n", - "| Software | Intelยฎ Extension for PyTorch*|" - ] - }, - { - "cell_type": "markdown", - "id": "0174e7dd-58ae-47ea-8f11-fa3d1ee8c317", - "metadata": {}, - "source": [ - "## Environment Setup" - ] - }, - { - "cell_type": "markdown", - "id": "cc24bdae-fcb7-40a5-8bb8-76472598730b", - "metadata": {}, - "source": [ - "### Install Jupyter notebook by Conda\n", - "\n", - "Please refer to the guide in README.md to setup running environment:\n", - "\n", - "1. Create Conda running environment.\n", - "2. Install Jupyter notebook.\n", - "3. Install Intelยฎ Extension for PyTorch* for CPU packages.\n", - "4. Startup Jupyter notebook service and open by web browser.\n", - "\n", - "\n", - "#### Set Kernel to PyTorch-CPU\n", - "\n", - "In Jupyter notebook menu, change kernel \"PyTorch-CPU\" by Kernel->Change Kernel." - ] - }, - { - "cell_type": "markdown", - "id": "d2d9847c-2d72-4dfc-a4a9-b87c987ff363", - "metadata": {}, - "source": [ - "### Install other python packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f9e7f549-65dd-4286-9e04-8d13a766c0e3", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m pip install transformers matplotlib" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8c1ae1f1-4878-4dc6-bbb3-5a0f17fbbd00", - "metadata": {}, - "source": [ - "Let's start with importing all necessary packages." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45146b66-e41e-400e-8a1b-5e680bbb7575", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from transformers import AutoModelForCausalLM, AutoTokenizer\n", - "import torch\n", - "\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "id": "2e158dd4-e2a7-44ca-af2d-052f88247e97", - "metadata": {}, - "source": [ - "## Model and tokenizer loading\n", - "\n", - "The first implemented function is loading tokenizer and model. \n", - "\n", - "Function input is link to the pre-trained model. In this sample we are using `microsoft/DialoGPT-large` from HuggingFace. This is also default parameter for this function. Of course, you can use also `microsoft/DialoGPT-medium` or `microsoft/DialoGPT-samll` models. Especially if you have limited resources. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6142753-eab1-4167-9818-4b40c900473c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def load_tokenizer_and_model(model=\"microsoft/DialoGPT-large\"):\n", - " \"\"\"\n", - " Load tokenizer and model instance for some specific DialoGPT model.\n", - " \"\"\"\n", - " # Initialize tokenizer and model\n", - " print(\"Loading model...\")\n", - " tokenizer = AutoTokenizer.from_pretrained(model, padding_side='left')\n", - " model = AutoModelForCausalLM.from_pretrained(model)\n", - " \n", - " # Return tokenizer and model\n", - " return tokenizer, model" - ] - }, - { - "cell_type": "markdown", - "id": "a4e150e6-4976-4998-93be-5f5f9ddcbb5b", - "metadata": { - "tags": [] - }, - "source": [ - "## INT8 Dynamic Quantization\n", - "\n", - "**Quantization** is a systematic reduction of the precision of all or several layers within the model. This means that we turn a higher-precision type, such as the FP32 (32 bits) most commonly used in Deep Learning, into a lower-precision type, such as FP16 (16 bits) or INT8 (8 bits). \n", - "\n", - "With type reduction, it is possible to effectively reduce the size of the model and also faster inference. That means:\n", - "\n", - "* lower memory bandwidth, \n", - "* lower storage, \n", - "* higher performance with minimum to zero accuracy loss. \n", - "\n", - "This is especially important, with large models such as those based on the Transformers architecture, like BERT or used in this sample GPT. \n", - "\n", - "We can distinguish 2 types of quantization:\n", - "\n", - "* static - requires an additional pass over a dataset to work, only activations do calibration,\n", - "* dynamic - multiplies input values by the scale factor, then rounds the result to the nearest, the scale factor for activations is determined dynamically based on the data range observed in runtime.\n", - "\n", - "In this sample we are using **the dynamic quantization**." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cca006fa-6fce-4e5f-81c0-240d12757493", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from intel_extension_for_pytorch.quantization import prepare, convert\n", - "import intel_extension_for_pytorch as ipex\n", - "\n", - "def quantize_model(tokenizer, model):\n", - " \"\"\"\n", - " Adding Intelยฎ Extension for PyTorch* dynamic quantization to the model\n", - " \"\"\"\n", - " # Evaluate model\n", - " model.eval()\n", - " \n", - " print(\"Quantization in progress...\")\n", - " \n", - " # Prepare example outputs for the model\n", - " question, text = \"What is SYCL?\", \"SYCL is an industry-driven standard, developed by Kronos Group and announced in March 2014.\"\n", - " inputs = tokenizer(question, text, return_tensors=\"pt\")\n", - " jit_inputs = tuple((inputs['input_ids']))\n", - " \n", - " # Create configuration for dynamic quantization\n", - " qconfig = ipex.quantization.default_dynamic_qconfig\n", - " \n", - " # Optimize model\n", - " model = ipex.optimize(model)\n", - " \n", - " # Prepare model for quantization using previously prepared parameters\n", - " prepared_model = prepare(model, qconfig, example_inputs=jit_inputs, inplace=False)\n", - " \n", - " # Convert types in model\n", - " converted_model = convert(prepared_model)\n", - " \n", - " return tokenizer, converted_model" - ] - }, - { - "cell_type": "markdown", - "id": "0efd690e-96bd-49aa-8a6b-863d4de3cdfa", - "metadata": {}, - "source": [ - "## Response generation \n", - "\n", - "Response generation in DialoGPT architecture based on **encoder-decoder** model. It means that first we need to *encode input sentence*, to later on be able to *decode it* generating response.\n", - "\n", - "As the model based on transformers architecture they have known issue of copying things. To avoid repetition in chat responses we used Top-K sampling and Top-p sampling.\n", - "\n", - "**Top-K sampling** filters the K most likely next words and redistributes the probability mass among only those K next words. **Top-p sampling**, rather than selecting only the most likely K words, selects the smallest possible set of words whose cumulative probability exceeds the probability p. The probability mass is then redistributed among the words in this set. As a result, the size of the set of words can be dynamically increased and decreased based on the probability distribution of the next word." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d90bd2c3-ff9c-4e52-994d-341792e3e035", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def generate_response(tokenizer, model, chat_round, chat_history_ids):\n", - " \"\"\"\n", - " Generate a response to some user input.\n", - " \"\"\"\n", - " # Encode user input and End-of-String (EOS) token\n", - " new_input_ids = tokenizer.encode(input(\">> You:\") + tokenizer.eos_token, return_tensors='pt')\n", - " \n", - " # Append tokens to chat history\n", - " bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_round > 0 else new_input_ids\n", - " \n", - " # Generate response given maximum chat length history of 2000 tokens\n", - " chat_history_ids = model.generate(\n", - " bot_input_ids,\n", - " do_sample=True, \n", - " max_length=2000,\n", - " top_k=50, \n", - " top_p=0.95,\n", - " pad_token_id=tokenizer.eos_token_id\n", - " )\n", - " \n", - " # Print response\n", - " print(\"DialoGPT: {}\".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))\n", - " \n", - " # Return the chat history ids\n", - " return chat_history_ids" - ] - }, - { - "cell_type": "markdown", - "id": "db1b079b-476c-47da-8a6a-3d42fccc32d4", - "metadata": {}, - "source": [ - "The next step is to prepare a function that allows interactive conversation for `n` rounds. This means that we will use the previously prepared `generate_response` function n-times." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28968553-b051-442d-abc2-92d8ac34415a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def chat_for_n_rounds(tokenizer, model, n=5):\n", - " \"\"\"\n", - " Chat with chatbot for n rounds (n = 5 by default)\n", - " \"\"\"\n", - "\n", - " # Initialize history variable\n", - " chat_history_ids = None\n", - "\n", - " # Chat for n rounds\n", - " for chat_round in range(n):\n", - " chat_history_ids = generate_response(tokenizer, model, chat_round, chat_history_ids)" - ] - }, - { - "cell_type": "markdown", - "id": "41b0f86f-2b17-41cd-911e-9ac9a92be4a0", - "metadata": {}, - "source": [ - "Now, it is time to use implemented functions - initializing the model and adding INT8 dynamic quantization." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1749c7a-4bba-4731-bbc6-da560edcfed2", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize tokenizer and model\n", - "tokenizer, model = load_tokenizer_and_model()\n", - "\n", - "# Adding ipex quantization to the model\n", - "tokenizer, model = quantize_model(tokenizer, model)" - ] - }, - { - "cell_type": "markdown", - "id": "31bae96c-276c-463f-8085-2cd8e97b5f30", - "metadata": {}, - "source": [ - "Let's play with the model by 5 rounds. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d79fa9d7-5713-4ceb-b489-90f1a4f6a4cf", - "metadata": {}, - "outputs": [], - "source": [ - "chat_for_n_rounds(tokenizer, model, 5)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "16402940-0779-44a1-98b5-3f23c5784bd4", - "metadata": {}, - "source": [ - "## Performance comparison\n", - "\n", - "Now that we know that the DialoGPT model still performs well as a chat bot after quantization, let's compare the model's performance before and after applying INT8 dynamic quantization.\n", - "\n", - "Let's start with defining function that will measure time that model needs for inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c8b4677-4635-4abd-8eca-9ed43b9b6624", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from time import time\n", - "def test_inference(model, data, warmup=5 , iters=25):\n", - " print(\"Warmup...\")\n", - " for i in range(warmup):\n", - " out = model(data)\n", - "\n", - " print(\"Inference...\")\n", - " inference_time = 0\n", - " for i in range(iters):\n", - " start_time = time()\n", - " out = model(data)\n", - " end_time = time()\n", - " inference_time = inference_time + (end_time - start_time)\n", - "\n", - " inference_time = inference_time / iters\n", - " return inference_time" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "a41f2a40-2176-4f04-b277-e3622df90430", - "metadata": {}, - "source": [ - "First, let's measure average time of inference for original model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6cb034fe-9b8b-4ee9-9975-8a6a03ce79a4", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"Inference with FP32\")\n", - "tokenizer_fp32, model_fp32 = load_tokenizer_and_model()\n", - "data = torch.randint(model_fp32.config.vocab_size, size=[1, 512])\n", - "fp32_inference_time = test_inference(model_fp32, data = data)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "6c58546c-5c60-482c-9782-ac901855ddce", - "metadata": { - "tags": [] - }, - "source": [ - "Then, the average inference time of model after INT8 dynamic quantization." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05fcd18c-0674-4715-a606-ce5ce9e42560", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"Inference with Dynamic INT8\")\n", - "tokenizer_int8, model_int8 = load_tokenizer_and_model()\n", - "tokenizer_int8, model_int8 = quantize_model(tokenizer_int8, model_int8)\n", - "data = torch.randint(model_int8.config.vocab_size, size=[1, 512])\n", - "int8_inference_time = test_inference(model_int8, data = data)" - ] - }, - { - "cell_type": "markdown", - "id": "2ef0648b-c926-42ac-8367-e1a3edb067ea", - "metadata": {}, - "source": [ - "Now, it's time to show nup the results on the bar chart using `matplotlib` library." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d492e5d-e188-489b-a18d-aa32cca0a1b8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "\n", - "# Create bar chart with training time results\n", - "plt.figure(figsize=(4,3))\n", - "plt.title(\"DialoGPT Inference Time\")\n", - "plt.ylabel(\"Inference Time (seconds)\")\n", - "plt.bar([\"FP32\", \"INT8 dynamic\"], [fp32_inference_time, int8_inference_time])" - ] - }, - { - "cell_type": "markdown", - "id": "d2c31e73-2d6d-4323-9609-f04191f8863d", - "metadata": {}, - "source": [ - "DialoGPT by Microsoft is another conversational chatbot that everyone can use. \n", - "\n", - "Based on this architecture, we created an interactive chat in this sample. The use of top-k and top-p allowed us to avoid some of the repetition in the chat answers. Furthermore, the addition of dynamic INT8 quantization reduced memory usage." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b56ff32-34d0-4866-9050-df1bdf7ad736", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89043271-f3dc-4d4d-a630-40c570c53d98", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb deleted file mode 100644 index 03020685e..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb +++ /dev/null @@ -1,347 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Optimize PyTorch Models using Intelยฎ Extension for PyTorch (IPEX) Quantization\n", - "This code sample will quantize a ResNet50 model while using Intel's Extension for PyTorch (IPEX). The model will run inference with FP32 and INT8 precision, including static INT8 quantization and dynamic INT8 quantization. During Static Quantization, the model calibrated with the CIFAR10 dataset. The inference time will be compared, showcasing the speedup of INT8 Quantization.\n", - "\n", - "## Environment Setup\n", - "Ensure the PyTorch kernel is activated before running this notebook.\n", - "\n", - "## Imports, Dataset, Hyperparameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torchvision\n", - "from time import time\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "import intel_extension_for_pytorch as ipex\n", - "from intel_extension_for_pytorch.quantization import prepare, convert\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hyperparameters and constants\n", - "LR = 0.001\n", - "DOWNLOAD = True\n", - "DATA = 'datasets/cifar10/'\n", - "WARMUP = 3\n", - "ITERS = 100\n", - "transform = torchvision.transforms.Compose([\n", - "torchvision.transforms.Resize((224, 224)),\n", - "torchvision.transforms.ToTensor(),\n", - "torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", - "])\n", - "test_dataset = torchvision.datasets.CIFAR10(\n", - " root=DATA,\n", - " train=False,\n", - " transform=transform,\n", - " download=DOWNLOAD,\n", - ")\n", - "calibration_data_loader = torch.utils.data.DataLoader(\n", - " dataset=test_dataset,\n", - " batch_size=128\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get model from torchvision" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = torch.rand(1, 3, 224, 224)\n", - "model_fp32 = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)\n", - "model_fp32.eval()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inference with FP32 model\n", - "\n", - "The function below will test the inference time with input model and return the average inference time for 1 iteration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def inference(model, WARMUP, ITERS, data):\n", - " print(\"Warmup before benchmark ...\")\n", - " for i in range(WARMUP):\n", - " out = model(data)\n", - "\n", - " print(\"Inference ...\")\n", - " inference_time = 0\n", - " for i in range(ITERS):\n", - " start_time = time()\n", - " out = model(data)\n", - " end_time = time()\n", - " inference_time = inference_time + (end_time - start_time)\n", - "\n", - " inference_time = inference_time / ITERS\n", - " print(\"Inference Time Avg: \", inference_time)\n", - " return inference_time" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Static Quantization \n", - "The function below staticQuantize will calibrate the fp32 model with calibration dataloader and return the quantized static int8 model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def staticQuantize(model_fp32, data, calibration_data_loader):\n", - " # Acquire inference times for static quantization INT8 model \n", - " qconfig_static = ipex.quantization.default_static_qconfig\n", - " # # Alternatively, define your own qconfig:\n", - " # from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig\n", - " # qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),\n", - " # weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric))\n", - " prepared_model_static = prepare(model_fp32, qconfig_static, example_inputs=data, inplace=False)\n", - " print(\"Calibration with Static Quantization ...\")\n", - " for batch_idx, (data, target) in enumerate(calibration_data_loader):\n", - " prepared_model_static(data)\n", - " if batch_idx % 10 == 0:\n", - " print(\"Batch %d/%d complete, continue ...\" %(batch_idx+1, len(calibration_data_loader)))\n", - " print(\"Calibration Done\")\n", - "\n", - " converted_model_static = convert(prepared_model_static)\n", - " with torch.no_grad():\n", - " traced_model_static = torch.jit.trace(converted_model_static, data)\n", - " traced_model_static = torch.jit.freeze(traced_model_static)\n", - "\n", - " # save the quantized static model \n", - " traced_model_static.save(\"quantized_model_static.pt\")\n", - " return traced_model_static\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dynamic Quantization \n", - "The function below dynamicQuantize will quantize the fp32 model with dynamic quantization and return the quantized dynamic int8 model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def dynamicQuantize(model_fp32, data):\n", - " # Acquire inference times for dynamic quantization INT8 model\n", - " qconfig_dynamic = ipex.quantization.default_dynamic_qconfig\n", - " print(\"Quantize Model with Dynamic Quantization ...\")\n", - "\n", - " prepared_model_dynamic = prepare(model_fp32, qconfig_dynamic, example_inputs=data, inplace=False)\n", - "\n", - " converted_model_dynamic = convert(prepared_model_dynamic)\n", - " with torch.no_grad():\n", - " traced_model_dynamic = torch.jit.trace(converted_model_dynamic, data)\n", - " traced_model_dynamic = torch.jit.freeze(traced_model_dynamic)\n", - "\n", - " # save the quantized dynamic model \n", - " traced_model_dynamic.save(\"quantized_model_dynamic.pt\")\n", - " return traced_model_dynamic\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Quantize the FP32 Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists('quantized_model_static.pt'):\n", - " # Static Quantizaton & Save Model to quantized_model_static.pt\n", - " print('quantize the model with static quantization')\n", - " staticQuantize(model_fp32, data, calibration_data_loader)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if not os.path.exists('quantized_model_dynamic.pt'):\n", - " # Dynamic Quantization & Save Model to quantized_model_dynamic.pt\n", - " print('quantize the model with dynamic quantization')\n", - " dynamicQuantize(model_fp32, data)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inference With FP32 Model, Static INT8 Model and Dynamic INT8 Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Inference with FP32\")\n", - "fp32_inference_time = inference(model_fp32, WARMUP, ITERS, data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Inference with Static INT8\")\n", - "traced_model_static = torch.jit.load('quantized_model_static.pt')\n", - "traced_model_static.eval()\n", - "traced_model_static = torch.jit.freeze(traced_model_static)\n", - "int8_inference_time_static = inference(traced_model_static, WARMUP, ITERS, data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Inference with Dynamic INT8\")\n", - "traced_model_dynamic = torch.jit.load('quantized_model_dynamic.pt')\n", - "traced_model_dynamic.eval()\n", - "traced_model_dynamic = torch.jit.freeze(traced_model_dynamic)\n", - "int8_inference_time_dynamic = inference(traced_model_dynamic, WARMUP, ITERS, data)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary of Results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Inference time results\n", - "print(\"Summary\")\n", - "print(\"FP32 inference time: %.3f\" %fp32_inference_time)\n", - "print(\"INT8 static quantization inference time: %.3f\" %int8_inference_time_static)\n", - "print(\"INT8 dynamic quantization inference time: %.3f\" %int8_inference_time_dynamic)\n", - "\n", - "# Create bar chart with training time results\n", - "plt.figure(figsize=(4,3))\n", - "plt.title(\"ResNet Inference Time\")\n", - "plt.xlabel(\"Test Case\")\n", - "plt.ylabel(\"Inference Time (seconds)\")\n", - "plt.bar([\"FP32\", \"INT8 static\", \"INT8 dynamic\"], [fp32_inference_time, int8_inference_time_static, int8_inference_time_dynamic])\n", - "\n", - "# Calculate speedup when using quantization\n", - "speedup_from_fp32_static = fp32_inference_time / int8_inference_time_static\n", - "print(\"Staic INT8 %.2fX faster than FP32\" %speedup_from_fp32_static)\n", - "speedup_from_fp32_dynamic = fp32_inference_time / int8_inference_time_dynamic\n", - "print(\"Dynamic INT8 %.2fX faster than FP32\" %speedup_from_fp32_dynamic)\n", - "\n", - "\n", - "# Create bar chart with speedup results\n", - "plt.figure(figsize=(4,3))\n", - "plt.title(\"Quantization Speedup\")\n", - "plt.xlabel(\"Test Case\")\n", - "plt.ylabel(\"Speedup\")\n", - "plt.bar([\"FP32\",\"Static INT8\", \"Dynamic INT8\"], [1, speedup_from_fp32_static, speedup_from_fp32_dynamic])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.15" - }, - "vscode": { - "interpreter": { - "hash": "4678fb2792a22465205165c52aab2f7cff7494375a364749bf16e0ac11f2a502" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/cpu/inference/python/jupyter-notebooks/README.md b/examples/cpu/inference/python/jupyter-notebooks/README.md deleted file mode 100644 index 2c9dfc91a..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/README.md +++ /dev/null @@ -1,61 +0,0 @@ -# Environment Setup for Jupyter Notebook with Intel Pytorch CPU - -The Intelยฎ Extension for PyTorch (IPEX) extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX). - -## Prerequisites - -| Optimized for | Description -|:--- |:--- -| OS | Ubuntu* 18.04 or newer -| Hardware | 4th Gen Intelยฎ Xeonยฎ Scalable Processors or newer -| Software | Intelยฎ Extension for PyTorch (IPEX) - -## For Local Development Environments - -- **Install Jupyter Notebook with Conda** - -Python 3.8,3.9,3.10,3.11 are supported. -Please create a **new conda environment** for each sample. - -``` -conda create -n ipex_cpu python=3.10 -y -conda activate ipex_cpu -pip install notebook ipykernel -``` - -If encounter any issue for jupyter notebook, please refer to [*Installing Jupyter*](https://jupyter.org/install) for detailed installation instructions. - - -- **Install Intelยฎ Extension for Pytorch* with Conda** - -Follow this instructions to install latest released Intelยฎ Extension for Pytorch* - -``` -pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu -pip install intel-extension-for-pytorch -``` - -If a specific version is needed, please follow the Installation Section and Sanity Check Section in the [installation guide](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu) to install Intelยฎ Extension for Pytroch* with specific version. - -- **Register ipykernel with Conda** - -``` -python -m ipykernel install --user --name=PyTorch-CPU -``` - -- **Runing the jupyter notebook** - -1. Change to the sample directory. -2. Launch Jupyter Notebook. -``` -jupyter notebook --ip=0.0.0.0 --port 8888 --allow-root -``` -3. Follow the instructions to open the URL with the token in your browser. -4. Locate and select the Notebook. -5. Change your Jupyter Notebook kernel to **PyTorch-CPU**. -6. Run every cell in the Notebook in sequence. - -## Example Output - -If successful, the sample displays `[CODE_SAMPLE_COMPLETED_SUCCESSFULLY]`. Additionally, the sample generates performance and analysis diagrams for comparison. - diff --git a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb deleted file mode 100644 index fe1fb8b52..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb +++ /dev/null @@ -1,888 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "1110103c-70d0-4ac0-8208-6a678b88deae", - "metadata": {}, - "source": [ - "# Optimize PyTorch Models using Intelยฎ Extension for PyTorch* (IPEX)\n", - "\n", - "This notebook guides you through the process of extending your PyTorch* code with Intelยฎ Extension for PyTorch* (IPEX) with optimizations to achieve performance boosts on Intelยฎ hardware.\n", - "\n", - "\n", - "| Area | Description\n", - "|:--- |:---\n", - "| What you will learn | Applying Intelยฎ Extension for PyTorch* (IPEX) Optimizations to a PyTorch workload in a step-by-step manner to gain performance boost\n", - "| Time to complete | 30 minutes\n", - "| Category | Code Optimization\n", - "\n", - "## Purpose\n", - "\n", - "This sample notebook shows how to get started with Intelยฎ Extension for PyTorch* (IPEX) for sample Computer Vision and NLP workloads.\n", - "\n", - "The sample starts by loading two models from the PyTorch hub: **Faster-RCNN** (Faster R-CNN) and **distilbert** (DistilBERT). After loading the models, the sample applies sequential optimizations from Intelยฎ Extension for PyTorch* (IPEX) and examines performance gains for each incremental change.\n", - "You can make code changes quickly on top of existing PyTorch code to obtain the performance speedups for model inference.\n", - "\n", - "We will be generating synthetic data to be used for inference with sample computer vision and NLP workloads. We will first use stock PyTorch models to generate predictions. Then, with minimal code changes using Intelยฎ Extension for PyTorch* (IPEX), we will see how speedups can be gained over stock PyTorch on Intelยฎ hardware. We will also see how quantization features from Intelยฎ Extension for PyTorch* (IPEX) can be used to reduce the inference time of a model.\n", - "\n", - "## Prerequisites\n", - "\n", - "\n", - "| Optimized for | Description\n", - "|:--- |:---\n", - "| OS | Ubuntu* 20.04 or newer\n", - "| Hardware | Intelยฎ Xeonยฎ Scalable processor family\n", - "| Software | Intelยฎ Extension for PyTorch*\n" - ] - }, - { - "cell_type": "markdown", - "id": "431d988d-40f1-4f98-96fd-2e17b4126eb4", - "metadata": {}, - "source": [ - "# Key Takeaways" - ] - }, - { - "cell_type": "markdown", - "id": "7438fa45-81e6-4d42-847b-fbe895ae8eed", - "metadata": {}, - "source": [ - "- Get started with Intelยฎ Extension for PyTorch* (IPEX) for drop-in acceleration\n", - "- Learn how to use the *optimize* method from Intelยฎ Extension for PyTorch* (IPEX) to apply optimizations at Python frontend to the given model (nn.Module)\n", - "- Learn how to use Quantization features from Intelยฎ Extension for PyTorch* (IPEX) to convert model to INT8\n", - "- Learn how to use Intelยฎ Extension for PyTorch* (IPEX) Launch Script module to set additional configurations on top of the previously mentioned optimizations to boost performance" - ] - }, - { - "cell_type": "markdown", - "id": "d72174e0", - "metadata": {}, - "source": [ - "# Samples" - ] - }, - { - "cell_type": "markdown", - "id": "06a26381", - "metadata": {}, - "source": [ - "## Install Intelยฎ Extension for PyTorch* for CPU and dependency packages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ccfb9a4", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m pip install transformers matplotlib" - ] - }, - { - "cell_type": "markdown", - "id": "b7eb6281-5db9-4a4f-9c6a-3f9c132f30f9", - "metadata": { - "tags": [] - }, - "source": [ - "## Computer Vision Workload - Faster R-CNN, Resnet50 Backbone" - ] - }, - { - "cell_type": "markdown", - "id": "d911257c-c9b0-4365-a308-95a4b3aea487", - "metadata": {}, - "source": [ - "Faster R-CNN is a convolutional neural network used for object detection. We are going to use the **optimize** method from Intelยฎ Extension for PyTorch* (IPEX) to apply optimizations. Following this, we will also use TorchScript to obtain performance gains." - ] - }, - { - "cell_type": "markdown", - "id": "96966a7b-b036-4f1c-8dd8-3b90b76f98c9", - "metadata": {}, - "source": [ - "Let's start by importing all the necessary packages and modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a4cb03a-f6b4-465b-9363-b435b00336c8", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import torch\n", - "import torchvision\n", - "import os\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "markdown", - "id": "5f613973-ffb0-482a-bc56-c295bce3c088", - "metadata": {}, - "source": [ - "**Prepare Sample Data**" - ] - }, - { - "cell_type": "markdown", - "id": "22eeae78-534e-4d44-a0f3-c48a4213ac3f", - "metadata": {}, - "source": [ - "Let's generate a random image using torch to test performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e75c62d-dbbb-47da-aa07-717e9719d86a", - "metadata": {}, - "outputs": [], - "source": [ - "# set the device to cpu\n", - "device = 'cpu'\n", - "# generate a random image to observe speedup on\n", - "image = torch.randn(1, 3, 1200, 1200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "678f5795-dcd8-439d-8d98-f4197a2417e4", - "metadata": {}, - "outputs": [], - "source": [ - "# explore image shape\n", - "\n", - "print(image.shape)" - ] - }, - { - "cell_type": "markdown", - "id": "32a54de2-76e7-42f9-9506-5574f5bb95a4", - "metadata": {}, - "source": [ - "**Helper Functions**" - ] - }, - { - "cell_type": "markdown", - "id": "43d0d63d-1d2b-4edc-8d36-0e2dd4ff3780", - "metadata": {}, - "source": [ - "Some functions to help us with loading the model and summarizing the optimizations. The functions below will help us record the time taken to run and, plot comparison charts." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c265eb1f-ef85-4ed1-941d-158d2ec16af6", - "metadata": {}, - "outputs": [], - "source": [ - "def load_model_eval_mode():\n", - " \"\"\"\n", - " Loads model and returns it in eval mode\n", - " \"\"\"\n", - " model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=weights, progress=True,\n", - " num_classes=91, weights_backbone=weights_backbone).to(device)\n", - " model = model.eval()\n", - " \n", - " return model\n", - "\n", - "def get_average_inference_time(model, image):\n", - " \"\"\"\n", - " does a model warm up and times the model runtime\n", - " \"\"\"\n", - " with torch.no_grad():\n", - " # warm up\n", - " for _ in range(25):\n", - " model(image)\n", - "\n", - " # measure\n", - " import time\n", - " start = time.time()\n", - " for _ in range(25):\n", - " output = model(image)\n", - " end = time.time()\n", - " average_inference_time = (end-start)/25*1000\n", - " \n", - " return average_inference_time\n", - "\n", - "def plot_speedup(inference_time_stock, inference_time_optimized):\n", - " \"\"\"\n", - " Plots a bar chart comparing the time taken by stock PyTorch model and the time taken by\n", - " the model optimized by Intelยฎ Extension for PyTorch* (IPEX)\n", - " \"\"\"\n", - " data = {'stock_pytorch_time': inference_time_stock, 'optimized_time': inference_time_optimized}\n", - " model_type = list(data.keys())\n", - " times = list(data.values())\n", - "\n", - " fig = plt.figure(figsize = (10, 5))\n", - "\n", - " # creating the bar plot\n", - " plt.bar(model_type, times, color ='blue',\n", - " width = 0.4)\n", - "\n", - " plt.ylabel(\"Runtime (ms)\")\n", - " plt.title(f\"Speedup acheived - {inference_time_stock/inference_time_optimized:.2f}x\")\n", - " plt.show()\n", - " \n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "46eb1386-203a-4667-aa9c-9b26adcd02c4", - "metadata": {}, - "source": [ - "**Baseline PyTorch Model**" - ] - }, - { - "cell_type": "markdown", - "id": "c441b52d-597b-422c-bd01-524a687f8024", - "metadata": {}, - "source": [ - "A baseline model is the simplest version of the model that can be loaded from the PyTorch hub. Let's load the baseline for Faster R-CNN model and get predictions." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d4efa17-9abe-4595-bcc1-8b9c3b966211", - "metadata": {}, - "outputs": [], - "source": [ - "# model configs\n", - "weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT\n", - "weights_backbone = torchvision.models.ResNet50_Weights.DEFAULT" - ] - }, - { - "cell_type": "markdown", - "id": "858fb895-8617-4997-a8ec-8974d00e4606", - "metadata": {}, - "source": [ - "**Input Image Memory Format**" - ] - }, - { - "cell_type": "markdown", - "id": "43a1fa1e-2e89-4c43-b1e4-7682686c2a4a", - "metadata": {}, - "source": [ - "There are two ways to represent image data that are inputs to a CNN model. Channels-First, and Channels-Last. In Channels-First, the channels dimension comes first followed by height and width. For example - (3, 224, 224) or NCHW where N is batch size, C is channels, H is height, and W is width. In Channels-Last, the channels dimension comes last. For example - (224, 223, 3) or NHWC." - ] - }, - { - "cell_type": "markdown", - "id": "a12280f4-b8a1-4c9a-89d3-2f092396f431", - "metadata": {}, - "source": [ - "**Channels-First**" - ] - }, - { - "cell_type": "markdown", - "id": "59f09f4f-de2c-4006-83e0-d201b58097e6", - "metadata": {}, - "source": [ - "PyTorch uses channels-first by default" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f054f770-c8d6-4fe7-baed-7ed77c510a13", - "metadata": {}, - "outputs": [], - "source": [ - "# send the input to the device and pass it through the network to\n", - "# get the detections and predictions\n", - "\n", - "model = load_model_eval_mode()\n", - "\n", - "inference_time_stock = get_average_inference_time(model, image)\n", - "\n", - "print(f\"time taken for forward pass: {inference_time_stock} ms\")" - ] - }, - { - "cell_type": "markdown", - "id": "3285b34b-03a2-4654-8061-a4dd9f66d1e9", - "metadata": {}, - "source": [ - "**Channels-Last**" - ] - }, - { - "cell_type": "markdown", - "id": "31439e30-344e-455c-816a-a88eb868ede4", - "metadata": {}, - "source": [ - "Channels-Last memory format is a different way of ordering NCHW tensors allowing us to make Channels-Last memory format optimizations on Intelยฎ hardware" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0afc270-0d24-4e8d-a1a9-271dd66fc350", - "metadata": {}, - "outputs": [], - "source": [ - "model = load_model_eval_mode()\n", - "model = model.to(memory_format=torch.channels_last)\n", - "image_channels_last = image.to(memory_format=torch.channels_last)\n", - "\n", - "inference_time_stock = get_average_inference_time(model, image_channels_last)\n", - "\n", - "print(f\"time taken for forward pass: {inference_time_stock} ms\")" - ] - }, - { - "cell_type": "markdown", - "id": "9cfff9fc-0e9d-4703-8369-da1db780d15e", - "metadata": {}, - "source": [ - "Now that we have timed the stock PyTorch model, let's add minimal code changes from Intelยฎ Extension for PyTorch* (IPEX) to obtain speedups. The minimal code changes are highlighted in the following cell" - ] - }, - { - "cell_type": "markdown", - "id": "aef650d5-02db-40a6-9688-90a03fee7da2", - "metadata": {}, - "source": [ - "**Intelยฎ Extension for PyTorch* (IPEX)**" - ] - }, - { - "cell_type": "markdown", - "id": "05237e85-1039-4fb5-8651-bb815824a1d9", - "metadata": {}, - "source": [ - "As described above, Intelยฎ Extension for PyTorch* (IPEX) provides us with the ability to make minimal code changes to apply optimizations over stock PyTorch models using Intelยฎ hardware. The simple code changes are indicated below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6356714f-c2b3-46c7-8dab-103d40054eb0", - "metadata": {}, - "outputs": [], - "source": [ - "model = load_model_eval_mode()\n", - "model = model.to(memory_format=torch.channels_last)\n", - "image_channels_last = image.to(memory_format=torch.channels_last)\n", - "#################### code changes ####################\n", - "import intel_extension_for_pytorch as ipex\n", - "model = ipex.optimize(model)\n", - "######################################################" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6bdd609b-13ec-4bb7-93ec-b2f8a6dd0f68", - "metadata": {}, - "outputs": [], - "source": [ - "inference_time_optimized = get_average_inference_time(model, image_channels_last)\n", - "\n", - "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5843fe0f-52c3-49a5-b231-f4ca514395a6", - "metadata": {}, - "outputs": [], - "source": [ - "# plot performance gain bar chart\n", - "\n", - "plot_speedup(inference_time_stock, inference_time_optimized)" - ] - }, - { - "cell_type": "markdown", - "id": "d263abf8-e0b6-4d99-a14a-67e46c197c3d", - "metadata": {}, - "source": [ - "> **_NOTE:_** If a below par performance is observed, please restart the notebook kernel." - ] - }, - { - "cell_type": "markdown", - "id": "6d465fcc-8115-46de-9542-2c4c8d7c1771", - "metadata": {}, - "source": [ - "**TorchScript**" - ] - }, - { - "cell_type": "markdown", - "id": "e96cbe96-92c2-4429-b90b-4e78a4bac04d", - "metadata": {}, - "source": [ - "TorchScript is a way to create serializable and optimizable models from PyTorch code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87254f28-ce41-4066-81c1-bcd646a0b871", - "metadata": {}, - "outputs": [], - "source": [ - "model = load_model_eval_mode()\n", - "model = model.to(memory_format=torch.channels_last)\n", - "with torch.no_grad():\n", - " model.backbone = torch.jit.trace(model.backbone, image_channels_last, strict=False)\n", - " model.backbone = torch.jit.freeze(model.backbone)\n", - " inference_time_optimized = get_average_inference_time(model, image_channels_last)\n", - "\n", - "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ad01762-ded5-4380-b649-7a3db30c3b34", - "metadata": {}, - "outputs": [], - "source": [ - "# plot performance gain bar chart\n", - "\n", - "plot_speedup(inference_time_stock, inference_time_optimized)" - ] - }, - { - "cell_type": "markdown", - "id": "11b2888e-1171-400d-a5f9-783e6c52f01e", - "metadata": {}, - "source": [ - "## NLP Workload - DistilBERT Base Uncased" - ] - }, - { - "cell_type": "markdown", - "id": "62666465-5647-4ccb-a53c-d389d3261629", - "metadata": {}, - "source": [ - "DistilBERT is a transformer model, smaller and faster than BERT. We will use the Quantization feature from Intelยฎ Extension for PyTorch* (IPEX) to convert the model into INT8 for faster inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e10eba7b-3d6e-46e8-8baf-8eddcc5f9d2a", - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import DistilBertTokenizer, DistilBertModel, logging\n", - "logging.set_verbosity_error()" - ] - }, - { - "cell_type": "markdown", - "id": "8b7899a8-205d-4ebc-b304-e9ee65ad3643", - "metadata": {}, - "source": [ - "**Helper Functions**" - ] - }, - { - "cell_type": "markdown", - "id": "797de740-3a76-48cc-838c-ef8e981cd41b", - "metadata": {}, - "source": [ - "Similar functions as before to help us load the model and summarize the optimizations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b0b68ba1-9215-4408-b3ee-5d6d610cffd7", - "metadata": {}, - "outputs": [], - "source": [ - "def load_model_eval_mode():\n", - " \"\"\"\n", - " Loads model and returns it in eval mode\n", - " \"\"\"\n", - " model = DistilBertModel.from_pretrained('distilbert-base-uncased-distilled-squad')\n", - " model.eval()\n", - " \n", - " return model\n", - "\n", - "def get_average_inference_time(model, inputs):\n", - " \"\"\"\n", - " does a model warm up and times the model runtime\n", - " \"\"\"\n", - " with torch.no_grad():\n", - " # warm up\n", - " for _ in range(25):\n", - " model(**inputs)\n", - "\n", - " # measure\n", - " import time\n", - " start = time.time()\n", - " for _ in range(25):\n", - " outputs = model(**inputs)\n", - " end = time.time()\n", - " average_inference_time = (end-start)/25*1000\n", - " \n", - " return average_inference_time" - ] - }, - { - "cell_type": "markdown", - "id": "81ec7560-5a3d-4450-badd-7ff3e5d6ae9b", - "metadata": {}, - "source": [ - "Generate sample text and tokenize using the transformers tokenizer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d94e1ea-03f8-4f2b-9c05-ae4a97d7d7d1", - "metadata": {}, - "outputs": [], - "source": [ - "# tokenizer for distilbert\n", - "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')\n", - "\n", - "# sample data\n", - "question, text = \"Who was Jim Henson?\", \"Jim Henson was a nice puppet\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e146748d-7b31-4e62-82fb-92e3cd231f0e", - "metadata": {}, - "outputs": [], - "source": [ - "model = load_model_eval_mode()\n", - "\n", - "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", - "\n", - "inference_time_stock = get_average_inference_time(model, inputs)\n", - "\n", - "print(f\"time taken for forward pass: {inference_time_stock} ms\")" - ] - }, - { - "cell_type": "markdown", - "id": "fd79e93e-bc11-48b2-8a92-b74f29e4d2bf", - "metadata": {}, - "source": [ - "**Quantization**" - ] - }, - { - "cell_type": "markdown", - "id": "1cfbb3fe-d99f-4e74-abd0-04bdbbe6e632", - "metadata": {}, - "source": [ - "Quantization allows us to perform operations and store tensors at a lower precision than FP32, like INT8 for example. This compact model and data representation results in a lower memory requirement." - ] - }, - { - "cell_type": "markdown", - "id": "1c8ed525-7075-4a88-80d2-dfb739468994", - "metadata": {}, - "source": [ - "Let's import the quantization modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "342873f5-72a3-4105-a2a5-68778d0599c8", - "metadata": {}, - "outputs": [], - "source": [ - "from intel_extension_for_pytorch.quantization import prepare, convert\n", - "import intel_extension_for_pytorch as ipex" - ] - }, - { - "cell_type": "markdown", - "id": "b71e64d0-21f7-497f-86c3-d1afcaeefd6c", - "metadata": {}, - "source": [ - "**Static Quantization** \n", - " Static quantization quantizes the weights and activations of the model. It fuses activations into preceding layers where possible. It requires calibration with a representative dataset to determine optimal quantization parameters for activations." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3ef972ab-a272-45dd-bd55-fbd9f7db5017", - "metadata": {}, - "outputs": [], - "source": [ - "model = load_model_eval_mode()\n", - "\n", - "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", - "\n", - "jit_inputs = tuple((inputs['input_ids'], inputs['attention_mask']))\n", - "\n", - "qconfig_mapping = ipex.quantization.default_static_qconfig_mapping # for static quantization\n", - "prepared_model = ipex.quantization.prepare(model, qconfig_mapping, example_inputs=jit_inputs, inplace=False)\n", - "\n", - "for i in range(2):\n", - " calibration_output = prepared_model(**inputs)\n", - "\n", - "model = convert(prepared_model)\n", - "with torch.no_grad():\n", - " model = torch.jit.trace(model, jit_inputs, strict=False)\n", - " model = torch.jit.freeze(model)\n", - " y = model(**inputs)\n", - " y = model(**inputs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7983dfa3-8bb2-4f9e-994c-24cbd556b94d", - "metadata": {}, - "outputs": [], - "source": [ - "inference_time_optimized = get_average_inference_time(model, inputs)\n", - "\n", - "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54e19218-dd71-4711-a25a-8e7c8a787d19", - "metadata": {}, - "outputs": [], - "source": [ - "# plot performance gain bar chart\n", - "\n", - "plot_speedup(inference_time_stock, inference_time_optimized)" - ] - }, - { - "cell_type": "markdown", - "id": "5e79ec28-6063-4d21-95d5-32be78bd49af", - "metadata": {}, - "source": [ - "**Dynamic Quantization** \n", - " In dynamic quantization the weights are quantized ahead of time but the activations are dynamically quantized during inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4bef3494-b5c5-4acd-8431-71e3199a9764", - "metadata": {}, - "outputs": [], - "source": [ - "model = load_model_eval_mode()\n", - "\n", - "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", - "\n", - "jit_inputs = tuple((inputs['input_ids'], inputs['attention_mask']))\n", - "\n", - "\n", - "qconfig_mapping = ipex.quantization.default_dynamic_qconfig_mapping # for dynamic quantization\n", - "prepared_model = ipex.quantization.prepare(model, qconfig_mapping, example_inputs=jit_inputs, inplace=False)\n", - "model = convert(prepared_model)\n", - "with torch.no_grad():\n", - " model = torch.jit.trace(model, jit_inputs, strict=False)\n", - " model = torch.jit.freeze(model)\n", - " y = model(**inputs)\n", - " y = model(**inputs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cc311f2-6215-41ff-a7ed-e1942e0cbbde", - "metadata": {}, - "outputs": [], - "source": [ - "inference_time_optimized = get_average_inference_time(model, inputs)\n", - "\n", - "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb75733c-91d9-4d38-97a8-a5bb4b3063a8", - "metadata": {}, - "outputs": [], - "source": [ - "# plot performance gain bar chart\n", - "\n", - "plot_speedup(inference_time_stock, inference_time_optimized)" - ] - }, - { - "cell_type": "markdown", - "id": "bff70b14-26d9-4ef8-894a-6d0a40973324", - "metadata": {}, - "source": [ - "## Intelยฎ Extension for PyTorch* (IPEX) Launch Script" - ] - }, - { - "cell_type": "markdown", - "id": "63f71568-902f-4e9f-a130-cc5fae1961db", - "metadata": {}, - "source": [ - "Default primitives of PyTorch and Intelยฎ Extension for PyTorch* (IPEX) are highly optimized, there are things users can do improve performance. Setting configuration options properly contributes to a performance boost. However, there is no unified configuration that is optimal to all topologies. Users need to try different combinations by themselves." - ] - }, - { - "cell_type": "markdown", - "id": "f826ac48-17e2-48b4-ab2c-0a47620790a7", - "metadata": {}, - "source": [ - "**Single instance for inference**" - ] - }, - { - "cell_type": "markdown", - "id": "f300e009-f7cb-4403-8229-938bf89b0920", - "metadata": {}, - "source": [ - "The launch script is provided as a module of Intelยฎ Extension for PyTorch* (IPEX). Below are some of those configurations that can be set using the launch script for a single instance. The launch script can be run as a shell command from a Jupyter notebook or from the shell itself." - ] - }, - { - "cell_type": "markdown", - "id": "35ed3f44-c019-4997-a73d-a043ddfa12ee", - "metadata": {}, - "source": [ - "To explore the features of the launch script module, we will be using a ResNet-50 model, which is a a convolutional neural network that is 50 layers deep.The model script is present in the scripts folder" - ] - }, - { - "cell_type": "markdown", - "id": "edbd3506-dc86-4f18-b666-dfa9e9e3705a", - "metadata": {}, - "source": [ - "It is recommended that the user check the output of [htop](https://htop.dev/) in an accompanying terminal to check the usage of cores while running the cells below. The output from htop looks as shown below." - ] - }, - { - "cell_type": "markdown", - "id": "b5015d8d-6ac6-41c4-a10e-509069b699ee", - "metadata": {}, - "source": [ - "![htop](https://intel.github.io/intel-extension-for-pytorch/latest/_images/1ins_phy.gif)" - ] - }, - { - "cell_type": "markdown", - "id": "91737a0e-1a14-4582-a9c4-63b24e963ff7", - "metadata": {}, - "source": [ - "By running the below command, One main worker thread will be launched, then it will launch threads on 2 other physical cores." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76265fe7-52cd-4574-9657-18efa5c25514", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 3 --log_path ./logs ./python/resnet50.py" - ] - }, - { - "cell_type": "markdown", - "id": "6fa2dd27-cf4f-4e1d-808a-271c35cd508b", - "metadata": {}, - "source": [ - "Similarly by increasing the number of cores, we can see an improvement in the inference time as shown below " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c522568-db0f-4438-ae1f-66623e07c96f", - "metadata": {}, - "outputs": [], - "source": [ - "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 6 --log_path ./logs ./python/resnet50.py" - ] - }, - { - "cell_type": "markdown", - "id": "a3a00650-d15f-4aa7-b526-a21c6823d50a", - "metadata": {}, - "source": [ - "We saw a small example usage of the launch script module. This [documentation](https://intel.github.io/intel-extension-for-pytorch/cpu/1.12.100+cpu/tutorials/performance_tuning/launch_script.html) provides many more examples to use the launch script. As mentioned earlier, each deep learning topology can benefit from custom tuning to achieve the best performance on top of the optimizations we have discussed so far." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bd6e13ed-0c0e-4e97-acaa-7a8dfba259c4", - "metadata": {}, - "outputs": [], - "source": [ - "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - }, - "nbTranslate": { - "displayLangs": [ - "*" - ], - "hotkey": "alt-t", - "langInMainMenu": true, - "sourceLang": "en", - "targetLang": "fr", - "useGoogleTranslate": true - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py b/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py deleted file mode 100644 index 6bc1e29a8..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 - -''' -============================================================== - Copyright ยฉ 2023 Intel Corporation - - SPDX-License-Identifier: MIT -============================================================== -''' - -import os -from time import time -import matplotlib.pyplot as plt -import torch -import intel_extension_for_pytorch as ipex -from intel_extension_for_pytorch.quantization import prepare, convert -import torchvision -from torchvision import models -from transformers import BertModel - -SUPPORTED_MODELS = ["resnet50", "bert"] # models supported by this code sample - -# ResNet sample data parameters -RESNET_BATCH_SIZE = 64 - -# BERT sample data parameters -BERT_BATCH_SIZE = 64 -BERT_SEQ_LENGTH = 512 - -os.environ["ONEDNN_MAX_CPU_ISA"] = "AVX512_CORE_VNNI" - -""" -Function to perform inference on Resnet50 and BERT -""" -def runInference(model, data, modelName="resnet50", dataType="FP32", amx=True): - """ - Input parameters - model: the PyTorch model object used for inference - data: a sample input into the model - modelName: str representing the name of the model, supported values - resnet50, bert - dataType: str representing the data type for model parameters, supported values - FP32, BF16, INT8 - amx: set to False to disable AMX on BF16, Default: True - Return value - inference_time: the time in seconds it takes to perform inference with the model - """ - - # Display run case - if amx: - isa_text = "AVX512_CORE_AMX" - else: - isa_text = "AVX512_CORE_VNNI" - print("%s %s inference with %s" %(modelName, dataType, isa_text)) - - # Special variables for specific models - batch_size = None - if "resnet50" == modelName: - batch_size = RESNET_BATCH_SIZE - elif "bert" == modelName: - d = torch.randint(model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) # sample data input for torchscript and inference - batch_size = BERT_BATCH_SIZE - else: - raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) - - # Prepare model for inference based on precision (FP32, BF16, INT8) - if "INT8" == dataType: - # Quantize model to INT8 if needed (one time) - model_filename = "quantized_model_%s.pt" %modelName - if not os.path.exists(model_filename): - qconfig = ipex.quantization.default_static_qconfig - prepared_model = prepare(model, qconfig, example_inputs=data, inplace=False) - converted_model = convert(prepared_model) - with torch.no_grad(): - if "resnet50" == modelName: - traced_model = torch.jit.trace(converted_model, data) - elif "bert" == modelName: - traced_model = torch.jit.trace(converted_model, (d,), check_trace=False, strict=False) - else: - raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) - traced_model = torch.jit.freeze(traced_model) - traced_model.save(model_filename) - - # Load INT8 model for inference - model = torch.jit.load(model_filename) - model.eval() - model = torch.jit.freeze(model) - elif "BF16" == dataType: - model = ipex.optimize(model, dtype=torch.bfloat16) - with torch.no_grad(): - with torch.cpu.amp.autocast(): - if "resnet50" == modelName: - model = torch.jit.trace(model, data) - elif "bert" == modelName: - model = torch.jit.trace(model, (d,), check_trace=False, strict=False) - else: - raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) - model = torch.jit.freeze(model) - else: # FP32 - with torch.no_grad(): - if "resnet50" == modelName: - model = torch.jit.trace(model, data) - elif "bert" == modelName: - model = torch.jit.trace(model, (d,), check_trace=False, strict=False) - else: - raise Exception("ERROR: modelName %s is not supported. Choose from %s" %(modelName, SUPPORTED_MODELS)) - model = torch.jit.freeze(model) - - # Run inference - with torch.no_grad(): - if "BF16" == dataType: - with torch.cpu.amp.autocast(): - # Warm up - for i in range(5): - model(data) - - # Measure latency - start_time = time() - model(data) - end_time = time() - else: - # Warm up - for i in range(5): - model(data) - - # Measure latency - start_time = time() - model(data) - end_time = time() - inference_time = end_time - start_time - print("Inference on batch size %d took %.3f seconds" %(batch_size, inference_time)) - - return inference_time - - -""" -Perform all types of inference in main function - -Inference run cases for both Resnet50 and BERT -1) INT8 using AVX512_CORE_VNNI -""" -def main(): - # ResNet50 - resnet_model = models.resnet50(pretrained=True) - resnet_data = torch.rand(RESNET_BATCH_SIZE, 3, 224, 224) - resnet_model.eval() - int8_with_vnni_resnet_inference_time = runInference(resnet_model, resnet_data, modelName="resnet50", dataType="INT8", amx=False) - - # BERT - bert_model = BertModel.from_pretrained("bert-base-uncased") -#torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') - bert_data = torch.randint(bert_model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) - bert_model.eval() - int8_with_vnni_bert_inference_time = runInference(bert_model, bert_data, modelName="bert", dataType="INT8", amx=False) - -if __name__ == '__main__': - main() diff --git a/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py b/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py deleted file mode 100644 index dae594af2..000000000 --- a/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py +++ /dev/null @@ -1,60 +0,0 @@ -import torch -import torchvision.models as models - -def inference(model, data): - with torch.no_grad(): - # warm up - for _ in range(100): - model(data) - - # measure - import time - start = time.time() - for _ in range(100): - output = model(data) - end = time.time() - print('Inference took {:.2f} ms in average'.format((end-start)/100*1000)) - -def main(args): - model = models.resnet50(pretrained=False) - model.eval() - - data = torch.rand(1, 3, 224, 224) - - import intel_extension_for_pytorch as ipex - - model = model.to(memory_format=torch.channels_last) - data = data.to(memory_format=torch.channels_last) - - if args.dtype == 'float32': - model = ipex.optimize(model, dtype=torch.float32) - elif args.dtype == 'bfloat16': - model = ipex.optimize(model, dtype=torch.bfloat16) - else: # int8 - from intel_extension_for_pytorch.quantization import prepare, convert - - qconfig = ipex.quantization.default_static_qconfig - model = prepare(model, qconfig, example_inputs=data, inplace=False) - - # calibration - n_iter = 100 - for i in range(n_iter): - model(data) - - model = convert(model) - - with torch.cpu.amp.autocast(enabled=args.dtype=='bfloat16'): - if args.torchscript: - with torch.no_grad(): - model = torch.jit.trace(model, data) - model = torch.jit.freeze(model) - - inference(model, data) - -if __name__ == '__main__': - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('--dtype', default='float32', choices=['float32', 'bfloat16', 'int8']) - parser.add_argument("--torchscript", default=False, action="store_true") - - main(parser.parse_args()) diff --git a/examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py rename to examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py b/examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py rename to examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/python-scripts/resnet50_general_inference_script.py b/examples/cpu/inference/python/resnet50_general_inference_script.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/resnet50_general_inference_script.py rename to examples/cpu/inference/python/resnet50_general_inference_script.py diff --git a/examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py rename to examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py rename to examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py rename to examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py rename to examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py diff --git a/examples/cpu/training/python-scripts/README.md b/examples/cpu/training/README.md similarity index 74% rename from examples/cpu/training/python-scripts/README.md rename to examples/cpu/training/README.md index 066425954..2169e8df9 100644 --- a/examples/cpu/training/python-scripts/README.md +++ b/examples/cpu/training/README.md @@ -23,6 +23,22 @@ git clone https://github.com/intel/intel-extension-for-pytorch.git cd intel-extension-for-pytorch/examples/cpu/training ``` +Running ResNet50 Float32 single precision training example: + +```bash +python single_instance_training_fp32.py +``` + +We provided support for BFloat16 half precision training. +Please refer to [Automatic Mixed Precision (AMP) introduction](https://pytorch.org/docs/stable/amp.html) for more details. +BFloat16 calculations are further accelerated on the processors supporting [Intelยฎ Advanced Matrix Extensions (AMX)](https://en.wikipedia.org/wiki/Advanced_Matrix_Extensions) instructions. + +Running ResNet50 BFloat16 half precision training example: + +```bash +python single_instance_training_bf16.py +``` + Running ResNet50 distributed training example: ```bash @@ -33,4 +49,4 @@ ipexrun --nnodes 1 distributed_data_parallel_training.py Please check [the training examples in Intelยฎ Extension for PyTorch\* online document](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/examples.html#training) for more details. -For more information and examples about distributed training via PyTorch\* DDP, please visit [oneAPI Collective Communications Library Bindings for Pytorch\* Github repository](https://github.com/intel/torch-ccl). +For more information and examples about distributed training via PyTorch\* DDP, please visit [oneAPI Collective Communications Library Bindings for Pytorch\* Github repository](https://github.com/intel/torch-ccl). \ No newline at end of file diff --git a/examples/cpu/training/python-scripts/distributed_data_parallel_training.py b/examples/cpu/training/distributed_data_parallel_training.py similarity index 100% rename from examples/cpu/training/python-scripts/distributed_data_parallel_training.py rename to examples/cpu/training/distributed_data_parallel_training.py diff --git a/examples/cpu/training/single_instance_training_bf16.py b/examples/cpu/training/single_instance_training_bf16.py new file mode 100644 index 000000000..fa596e686 --- /dev/null +++ b/examples/cpu/training/single_instance_training_bf16.py @@ -0,0 +1,51 @@ +import torch +import torchvision +import intel_extension_for_pytorch as ipex + +LR = 0.001 +DOWNLOAD = True +DATA = "datasets/cifar10/" + +transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize((224, 224)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] +) +train_dataset = torchvision.datasets.CIFAR10( + root=DATA, + train=True, + transform=transform, + download=DOWNLOAD, +) +train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) + +model = torchvision.models.resnet50() +criterion = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9) +model.train() + +model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) +# Uncomment the code below to enable beta feature `torch.compile` +# model = torch.compile(model, backend="ipex") + +for batch_idx, (data, target) in enumerate(train_loader): + optimizer.zero_grad() + # Note: bf16 training requires amp.autocast() context # noqa F401 + with torch.cpu.amp.autocast(): + output = model(data) + loss = criterion(output, target) + loss.backward() + optimizer.step() + print(batch_idx) + +torch.save( + { + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + }, + "checkpoint.pth", +) + +print("Execution finished") diff --git a/examples/cpu/training/single_instance_training_fp32.py b/examples/cpu/training/single_instance_training_fp32.py new file mode 100644 index 000000000..ae2b970ad --- /dev/null +++ b/examples/cpu/training/single_instance_training_fp32.py @@ -0,0 +1,49 @@ +import torch +import torchvision +import intel_extension_for_pytorch as ipex + +LR = 0.001 +DOWNLOAD = True +DATA = "datasets/cifar10/" + +transform = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize((224, 224)), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ] +) +train_dataset = torchvision.datasets.CIFAR10( + root=DATA, + train=True, + transform=transform, + download=DOWNLOAD, +) +train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) + +model = torchvision.models.resnet50() +criterion = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9) +model.train() + +model, optimizer = ipex.optimize(model, optimizer=optimizer) +# Uncomment the code below to enable beta feature `torch.compile` +# model = torch.compile(model, backend="ipex") + +for batch_idx, (data, target) in enumerate(train_loader): + optimizer.zero_grad() + output = model(data) + loss = criterion(output, target) + loss.backward() + optimizer.step() + print(batch_idx) + +torch.save( + { + "model_state_dict": model.state_dict(), + "optimizer_state_dict": optimizer.state_dict(), + }, + "checkpoint.pth", +) + +print("Execution finished") diff --git a/scripts/build_doc.sh b/scripts/build_doc.sh index 8bc92cdba..a34293801 100644 --- a/scripts/build_doc.sh +++ b/scripts/build_doc.sh @@ -173,23 +173,25 @@ parse_example() { cp ${MDEXAMPLE} tutorials/examples.md.bk if [[ ${DEVICE} == "cpu" ]]; then - parse_example "../examples/cpu/training/python-scripts/distributed_data_parallel_training.py" ${MDEXAMPLE} "(marker_train_ddp_complete)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_fp32)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_fp32)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_bf16)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_bf16)" "python" + parse_example "../examples/cpu/training/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" + parse_example "../examples/cpu/training/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" + parse_example "../examples/cpu/training/distributed_data_parallel_training.py" ${MDEXAMPLE} "(marker_train_ddp_complete)" "python" + parse_example "../examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" + parse_example "../examples/cpu/inference/python/bert_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" + parse_example "../examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" + parse_example "../examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" + parse_example "../examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_fp32)" "python" + parse_example "../examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_fp32)" "python" + parse_example "../examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" + parse_example "../examples/cpu/inference/python/bert_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" + parse_example "../examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" + parse_example "../examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" + parse_example "../examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_bf16)" "python" + parse_example "../examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_bf16)" "python" parse_example "../examples/cpu/features/fast_bert/fast_bert_inference_bf16.py" ${MDEXAMPLE} "(marker_feature_fastbert_bf16)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py" ${MDEXAMPLE} "(marker_int8_dynamic)" "python" - parse_example "../examples/cpu/inference/python/python-scripts/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" + parse_example "../examples/cpu/inference/python/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" + parse_example "../examples/cpu/inference/python/int8_quantization_dynamic.py" ${MDEXAMPLE} "(marker_int8_dynamic)" "python" + parse_example "../examples/cpu/inference/python/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" parse_example "../examples/cpu/features/llm/llm_optimize.py" ${MDEXAMPLE} "(marker_llm_optimize)" "python" parse_example "../examples/cpu/features/llm/llm_optimize_smoothquant.py" ${MDEXAMPLE} "(marker_llm_optimize_sq)" "python" parse_example "../examples/cpu/features/llm/llm_optimize_woq.py" ${MDEXAMPLE} "(marker_llm_optimize_woq)" "python" @@ -205,24 +207,24 @@ if [[ ${DEVICE} == "cpu" ]]; then parse_example "../examples/cpu/features/graph_optimization/int8.py" tutorials/features/graph_optimization.md "(marker_feature_graph_optimization_int8)" "python" parse_example "../examples/cpu/features/graph_optimization/folding.py" tutorials/features/graph_optimization.md "(marker_feature_graph_optimization_folding)" "python" elif [[ ${DEVICE} == "gpu" ]]; then - parse_example "../examples/gpu/training/python-scripts/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" - parse_example "../examples/gpu/training/python-scripts/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp16)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp32_alt.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32_alt)" "python" - # parse_example "../examples/gpu/inference/python/python-scripts/int8_calibration_static_imperative.py" ${MDEXAMPLE} "(marker_int8_static_imperative)" "python" - parse_example "../examples/gpu/inference/python/python-scripts/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" - # parse_example "../examples/gpu/inference/python/python-scripts/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" + parse_example "../examples/gpu/training/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" + parse_example "../examples/gpu/training/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" + parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" + parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" + parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" + parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" + parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" + parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" + parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" + parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" + parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp16)" "python" + parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp16)" "python" + parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp16)" "python" + parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp16)" "python" + parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp32_alt.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32_alt)" "python" + # parse_example "../examples/gpu/inference/python/int8_calibration_static_imperative.py" ${MDEXAMPLE} "(marker_int8_static_imperative)" "python" + parse_example "../examples/gpu/inference/python/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" + # parse_example "../examples/gpu/inference/python/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" parse_example "../examples/gpu/inference/cpp/example-app/example-app.cpp" ${MDEXAMPLE} "(marker_cppsdk_sample_app)" "cpp" parse_example "../examples/gpu/inference/cpp/example-app/CMakeLists.txt" ${MDEXAMPLE} "(marker_cppsdk_cmake_app)" "cmake" parse_example "../examples/gpu/inference/cpp/example-usm/example-usm.cpp" ${MDEXAMPLE} "(marker_cppsdk_sample_usm)" "cpp" From 27f99744ec584fbdb01f7b0e43242cac94689efc Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Tue, 4 Jun 2024 14:53:20 +0900 Subject: [PATCH 104/199] Jingxu10/example restructure main (#2958) * restruct example directories * add jupyter notebook of IntelPytorch Inference AMX BF16 and INT8 * mv 2 examples from onesample (#2787) * mv 2 examples from onesample * fix license format * add jupyter notebook readme * move oneAPI IPEX inference sample optimize (#2798) * clear output of notebook * Update example. Add example 'complete flag' * update readme, remove aikit and refer ipex installation guide * remove installation part in jupyter notebook * remove installation part in jupyter notebook and add kernel select * each sample use conda env seperately * Update cpu example jupyter nootbook README * rm install jupyter and refer to readme, fix table format * Create IPEX_Getting_Started.ipynb * Create IntelPytorch_Quantization.ipynb * remove training examples * fix lint issues --------- Co-authored-by: Zheng, Zhaoqiong Co-authored-by: Neo Zhang Jianyu Co-authored-by: xiguiw <111278656+xiguiw@users.noreply.github.com> Co-authored-by: Wang, Xigui Co-authored-by: yqiu-intel <113460727+YuningQiu@users.noreply.github.com> --- docs/tutorials/examples.md | 45 - .../python/jupyter-notebooks/.gitkeep | 0 .../IPEX_Getting_Started.ipynb | 367 ++++++++ ...InferenceOptimizations_AMX_BF16_INT8.ipynb | 589 ++++++++++++ ...ytorch_Interactive_Chat_Quantization.ipynb | 521 ++++++++++ .../IntelPytorch_Quantization.ipynb | 347 +++++++ .../python/jupyter-notebooks/README.md | 61 ++ .../optimize_pytorch_models_with_ipex.ipynb | 888 ++++++++++++++++++ .../python/pytorch_inference_vnni.py | 183 ++++ .../jupyter-notebooks/python/resnet50.py | 67 ++ .../python/{ => python-scripts}/README.md | 2 +- .../bert_eager_mode_inference_bf16.py | 0 .../bert_eager_mode_inference_fp32.py | 0 .../bert_general_inference_script.py | 0 .../bert_torchdynamo_mode_inference_bf16.py | 0 .../bert_torchdynamo_mode_inference_fp32.py | 0 .../bert_torchscript_mode_inference_bf16.py | 0 .../bert_torchscript_mode_inference_fp32.py | 0 .../{ => python-scripts}/int8_deployment.py | 0 .../int8_quantization_dynamic.py | 0 .../int8_quantization_static.py | 0 .../resnet50_eager_mode_inference_bf16.py | 0 .../resnet50_eager_mode_inference_fp32.py | 0 .../resnet50_general_inference_script.py | 0 ...esnet50_torchdynamo_mode_inference_bf16.py | 0 ...esnet50_torchdynamo_mode_inference_fp32.py | 0 ...esnet50_torchscript_mode_inference_bf16.py | 0 ...esnet50_torchscript_mode_inference_fp32.py | 0 .../training/{ => python-scripts}/README.md | 18 +- .../distributed_data_parallel_training.py | 0 .../training/single_instance_training_bf16.py | 51 - .../training/single_instance_training_fp32.py | 49 - scripts/build_doc.sh | 70 +- 33 files changed, 3059 insertions(+), 199 deletions(-) create mode 100644 examples/cpu/inference/python/jupyter-notebooks/.gitkeep create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/README.md create mode 100644 examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb create mode 100644 examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py create mode 100644 examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py rename examples/cpu/inference/python/{ => python-scripts}/README.md (97%) rename examples/cpu/inference/python/{ => python-scripts}/bert_eager_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_eager_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_general_inference_script.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchdynamo_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchdynamo_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchscript_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/bert_torchscript_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/int8_deployment.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/int8_quantization_dynamic.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/int8_quantization_static.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_eager_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_eager_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_general_inference_script.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchdynamo_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchdynamo_mode_inference_fp32.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchscript_mode_inference_bf16.py (100%) rename examples/cpu/inference/python/{ => python-scripts}/resnet50_torchscript_mode_inference_fp32.py (100%) rename examples/cpu/training/{ => python-scripts}/README.md (74%) rename examples/cpu/training/{ => python-scripts}/distributed_data_parallel_training.py (100%) delete mode 100644 examples/cpu/training/single_instance_training_bf16.py delete mode 100644 examples/cpu/training/single_instance_training_fp32.py diff --git a/docs/tutorials/examples.md b/docs/tutorials/examples.md index f90505a2d..3c9eacaa6 100644 --- a/docs/tutorials/examples.md +++ b/docs/tutorials/examples.md @@ -25,51 +25,6 @@ Before running these examples, please note the following: ### Training -#### Single-instance Training - -To use Intelยฎ Extension for PyTorch\* on training, you need to make the following changes in your code: - -1. Import `intel_extension_for_pytorch` as `ipex`. -2. Invoke the `ipex.optimize` function to apply optimizations against the model and optimizer objects, as shown below: - - -```python -... -import torch -import intel_extension_for_pytorch as ipex -... -model = Model() -criterion = ... -optimizer = ... -model.train() -# For Float32 -model, optimizer = ipex.optimize(model, optimizer=optimizer) -# For BFloat16 -model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) -# Invoke the code below to enable beta feature torch.compile -model = torch.compile(model, backend="ipex") -... -optimizer.zero_grad() -output = model(data) -... -``` - -Below you can find complete code examples demonstrating how to use the extension on training for different data types: - -##### Float32 - -**Note:** You need to install `torchvision` Python package to run the following example. - -[//]: # (marker_train_single_fp32_complete) -[//]: # (marker_train_single_fp32_complete) - -##### BFloat16 - -**Note:** You need to install `torchvision` Python package to run the following example. - -[//]: # (marker_train_single_bf16_complete) -[//]: # (marker_train_single_bf16_complete) - #### Distributed Training Distributed training with PyTorch DDP is accelerated by oneAPI Collective Communications Library Bindings for Pytorch\* (oneCCL Bindings for Pytorch\*). The extension supports FP32 and BF16 data types. More detailed information and examples are available at the [Github repo](https://github.com/intel/torch-ccl). diff --git a/examples/cpu/inference/python/jupyter-notebooks/.gitkeep b/examples/cpu/inference/python/jupyter-notebooks/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb new file mode 100644 index 000000000..10c934360 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IPEX_Getting_Started.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting Started with Intelยฎ Extension for PyTorch (IPEX)\n", + "This code sample will guide users how to run a PyTorch inference workload on CPU by using oneAPI AI Analytics Toolkit and also analyze the CPU usage via oneDNN verbose logs." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resnet50 Inference on CPU\n", + "***\n", + "This section shows users how to run resnet50 inference on CPU." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### prerequisites" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ignore all warning messages\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import os" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the installation path of your oneAPI AI Analytics toolkit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%env ONEAPI_INSTALL=/opt/intel/oneapi" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download the resnet50 inference sample from Intelยฎ Extension for PyTorch (IPEX) github repository" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/intel/intel-extension-for-pytorch/master/examples/cpu/inference/python/resnet50_general_inference_script.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check PyTorch and Intelยฎ Extension for PyTorch (IPEX) verson in current ipython kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run ../../version_check.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run resnet50 on CPU" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Run on CPU via Intelยฎ Extension for PyTorch (IPEX)\n", + "There is a PyTorch conda environment with Intelยฎ Extension for PyTorch (IPEX) installation in current AI Kit installation.\n", + "Users could run resnet50_general_inference_script.py on Intel CPU on this PyTorch conda environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile run.sh\n", + "#!/bin/bash\n", + "source $ONEAPI_INSTALL/setvars.sh --force > /dev/null 2>&1\n", + "source activate pytorch\n", + "echo \"########## Executing the run\"\n", + "DNNL_VERBOSE=1 python resnet50_general_inference_script.py > infer_rn50_cpu.csv\n", + "echo \"########## Done with the run\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Submitting build.sh and run.sh to the job queue\n", + "\n", + "Now we can submit build.sh and run.sh to the job queue.\n", + "\n", + "NOTE - it is possible to execute any of the build and run commands in local environments.\n", + "To enable users to run their scripts either on the Intel DevCloud or in local environments, this and subsequent training checks for the existence of the job submission command qsub. If the check fails, it is assumed that build/run will be local." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! chmod 755 ../../q; chmod 755 run.sh;if [ -x \"$(command -v qsub)\" ]; then ../../q run.sh; else ./run.sh; fi" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Analyze Verbose Logs\n", + "***\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download profile_utils.py to parse oneDNN verbose logs from previous section." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!wget https://raw.githubusercontent.com/oneapi-src/oneAPI-samples/master/Libraries/oneDNN/tutorials/profiling/profile_utils.py" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: List out all oneDNN verbose logs\n", + "users should see the verbose log listed in the table below.\n", + "\n", + "|Log File Name | Description |\n", + "|:-----|:----|\n", + "|infer_rn50_cpu.csv| log for cpu run |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "filenames= os.listdir (\".\") \n", + "result = []\n", + "keyword = \".csv\"\n", + "for filename in filenames: \n", + " #if os.path.isdir(os.path.join(os.path.abspath(\".\"), filename)): \n", + " if filename.find(keyword) != -1:\n", + " result.append(filename)\n", + "result.sort()\n", + "\n", + "index =0 \n", + "for folder in result:\n", + " print(\" %d : %s \" %(index, folder))\n", + " index+=1" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Pick a verbose log by putting its index value below\n", + "Users can pick cpu log for analysis. \n", + "Once users finish Step 2 to Step 7 for one log file, they can go back to step 2 and select another log file for analysis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "FdIndex=0" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Parse verbose log and get the data back\n", + "> Users will also get a oneDNN.json file with timeline information for oneDNN primitives. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "logfile = result[FdIndex]\n", + "print(logfile)\n", + "from profile_utils import oneDNNUtils, oneDNNLog\n", + "onednn = oneDNNUtils()\n", + "log1 = oneDNNLog()\n", + "log1.load_log(logfile)\n", + "data = log1.data\n", + "exec_data = log1.exec_data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: Time breakdown for exec type\n", + "The exec type includes exec and create. \n", + "\n", + "|exec type | Description | \n", + "|:-----|:----| \n", + "|exec | Time for primitives exection. Better to spend most of time on primitives execution. | \n", + "|create| Time for primitives creation. Primitives creation happens once. Better to spend less time on primitive creation. | " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5: Time breakdown for architecture type\n", + "The supported architecture only includes CPU. \n", + "so users should see 100% CPU time. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onednn.breakdown(exec_data,\"arch\",\"time\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 6: Time breakdown for primitives type\n", + "The primitives type includes convolution, reorder, sum, etc. \n", + "For this simple convolution net example, convolution and inner product primitives are expected to spend most of time. \n", + "However, the exact time percentage of different primitivies may vary among different architectures. \n", + "Users can easily identify top hotpots of primitives executions with this time breakdown. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onednn.breakdown(exec_data,\"type\",\"time\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 7: Time breakdown for JIT kernel type\n", + "oneDNN uses just-in-time compilation (JIT) to generate optimal code for some functions based on input parameters and instruction set supported by the system. \n", + "Therefore, users can see different JIT kernel type among different CPU architectures. \n", + "For example, users can see avx_core_vnni JIT kernel if the workload uses VNNI instruction on Cascake Lake platform. \n", + "Moreover, users can identify the top hotspots of JIT kernel executions with this time breakdown. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "onednn.breakdown(exec_data,\"jit\",\"time\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output(both stdout and stderr) is displayed on the command line console" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "py39" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb new file mode 100644 index 000000000..c4bca3199 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPyTorch_InferenceOptimizations_AMX_BF16_INT8.ipynb @@ -0,0 +1,589 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "75f9200e-7830-4ee5-8637-e67b5df57eac", + "metadata": {}, + "source": [ + "# PyTorch Inference Optimizations with Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) Bfloat16 Integer8" + ] + }, + { + "cell_type": "markdown", + "id": "48eb565f-ef03-40cb-9182-5b2b752331e8", + "metadata": {}, + "source": [ + "The `PyTorch* Inference Optimizations with Advanced Matrix Extensions Bfloat16 Integer8` sample demonstrates how to perform inference using the ResNet50 and BERT models using the Intelยฎ Extension for PyTorch (IPEX).\n", + "\n", + "The Intelยฎ Extension for PyTorch (IPEX) extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX).\n", + "\n", + "| Area | Description\n", + "|:--- |:---\n", + "| What you will learn | Inference performance improvements using Intelยฎ Extension for PyTorch (IPEX) with Intelยฎ AMX BF16/INT8\n", + "| Time to complete | 5 minutes\n", + "| Category | Code Optimization\n", + "\n", + "## Purpose\n", + "\n", + "The Intelยฎ Extension for PyTorch (IPEX) allows you to speed up inference on Intelยฎ Xeon Scalable processors with lower precision data formats and specialized computer instructions. The bfloat16 (BF16) data format uses half the bit width of floating-point-32 (FP32), which lessens the amount of memory needed and execution time to process. Likewise, the integer8 (INT8) data format uses half the bit width of BF16. You should notice performance optimization with the Intelยฎ AMX instruction set when compared to Intelยฎ Vector Neural Network Instructions (Intelยฎ VNNI).\n", + "\n", + "## Prerequisites\n", + "\n", + "| Optimized for | Description\n", + "|:--- |:---\n", + "| OS | Ubuntu* 18.04 or newer\n", + "| Hardware | 4th Gen Intelยฎ Xeonยฎ Scalable Processors or newer\n", + "| Software | Intelยฎ Extension for PyTorch (IPEX)\n", + "\n", + "## Key Implementation Details\n", + "\n", + "This code sample will perform inference on the ResNet50 and BERT models while using Intelยฎ Extension for PyTorch (IPEX). For each pretrained model, there will be a warm up of 20 samples before running inference on the specified number of samples (i.e. 1000) to record the time. Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) is supported on BF16 and INT8 data types starting with the 4th Generation of Xeon Scalable Processors. The inference time will be compared, showcasing the speedup over FP32 when using AVX-512, Intelยฎ AMX, BF16, and INT8. The following run cases are executed: \n", + "\n", + "1. FP32 (baseline)\n", + "2. BF16 using AVX512_CORE_AMX\n", + "3. INT8 using AVX512_CORE_VNNI\n", + "4. INT8 using AVX512_CORE_AMX\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "4c254afc", + "metadata": {}, + "source": [ + "## Installation of required packages\n", + "\n", + "Ensure the kernel is set to Pytorch-CPU before running the follwing code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa457cee-5b1e-4ec9-b03a-2a7b2a8b464e", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install matplotlib transformers py-cpuinfo sentencepiece sacremoses " + ] + }, + { + "cell_type": "markdown", + "id": "4e41ce52-c94c-4bdf-a528-0e0200fd5501", + "metadata": {}, + "source": [ + "## Imports, Constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e4eedf0-5c7c-49d3-be15-f46b4988d9ff", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from time import time\n", + "import matplotlib.pyplot as plt\n", + "import torch\n", + "import intel_extension_for_pytorch as ipex\n", + "from intel_extension_for_pytorch.quantization import prepare, convert\n", + "import torchvision\n", + "from torchvision import models\n", + "from transformers import BertModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17246f67-0059-4b5f-afe8-a105d767b139", + "metadata": {}, + "outputs": [], + "source": [ + "SUPPORTED_MODELS = [\"resnet50\", \"bert\"] # models supported by this code sample\n", + "\n", + "# ResNet sample data parameters\n", + "RESNET_BATCH_SIZE = 64\n", + "\n", + "# BERT sample data parameters\n", + "BERT_BATCH_SIZE = 64\n", + "BERT_SEQ_LENGTH = 512" + ] + }, + { + "cell_type": "markdown", + "id": "9771f165", + "metadata": {}, + "source": [ + "## Identify Supported ISA \n", + "We identify the underlying supported ISA to determine whether Intelยฎ AMX is supported. The 4th Gen Intelยฎ Xeonยฎ Scalable Processor (codenamed Sapphire Rapids) or newer must be used to run this sample. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25c339a4", + "metadata": {}, + "outputs": [], + "source": [ + "# Check if hardware supports Intelยฎ AMX\n", + "import sys\n", + "sys.path.append('../../')\n", + "from cpuinfo import get_cpu_info\n", + "info = get_cpu_info()\n", + "flags = info['flags']\n", + "amx_supported = False\n", + "for flag in flags:\n", + " if \"amx\" in flag:\n", + " amx_supported = True\n", + " break\n", + "if not amx_supported:\n", + " print(\"Intelยฎ AMX is not supported on current hardware. Code sample cannot be run.\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "7b3f461d", + "metadata": {}, + "source": [ + "If the message \"Intelยฎ AMX is not supported on current hardware. Code sample cannot be run.\" is printed above, the hardware being used does not support Intelยฎ AMX. Therefore, this code sample cannot proceed." + ] + }, + { + "cell_type": "markdown", + "id": "6ccd66ee-aac5-4a60-8f66-417612d4d3af", + "metadata": {}, + "source": [ + "## Running Inference\n", + "The function runInference() will perform inference on the selected model, precision, and whether Intelยฎ AMX is to be enabled. The environment variable `ONEDNN_MAX_CPU_ISA` is used to enable or disable Intelยฎ AMX. **Note that this environment variable is only initialized once.** This means to run with Intelยฎ AMX and Intelยฎ VNNI, there will need to be separate processes. The best practice is to set this environment variable before running your script. For more information, refer to the [oneDNN documentation on CPU Dispatcher Control](https://www.intel.com/content/www/us/en/develop/documentation/onednn-developer-guide-and-reference/top/performance-profiling-and-inspection/cpu-dispatcher-control.html). \n", + "\n", + "To use BF16 in operations, use the `torch.cpu.amp.autocast()` function to perform forward pass. For INT8, the quantization feature from Intelยฎ Extension for PyTorch (IPEX) is used to quantize the FP32 model to INT8 before running inference.\n", + "\n", + "Torchscript is also utilized to deploy the model in graph mode instead of imperative mode for faster runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f08d718", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"ONEDNN_MAX_CPU_ISA\"] = \"AVX512_CORE_AMX\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b8e21c9-aaa5-4f75-b00a-0d875cc0bfba", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Function to perform inference on Resnet50 and BERT\n", + "\"\"\"\n", + "def runInference(model, data, modelName=\"resnet50\", dataType=\"FP32\", amx=True):\n", + " \"\"\"\n", + " Input parameters\n", + " model: the PyTorch model object used for inference\n", + " data: a sample input into the model\n", + " modelName: str representing the name of the model, supported values - resnet50, bert\n", + " dataType: str representing the data type for model parameters, supported values - FP32, BF16, INT8\n", + " amx: set to False to disable Intelยฎ AMX on BF16, Default: True\n", + " Return value\n", + " inference_time: the time in seconds it takes to perform inference with the model\n", + " \"\"\"\n", + " \n", + " # Display run case\n", + " if amx:\n", + " isa_text = \"AVX512_CORE_AMX\"\n", + " else:\n", + " isa_text = \"AVX512_CORE_VNNI\"\n", + " print(\"%s %s inference with %s\" %(modelName, dataType, isa_text))\n", + "\n", + " # Special variables for specific models\n", + " batch_size = None\n", + " if \"resnet50\" == modelName:\n", + " batch_size = RESNET_BATCH_SIZE\n", + " elif \"bert\" == modelName:\n", + " d = torch.randint(model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH]) # sample data input for torchscript and inference\n", + " batch_size = BERT_BATCH_SIZE\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + "\n", + " # Prepare model for inference based on precision (FP32, BF16, INT8)\n", + " if \"INT8\" == dataType:\n", + " # Quantize model to INT8 if needed (one time)\n", + " model_filename = \"quantized_model_%s.pt\" %modelName\n", + " if not os.path.exists(model_filename):\n", + " qconfig = ipex.quantization.default_static_qconfig\n", + " prepared_model = prepare(model, qconfig, example_inputs=data, inplace=False)\n", + " converted_model = convert(prepared_model)\n", + " with torch.no_grad():\n", + " if \"resnet50\" == modelName:\n", + " traced_model = torch.jit.trace(converted_model, data)\n", + " elif \"bert\" == modelName:\n", + " traced_model = torch.jit.trace(converted_model, (d,), check_trace=False, strict=False)\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + " traced_model = torch.jit.freeze(traced_model)\n", + " traced_model.save(model_filename)\n", + "\n", + " # Load INT8 model for inference\n", + " model = torch.jit.load(model_filename)\n", + " model.eval()\n", + " model = torch.jit.freeze(model)\n", + " elif \"BF16\" == dataType:\n", + " model = ipex.optimize(model, dtype=torch.bfloat16)\n", + " with torch.no_grad():\n", + " with torch.cpu.amp.autocast():\n", + " if \"resnet50\" == modelName:\n", + " model = torch.jit.trace(model, data)\n", + " elif \"bert\" == modelName:\n", + " model = torch.jit.trace(model, (d,), check_trace=False, strict=False)\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + " model = torch.jit.freeze(model)\n", + " else: # FP32\n", + " with torch.no_grad():\n", + " if \"resnet50\" == modelName:\n", + " model = torch.jit.trace(model, data)\n", + " elif \"bert\" == modelName:\n", + " model = torch.jit.trace(model, (d,), check_trace=False, strict=False)\n", + " else:\n", + " raise Exception(\"ERROR: modelName %s is not supported. Choose from %s\" %(modelName, SUPPORTED_MODELS))\n", + " model = torch.jit.freeze(model)\n", + "\n", + " # Run inference\n", + " with torch.no_grad():\n", + " if \"BF16\" == dataType:\n", + " with torch.cpu.amp.autocast():\n", + " # Warm up\n", + " for i in range(5):\n", + " model(data)\n", + " \n", + " # Measure latency\n", + " start_time = time()\n", + " model(data)\n", + " end_time = time()\n", + " else:\n", + " # Warm up\n", + " for i in range(5):\n", + " model(data)\n", + " \n", + " # Measure latency\n", + " start_time = time()\n", + " model(data)\n", + " end_time = time()\n", + " inference_time = end_time - start_time\n", + " print(\"Inference on batch size %d took %.3f seconds\" %(batch_size, inference_time))\n", + "\n", + " return inference_time" + ] + }, + { + "cell_type": "markdown", + "id": "1dad2dae", + "metadata": {}, + "source": [ + "The function summarizeResults() displays the inference times and generates one graph for comparing the inference times and another graph for comparing the speedup using FP32 as the baseline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cf736a2", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Prints out results and displays figures summarizing output.\n", + "\"\"\"\n", + "def summarizeResults(modelName=\"\", results=None, batch_size=1):\n", + " \"\"\"\n", + " Input parameters\n", + " modelName: a str representing the name of the model\n", + " results: a dict with the run case and its corresponding time in seconds\n", + " batch_size: an integer for the batch size\n", + " Return value\n", + " None\n", + " \"\"\"\n", + "\n", + " # Inference time results\n", + " print(\"\\nSummary for %s (Batch Size = %d)\" %(modelName, batch_size))\n", + " for key in results.keys():\n", + " print(\"%s inference time: %.3f seconds\" %(key, results[key]))\n", + "\n", + " # Create bar chart with inference time results\n", + " plt.figure()\n", + " plt.title(\"%s Inference Time (Batch Size = %d)\" %(modelName, batch_size))\n", + " plt.xlabel(\"Run Case\")\n", + " plt.ylabel(\"Inference Time (seconds)\")\n", + " plt.bar(results.keys(), results.values())\n", + "\n", + " # Calculate speedup when using Intelยฎ AMX\n", + " print(\"\\n\")\n", + " bf16_with_amx_speedup = results[\"FP32\"] / results[\"BF16_with_AMX\"]\n", + " print(\"BF16 with Intelยฎ AMX is %.2fX faster than FP32\" %bf16_with_amx_speedup)\n", + " int8_with_vnni_speedup = results[\"FP32\"] / results[\"INT8_with_VNNI\"]\n", + " print(\"INT8 without Intelยฎ AMX is %.2fX faster than FP32\" %int8_with_vnni_speedup)\n", + " int8_with_amx_speedup = results[\"FP32\"] / results[\"INT8_with_AMX\"]\n", + " print(\"INT8 with Intelยฎ AMX is %.2fX faster than FP32\" %int8_with_amx_speedup)\n", + " print(\"\\n\\n\")\n", + "\n", + " # Create bar chart with speedup results\n", + " plt.figure()\n", + " plt.title(\"%s Intelยฎ AMX BF16/INT8 Speedup over FP32\" %modelName)\n", + " plt.xlabel(\"Run Case\")\n", + " plt.ylabel(\"Speedup\")\n", + " plt.bar(results.keys(), \n", + " [1, bf16_with_amx_speedup, int8_with_vnni_speedup, int8_with_amx_speedup]\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "9e42672a", + "metadata": {}, + "source": [ + "### VNNI: ResNet50 and BERT\n", + "Since ONEDNN_MAX_CPU_ISA is initialized ONCE when a workload is being run, another process must be used to run with a different setting. \n", + "In other words, changing ONEDNN_MAX_CPU_ISA during runtime in the same process will not have any effect.\n", + "Thus, to run with VNNI, a separate script is run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "555ec5a9", + "metadata": {}, + "outputs": [], + "source": [ + "!python python/pytorch_inference_vnni.py" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d194fa7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Record the inference times for INT8 using AVX-512\n", + "int8_with_vnni_resnet_inference_time = 0.033 #TODO: enter in inference time\n", + "int8_with_vnni_bert_inference_time = 0.691 #TODO: enter in inference time" + ] + }, + { + "cell_type": "markdown", + "id": "c61288e7", + "metadata": {}, + "source": [ + "### : ResNet50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4a6a84c", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up ResNet50 model and sample data\n", + "resnet_model = models.resnet50(pretrained=True)\n", + "resnet_data = torch.rand(RESNET_BATCH_SIZE, 3, 224, 224)\n", + "resnet_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b26789b9", + "metadata": {}, + "outputs": [], + "source": [ + "# FP32 (baseline)\n", + "fp32_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"FP32\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ad0c512", + "metadata": {}, + "outputs": [], + "source": [ + "# BF16 using Intelยฎ \n", + "bf16_amx_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"BF16\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cd9f1bd", + "metadata": {}, + "outputs": [], + "source": [ + "# INT8 using Intelยฎ \n", + "int8_amx_resnet_inference_time = runInference(resnet_model, resnet_data, modelName=\"resnet50\", dataType=\"INT8\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59fcbbe2", + "metadata": {}, + "outputs": [], + "source": [ + "# Summarize and display results\n", + "results_resnet = {\n", + " \"FP32\": fp32_resnet_inference_time,\n", + " \"BF16_with_AMX\": bf16_amx_resnet_inference_time,\n", + " \"INT8_with_VNNI\": int8_with_vnni_resnet_inference_time,\n", + " \"INT8_with_AMX\": int8_amx_resnet_inference_time\n", + " }\n", + "summarizeResults(\"ResNet50\", results_resnet, RESNET_BATCH_SIZE)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "75a62b72", + "metadata": {}, + "source": [ + "The first graph displays the inference times on the specified number of samples. In general, the times should be decreasing from left to right because using lower precision and with accelerates the computations. The second graph displays the relative speedup of each run case compared to that of FP32. In general the speedup should be increasing from left to right." + ] + }, + { + "cell_type": "markdown", + "id": "b36fa4b3", + "metadata": {}, + "source": [ + "### BERT" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27f173e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up BERT model and sample data\n", + "bert_model = BertModel.from_pretrained(\"bert-base-uncased\")\n", + "bert_data = torch.randint(bert_model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH])\n", + "bert_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a5847c1", + "metadata": {}, + "outputs": [], + "source": [ + "# FP32 (baseline)\n", + "fp32_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"FP32\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d35fc58e", + "metadata": {}, + "outputs": [], + "source": [ + "# BF16 using Intelยฎ \n", + "bf16_amx_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"BF16\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b3d2ccd", + "metadata": {}, + "outputs": [], + "source": [ + "# INT8 using Intelยฎ \n", + "int8_amx_bert_inference_time = runInference(bert_model, bert_data, modelName=\"bert\", dataType=\"INT8\", amx=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3721e698", + "metadata": {}, + "outputs": [], + "source": [ + "# Summarize and display results\n", + "results_bert = {\n", + " \"FP32\": fp32_bert_inference_time,\n", + " \"BF16_with_AMX\": bf16_amx_bert_inference_time,\n", + " \"INT8_with_VNNI\": int8_with_vnni_bert_inference_time,\n", + " \"INT8_with_AMX\": int8_amx_bert_inference_time\n", + " }\n", + "summarizeResults(\"BERT\", results_bert, BERT_BATCH_SIZE)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "03e63f93", + "metadata": {}, + "source": [ + "The first graph displays the inference times on the specified number of samples. In general, the times should be decreasing from left to right because using lower precision and with accelerates the computations. The second graph displays the relative speedup of each run case compared to that of FP32. In general the speedup should be increasing from left to right." + ] + }, + { + "cell_type": "markdown", + "id": "b559aeb8", + "metadata": {}, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "0da073a6", + "metadata": {}, + "source": [ + "This code sample shows how to enable and disable during runtime, as well as the performance improvements using BF16 and INT8 for inference on the ResNet50 and BERT models. Performance will vary based on your hardware and software versions. To see a larger performance gap between VNNI and , increase the batch size. For even more speedup, consider using the Intelยฎ Extension for PyTorch (IPEX) [Launch Script](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/launch_script.html). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa0877d6-e045-4091-b5e4-4dfcb6d04f7d", + "metadata": {}, + "outputs": [], + "source": [ + "print('[CODE_SAMPLE_COMPLETED_SUCCESSFULLY]')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "vscode": { + "interpreter": { + "hash": "ed6ae0d06e7bec0fef5f1fb38f177ceea45508ce95c68ed2f49461dd6a888a39" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb new file mode 100644 index 000000000..7e3f0a889 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Interactive_Chat_Quantization.ipynb @@ -0,0 +1,521 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5b88f590-e457-4052-9dbd-74d7be597dc1", + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================\n", + "# Copyright ยฉ 2023 Intel Corporation\n", + "# \n", + "# SPDX-License-Identifier: MIT\n", + "# =============================================================" + ] + }, + { + "cell_type": "markdown", + "id": "6f25b97a-56f7-4309-87fa-d9626baecf5e", + "metadata": {}, + "source": [ + "# Interactive Chat Based on DialoGPT Model Using Intelยฎ Extension for PyTorch* Quantization\n", + "\n", + "This code sample shows usage of DiloGPT model as interactive chat with Intelยฎ Extension for PyTorch* INT8 quantization.\n", + "\n", + "## DialoGPT\n", + "\n", + "DialoGPT is a model based on GPT-2 architecture proposed by Microsoft in 2019. It's goal was to create open-domain chatbots capable of producing natural responses to a variety of conversational topics." + ] + }, + { + "cell_type": "markdown", + "id": "f7c87090-2f40-4c29-b70b-c9d413bd3bff", + "metadata": {}, + "source": [ + "The `Interactive chat based on DialoGPT model using Intelยฎ Extension for PyTorch* Quantization` sample demonstrates how to create interactive chat based on pre-trained DialoGPT model and add the Intelยฎ Extension for PyTorch* quantization to it.\n", + "\n", + "| Area | Description|\n", + "|-----------------------|------------|\n", + "| What you will learn | How to create interactive chat and add INT8 dynamic quantization form Intelยฎ Extension for PyTorch*|\n", + "| Time to complete | 10 minutes|\n", + "| Category | Concepts and Functionality|\n", + "\n", + "The Intelยฎ Extension for PyTorch* extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX).\n", + "\n", + "## Purpose\n", + "\n", + "This sample shows how to create interactive chat based on the pre-trained DialoGPT model from HuggingFace and how to add INT8 dynamic quantization to it. The Intelยฎ Extension for PyTorch* gives users the ability to speed up operations on processors with INT8 data format and specialized computer instructions. The INT8 data format uses quarter the bit width of floating-point-32 (FP32), lowering the amount of memory needed and execution time to process with minimum to zero accuracy loss.\n", + "\n", + "## Prerequisites\n", + "\n", + "| Optimized for | Description|\n", + "|-------------------------|------------|\n", + "| OS | Ubuntu* 20.04 or newer|\n", + "| Hardware | Intelยฎ Xeonยฎ Scalable Processor family|\n", + "| Software | Intelยฎ Extension for PyTorch*|" + ] + }, + { + "cell_type": "markdown", + "id": "0174e7dd-58ae-47ea-8f11-fa3d1ee8c317", + "metadata": {}, + "source": [ + "## Environment Setup" + ] + }, + { + "cell_type": "markdown", + "id": "cc24bdae-fcb7-40a5-8bb8-76472598730b", + "metadata": {}, + "source": [ + "### Install Jupyter notebook by Conda\n", + "\n", + "Please refer to the guide in README.md to setup running environment:\n", + "\n", + "1. Create Conda running environment.\n", + "2. Install Jupyter notebook.\n", + "3. Install Intelยฎ Extension for PyTorch* for CPU packages.\n", + "4. Startup Jupyter notebook service and open by web browser.\n", + "\n", + "\n", + "#### Set Kernel to PyTorch-CPU\n", + "\n", + "In Jupyter notebook menu, change kernel \"PyTorch-CPU\" by Kernel->Change Kernel." + ] + }, + { + "cell_type": "markdown", + "id": "d2d9847c-2d72-4dfc-a4a9-b87c987ff363", + "metadata": {}, + "source": [ + "### Install other python packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9e7f549-65dd-4286-9e04-8d13a766c0e3", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install transformers matplotlib" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8c1ae1f1-4878-4dc6-bbb3-5a0f17fbbd00", + "metadata": {}, + "source": [ + "Let's start with importing all necessary packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45146b66-e41e-400e-8a1b-5e680bbb7575", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "import torch\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "2e158dd4-e2a7-44ca-af2d-052f88247e97", + "metadata": {}, + "source": [ + "## Model and tokenizer loading\n", + "\n", + "The first implemented function is loading tokenizer and model. \n", + "\n", + "Function input is link to the pre-trained model. In this sample we are using `microsoft/DialoGPT-large` from HuggingFace. This is also default parameter for this function. Of course, you can use also `microsoft/DialoGPT-medium` or `microsoft/DialoGPT-samll` models. Especially if you have limited resources. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6142753-eab1-4167-9818-4b40c900473c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def load_tokenizer_and_model(model=\"microsoft/DialoGPT-large\"):\n", + " \"\"\"\n", + " Load tokenizer and model instance for some specific DialoGPT model.\n", + " \"\"\"\n", + " # Initialize tokenizer and model\n", + " print(\"Loading model...\")\n", + " tokenizer = AutoTokenizer.from_pretrained(model, padding_side='left')\n", + " model = AutoModelForCausalLM.from_pretrained(model)\n", + " \n", + " # Return tokenizer and model\n", + " return tokenizer, model" + ] + }, + { + "cell_type": "markdown", + "id": "a4e150e6-4976-4998-93be-5f5f9ddcbb5b", + "metadata": { + "tags": [] + }, + "source": [ + "## INT8 Dynamic Quantization\n", + "\n", + "**Quantization** is a systematic reduction of the precision of all or several layers within the model. This means that we turn a higher-precision type, such as the FP32 (32 bits) most commonly used in Deep Learning, into a lower-precision type, such as FP16 (16 bits) or INT8 (8 bits). \n", + "\n", + "With type reduction, it is possible to effectively reduce the size of the model and also faster inference. That means:\n", + "\n", + "* lower memory bandwidth, \n", + "* lower storage, \n", + "* higher performance with minimum to zero accuracy loss. \n", + "\n", + "This is especially important, with large models such as those based on the Transformers architecture, like BERT or used in this sample GPT. \n", + "\n", + "We can distinguish 2 types of quantization:\n", + "\n", + "* static - requires an additional pass over a dataset to work, only activations do calibration,\n", + "* dynamic - multiplies input values by the scale factor, then rounds the result to the nearest, the scale factor for activations is determined dynamically based on the data range observed in runtime.\n", + "\n", + "In this sample we are using **the dynamic quantization**." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cca006fa-6fce-4e5f-81c0-240d12757493", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from intel_extension_for_pytorch.quantization import prepare, convert\n", + "import intel_extension_for_pytorch as ipex\n", + "\n", + "def quantize_model(tokenizer, model):\n", + " \"\"\"\n", + " Adding Intelยฎ Extension for PyTorch* dynamic quantization to the model\n", + " \"\"\"\n", + " # Evaluate model\n", + " model.eval()\n", + " \n", + " print(\"Quantization in progress...\")\n", + " \n", + " # Prepare example outputs for the model\n", + " question, text = \"What is SYCL?\", \"SYCL is an industry-driven standard, developed by Kronos Group and announced in March 2014.\"\n", + " inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + " jit_inputs = tuple((inputs['input_ids']))\n", + " \n", + " # Create configuration for dynamic quantization\n", + " qconfig = ipex.quantization.default_dynamic_qconfig\n", + " \n", + " # Optimize model\n", + " model = ipex.optimize(model)\n", + " \n", + " # Prepare model for quantization using previously prepared parameters\n", + " prepared_model = prepare(model, qconfig, example_inputs=jit_inputs, inplace=False)\n", + " \n", + " # Convert types in model\n", + " converted_model = convert(prepared_model)\n", + " \n", + " return tokenizer, converted_model" + ] + }, + { + "cell_type": "markdown", + "id": "0efd690e-96bd-49aa-8a6b-863d4de3cdfa", + "metadata": {}, + "source": [ + "## Response generation \n", + "\n", + "Response generation in DialoGPT architecture based on **encoder-decoder** model. It means that first we need to *encode input sentence*, to later on be able to *decode it* generating response.\n", + "\n", + "As the model based on transformers architecture they have known issue of copying things. To avoid repetition in chat responses we used Top-K sampling and Top-p sampling.\n", + "\n", + "**Top-K sampling** filters the K most likely next words and redistributes the probability mass among only those K next words. **Top-p sampling**, rather than selecting only the most likely K words, selects the smallest possible set of words whose cumulative probability exceeds the probability p. The probability mass is then redistributed among the words in this set. As a result, the size of the set of words can be dynamically increased and decreased based on the probability distribution of the next word." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d90bd2c3-ff9c-4e52-994d-341792e3e035", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def generate_response(tokenizer, model, chat_round, chat_history_ids):\n", + " \"\"\"\n", + " Generate a response to some user input.\n", + " \"\"\"\n", + " # Encode user input and End-of-String (EOS) token\n", + " new_input_ids = tokenizer.encode(input(\">> You:\") + tokenizer.eos_token, return_tensors='pt')\n", + " \n", + " # Append tokens to chat history\n", + " bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1) if chat_round > 0 else new_input_ids\n", + " \n", + " # Generate response given maximum chat length history of 2000 tokens\n", + " chat_history_ids = model.generate(\n", + " bot_input_ids,\n", + " do_sample=True, \n", + " max_length=2000,\n", + " top_k=50, \n", + " top_p=0.95,\n", + " pad_token_id=tokenizer.eos_token_id\n", + " )\n", + " \n", + " # Print response\n", + " print(\"DialoGPT: {}\".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))\n", + " \n", + " # Return the chat history ids\n", + " return chat_history_ids" + ] + }, + { + "cell_type": "markdown", + "id": "db1b079b-476c-47da-8a6a-3d42fccc32d4", + "metadata": {}, + "source": [ + "The next step is to prepare a function that allows interactive conversation for `n` rounds. This means that we will use the previously prepared `generate_response` function n-times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28968553-b051-442d-abc2-92d8ac34415a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def chat_for_n_rounds(tokenizer, model, n=5):\n", + " \"\"\"\n", + " Chat with chatbot for n rounds (n = 5 by default)\n", + " \"\"\"\n", + "\n", + " # Initialize history variable\n", + " chat_history_ids = None\n", + "\n", + " # Chat for n rounds\n", + " for chat_round in range(n):\n", + " chat_history_ids = generate_response(tokenizer, model, chat_round, chat_history_ids)" + ] + }, + { + "cell_type": "markdown", + "id": "41b0f86f-2b17-41cd-911e-9ac9a92be4a0", + "metadata": {}, + "source": [ + "Now, it is time to use implemented functions - initializing the model and adding INT8 dynamic quantization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1749c7a-4bba-4731-bbc6-da560edcfed2", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize tokenizer and model\n", + "tokenizer, model = load_tokenizer_and_model()\n", + "\n", + "# Adding ipex quantization to the model\n", + "tokenizer, model = quantize_model(tokenizer, model)" + ] + }, + { + "cell_type": "markdown", + "id": "31bae96c-276c-463f-8085-2cd8e97b5f30", + "metadata": {}, + "source": [ + "Let's play with the model by 5 rounds. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d79fa9d7-5713-4ceb-b489-90f1a4f6a4cf", + "metadata": {}, + "outputs": [], + "source": [ + "chat_for_n_rounds(tokenizer, model, 5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "16402940-0779-44a1-98b5-3f23c5784bd4", + "metadata": {}, + "source": [ + "## Performance comparison\n", + "\n", + "Now that we know that the DialoGPT model still performs well as a chat bot after quantization, let's compare the model's performance before and after applying INT8 dynamic quantization.\n", + "\n", + "Let's start with defining function that will measure time that model needs for inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c8b4677-4635-4abd-8eca-9ed43b9b6624", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from time import time\n", + "def test_inference(model, data, warmup=5 , iters=25):\n", + " print(\"Warmup...\")\n", + " for i in range(warmup):\n", + " out = model(data)\n", + "\n", + " print(\"Inference...\")\n", + " inference_time = 0\n", + " for i in range(iters):\n", + " start_time = time()\n", + " out = model(data)\n", + " end_time = time()\n", + " inference_time = inference_time + (end_time - start_time)\n", + "\n", + " inference_time = inference_time / iters\n", + " return inference_time" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a41f2a40-2176-4f04-b277-e3622df90430", + "metadata": {}, + "source": [ + "First, let's measure average time of inference for original model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cb034fe-9b8b-4ee9-9975-8a6a03ce79a4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Inference with FP32\")\n", + "tokenizer_fp32, model_fp32 = load_tokenizer_and_model()\n", + "data = torch.randint(model_fp32.config.vocab_size, size=[1, 512])\n", + "fp32_inference_time = test_inference(model_fp32, data = data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6c58546c-5c60-482c-9782-ac901855ddce", + "metadata": { + "tags": [] + }, + "source": [ + "Then, the average inference time of model after INT8 dynamic quantization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05fcd18c-0674-4715-a606-ce5ce9e42560", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Inference with Dynamic INT8\")\n", + "tokenizer_int8, model_int8 = load_tokenizer_and_model()\n", + "tokenizer_int8, model_int8 = quantize_model(tokenizer_int8, model_int8)\n", + "data = torch.randint(model_int8.config.vocab_size, size=[1, 512])\n", + "int8_inference_time = test_inference(model_int8, data = data)" + ] + }, + { + "cell_type": "markdown", + "id": "2ef0648b-c926-42ac-8367-e1a3edb067ea", + "metadata": {}, + "source": [ + "Now, it's time to show nup the results on the bar chart using `matplotlib` library." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d492e5d-e188-489b-a18d-aa32cca0a1b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "# Create bar chart with training time results\n", + "plt.figure(figsize=(4,3))\n", + "plt.title(\"DialoGPT Inference Time\")\n", + "plt.ylabel(\"Inference Time (seconds)\")\n", + "plt.bar([\"FP32\", \"INT8 dynamic\"], [fp32_inference_time, int8_inference_time])" + ] + }, + { + "cell_type": "markdown", + "id": "d2c31e73-2d6d-4323-9609-f04191f8863d", + "metadata": {}, + "source": [ + "DialoGPT by Microsoft is another conversational chatbot that everyone can use. \n", + "\n", + "Based on this architecture, we created an interactive chat in this sample. The use of top-k and top-p allowed us to avoid some of the repetition in the chat answers. Furthermore, the addition of dynamic INT8 quantization reduced memory usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b56ff32-34d0-4866-9050-df1bdf7ad736", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89043271-f3dc-4d4d-a630-40c570c53d98", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb new file mode 100644 index 000000000..03020685e --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/IntelPytorch_Quantization.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Optimize PyTorch Models using Intelยฎ Extension for PyTorch (IPEX) Quantization\n", + "This code sample will quantize a ResNet50 model while using Intel's Extension for PyTorch (IPEX). The model will run inference with FP32 and INT8 precision, including static INT8 quantization and dynamic INT8 quantization. During Static Quantization, the model calibrated with the CIFAR10 dataset. The inference time will be compared, showcasing the speedup of INT8 Quantization.\n", + "\n", + "## Environment Setup\n", + "Ensure the PyTorch kernel is activated before running this notebook.\n", + "\n", + "## Imports, Dataset, Hyperparameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torchvision\n", + "from time import time\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import intel_extension_for_pytorch as ipex\n", + "from intel_extension_for_pytorch.quantization import prepare, convert\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hyperparameters and constants\n", + "LR = 0.001\n", + "DOWNLOAD = True\n", + "DATA = 'datasets/cifar10/'\n", + "WARMUP = 3\n", + "ITERS = 100\n", + "transform = torchvision.transforms.Compose([\n", + "torchvision.transforms.Resize((224, 224)),\n", + "torchvision.transforms.ToTensor(),\n", + "torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))\n", + "])\n", + "test_dataset = torchvision.datasets.CIFAR10(\n", + " root=DATA,\n", + " train=False,\n", + " transform=transform,\n", + " download=DOWNLOAD,\n", + ")\n", + "calibration_data_loader = torch.utils.data.DataLoader(\n", + " dataset=test_dataset,\n", + " batch_size=128\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get model from torchvision" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = torch.rand(1, 3, 224, 224)\n", + "model_fp32 = torchvision.models.resnet50(weights=torchvision.models.ResNet50_Weights.DEFAULT)\n", + "model_fp32.eval()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference with FP32 model\n", + "\n", + "The function below will test the inference time with input model and return the average inference time for 1 iteration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def inference(model, WARMUP, ITERS, data):\n", + " print(\"Warmup before benchmark ...\")\n", + " for i in range(WARMUP):\n", + " out = model(data)\n", + "\n", + " print(\"Inference ...\")\n", + " inference_time = 0\n", + " for i in range(ITERS):\n", + " start_time = time()\n", + " out = model(data)\n", + " end_time = time()\n", + " inference_time = inference_time + (end_time - start_time)\n", + "\n", + " inference_time = inference_time / ITERS\n", + " print(\"Inference Time Avg: \", inference_time)\n", + " return inference_time" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Static Quantization \n", + "The function below staticQuantize will calibrate the fp32 model with calibration dataloader and return the quantized static int8 model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def staticQuantize(model_fp32, data, calibration_data_loader):\n", + " # Acquire inference times for static quantization INT8 model \n", + " qconfig_static = ipex.quantization.default_static_qconfig\n", + " # # Alternatively, define your own qconfig:\n", + " # from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig\n", + " # qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8),\n", + " # weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric))\n", + " prepared_model_static = prepare(model_fp32, qconfig_static, example_inputs=data, inplace=False)\n", + " print(\"Calibration with Static Quantization ...\")\n", + " for batch_idx, (data, target) in enumerate(calibration_data_loader):\n", + " prepared_model_static(data)\n", + " if batch_idx % 10 == 0:\n", + " print(\"Batch %d/%d complete, continue ...\" %(batch_idx+1, len(calibration_data_loader)))\n", + " print(\"Calibration Done\")\n", + "\n", + " converted_model_static = convert(prepared_model_static)\n", + " with torch.no_grad():\n", + " traced_model_static = torch.jit.trace(converted_model_static, data)\n", + " traced_model_static = torch.jit.freeze(traced_model_static)\n", + "\n", + " # save the quantized static model \n", + " traced_model_static.save(\"quantized_model_static.pt\")\n", + " return traced_model_static\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dynamic Quantization \n", + "The function below dynamicQuantize will quantize the fp32 model with dynamic quantization and return the quantized dynamic int8 model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def dynamicQuantize(model_fp32, data):\n", + " # Acquire inference times for dynamic quantization INT8 model\n", + " qconfig_dynamic = ipex.quantization.default_dynamic_qconfig\n", + " print(\"Quantize Model with Dynamic Quantization ...\")\n", + "\n", + " prepared_model_dynamic = prepare(model_fp32, qconfig_dynamic, example_inputs=data, inplace=False)\n", + "\n", + " converted_model_dynamic = convert(prepared_model_dynamic)\n", + " with torch.no_grad():\n", + " traced_model_dynamic = torch.jit.trace(converted_model_dynamic, data)\n", + " traced_model_dynamic = torch.jit.freeze(traced_model_dynamic)\n", + "\n", + " # save the quantized dynamic model \n", + " traced_model_dynamic.save(\"quantized_model_dynamic.pt\")\n", + " return traced_model_dynamic\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quantize the FP32 Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists('quantized_model_static.pt'):\n", + " # Static Quantizaton & Save Model to quantized_model_static.pt\n", + " print('quantize the model with static quantization')\n", + " staticQuantize(model_fp32, data, calibration_data_loader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists('quantized_model_dynamic.pt'):\n", + " # Dynamic Quantization & Save Model to quantized_model_dynamic.pt\n", + " print('quantize the model with dynamic quantization')\n", + " dynamicQuantize(model_fp32, data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference With FP32 Model, Static INT8 Model and Dynamic INT8 Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Inference with FP32\")\n", + "fp32_inference_time = inference(model_fp32, WARMUP, ITERS, data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Inference with Static INT8\")\n", + "traced_model_static = torch.jit.load('quantized_model_static.pt')\n", + "traced_model_static.eval()\n", + "traced_model_static = torch.jit.freeze(traced_model_static)\n", + "int8_inference_time_static = inference(traced_model_static, WARMUP, ITERS, data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Inference with Dynamic INT8\")\n", + "traced_model_dynamic = torch.jit.load('quantized_model_dynamic.pt')\n", + "traced_model_dynamic.eval()\n", + "traced_model_dynamic = torch.jit.freeze(traced_model_dynamic)\n", + "int8_inference_time_dynamic = inference(traced_model_dynamic, WARMUP, ITERS, data)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary of Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference time results\n", + "print(\"Summary\")\n", + "print(\"FP32 inference time: %.3f\" %fp32_inference_time)\n", + "print(\"INT8 static quantization inference time: %.3f\" %int8_inference_time_static)\n", + "print(\"INT8 dynamic quantization inference time: %.3f\" %int8_inference_time_dynamic)\n", + "\n", + "# Create bar chart with training time results\n", + "plt.figure(figsize=(4,3))\n", + "plt.title(\"ResNet Inference Time\")\n", + "plt.xlabel(\"Test Case\")\n", + "plt.ylabel(\"Inference Time (seconds)\")\n", + "plt.bar([\"FP32\", \"INT8 static\", \"INT8 dynamic\"], [fp32_inference_time, int8_inference_time_static, int8_inference_time_dynamic])\n", + "\n", + "# Calculate speedup when using quantization\n", + "speedup_from_fp32_static = fp32_inference_time / int8_inference_time_static\n", + "print(\"Staic INT8 %.2fX faster than FP32\" %speedup_from_fp32_static)\n", + "speedup_from_fp32_dynamic = fp32_inference_time / int8_inference_time_dynamic\n", + "print(\"Dynamic INT8 %.2fX faster than FP32\" %speedup_from_fp32_dynamic)\n", + "\n", + "\n", + "# Create bar chart with speedup results\n", + "plt.figure(figsize=(4,3))\n", + "plt.title(\"Quantization Speedup\")\n", + "plt.xlabel(\"Test Case\")\n", + "plt.ylabel(\"Speedup\")\n", + "plt.bar([\"FP32\",\"Static INT8\", \"Dynamic INT8\"], [1, speedup_from_fp32_static, speedup_from_fp32_dynamic])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print('[CODE_SAMPLE_COMPLETED_SUCCESFULLY]')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + }, + "vscode": { + "interpreter": { + "hash": "4678fb2792a22465205165c52aab2f7cff7494375a364749bf16e0ac11f2a502" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/README.md b/examples/cpu/inference/python/jupyter-notebooks/README.md new file mode 100644 index 000000000..2c9dfc91a --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/README.md @@ -0,0 +1,61 @@ +# Environment Setup for Jupyter Notebook with Intel Pytorch CPU + +The Intelยฎ Extension for PyTorch (IPEX) extends PyTorch* with optimizations for extra performance boost on Intelยฎ hardware. While most of the optimizations will be included in future PyTorch* releases, the extension delivers up-to-date features and optimizations for PyTorch on Intelยฎ hardware. For example, newer optimizations include AVX-512 Vector Neural Network Instructions (AVX512 VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX). + +## Prerequisites + +| Optimized for | Description +|:--- |:--- +| OS | Ubuntu* 18.04 or newer +| Hardware | 4th Gen Intelยฎ Xeonยฎ Scalable Processors or newer +| Software | Intelยฎ Extension for PyTorch (IPEX) + +## For Local Development Environments + +- **Install Jupyter Notebook with Conda** + +Python 3.8,3.9,3.10,3.11 are supported. +Please create a **new conda environment** for each sample. + +``` +conda create -n ipex_cpu python=3.10 -y +conda activate ipex_cpu +pip install notebook ipykernel +``` + +If encounter any issue for jupyter notebook, please refer to [*Installing Jupyter*](https://jupyter.org/install) for detailed installation instructions. + + +- **Install Intelยฎ Extension for Pytorch* with Conda** + +Follow this instructions to install latest released Intelยฎ Extension for Pytorch* + +``` +pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +pip install intel-extension-for-pytorch +``` + +If a specific version is needed, please follow the Installation Section and Sanity Check Section in the [installation guide](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu) to install Intelยฎ Extension for Pytroch* with specific version. + +- **Register ipykernel with Conda** + +``` +python -m ipykernel install --user --name=PyTorch-CPU +``` + +- **Runing the jupyter notebook** + +1. Change to the sample directory. +2. Launch Jupyter Notebook. +``` +jupyter notebook --ip=0.0.0.0 --port 8888 --allow-root +``` +3. Follow the instructions to open the URL with the token in your browser. +4. Locate and select the Notebook. +5. Change your Jupyter Notebook kernel to **PyTorch-CPU**. +6. Run every cell in the Notebook in sequence. + +## Example Output + +If successful, the sample displays `[CODE_SAMPLE_COMPLETED_SUCCESSFULLY]`. Additionally, the sample generates performance and analysis diagrams for comparison. + diff --git a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb new file mode 100644 index 000000000..fe1fb8b52 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb @@ -0,0 +1,888 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1110103c-70d0-4ac0-8208-6a678b88deae", + "metadata": {}, + "source": [ + "# Optimize PyTorch Models using Intelยฎ Extension for PyTorch* (IPEX)\n", + "\n", + "This notebook guides you through the process of extending your PyTorch* code with Intelยฎ Extension for PyTorch* (IPEX) with optimizations to achieve performance boosts on Intelยฎ hardware.\n", + "\n", + "\n", + "| Area | Description\n", + "|:--- |:---\n", + "| What you will learn | Applying Intelยฎ Extension for PyTorch* (IPEX) Optimizations to a PyTorch workload in a step-by-step manner to gain performance boost\n", + "| Time to complete | 30 minutes\n", + "| Category | Code Optimization\n", + "\n", + "## Purpose\n", + "\n", + "This sample notebook shows how to get started with Intelยฎ Extension for PyTorch* (IPEX) for sample Computer Vision and NLP workloads.\n", + "\n", + "The sample starts by loading two models from the PyTorch hub: **Faster-RCNN** (Faster R-CNN) and **distilbert** (DistilBERT). After loading the models, the sample applies sequential optimizations from Intelยฎ Extension for PyTorch* (IPEX) and examines performance gains for each incremental change.\n", + "You can make code changes quickly on top of existing PyTorch code to obtain the performance speedups for model inference.\n", + "\n", + "We will be generating synthetic data to be used for inference with sample computer vision and NLP workloads. We will first use stock PyTorch models to generate predictions. Then, with minimal code changes using Intelยฎ Extension for PyTorch* (IPEX), we will see how speedups can be gained over stock PyTorch on Intelยฎ hardware. We will also see how quantization features from Intelยฎ Extension for PyTorch* (IPEX) can be used to reduce the inference time of a model.\n", + "\n", + "## Prerequisites\n", + "\n", + "\n", + "| Optimized for | Description\n", + "|:--- |:---\n", + "| OS | Ubuntu* 20.04 or newer\n", + "| Hardware | Intelยฎ Xeonยฎ Scalable processor family\n", + "| Software | Intelยฎ Extension for PyTorch*\n" + ] + }, + { + "cell_type": "markdown", + "id": "431d988d-40f1-4f98-96fd-2e17b4126eb4", + "metadata": {}, + "source": [ + "# Key Takeaways" + ] + }, + { + "cell_type": "markdown", + "id": "7438fa45-81e6-4d42-847b-fbe895ae8eed", + "metadata": {}, + "source": [ + "- Get started with Intelยฎ Extension for PyTorch* (IPEX) for drop-in acceleration\n", + "- Learn how to use the *optimize* method from Intelยฎ Extension for PyTorch* (IPEX) to apply optimizations at Python frontend to the given model (nn.Module)\n", + "- Learn how to use Quantization features from Intelยฎ Extension for PyTorch* (IPEX) to convert model to INT8\n", + "- Learn how to use Intelยฎ Extension for PyTorch* (IPEX) Launch Script module to set additional configurations on top of the previously mentioned optimizations to boost performance" + ] + }, + { + "cell_type": "markdown", + "id": "d72174e0", + "metadata": {}, + "source": [ + "# Samples" + ] + }, + { + "cell_type": "markdown", + "id": "06a26381", + "metadata": {}, + "source": [ + "## Install Intelยฎ Extension for PyTorch* for CPU and dependency packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ccfb9a4", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install transformers matplotlib" + ] + }, + { + "cell_type": "markdown", + "id": "b7eb6281-5db9-4a4f-9c6a-3f9c132f30f9", + "metadata": { + "tags": [] + }, + "source": [ + "## Computer Vision Workload - Faster R-CNN, Resnet50 Backbone" + ] + }, + { + "cell_type": "markdown", + "id": "d911257c-c9b0-4365-a308-95a4b3aea487", + "metadata": {}, + "source": [ + "Faster R-CNN is a convolutional neural network used for object detection. We are going to use the **optimize** method from Intelยฎ Extension for PyTorch* (IPEX) to apply optimizations. Following this, we will also use TorchScript to obtain performance gains." + ] + }, + { + "cell_type": "markdown", + "id": "96966a7b-b036-4f1c-8dd8-3b90b76f98c9", + "metadata": {}, + "source": [ + "Let's start by importing all the necessary packages and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a4cb03a-f6b4-465b-9363-b435b00336c8", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import torch\n", + "import torchvision\n", + "import os\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "5f613973-ffb0-482a-bc56-c295bce3c088", + "metadata": {}, + "source": [ + "**Prepare Sample Data**" + ] + }, + { + "cell_type": "markdown", + "id": "22eeae78-534e-4d44-a0f3-c48a4213ac3f", + "metadata": {}, + "source": [ + "Let's generate a random image using torch to test performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e75c62d-dbbb-47da-aa07-717e9719d86a", + "metadata": {}, + "outputs": [], + "source": [ + "# set the device to cpu\n", + "device = 'cpu'\n", + "# generate a random image to observe speedup on\n", + "image = torch.randn(1, 3, 1200, 1200)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "678f5795-dcd8-439d-8d98-f4197a2417e4", + "metadata": {}, + "outputs": [], + "source": [ + "# explore image shape\n", + "\n", + "print(image.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "32a54de2-76e7-42f9-9506-5574f5bb95a4", + "metadata": {}, + "source": [ + "**Helper Functions**" + ] + }, + { + "cell_type": "markdown", + "id": "43d0d63d-1d2b-4edc-8d36-0e2dd4ff3780", + "metadata": {}, + "source": [ + "Some functions to help us with loading the model and summarizing the optimizations. The functions below will help us record the time taken to run and, plot comparison charts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c265eb1f-ef85-4ed1-941d-158d2ec16af6", + "metadata": {}, + "outputs": [], + "source": [ + "def load_model_eval_mode():\n", + " \"\"\"\n", + " Loads model and returns it in eval mode\n", + " \"\"\"\n", + " model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=weights, progress=True,\n", + " num_classes=91, weights_backbone=weights_backbone).to(device)\n", + " model = model.eval()\n", + " \n", + " return model\n", + "\n", + "def get_average_inference_time(model, image):\n", + " \"\"\"\n", + " does a model warm up and times the model runtime\n", + " \"\"\"\n", + " with torch.no_grad():\n", + " # warm up\n", + " for _ in range(25):\n", + " model(image)\n", + "\n", + " # measure\n", + " import time\n", + " start = time.time()\n", + " for _ in range(25):\n", + " output = model(image)\n", + " end = time.time()\n", + " average_inference_time = (end-start)/25*1000\n", + " \n", + " return average_inference_time\n", + "\n", + "def plot_speedup(inference_time_stock, inference_time_optimized):\n", + " \"\"\"\n", + " Plots a bar chart comparing the time taken by stock PyTorch model and the time taken by\n", + " the model optimized by Intelยฎ Extension for PyTorch* (IPEX)\n", + " \"\"\"\n", + " data = {'stock_pytorch_time': inference_time_stock, 'optimized_time': inference_time_optimized}\n", + " model_type = list(data.keys())\n", + " times = list(data.values())\n", + "\n", + " fig = plt.figure(figsize = (10, 5))\n", + "\n", + " # creating the bar plot\n", + " plt.bar(model_type, times, color ='blue',\n", + " width = 0.4)\n", + "\n", + " plt.ylabel(\"Runtime (ms)\")\n", + " plt.title(f\"Speedup acheived - {inference_time_stock/inference_time_optimized:.2f}x\")\n", + " plt.show()\n", + " \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "46eb1386-203a-4667-aa9c-9b26adcd02c4", + "metadata": {}, + "source": [ + "**Baseline PyTorch Model**" + ] + }, + { + "cell_type": "markdown", + "id": "c441b52d-597b-422c-bd01-524a687f8024", + "metadata": {}, + "source": [ + "A baseline model is the simplest version of the model that can be loaded from the PyTorch hub. Let's load the baseline for Faster R-CNN model and get predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d4efa17-9abe-4595-bcc1-8b9c3b966211", + "metadata": {}, + "outputs": [], + "source": [ + "# model configs\n", + "weights = torchvision.models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT\n", + "weights_backbone = torchvision.models.ResNet50_Weights.DEFAULT" + ] + }, + { + "cell_type": "markdown", + "id": "858fb895-8617-4997-a8ec-8974d00e4606", + "metadata": {}, + "source": [ + "**Input Image Memory Format**" + ] + }, + { + "cell_type": "markdown", + "id": "43a1fa1e-2e89-4c43-b1e4-7682686c2a4a", + "metadata": {}, + "source": [ + "There are two ways to represent image data that are inputs to a CNN model. Channels-First, and Channels-Last. In Channels-First, the channels dimension comes first followed by height and width. For example - (3, 224, 224) or NCHW where N is batch size, C is channels, H is height, and W is width. In Channels-Last, the channels dimension comes last. For example - (224, 223, 3) or NHWC." + ] + }, + { + "cell_type": "markdown", + "id": "a12280f4-b8a1-4c9a-89d3-2f092396f431", + "metadata": {}, + "source": [ + "**Channels-First**" + ] + }, + { + "cell_type": "markdown", + "id": "59f09f4f-de2c-4006-83e0-d201b58097e6", + "metadata": {}, + "source": [ + "PyTorch uses channels-first by default" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f054f770-c8d6-4fe7-baed-7ed77c510a13", + "metadata": {}, + "outputs": [], + "source": [ + "# send the input to the device and pass it through the network to\n", + "# get the detections and predictions\n", + "\n", + "model = load_model_eval_mode()\n", + "\n", + "inference_time_stock = get_average_inference_time(model, image)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_stock} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "3285b34b-03a2-4654-8061-a4dd9f66d1e9", + "metadata": {}, + "source": [ + "**Channels-Last**" + ] + }, + { + "cell_type": "markdown", + "id": "31439e30-344e-455c-816a-a88eb868ede4", + "metadata": {}, + "source": [ + "Channels-Last memory format is a different way of ordering NCHW tensors allowing us to make Channels-Last memory format optimizations on Intelยฎ hardware" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f0afc270-0d24-4e8d-a1a9-271dd66fc350", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "model = model.to(memory_format=torch.channels_last)\n", + "image_channels_last = image.to(memory_format=torch.channels_last)\n", + "\n", + "inference_time_stock = get_average_inference_time(model, image_channels_last)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_stock} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "9cfff9fc-0e9d-4703-8369-da1db780d15e", + "metadata": {}, + "source": [ + "Now that we have timed the stock PyTorch model, let's add minimal code changes from Intelยฎ Extension for PyTorch* (IPEX) to obtain speedups. The minimal code changes are highlighted in the following cell" + ] + }, + { + "cell_type": "markdown", + "id": "aef650d5-02db-40a6-9688-90a03fee7da2", + "metadata": {}, + "source": [ + "**Intelยฎ Extension for PyTorch* (IPEX)**" + ] + }, + { + "cell_type": "markdown", + "id": "05237e85-1039-4fb5-8651-bb815824a1d9", + "metadata": {}, + "source": [ + "As described above, Intelยฎ Extension for PyTorch* (IPEX) provides us with the ability to make minimal code changes to apply optimizations over stock PyTorch models using Intelยฎ hardware. The simple code changes are indicated below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6356714f-c2b3-46c7-8dab-103d40054eb0", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "model = model.to(memory_format=torch.channels_last)\n", + "image_channels_last = image.to(memory_format=torch.channels_last)\n", + "#################### code changes ####################\n", + "import intel_extension_for_pytorch as ipex\n", + "model = ipex.optimize(model)\n", + "######################################################" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bdd609b-13ec-4bb7-93ec-b2f8a6dd0f68", + "metadata": {}, + "outputs": [], + "source": [ + "inference_time_optimized = get_average_inference_time(model, image_channels_last)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5843fe0f-52c3-49a5-b231-f4ca514395a6", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "d263abf8-e0b6-4d99-a14a-67e46c197c3d", + "metadata": {}, + "source": [ + "> **_NOTE:_** If a below par performance is observed, please restart the notebook kernel." + ] + }, + { + "cell_type": "markdown", + "id": "6d465fcc-8115-46de-9542-2c4c8d7c1771", + "metadata": {}, + "source": [ + "**TorchScript**" + ] + }, + { + "cell_type": "markdown", + "id": "e96cbe96-92c2-4429-b90b-4e78a4bac04d", + "metadata": {}, + "source": [ + "TorchScript is a way to create serializable and optimizable models from PyTorch code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87254f28-ce41-4066-81c1-bcd646a0b871", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "model = model.to(memory_format=torch.channels_last)\n", + "with torch.no_grad():\n", + " model.backbone = torch.jit.trace(model.backbone, image_channels_last, strict=False)\n", + " model.backbone = torch.jit.freeze(model.backbone)\n", + " inference_time_optimized = get_average_inference_time(model, image_channels_last)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ad01762-ded5-4380-b649-7a3db30c3b34", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "11b2888e-1171-400d-a5f9-783e6c52f01e", + "metadata": {}, + "source": [ + "## NLP Workload - DistilBERT Base Uncased" + ] + }, + { + "cell_type": "markdown", + "id": "62666465-5647-4ccb-a53c-d389d3261629", + "metadata": {}, + "source": [ + "DistilBERT is a transformer model, smaller and faster than BERT. We will use the Quantization feature from Intelยฎ Extension for PyTorch* (IPEX) to convert the model into INT8 for faster inference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e10eba7b-3d6e-46e8-8baf-8eddcc5f9d2a", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import DistilBertTokenizer, DistilBertModel, logging\n", + "logging.set_verbosity_error()" + ] + }, + { + "cell_type": "markdown", + "id": "8b7899a8-205d-4ebc-b304-e9ee65ad3643", + "metadata": {}, + "source": [ + "**Helper Functions**" + ] + }, + { + "cell_type": "markdown", + "id": "797de740-3a76-48cc-838c-ef8e981cd41b", + "metadata": {}, + "source": [ + "Similar functions as before to help us load the model and summarize the optimizations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0b68ba1-9215-4408-b3ee-5d6d610cffd7", + "metadata": {}, + "outputs": [], + "source": [ + "def load_model_eval_mode():\n", + " \"\"\"\n", + " Loads model and returns it in eval mode\n", + " \"\"\"\n", + " model = DistilBertModel.from_pretrained('distilbert-base-uncased-distilled-squad')\n", + " model.eval()\n", + " \n", + " return model\n", + "\n", + "def get_average_inference_time(model, inputs):\n", + " \"\"\"\n", + " does a model warm up and times the model runtime\n", + " \"\"\"\n", + " with torch.no_grad():\n", + " # warm up\n", + " for _ in range(25):\n", + " model(**inputs)\n", + "\n", + " # measure\n", + " import time\n", + " start = time.time()\n", + " for _ in range(25):\n", + " outputs = model(**inputs)\n", + " end = time.time()\n", + " average_inference_time = (end-start)/25*1000\n", + " \n", + " return average_inference_time" + ] + }, + { + "cell_type": "markdown", + "id": "81ec7560-5a3d-4450-badd-7ff3e5d6ae9b", + "metadata": {}, + "source": [ + "Generate sample text and tokenize using the transformers tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d94e1ea-03f8-4f2b-9c05-ae4a97d7d7d1", + "metadata": {}, + "outputs": [], + "source": [ + "# tokenizer for distilbert\n", + "tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')\n", + "\n", + "# sample data\n", + "question, text = \"Who was Jim Henson?\", \"Jim Henson was a nice puppet\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e146748d-7b31-4e62-82fb-92e3cd231f0e", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "\n", + "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + "\n", + "inference_time_stock = get_average_inference_time(model, inputs)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_stock} ms\")" + ] + }, + { + "cell_type": "markdown", + "id": "fd79e93e-bc11-48b2-8a92-b74f29e4d2bf", + "metadata": {}, + "source": [ + "**Quantization**" + ] + }, + { + "cell_type": "markdown", + "id": "1cfbb3fe-d99f-4e74-abd0-04bdbbe6e632", + "metadata": {}, + "source": [ + "Quantization allows us to perform operations and store tensors at a lower precision than FP32, like INT8 for example. This compact model and data representation results in a lower memory requirement." + ] + }, + { + "cell_type": "markdown", + "id": "1c8ed525-7075-4a88-80d2-dfb739468994", + "metadata": {}, + "source": [ + "Let's import the quantization modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "342873f5-72a3-4105-a2a5-68778d0599c8", + "metadata": {}, + "outputs": [], + "source": [ + "from intel_extension_for_pytorch.quantization import prepare, convert\n", + "import intel_extension_for_pytorch as ipex" + ] + }, + { + "cell_type": "markdown", + "id": "b71e64d0-21f7-497f-86c3-d1afcaeefd6c", + "metadata": {}, + "source": [ + "**Static Quantization** \n", + " Static quantization quantizes the weights and activations of the model. It fuses activations into preceding layers where possible. It requires calibration with a representative dataset to determine optimal quantization parameters for activations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ef972ab-a272-45dd-bd55-fbd9f7db5017", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "\n", + "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + "\n", + "jit_inputs = tuple((inputs['input_ids'], inputs['attention_mask']))\n", + "\n", + "qconfig_mapping = ipex.quantization.default_static_qconfig_mapping # for static quantization\n", + "prepared_model = ipex.quantization.prepare(model, qconfig_mapping, example_inputs=jit_inputs, inplace=False)\n", + "\n", + "for i in range(2):\n", + " calibration_output = prepared_model(**inputs)\n", + "\n", + "model = convert(prepared_model)\n", + "with torch.no_grad():\n", + " model = torch.jit.trace(model, jit_inputs, strict=False)\n", + " model = torch.jit.freeze(model)\n", + " y = model(**inputs)\n", + " y = model(**inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7983dfa3-8bb2-4f9e-994c-24cbd556b94d", + "metadata": {}, + "outputs": [], + "source": [ + "inference_time_optimized = get_average_inference_time(model, inputs)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54e19218-dd71-4711-a25a-8e7c8a787d19", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "5e79ec28-6063-4d21-95d5-32be78bd49af", + "metadata": {}, + "source": [ + "**Dynamic Quantization** \n", + " In dynamic quantization the weights are quantized ahead of time but the activations are dynamically quantized during inference" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bef3494-b5c5-4acd-8431-71e3199a9764", + "metadata": {}, + "outputs": [], + "source": [ + "model = load_model_eval_mode()\n", + "\n", + "inputs = tokenizer(question, text, return_tensors=\"pt\")\n", + "\n", + "jit_inputs = tuple((inputs['input_ids'], inputs['attention_mask']))\n", + "\n", + "\n", + "qconfig_mapping = ipex.quantization.default_dynamic_qconfig_mapping # for dynamic quantization\n", + "prepared_model = ipex.quantization.prepare(model, qconfig_mapping, example_inputs=jit_inputs, inplace=False)\n", + "model = convert(prepared_model)\n", + "with torch.no_grad():\n", + " model = torch.jit.trace(model, jit_inputs, strict=False)\n", + " model = torch.jit.freeze(model)\n", + " y = model(**inputs)\n", + " y = model(**inputs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cc311f2-6215-41ff-a7ed-e1942e0cbbde", + "metadata": {}, + "outputs": [], + "source": [ + "inference_time_optimized = get_average_inference_time(model, inputs)\n", + "\n", + "print(f\"time taken for forward pass: {inference_time_optimized} ms\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb75733c-91d9-4d38-97a8-a5bb4b3063a8", + "metadata": {}, + "outputs": [], + "source": [ + "# plot performance gain bar chart\n", + "\n", + "plot_speedup(inference_time_stock, inference_time_optimized)" + ] + }, + { + "cell_type": "markdown", + "id": "bff70b14-26d9-4ef8-894a-6d0a40973324", + "metadata": {}, + "source": [ + "## Intelยฎ Extension for PyTorch* (IPEX) Launch Script" + ] + }, + { + "cell_type": "markdown", + "id": "63f71568-902f-4e9f-a130-cc5fae1961db", + "metadata": {}, + "source": [ + "Default primitives of PyTorch and Intelยฎ Extension for PyTorch* (IPEX) are highly optimized, there are things users can do improve performance. Setting configuration options properly contributes to a performance boost. However, there is no unified configuration that is optimal to all topologies. Users need to try different combinations by themselves." + ] + }, + { + "cell_type": "markdown", + "id": "f826ac48-17e2-48b4-ab2c-0a47620790a7", + "metadata": {}, + "source": [ + "**Single instance for inference**" + ] + }, + { + "cell_type": "markdown", + "id": "f300e009-f7cb-4403-8229-938bf89b0920", + "metadata": {}, + "source": [ + "The launch script is provided as a module of Intelยฎ Extension for PyTorch* (IPEX). Below are some of those configurations that can be set using the launch script for a single instance. The launch script can be run as a shell command from a Jupyter notebook or from the shell itself." + ] + }, + { + "cell_type": "markdown", + "id": "35ed3f44-c019-4997-a73d-a043ddfa12ee", + "metadata": {}, + "source": [ + "To explore the features of the launch script module, we will be using a ResNet-50 model, which is a a convolutional neural network that is 50 layers deep.The model script is present in the scripts folder" + ] + }, + { + "cell_type": "markdown", + "id": "edbd3506-dc86-4f18-b666-dfa9e9e3705a", + "metadata": {}, + "source": [ + "It is recommended that the user check the output of [htop](https://htop.dev/) in an accompanying terminal to check the usage of cores while running the cells below. The output from htop looks as shown below." + ] + }, + { + "cell_type": "markdown", + "id": "b5015d8d-6ac6-41c4-a10e-509069b699ee", + "metadata": {}, + "source": [ + "![htop](https://intel.github.io/intel-extension-for-pytorch/latest/_images/1ins_phy.gif)" + ] + }, + { + "cell_type": "markdown", + "id": "91737a0e-1a14-4582-a9c4-63b24e963ff7", + "metadata": {}, + "source": [ + "By running the below command, One main worker thread will be launched, then it will launch threads on 2 other physical cores." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76265fe7-52cd-4574-9657-18efa5c25514", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 3 --log_path ./logs ./python/resnet50.py" + ] + }, + { + "cell_type": "markdown", + "id": "6fa2dd27-cf4f-4e1d-808a-271c35cd508b", + "metadata": {}, + "source": [ + "Similarly by increasing the number of cores, we can see an improvement in the inference time as shown below " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c522568-db0f-4438-ae1f-66623e07c96f", + "metadata": {}, + "outputs": [], + "source": [ + "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 6 --log_path ./logs ./python/resnet50.py" + ] + }, + { + "cell_type": "markdown", + "id": "a3a00650-d15f-4aa7-b526-a21c6823d50a", + "metadata": {}, + "source": [ + "We saw a small example usage of the launch script module. This [documentation](https://intel.github.io/intel-extension-for-pytorch/cpu/1.12.100+cpu/tutorials/performance_tuning/launch_script.html) provides many more examples to use the launch script. As mentioned earlier, each deep learning topology can benefit from custom tuning to achieve the best performance on top of the optimizations we have discussed so far." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd6e13ed-0c0e-4e97-acaa-7a8dfba259c4", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"[CODE_SAMPLE_COMPLETED_SUCCESFULLY]\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "nbTranslate": { + "displayLangs": [ + "*" + ], + "hotkey": "alt-t", + "langInMainMenu": true, + "sourceLang": "en", + "targetLang": "fr", + "useGoogleTranslate": true + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py b/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py new file mode 100644 index 000000000..a087de1af --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/python/pytorch_inference_vnni.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python +# encoding: utf-8 + +""" +============================================================== + Copyright ยฉ 2023 Intel Corporation + + SPDX-License-Identifier: MIT +============================================================== +""" + +import os +from time import time +import torch +import intel_extension_for_pytorch as ipex +from intel_extension_for_pytorch.quantization import prepare, convert +from torchvision import models +from transformers import BertModel + +SUPPORTED_MODELS = ["resnet50", "bert"] # models supported by this code sample + +# ResNet sample data parameters +RESNET_BATCH_SIZE = 64 + +# BERT sample data parameters +BERT_BATCH_SIZE = 64 +BERT_SEQ_LENGTH = 512 + +os.environ["ONEDNN_MAX_CPU_ISA"] = "AVX512_CORE_VNNI" + +""" +Function to perform inference on Resnet50 and BERT +""" + + +def runInference(model, data, modelName="resnet50", dataType="FP32", amx=True): + """ + Input parameters + model: the PyTorch model object used for inference + data: a sample input into the model + modelName: str representing the name of the model, supported values - resnet50, bert + dataType: str representing the data type for model parameters, supported values - FP32, BF16, INT8 + amx: set to False to disable AMX on BF16, Default: True + Return value + inference_time: the time in seconds it takes to perform inference with the model + """ + + # Display run case + if amx: + isa_text = "AVX512_CORE_AMX" + else: + isa_text = "AVX512_CORE_VNNI" + print("%s %s inference with %s" % (modelName, dataType, isa_text)) + + # Special variables for specific models + batch_size = None + if "resnet50" == modelName: + batch_size = RESNET_BATCH_SIZE + elif "bert" == modelName: + d = torch.randint( + model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH] + ) # sample data input for torchscript and inference + batch_size = BERT_BATCH_SIZE + else: + raise Exception( + "ERROR: modelName %s is not supported. Choose from %s" + % (modelName, SUPPORTED_MODELS) + ) + + # Prepare model for inference based on precision (FP32, BF16, INT8) + if "INT8" == dataType: + # Quantize model to INT8 if needed (one time) + model_filename = "quantized_model_%s.pt" % modelName + if not os.path.exists(model_filename): + qconfig = ipex.quantization.default_static_qconfig + prepared_model = prepare(model, qconfig, example_inputs=data, inplace=False) + converted_model = convert(prepared_model) + with torch.no_grad(): + if "resnet50" == modelName: + traced_model = torch.jit.trace(converted_model, data) + elif "bert" == modelName: + traced_model = torch.jit.trace( + converted_model, (d,), check_trace=False, strict=False + ) + else: + raise Exception( + "ERROR: modelName %s is not supported. Choose from %s" + % (modelName, SUPPORTED_MODELS) + ) + traced_model = torch.jit.freeze(traced_model) + traced_model.save(model_filename) + + # Load INT8 model for inference + model = torch.jit.load(model_filename) + model.eval() + model = torch.jit.freeze(model) + elif "BF16" == dataType: + model = ipex.optimize(model, dtype=torch.bfloat16) + with torch.no_grad(): + with torch.cpu.amp.autocast(): + if "resnet50" == modelName: + model = torch.jit.trace(model, data) + elif "bert" == modelName: + model = torch.jit.trace( + model, (d,), check_trace=False, strict=False + ) + else: + raise Exception( + "ERROR: modelName %s is not supported. Choose from %s" + % (modelName, SUPPORTED_MODELS) + ) + model = torch.jit.freeze(model) + else: # FP32 + with torch.no_grad(): + if "resnet50" == modelName: + model = torch.jit.trace(model, data) + elif "bert" == modelName: + model = torch.jit.trace(model, (d,), check_trace=False, strict=False) + else: + raise Exception( + "ERROR: modelName %s is not supported. Choose from %s" + % (modelName, SUPPORTED_MODELS) + ) + model = torch.jit.freeze(model) + + # Run inference + with torch.no_grad(): + if "BF16" == dataType: + with torch.cpu.amp.autocast(): + # Warm up + for i in range(5): + model(data) + + # Measure latency + start_time = time() + model(data) + end_time = time() + else: + # Warm up + for i in range(5): + model(data) + + # Measure latency + start_time = time() + model(data) + end_time = time() + inference_time = end_time - start_time + print("Inference on batch size %d took %.3f seconds" % (batch_size, inference_time)) + + return inference_time + + +""" +Perform all types of inference in main function + +Inference run cases for both Resnet50 and BERT +1) INT8 using AVX512_CORE_VNNI +""" + + +def main(): + # ResNet50 + resnet_model = models.resnet50(pretrained=True) + resnet_data = torch.rand(RESNET_BATCH_SIZE, 3, 224, 224) + resnet_model.eval() + int8_with_vnni_resnet_inference_time = runInference( + resnet_model, resnet_data, modelName="resnet50", dataType="INT8", amx=False + ) + + # BERT + bert_model = BertModel.from_pretrained("bert-base-uncased") + # torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased') + bert_data = torch.randint( + bert_model.config.vocab_size, size=[BERT_BATCH_SIZE, BERT_SEQ_LENGTH] + ) + bert_model.eval() + int8_with_vnni_bert_inference_time = runInference( + bert_model, bert_data, modelName="bert", dataType="INT8", amx=False + ) + + +if __name__ == "__main__": + main() diff --git a/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py b/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py new file mode 100644 index 000000000..4aa479161 --- /dev/null +++ b/examples/cpu/inference/python/jupyter-notebooks/python/resnet50.py @@ -0,0 +1,67 @@ +import torch +import torchvision.models as models + + +def inference(model, data): + with torch.no_grad(): + # warm up + for _ in range(100): + model(data) + + # measure + import time + + start = time.time() + for _ in range(100): + output = model(data) + end = time.time() + print("Inference took {:.2f} ms in average".format((end - start) / 100 * 1000)) + + +def main(args): + model = models.resnet50(pretrained=False) + model.eval() + + data = torch.rand(1, 3, 224, 224) + + import intel_extension_for_pytorch as ipex + + model = model.to(memory_format=torch.channels_last) + data = data.to(memory_format=torch.channels_last) + + if args.dtype == "float32": + model = ipex.optimize(model, dtype=torch.float32) + elif args.dtype == "bfloat16": + model = ipex.optimize(model, dtype=torch.bfloat16) + else: # int8 + from intel_extension_for_pytorch.quantization import prepare, convert + + qconfig = ipex.quantization.default_static_qconfig + model = prepare(model, qconfig, example_inputs=data, inplace=False) + + # calibration + n_iter = 100 + for i in range(n_iter): + model(data) + + model = convert(model) + + with torch.cpu.amp.autocast(enabled=args.dtype == "bfloat16"): + if args.torchscript: + with torch.no_grad(): + model = torch.jit.trace(model, data) + model = torch.jit.freeze(model) + + inference(model, data) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument( + "--dtype", default="float32", choices=["float32", "bfloat16", "int8"] + ) + parser.add_argument("--torchscript", default=False, action="store_true") + + main(parser.parse_args()) diff --git a/examples/cpu/inference/python/README.md b/examples/cpu/inference/python/python-scripts/README.md similarity index 97% rename from examples/cpu/inference/python/README.md rename to examples/cpu/inference/python/python-scripts/README.md index 044dc73e2..b20cf38c4 100644 --- a/examples/cpu/inference/python/README.md +++ b/examples/cpu/inference/python/python-scripts/README.md @@ -1,7 +1,7 @@ ๏ปฟ# Model Inference with Intelยฎ Extension for PyTorch\* Optimizations We provided examples about how to use Intelยฎ Extension for PyTorch\* to accelerate model inference. -The `ipex.optimize` function of Intelยฎ Extension for PyTorch* applies optimizations to the model, bringing additional performance boosts. +The `ipex.optimize` function of Intelยฎ Extension for PyTorch\* applies optimizations to the model, bringing additional performance boosts. For both computer vision workloads and NLP workloads, we recommend applying the `ipex.optimize` function against the model object. ## Environment Setup diff --git a/examples/cpu/inference/python/bert_eager_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/bert_eager_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/bert_eager_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/bert_eager_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/bert_general_inference_script.py b/examples/cpu/inference/python/python-scripts/bert_general_inference_script.py similarity index 100% rename from examples/cpu/inference/python/bert_general_inference_script.py rename to examples/cpu/inference/python/python-scripts/bert_general_inference_script.py diff --git a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/int8_deployment.py b/examples/cpu/inference/python/python-scripts/int8_deployment.py similarity index 100% rename from examples/cpu/inference/python/int8_deployment.py rename to examples/cpu/inference/python/python-scripts/int8_deployment.py diff --git a/examples/cpu/inference/python/int8_quantization_dynamic.py b/examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py similarity index 100% rename from examples/cpu/inference/python/int8_quantization_dynamic.py rename to examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py diff --git a/examples/cpu/inference/python/int8_quantization_static.py b/examples/cpu/inference/python/python-scripts/int8_quantization_static.py similarity index 100% rename from examples/cpu/inference/python/int8_quantization_static.py rename to examples/cpu/inference/python/python-scripts/int8_quantization_static.py diff --git a/examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/resnet50_general_inference_script.py b/examples/cpu/inference/python/python-scripts/resnet50_general_inference_script.py similarity index 100% rename from examples/cpu/inference/python/resnet50_general_inference_script.py rename to examples/cpu/inference/python/python-scripts/resnet50_general_inference_script.py diff --git a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py diff --git a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py b/examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py diff --git a/examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py b/examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py similarity index 100% rename from examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py rename to examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py diff --git a/examples/cpu/training/README.md b/examples/cpu/training/python-scripts/README.md similarity index 74% rename from examples/cpu/training/README.md rename to examples/cpu/training/python-scripts/README.md index 2169e8df9..066425954 100644 --- a/examples/cpu/training/README.md +++ b/examples/cpu/training/python-scripts/README.md @@ -23,22 +23,6 @@ git clone https://github.com/intel/intel-extension-for-pytorch.git cd intel-extension-for-pytorch/examples/cpu/training ``` -Running ResNet50 Float32 single precision training example: - -```bash -python single_instance_training_fp32.py -``` - -We provided support for BFloat16 half precision training. -Please refer to [Automatic Mixed Precision (AMP) introduction](https://pytorch.org/docs/stable/amp.html) for more details. -BFloat16 calculations are further accelerated on the processors supporting [Intelยฎ Advanced Matrix Extensions (AMX)](https://en.wikipedia.org/wiki/Advanced_Matrix_Extensions) instructions. - -Running ResNet50 BFloat16 half precision training example: - -```bash -python single_instance_training_bf16.py -``` - Running ResNet50 distributed training example: ```bash @@ -49,4 +33,4 @@ ipexrun --nnodes 1 distributed_data_parallel_training.py Please check [the training examples in Intelยฎ Extension for PyTorch\* online document](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/examples.html#training) for more details. -For more information and examples about distributed training via PyTorch\* DDP, please visit [oneAPI Collective Communications Library Bindings for Pytorch\* Github repository](https://github.com/intel/torch-ccl). \ No newline at end of file +For more information and examples about distributed training via PyTorch\* DDP, please visit [oneAPI Collective Communications Library Bindings for Pytorch\* Github repository](https://github.com/intel/torch-ccl). diff --git a/examples/cpu/training/distributed_data_parallel_training.py b/examples/cpu/training/python-scripts/distributed_data_parallel_training.py similarity index 100% rename from examples/cpu/training/distributed_data_parallel_training.py rename to examples/cpu/training/python-scripts/distributed_data_parallel_training.py diff --git a/examples/cpu/training/single_instance_training_bf16.py b/examples/cpu/training/single_instance_training_bf16.py deleted file mode 100644 index fa596e686..000000000 --- a/examples/cpu/training/single_instance_training_bf16.py +++ /dev/null @@ -1,51 +0,0 @@ -import torch -import torchvision -import intel_extension_for_pytorch as ipex - -LR = 0.001 -DOWNLOAD = True -DATA = "datasets/cifar10/" - -transform = torchvision.transforms.Compose( - [ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), - ] -) -train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, -) -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) - -model = torchvision.models.resnet50() -criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9) -model.train() - -model, optimizer = ipex.optimize(model, optimizer=optimizer, dtype=torch.bfloat16) -# Uncomment the code below to enable beta feature `torch.compile` -# model = torch.compile(model, backend="ipex") - -for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - # Note: bf16 training requires amp.autocast() context # noqa F401 - with torch.cpu.amp.autocast(): - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - -torch.save( - { - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - }, - "checkpoint.pth", -) - -print("Execution finished") diff --git a/examples/cpu/training/single_instance_training_fp32.py b/examples/cpu/training/single_instance_training_fp32.py deleted file mode 100644 index ae2b970ad..000000000 --- a/examples/cpu/training/single_instance_training_fp32.py +++ /dev/null @@ -1,49 +0,0 @@ -import torch -import torchvision -import intel_extension_for_pytorch as ipex - -LR = 0.001 -DOWNLOAD = True -DATA = "datasets/cifar10/" - -transform = torchvision.transforms.Compose( - [ - torchvision.transforms.Resize((224, 224)), - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), - ] -) -train_dataset = torchvision.datasets.CIFAR10( - root=DATA, - train=True, - transform=transform, - download=DOWNLOAD, -) -train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=128) - -model = torchvision.models.resnet50() -criterion = torch.nn.CrossEntropyLoss() -optimizer = torch.optim.SGD(model.parameters(), lr=LR, momentum=0.9) -model.train() - -model, optimizer = ipex.optimize(model, optimizer=optimizer) -# Uncomment the code below to enable beta feature `torch.compile` -# model = torch.compile(model, backend="ipex") - -for batch_idx, (data, target) in enumerate(train_loader): - optimizer.zero_grad() - output = model(data) - loss = criterion(output, target) - loss.backward() - optimizer.step() - print(batch_idx) - -torch.save( - { - "model_state_dict": model.state_dict(), - "optimizer_state_dict": optimizer.state_dict(), - }, - "checkpoint.pth", -) - -print("Execution finished") diff --git a/scripts/build_doc.sh b/scripts/build_doc.sh index a34293801..8bc92cdba 100644 --- a/scripts/build_doc.sh +++ b/scripts/build_doc.sh @@ -173,25 +173,23 @@ parse_example() { cp ${MDEXAMPLE} tutorials/examples.md.bk if [[ ${DEVICE} == "cpu" ]]; then - parse_example "../examples/cpu/training/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" - parse_example "../examples/cpu/training/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" - parse_example "../examples/cpu/training/distributed_data_parallel_training.py" ${MDEXAMPLE} "(marker_train_ddp_complete)" "python" - parse_example "../examples/cpu/inference/python/resnet50_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" - parse_example "../examples/cpu/inference/python/bert_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" - parse_example "../examples/cpu/inference/python/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_fp32)" "python" - parse_example "../examples/cpu/inference/python/bert_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_fp32)" "python" - parse_example "../examples/cpu/inference/python/resnet50_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" - parse_example "../examples/cpu/inference/python/bert_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" - parse_example "../examples/cpu/inference/python/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" - parse_example "../examples/cpu/inference/python/resnet50_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_bf16)" "python" - parse_example "../examples/cpu/inference/python/bert_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_bf16)" "python" + parse_example "../examples/cpu/training/python-scripts/distributed_data_parallel_training.py" ${MDEXAMPLE} "(marker_train_ddp_complete)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_fp32)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_eager_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/resnet50_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_dynamo_bf16)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/bert_torchdynamo_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_dynamo_bf16)" "python" parse_example "../examples/cpu/features/fast_bert/fast_bert_inference_bf16.py" ${MDEXAMPLE} "(marker_feature_fastbert_bf16)" "python" - parse_example "../examples/cpu/inference/python/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" - parse_example "../examples/cpu/inference/python/int8_quantization_dynamic.py" ${MDEXAMPLE} "(marker_int8_dynamic)" "python" - parse_example "../examples/cpu/inference/python/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/int8_quantization_dynamic.py" ${MDEXAMPLE} "(marker_int8_dynamic)" "python" + parse_example "../examples/cpu/inference/python/python-scripts/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" parse_example "../examples/cpu/features/llm/llm_optimize.py" ${MDEXAMPLE} "(marker_llm_optimize)" "python" parse_example "../examples/cpu/features/llm/llm_optimize_smoothquant.py" ${MDEXAMPLE} "(marker_llm_optimize_sq)" "python" parse_example "../examples/cpu/features/llm/llm_optimize_woq.py" ${MDEXAMPLE} "(marker_llm_optimize_woq)" "python" @@ -207,24 +205,24 @@ if [[ ${DEVICE} == "cpu" ]]; then parse_example "../examples/cpu/features/graph_optimization/int8.py" tutorials/features/graph_optimization.md "(marker_feature_graph_optimization_int8)" "python" parse_example "../examples/cpu/features/graph_optimization/folding.py" tutorials/features/graph_optimization.md "(marker_feature_graph_optimization_folding)" "python" elif [[ ${DEVICE} == "gpu" ]]; then - parse_example "../examples/gpu/training/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" - parse_example "../examples/gpu/training/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" - parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" - parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" - parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" - parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" - parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp16)" "python" - parse_example "../examples/gpu/inference/python/bert_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp16)" "python" - parse_example "../examples/gpu/inference/python/bert_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp16)" "python" - parse_example "../examples/gpu/inference/python/resnet50_imperative_mode_inference_fp32_alt.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32_alt)" "python" - # parse_example "../examples/gpu/inference/python/int8_calibration_static_imperative.py" ${MDEXAMPLE} "(marker_int8_static_imperative)" "python" - parse_example "../examples/gpu/inference/python/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" - # parse_example "../examples/gpu/inference/python/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" + parse_example "../examples/gpu/training/python-scripts/single_instance_training_fp32.py" ${MDEXAMPLE} "(marker_train_single_fp32_complete)" "python" + parse_example "../examples/gpu/training/python-scripts/single_instance_training_bf16.py" ${MDEXAMPLE} "(marker_train_single_bf16_complete)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp32.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp32)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_bf16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_bf16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_imperative_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_imp_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_rn50_ts_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/bert_torchscript_mode_inference_fp16.py" ${MDEXAMPLE} "(marker_inf_bert_ts_fp16)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/resnet50_imperative_mode_inference_fp32_alt.py" ${MDEXAMPLE} "(marker_inf_rn50_imp_fp32_alt)" "python" + # parse_example "../examples/gpu/inference/python/python-scripts/int8_calibration_static_imperative.py" ${MDEXAMPLE} "(marker_int8_static_imperative)" "python" + parse_example "../examples/gpu/inference/python/python-scripts/int8_quantization_static.py" ${MDEXAMPLE} "(marker_int8_static)" "python" + # parse_example "../examples/gpu/inference/python/python-scripts/int8_deployment.py" ${MDEXAMPLE} "(marker_int8_deploy)" "python" parse_example "../examples/gpu/inference/cpp/example-app/example-app.cpp" ${MDEXAMPLE} "(marker_cppsdk_sample_app)" "cpp" parse_example "../examples/gpu/inference/cpp/example-app/CMakeLists.txt" ${MDEXAMPLE} "(marker_cppsdk_cmake_app)" "cmake" parse_example "../examples/gpu/inference/cpp/example-usm/example-usm.cpp" ${MDEXAMPLE} "(marker_cppsdk_sample_usm)" "cpp" From 1707152edd4af04d083fae0729d8a65928d49288 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Thu, 6 Jun 2024 21:06:46 -0700 Subject: [PATCH 105/199] enable docker support for multi-nodes LLM inference via deepspeed (#2664) * first change change for docker build from source add run scaling script mutli-node config fix for master address fix some cpu affinity issue add a error handling fix an issue for run.py Update README.md add build arg for custom sshd port Rename run_scaling.sh to .run_scaling.sh hidden the run_scaling.sh since it is not ready. update dockerfile to use native python env Add files via upload add passwordless ssh connection fixes for run_scaling.sh Update README.md for multinodes run change nodefile to hostfile.txt according to README update files for bug fixes * Update README.md according to Daisy's feedback * move commands into code blocks * fix lint issues --------- Co-authored-by: Jing Xu --- examples/cpu/inference/python/llm/README.md | 31 ++++- examples/cpu/inference/python/llm/run.py | 6 +- .../inference/python/llm/tools/run_scaling.sh | 112 ++++++++++++++++++ 3 files changed, 146 insertions(+), 3 deletions(-) create mode 100644 examples/cpu/inference/python/llm/tools/run_scaling.sh diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index 7123f243e..3c9d8782a 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -94,7 +94,10 @@ git submodule sync git submodule update --init --recursive # Build an image with the provided Dockerfile by compiling Intelยฎ Extension for PyTorch\* from source -DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile --build-arg COMPILE=ON -t ipex-llm:main . +# To have a custom ssh server port for multi-nodes run, please add --build-arg PORT_SSH= ex: 2345, otherwise use the default 22 SSH port +DOCKER_BUILDKIT=1 docker build -f examples/cpu/inference/python/llm/Dockerfile --build-arg COMPILE=ON --build-arg PORT_SSH=2345 -t ipex-llm:main . + + # Run the container with command below docker run --rm -it --privileged ipex-llm:main bash @@ -264,6 +267,32 @@ deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` deepspeed --num_accelerators 2 --master_addr `hostname -I | sed -e 's/\s.*$//'` --bind_cores_to_rank run_accuracy_with_deepspeed.py --model meta-llama/Llama-2-7b-hf --ipex-weight-only-quantization --weight-dtype INT8 --quant-with-amp --tasks lambada_openai ``` +#### 4.1.2.3 Distributed inference among multiple nodes with TCP + +A bash script (`tools/run_scaling.sh`) is provided to simplify environment configuration and the command launch. + +Steps: + +2. Enter the `llm` directory +3. Create a `hostfile.txt` following [instructions of deepspeed](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) +4. Find out the network interface name used for node communication via `ifconfig` or `ibv_devices` ex : eth0 +5. Open `tools/run_scaling.sh` script to update required information in line 3 to line 11 according to your environment and needs +6. run the command below to run distributed inference among nodes + +```bash +bash tools/run_scaling.sh +``` + +The docker image built in Section 3.1 functions ssh connection for distributed executions across multiple machines via Ethernet. However, it is supposed to be running with 1 single container on each machine. Inside each docker container, multiple inference instances can be launched by the `deepspeed` command. + +Use the command below on all machines to launch the docker containers. This command uses the host network interfaces inside the docker container. Thus, you need to put the host ip addresses into the `hostfile.txt`. Do NOT launch multiple docker containers on one single machine from the same docker image. These docker containers listen on the same machine on the same port, will result in unpredicable ssh connections. + +```bash +docker run --rm -it --privileged --net host ipex-llm:main bash +``` + +**Note:** For models on HuggingFace require access privileges, you need to run the `huggingface-cli login` command in each docker container to config a HuggingFace access token. + ## 4.2 Detail usage of running LLM models ### 4.2.1 Run generation with one instance diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index e29689b30..e7fe93c85 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -624,7 +624,8 @@ def main(args_in: Optional[List[str]] = None) -> None: shard_cmd.extend( ["--save-path", str(args.output_dir) + str(MODEL_CLASSES[model_type])] ) - shard_cmd.extend(["--local_rank", str(args.local_rank)]) + if args.local_rank is not None: + shard_cmd.extend(["--local_rank", str(args.local_rank)]) print("LLM RUNTIME INFO: sharding model...") result = subprocess.run(shard_cmd) if result.returncode != 0: @@ -652,7 +653,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--num-iter", str(args.num_iter)]) infer_cmd.extend(["--num-warmup", str(args.num_warmup)]) infer_cmd.extend(["--batch-size", str(args.batch_size)]) - infer_cmd.extend(["--local_rank", str(args.local_rank)]) + if args.local_rank is not None: + infer_cmd.extend(["--local_rank", str(args.local_rank)]) if args.greedy: infer_cmd.extend(["--greedy"]) if args.streaming: diff --git a/examples/cpu/inference/python/llm/tools/run_scaling.sh b/examples/cpu/inference/python/llm/tools/run_scaling.sh new file mode 100644 index 000000000..d235df357 --- /dev/null +++ b/examples/cpu/inference/python/llm/tools/run_scaling.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +MASTER_NET_IF=eth0 +model_id=meta-llama/Llama-2-7b-hf +data_type=float32 +batch_size=1 +output=32 +input=32 +num_iter=10 +warmup=2 +ONECCL_NUM_WORKERS=4 ## You could tune the worker number for your workload + +NODEFILE=hostfile.txt +if ! [ -f $NODEFILE ]; then + echo "File does not exist." + exit 0 +fi + +WORKDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +function get_hw_info() +{ + number_threads=`nproc --all` + number_cores=`lscpu -b -p=Core,Socket | grep -v '^#' | sort -u | wc -l` + number_sockets=`grep physical.id /proc/cpuinfo | sort -u | wc -l` + cpu_per_socket=$((number_cores/number_sockets)) +} + +# Config OneCCL +function nw_config() +{ + worker=$1 + omp_worker=$2 + get_hw_info + if [ $number_sockets != 1 ]; then + ccl_cpu_list=$(seq -s, $((cpu_per_socket - worker/number_sockets)) $((cpu_per_socket - 1))),$(seq -s, $((number_cores - worker/number_sockets)) $((number_cores - 1))) + omp_cpu_list=$(seq -s, $((0)) $((omp_worker - 1))),$(seq -s, $((cpu_per_socket)) $((cpu_per_socket + omp_worker -1))) + else + ccl_cpu_list=$(seq -s, $((cpu_per_socket - worker)) $((cpu_per_socket - 1))) + omp_cpu_list=$(seq -s, $((0)) $((omp_worker - 1))) + fi + export CCL_WORKER_AFFINITY=$ccl_cpu_list + export CCL_WORKER_COUNT=$((worker/number_sockets)) + + export CCL_ALLREDUCE=rabenseifner # Other algorithms inlcude nreduce, ring and recursive_doubling. Rabenseifner algorithm is more friendly for latency sensitive workload + export CCL_ATL_TRANSPORT=ofi #Other option is mpi +} + +# Create mpi argments +# Assume your ibv_devices output has 4 nics: irdma-cvl01tf2,irdma-cvl02tf2,irdma-cvl11tf2,irdma-cvl12tf2 +function build_launch_args_fi_tcp(){ + PKG_PATH=$1 + margs="--genv CCL_WORKER_COUNT=${CCL_WORKER_COUNT}" + #margs="$margs --genv CCL_MNIC=global" # Select all NICs local for the NUMA node that corresponds to process pinning + margs="$margs --genv CCL_MNIC_COUNT=1" # The maximum number of NICs that should be selected for oneCCL workers. + margs="$margs --genv CCL_MNIC_NAME=${MASTER_NET_IF}" # to control multi-NIC selection by NIC names + margs="$margs --genv CCL_WORKER_AFFINITY=${CCL_WORKER_AFFINITY}" + margs="$margs --genv CCL_ATL_TRANSPORT=$CCL_ATL_TRANSPORT" # Select the transport for inter-process communications + margs="$margs --genv I_MPI_PIN=0" + #margs="$margs --genv FI_LOG_LEVEL=debug" + margs="$margs --genv FI_PROVIDER=tcp" + margs="$margs --genv FI_TCP_IFACE=${MASTER_NET_IF}" + margs="$margs --genv I_MPI_OFI_PROVIDER=tcp" + margs="$margs --genv I_MPI_FABRICS=ofi" + margs="$margs --genv I_MPI_HYDRA_IFACE=${MASTER_NET_IF}" + margs="$margs --genv CCL_KVS_IFACE=${MASTER_NET_IF}" + margs="$margs --genv PDSH_RCMD_TYPE=ssh" +} + +function build_launch_args_fi_psm3(){ + PKG_PATH=$1 + margs="--genv CCL_WORKER_COUNT=${CCL_WORKER_COUNT}" + margs="$margs --genv CCL_ALLREDUCE=${CCL_ALLREDUCE}" + margs="$margs --genv CCL_MNIC=global" + margs="$margs --genv CCL_LOG_LEVEL=debug" + margs="$margs --genv CCL_MNIC_COUNT=1" + margs="$margs --genv CCL_MNIC_NAME=${MASTER_NET_IF}" + margs="$margs --genv CCL_WORKER_AFFINITY=${CCL_WORKER_AFFINITY}" + margs="$margs --genv CCL_ATL_TRANSPORT=ofi" + margs="$margs --genv PSM3_ALLOW_ROUTERS=1" + margs="$margs --genv PSM3_RDMA=1" + margs="$margs --genv PSM3_IDENTIFY=1" + margs="$margs --genv PSM3_RV_MR_CACHE_SIZE=8192" + margs="$margs --genv FI_PROVIDER_PATH=${PKG_PATH}/oneccl_bindings_for_pytorch/lib/" # Specify the location of the installed PSM3 provider, when use torch-ccl the version in torch-ccl enviroment will be used + margs="$margs --genv PSM3_NIC_SPEED=100000" + margs="$margs --genv PSM3_KASSIST_MODE=none" + margs="$margs --genv PSM3_NIC=${MASTER_NET_IF}" + margs="$margs --genv I_MPI_PIN=0" + margs="$margs --genv I_MPI_PIN_PROCESSOR_LIST=1,33" + #margs="$margs --genv FI_LOG_LEVEL=debug" + margs="$margs --genv FI_PROVIDER=psm3" + margs="$margs --genv I_MPI_OFI_PROVIDER=psm3" + margs="$margs --genv FI_TCP_IFACE=${MASTER_NET_IF}" + margs="$margs --genv I_MPI_FABRICS=ofi" + margs="$margs --genv I_MPI_HYDRA_IFACE=${MASTER_NET_IF}" + #margs="$margs --genv PSM3_DEVICES=\'self,nic\'" +} + +# Run +PKG_PATH=$(python -m pip show intel-extension-for-pytorch | grep "Location" | cut -d " " -f 2) +source ${PKG_PATH}/intel_extension_for_pytorch/env/setvars.sh +export MASTER_ADDR=$(ifconfig $MASTER_NET_IF | grep 'inet ' | awk '{print $2}') +export MASTER_PORT=29500 +echo $MASTER_ADDR + +get_hw_info +OMP_NUM_THREADS=$((cpu_per_socket - ONECCL_NUM_WORKERS/number_sockets)) # Leave some cores for CCL worker and leave some cores idle to reduce stragger effect +nw_config $ONECCL_NUM_WORKERS $OMP_NUM_THREADS +build_launch_args_fi_tcp ${PKG_PATH} + +# For example to run bf16 +deepspeed --master_addr=${MASTER_ADDR} --master_port=${MASTER_PORT} --no_ssh_check --hostfile=${NODEFILE} --bind_cores_to_rank --bind_core_list ${omp_cpu_list} --launcher impi --launcher_args " --genv LD_LIBRARY_PATH=$LD_LIBRARY_PATH $margs" distributed/run_generation_with_deepspeed.py --model-id $model_id --dtype $data_type --ipex --batch-size $batch_size --benchmark --max-new-tokens ${output} --input-tokens ${input} --token-latency --num-iter ${num_iter} --num-warmup ${warmup} From 8336f0d8f3c12814ac5feb958071e903144b9df1 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Fri, 7 Jun 2024 22:03:07 +0800 Subject: [PATCH 106/199] update oneDNN to 525e7b7791 on rls-v3.5 (#2964) Co-authored-by: WeizhuoZhang-intel --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index e2a639872..e01a42907 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit e2a6398723651c8e09a387d63475c82fbcb0dcf5 +Subproject commit e01a4290724b3b28401b22f63c199b8f289cd3ae From 4027749462a5bb5ece1bcf89fdae463e883e3934 Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Sat, 8 Jun 2024 17:10:31 +0900 Subject: [PATCH 107/199] update compile bundle for certain conda env (#2967) --- scripts/compile_bundle.sh | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/scripts/compile_bundle.sh b/scripts/compile_bundle.sh index b4ded6057..0fcd74bcb 100644 --- a/scripts/compile_bundle.sh +++ b/scripts/compile_bundle.sh @@ -136,7 +136,7 @@ if [ ! -z "${MAX_JOBS}" ]; then fi # Install dependencies -python -m pip install cmake==3.28.4 +python -m pip install cmake==3.28.4 make # Compare the torch torchvision and torchaudio version function ver_compare_eq() { @@ -159,30 +159,27 @@ if python -c "import torch; print(torch.__version__)" &> /dev/null; then VER_COMP_AUDIO=$(ver_compare_eq ${torchaudio_version} ${VER_TORCHAUDIO}) fi if [ ${VER_COMP_TORCH} -ne 1 ] || [ ${VER_COMP_VISION} -ne 1 ] || [ ${VER_COMP_AUDIO} -ne 1 ]; then - if [ ${VER_COMP_TORCH} -ne 1 ]; then + if [ ! -z ${VER_COMP_TORCH} ] && [ ${VER_COMP_TORCH} -ne 1 ]; then printf "WARNING: Found installed torch version ${torch_version}, the required version for compiling is ${VER_TORCH}\\n" fi - if [ ${VER_COMP_VISION} -ne 1 ]; then + if [ ! -z ${VER_COMP_VISION} ] && [ ${VER_COMP_VISION} -ne 1 ]; then printf " Found installed torchvision version ${torchvision_version}, the required version for compiling is ${VER_TORCHVISION}\\n" fi - if [ ${VER_COMP_AUDIO} -ne 1 ]; then + if [ ! -z ${VER_COMP_AUDIO} ] && [ ${VER_COMP_AUDIO} -ne 1 ]; then printf " Found installed torchaudio version ${torchaudio_version}, the required version for compiling is ${VER_COMP_AUDIO}\\n" fi - printf "Continue to run the compile script will replace the current torch/torchvision/torchaudio package\\n" + printf "Continue to run the compile script will replace the current torch/torchvision/torchaudio package\\n" printf "Are sure you want to continue the compilation? yes for continue, no for quit. [yes|no]\\n" printf "[yes] >>> " read -r ans ans=$(echo "${ans}" | tr '[:lower:]' '[:upper:]') - if [ "${ans}" != "YES" ] && [ "${ans}" != "Y" ] - then + if [ ! -z ${ans} ] && [ "${ans}" != "YES" ] && [ "${ans}" != "Y" ]; then printf "Aborting compilation\\n" exit 2 fi fi fi - - python -m pip uninstall -y torch torchvision torchaudio intel-extension-for-pytorch oneccl_bind_pt set +e echo ${VER_TORCH} | grep "dev" > /dev/null @@ -221,7 +218,7 @@ ABI=$(python -c "import torch; print(int(torch._C._GLIBCXX_USE_CXX11_ABI))") if [ ${GCC_CONDA} -eq 1 ]; then if [ ${EXIST_CONDA} -gt 0 ]; then echo "Command \"conda\" not found. Exit." - exit 2 + exit 2 fi conda install -y sysroot_linux-64 conda install -y gcc==12.3 gxx==12.3 cxx-compiler -c conda-forge @@ -244,12 +241,6 @@ if [[ ! -z ${LDFLAGS} ]]; then function join { local IFS="$1"; shift; echo "$*"; } export LDFLAGS=$(join ' ' "${ldflags[@]}") fi -set +e -command -v make > /dev/null -if [ $? -gt 0 ]; then - python -m pip install make -fi -set -e # LLVM LLVM_ROOT="$(pwd)/llvm-release" @@ -296,14 +287,14 @@ unset LLVM_DIR export LD_LIBRARY_PATH=${LD_LIBRARY_PATH_BK} export PATH=${PATH_BK} python -m pip uninstall -y mkl-static mkl-include -python -m pip install dist/*.whl +python -m pip install --force-reinstall dist/*.whl cd .. # Torch-CCL if [ $((${MODE} & 0x01)) -ne 0 ]; then cd torch-ccl python setup.py clean python setup.py bdist_wheel 2>&1 | tee build.log - python -m pip install dist/*.whl + python -m pip install --force-reinstall dist/*.whl cd .. fi export LD_PRELOAD=$(bash ./intel-extension-for-pytorch/tools/get_libstdcpp_lib.sh) From 46f03403f5f6a34f5d949ca43de7f38f215a03ed Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 11 Jun 2024 12:20:46 +0800 Subject: [PATCH 108/199] add api to disable concat linear in JIT pass (#2959) --- csrc/cpu/jit/auto_opt_config.h | 10 ++ csrc/cpu/jit/fusion_pass.cpp | 6 +- .../csrc/cpu/Module.cpp | 9 ++ tests/cpu/test_jit.py | 92 +++++++++++++++++++ 4 files changed, 115 insertions(+), 2 deletions(-) diff --git a/csrc/cpu/jit/auto_opt_config.h b/csrc/cpu/jit/auto_opt_config.h index 4a1a27286..f73a4c1ca 100644 --- a/csrc/cpu/jit/auto_opt_config.h +++ b/csrc/cpu/jit/auto_opt_config.h @@ -26,6 +26,14 @@ class IPEX_API AutoOptConfig { return jit_repack_for_linear_; } + inline void set_jit_concat_linear(bool jit_concat_linear) { + jit_concat_linear_ = jit_concat_linear; + } + + inline bool get_jit_concat_linear() { + return jit_concat_linear_; + } + private: AutoOptConfig() : jit_fuse_(true), @@ -38,6 +46,7 @@ class IPEX_API AutoOptConfig { // will be the best format. (2) Linear + binary cannot be folded if // we do not do repack, since it is implemented on aten:linear jit_repack_for_linear_(true), + jit_concat_linear_(true), calibration_step_(false), qscheme_(at::QScheme::PER_TENSOR_AFFINE) {} @@ -47,6 +56,7 @@ class IPEX_API AutoOptConfig { bool jit_fuse_; bool jit_repack_for_linear_; + bool jit_concat_linear_; // the flag for one iteration of calibration step whether end or not. bool calibration_step_; at::QScheme qscheme_; diff --git a/csrc/cpu/jit/fusion_pass.cpp b/csrc/cpu/jit/fusion_pass.cpp index 1ea265825..ca056a529 100644 --- a/csrc/cpu/jit/fusion_pass.cpp +++ b/csrc/cpu/jit/fusion_pass.cpp @@ -138,8 +138,10 @@ void IPEXFusionPass(std::shared_ptr& graph) { graph, aten_linear_recorder.use_mkl()); } // concat multi-linear with same input - torch_ipex::jit::FrozenConcatLinear( - graph, aten_linear_recorder.get_records()); + if (AutoOptConfig::singleton().get_jit_concat_linear()) { + torch_ipex::jit::FrozenConcatLinear( + graph, aten_linear_recorder.get_records()); + } graph_rewrite::FrozenLinearFolding(graph); // linear fusion diff --git a/intel_extension_for_pytorch/csrc/cpu/Module.cpp b/intel_extension_for_pytorch/csrc/cpu/Module.cpp index 3852bb961..055b2a53b 100644 --- a/intel_extension_for_pytorch/csrc/cpu/Module.cpp +++ b/intel_extension_for_pytorch/csrc/cpu/Module.cpp @@ -166,6 +166,15 @@ void InitIpexModuleBindings(py::module m) { m.def("get_jit_linear_repack", []() { return AutoOptConfig::singleton().get_jit_repack_for_linear(); }); + m.def("disable_jit_concat_linear", []() { + AutoOptConfig::singleton().set_jit_concat_linear(false); + }); + m.def("enable_jit_concat_linear", []() { + AutoOptConfig::singleton().set_jit_concat_linear(true); + }); + m.def("get_jit_concat_linear", []() { + return AutoOptConfig::singleton().get_jit_concat_linear(); + }); // BF32 py::enum_(m, "FP32MathMode") diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py index bfab04bf5..2f1f28168 100644 --- a/tests/cpu/test_jit.py +++ b/tests/cpu/test_jit.py @@ -1610,6 +1610,14 @@ def _texpr_enable(self, strategy): finally: torch._C._jit_set_texpr_fuser_enabled(old_texpr_fuser_state) + @contextlib.contextmanager + def _disable_concat_linear(self): + ipex._C.disable_jit_concat_linear() + try: + yield + finally: + ipex._C.enable_jit_concat_linear() + def _test_output( self, base_model, @@ -2011,6 +2019,90 @@ def check_op_count(graph_str, op_names=None): ) self.assertEqual(linear_count_ori_v1, 2) + # Test disable concat linear + origin_model = ModMultLinear(50, 60).eval() + test_val1 = torch.rand([50, 5]) + with self._disable_concat_linear(): + # call mkl path(fp32) + model = ipex.optimize( + origin_model, + dtype=torch.float32, + weights_prepack=False, + ) + ori_res = model(test_val1) + with torch.no_grad(): + model_jit = torch.jit.trace(model, (test_val1)) + graph_ori = str(model_jit.graph_for(test_val1)) + linear_count_ori = check_op_count(graph_ori, ["aten::linear"]) + self.assertEqual(linear_count_ori, 4) + model_jit = torch.jit.freeze(model_jit) + jit_res = model_jit(test_val1) + self.assertEqual(ori_res, jit_res) + graph_opt = str(model_jit.graph_for(test_val1)) + linear_count_ori = check_op_count(graph_opt, ["aten::linear"]) + self.assertEqual(linear_count_ori, 4) + # call prepack mkl path(fp32) + model = ipex.optimize(origin_model, dtype=torch.float32) + ori_res = model(test_val1) + with torch.no_grad(): + model_jit = torch.jit.trace(model, (test_val1)) + graph_ori = str(model_jit.graph_for(test_val1)) + linear_count_ori = check_op_count( + graph_ori, ["ipex_prepack::mkl_sgemm_run"] + ) + self.assertEqual(linear_count_ori, 4) + model_jit = torch.jit.freeze(model_jit) + jit_res = model_jit(test_val1) + self.assertEqual(ori_res, jit_res) + graph_opt = str(model_jit.graph_for(test_val1)) + linear_count_ori = check_op_count( + graph_opt, ["ipex_prepack::mkl_sgemm_run"] + ) + self.assertEqual(linear_count_ori, 4) + + # call onednn path(fp32) + model = ipex.optimize( + origin_model, + dtype=torch.float32, + auto_kernel_selection=True, + ) + ori_res = model(test_val1) + with torch.no_grad(): + model_jit = torch.jit.trace(model, (test_val1)) + graph_ori = str(model_jit.graph_for(test_val1)) + linear_count_ori = check_op_count( + graph_ori, ["ipex_prepack::linear_run"] + ) + self.assertEqual(linear_count_ori, 4) + model_jit = torch.jit.freeze(model_jit) + jit_res = model_jit(test_val1) + self.assertEqual(ori_res, jit_res) + graph_opt = str(model_jit.graph_for(test_val1)) + linear_count_ori = check_op_count( + graph_opt, ["ipex_prepack::linear_run"] + ) + self.assertEqual(linear_count_ori, 4) + + model = ipex.optimize(origin_model, dtype=torch.bfloat16) + test_val1 = test_val1.bfloat16() + with torch.cpu.amp.autocast(), torch.no_grad(): + ori_res = model(test_val1) + model_jit = torch.jit.trace(model, (test_val1)) + graph_ori = str(model_jit.graph_for(test_val1)) + linear_count_ori = check_op_count( + graph_ori, ["ipex_prepack::linear_run"] + ) + self.assertEqual(linear_count_ori, 4) + model_jit = torch.jit.freeze(model_jit) + model_jit(test_val1) + graph_opt = str(model_jit.graph_for(test_val1)) + jit_res = model_jit(test_val1) + self.assertEqual(ori_res[1], jit_res[1]) + linear_count_ori = check_op_count( + graph_opt, ["ipex_prepack::linear_run"] + ) + self.assertEqual(linear_count_ori, 4) + def test_add_layernorm(self): for dim in [768, 100]: with torch.no_grad(): From 1b823482c1079830519c3ec5a697870bace4bbc8 Mon Sep 17 00:00:00 2001 From: Ryan Tao <65508217+RanTao123@users.noreply.github.com> Date: Tue, 11 Jun 2024 15:08:43 +0800 Subject: [PATCH 109/199] support activation symmetric quant for WOQ weight INT4 low_p mode INT8 (#2940) * support symmetric quant for activation int8 * clang format * remove unnecessary change * modify according to comment * modify according to comment * clang-format * fix bug * clang-format * set default act mode * fix bugs * fix bug * add annotation * revert code * unnecessary change --- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 769 ++++++++++++++---- csrc/cpu/tpp/xsmm_functors.h | 16 +- .../llm/single_instance/run_quantization.py | 19 +- .../quantization/_qconfig.py | 4 + tests/cpu/test_quantization_default_recipe.py | 139 ++++ 5 files changed, 795 insertions(+), 152 deletions(-) diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index 6bb2a03d5..8ca6442ac 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -128,10 +128,22 @@ constexpr long LOOP_K_UNROLL = 4; // TODO(jgong5): do not hard-code #define QUANT_A_PER_K_BLOCK 1 #define QUANT_A_PER_M 2 #define QUANT_A_PER_M_K_BLOCK 3 +#define QUANT_A_PER_TENSOR_SYM 4 +#define QUANT_A_PER_K_BLOCK_SYM 5 +#define QUANT_A_PER_M_SYM 6 +#define QUANT_A_PER_M_K_BLOCK_SYM 7 #define QUANT_W_PER_CHANNEL 0 #define QUANT_W_PER_K_BLOCK 1 +// negate elements in a according to b's sign +static inline __m512i _mm512_sign_epi8(__m512i a, __m512i b) { + __m512i zero = _mm512_setzero_si512(); + __mmask64 blt0 = _mm512_movepi8_mask(b); + return _mm512_mask_sub_epi8(a, blt0, zero, a); + ; +} + template struct load_dequant_zp_only_4bit { template @@ -800,7 +812,9 @@ struct GemmMicroKernel< vscales[i] = _mm512_loadu_ps(scales + i * 16); // TODO(jgong5): should we use 512 or two 256 here? vzps[i] = combine_m256i(load_zps_4vnni(zps + i * 16)); - vcompensate[i] = _mm512_setzero_epi32(); + if (zp_a) { + vcompensate[i] = _mm512_setzero_epi32(); + } }); compile_time_for::op( @@ -821,13 +835,21 @@ struct GemmMicroKernel< if constexpr (row == 0) { vb[col] = combine_m256i(load_int4_as_int8(pqB[k / 4][col * 16])); vb[col] = _mm512_sub_epi8(vb[col], vzps[col]); - vcompensate[col] = _mm512_dpbusd_epi32(vcompensate[col], ones, vb[col]); + if (zp_a) { + vcompensate[col] = + _mm512_dpbusd_epi32(vcompensate[col], ones, vb[col]); + } if constexpr (PREFETCH_K_DIST > 0) { _mm_prefetch(pqB[(k + PREFETCH_K_DIST) / 4][col * 16], _MM_HINT_T0); } } - - vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]); + if (zp_a) { + vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]); + } else { + auto vsb = _mm512_sign_epi8(vb[col], va); + auto vabsa = _mm512_sign_epi8(va, va); + vc[i] = _mm512_dpbusds_epi32(vc[i], vabsa, vsb); + } }; // Accumulate along k @@ -852,24 +874,34 @@ struct GemmMicroKernel< __m512 vc_float; if constexpr ( quant_a_mode == QUANT_A_PER_TENSOR || - quant_a_mode == QUANT_A_PER_K_BLOCK) { - vc[i] = _mm512_sub_epi32( - vc[i], - _mm512_mullo_epi32(vcompensate[col], _mm512_set1_epi32(*zp_a))); + quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_TENSOR_SYM || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { + if (zp_a) { + vc[i] = _mm512_sub_epi32( + vc[i], + _mm512_mullo_epi32(vcompensate[col], _mm512_set1_epi32(*zp_a))); + } vc_float = _mm512_cvtepi32_ps(vc[i]); vc_float = _mm512_mul_ps(vc_float, _mm512_set1_ps(*scale_a)); - } else if constexpr (quant_a_mode == QUANT_A_PER_M) { - vc[i] = _mm512_sub_epi32( - vc[i], - _mm512_mullo_epi32( - vcompensate[col], _mm512_set1_epi32(*(zp_a + row)))); + } else if constexpr ( + quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { + if (zp_a) { + vc[i] = _mm512_sub_epi32( + vc[i], + _mm512_mullo_epi32( + vcompensate[col], _mm512_set1_epi32(*(zp_a + row)))); + } vc_float = _mm512_cvtepi32_ps(vc[i]); vc_float = _mm512_mul_ps(vc_float, _mm512_set1_ps(*(scale_a + row))); } else { - vc[i] = _mm512_sub_epi32( - vc[i], - _mm512_mullo_epi32( - vcompensate[col], _mm512_set1_epi32(*(zp_a + row * k_groups)))); + if (zp_a) { + vc[i] = _mm512_sub_epi32( + vc[i], + _mm512_mullo_epi32( + vcompensate[col], + _mm512_set1_epi32(*(zp_a + row * k_groups)))); + } vc_float = _mm512_cvtepi32_ps(vc[i]); vc_float = _mm512_mul_ps( vc_float, _mm512_set1_ps(*(scale_a + row * k_groups))); @@ -1509,6 +1541,9 @@ class DequantGemmTPP< static_assert(N % 16 == 0, "N must be a multiple of 16"); TLA_ASSERT(K % 4 == 0, "Kb must be a multiple of 4 for int8 VNNI"); // TODO(jgong5): output fp32 directly + // set is_sym_quant true if quant_a_mode is larger than + // QUANT_A_PER_TENSOR_SYM + constexpr bool is_sym_quant = quant_a_mode >= QUANT_A_PER_TENSOR_SYM; pgemm = new TBrgemmTPP( M, N, @@ -1521,7 +1556,8 @@ class DequantGemmTPP< /*ACC*/ 0, /*transA*/ false, unroll_hint, - /*b_vnni*/ true); + /*b_vnni*/ true, + is_sym_quant); } ~DequantGemmTPP() { @@ -1552,12 +1588,22 @@ class DequantGemmTPP< int32_t* zp_a_m; if constexpr ( quant_a_mode == QUANT_A_PER_M || - quant_a_mode == QUANT_A_PER_M_K_BLOCK) { + quant_a_mode == QUANT_A_PER_M_K_BLOCK || + quant_a_mode == QUANT_A_PER_M_SYM || + quant_a_mode == QUANT_A_PER_M_K_BLOCK_SYM) { scale_a_m = scale_a + m * k_groups; - zp_a_m = zp_a + m * k_groups; + if constexpr ( + quant_a_mode == QUANT_A_PER_M || + quant_a_mode == QUANT_A_PER_M_K_BLOCK) { + zp_a_m = zp_a + m * k_groups; + } } else { scale_a_m = scale_a; - zp_a_m = zp_a; + if constexpr ( + quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_TENSOR) { + zp_a_m = zp_a; + } } enumerate_dispatcher::call( block_m, @@ -1638,15 +1684,26 @@ class DequantGemmTPP< int32_t* zp_a_m; if constexpr ( quant_a_mode == QUANT_A_PER_M || - quant_a_mode == QUANT_A_PER_M_K_BLOCK) { + quant_a_mode == QUANT_A_PER_M_K_BLOCK || + quant_a_mode == QUANT_A_PER_M_SYM || + quant_a_mode == QUANT_A_PER_M_K_BLOCK_SYM) { scale_a_m = scale_a + m * k_groups; - zp_a_m = zp_a + m * k_groups; + if (zp_a) { + zp_a_m = zp_a + m * k_groups; + } } else { scale_a_m = scale_a; - zp_a_m = zp_a; + if (zp_a) { + zp_a_m = zp_a; + } + } + float c = 0; + if (zp_a_m) { + c = (qC[m][n] - compensation[n] * (*zp_a_m)) * (*scale_a_m) * + scales[n]; + } else { + c = (qC[m][n]) * (*scale_a_m) * scales[n]; } - float c = (qC[m][n] - compensation[n] * (*zp_a_m)) * (*scale_a_m) * - scales[n]; if constexpr (ACC) { C[m * ldc + n] += c; } else { @@ -2128,7 +2185,7 @@ void qlinear_woq_affine_impl( !(std::is_same()) || (std::is_same()), "T must be TComp if T is uint8_t"); - bool no_x_buf = std::is_same(); + bool no_x_buf = std::is_same() || std::is_same(); bool no_y_buf = std::is_same() && std::is_same() && k_splits == 1; @@ -2320,10 +2377,12 @@ void qlinear_woq_affine_impl( IPEX_KCB_BLOCK_SIZE, str_a); - auto pcvt_x_tpp = std::is_same() + auto pcvt_x_tpp = + std::is_same() || std::is_same() ? nullptr : std::make_shared>(BLOCK_M, Kb, K, Kb); - auto pcvt_x_rem_tpp = std::is_same() + auto pcvt_x_rem_tpp = + std::is_same() || std::is_same() ? nullptr : std::make_shared>( BLOCK_M_rem, Kb, K, Kb); @@ -2365,21 +2424,34 @@ void qlinear_woq_affine_impl( TLA_ASSERT( !sym_quant, "Calculation of uint8 does not support symmetric quant."); - if constexpr (quant_a_mode == QUANT_A_PER_TENSOR) { + if constexpr ( + quant_a_mode == QUANT_A_PER_TENSOR || + quant_a_mode == QUANT_A_PER_TENSOR_SYM) { scale_a = scales_a_ptr; - zp_a = zps_a_ptr; + if constexpr (quant_a_mode == QUANT_A_PER_TENSOR) { + zp_a = zps_a_ptr; + } } else if constexpr ( - quant_a_mode == QUANT_A_PER_K_BLOCK) { + quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { scale_a = scales_a_ptr + quant_offset; - zp_a = zps_a_ptr + quant_offset; - } else if constexpr (quant_a_mode == QUANT_A_PER_M) { + if constexpr (quant_a_mode == QUANT_A_PER_K_BLOCK) { + zp_a = zps_a_ptr + quant_offset; + } + } else if constexpr ( + quant_a_mode == QUANT_A_PER_M || + quant_a_mode == QUANT_A_PER_M_SYM) { scale_a = scales_a_ptr + m; - zp_a = zps_a_ptr + m; + if constexpr (quant_a_mode == QUANT_A_PER_M) { + zp_a = zps_a_ptr + m; + } k_groups = 1; } else { scale_a = scales_a_ptr + m * quant_k_blocks + quant_offset; - zp_a = zps_a_ptr + m * quant_k_blocks + quant_offset; + if constexpr (quant_a_mode == QUANT_A_PER_M_K_BLOCK) { + zp_a = zps_a_ptr + m * quant_k_blocks + quant_offset; + } k_groups = quant_k_blocks; } } @@ -2547,21 +2619,35 @@ void qlinear_woq_affine_impl( TLA_ASSERT( !sym_quant, "Calculation of uint8 does not support symmetric quant."); - if constexpr (quant_a_mode == QUANT_A_PER_TENSOR) { + if constexpr ( + quant_a_mode == QUANT_A_PER_TENSOR || + quant_a_mode == QUANT_A_PER_TENSOR_SYM) { scale_a = scales_a_ptr; - zp_a = zps_a_ptr; + if constexpr (quant_a_mode == QUANT_A_PER_TENSOR) { + zp_a = zps_a_ptr; + } } else if constexpr ( - quant_a_mode == QUANT_A_PER_K_BLOCK) { + quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { scale_a = scales_a_ptr + quant_offset; - zp_a = zps_a_ptr + quant_offset; - } else if constexpr (quant_a_mode == QUANT_A_PER_M) { + if constexpr (quant_a_mode == QUANT_A_PER_K_BLOCK) { + zp_a = zps_a_ptr + quant_offset; + } + } else if constexpr ( + quant_a_mode == QUANT_A_PER_M || + quant_a_mode == QUANT_A_PER_M_SYM) { scale_a = scales_a_ptr + m; - zp_a = zps_a_ptr + m; + if constexpr (quant_a_mode == QUANT_A_PER_M) { + zp_a = zps_a_ptr + m; + } k_groups = 1; } else { scale_a = scales_a_ptr + m * quant_k_blocks + quant_offset; - zp_a = zps_a_ptr + m * quant_k_blocks + quant_offset; + if constexpr (quant_a_mode == QUANT_A_PER_M_K_BLOCK) { + zp_a = + zps_a_ptr + m * quant_k_blocks + quant_offset; + } k_groups = quant_k_blocks; } } @@ -2900,21 +2986,24 @@ template void compute_int8_qparams_per_tensor( const at::Tensor& t, float* scale, - int32_t* zp) { + int32_t* zp, + bool is_sym_quant) { auto [t_min, t_max] = at::aminmax(t); auto min = t_min.item(); auto max = t_max.item(); min = std::min(min, 0.0f); max = std::max(max, 0.0f); - *scale = (max - min) / 255.0f; - *zp = (int32_t)(-std::nearbyint(min / *scale)); + *scale = is_sym_quant ? std::max(fabs(max), fabs(min)) / 127.0f + : (max - min) / 255.0f; + *zp = is_sym_quant ? 0 : (int32_t)(-std::nearbyint(min / *scale)); } template <> void compute_int8_qparams_per_tensor( const at::Tensor& t, float* scale, - int32_t* zp) { + int32_t* zp, + bool is_sym_quant) { auto in_ptr0 = t.data_ptr(); auto n = t.numel(); auto K = t.size(-1); @@ -2968,12 +3057,15 @@ void compute_int8_qparams_per_tensor( } auto min_elem_ptr = std::min_element(min_vals, min_vals + thread_used); auto max_elem_ptr = std::max_element(max_vals, max_vals + thread_used); - *scale = (*max_elem_ptr - *min_elem_ptr) / 255.0f; - *zp = (int32_t)(-std::nearbyint(*min_elem_ptr / *scale)); + *scale = is_sym_quant + ? std::max(fabs(*max_elem_ptr), fabs(*min_elem_ptr)) / 127.0f + : (*max_elem_ptr - *min_elem_ptr) / 255.0f; + *zp = is_sym_quant ? 0 : (int32_t)(-std::nearbyint(*min_elem_ptr / *scale)); } else { auto [min_val, max_val] = compute_block(in_ptr0, 0, n); - *scale = (max_val - min_val) / 255.0f; - *zp = (int32_t)(-std::nearbyint(min_val / *scale)); + *scale = is_sym_quant ? std::max(fabs(max_val), fabs(min_val)) / 127.0f + : (max_val - min_val) / 255.0f; + *zp = is_sym_quant ? 0 : (int32_t)(-std::nearbyint(min_val / *scale)); } } @@ -2981,7 +3073,8 @@ template <> void compute_int8_qparams_per_tensor( const at::Tensor& t, float* scale, - int32_t* zp) { + int32_t* zp, + bool is_sym_quant) { auto in_ptr0 = t.data_ptr(); auto n = t.numel(); auto K = t.size(-1); @@ -3039,12 +3132,15 @@ void compute_int8_qparams_per_tensor( } auto min_elem_ptr = std::min_element(min_vals, min_vals + thread_used); auto max_elem_ptr = std::max_element(max_vals, max_vals + thread_used); - *scale = (*max_elem_ptr - *min_elem_ptr) / 255.0f; - *zp = (int32_t)(-std::nearbyint(*min_elem_ptr / *scale)); + *scale = is_sym_quant + ? std::max(fabs(*max_elem_ptr), fabs(*min_elem_ptr)) / 127.0f + : (*max_elem_ptr - *min_elem_ptr) / 255.0f; + *zp = is_sym_quant ? 0 : (int32_t)(-std::nearbyint(*min_elem_ptr / *scale)); } else { auto [min_val, max_val] = compute_block(in_ptr0, 0, n); - *scale = (max_val - min_val) / 255.0f; - *zp = (int32_t)(-std::nearbyint(min_val / *scale)); + *scale = is_sym_quant ? std::max(fabs(max_val), fabs(min_val)) / 127.0f + : (max_val - min_val) / 255.0f; + *zp = is_sym_quant ? 0 : (int32_t)(-std::nearbyint(min_val / *scale)); } } @@ -3052,19 +3148,24 @@ template std::pair compute_int8_qparams_per_block( const at::Tensor& t, int quant_block_k, - int quant_a_mode) { + int quant_a_mode, + bool is_sym_quant) { auto K = t.size(-1); auto n = t.numel(); auto M = n / K; auto t_reshape = t.reshape({M, K}); - if (quant_a_mode == QUANT_A_PER_M) { + if (quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { auto grouped_min = std::get<0>(t_reshape.min(-1)); auto grouped_max = std::get<0>(t_reshape.max(-1)); auto zeros = at::zeros_like(grouped_min); - auto min = at::minimum(grouped_min, zeros); - auto max = at::maximum(grouped_max, zeros); - auto scales = (max - min) / 255; - auto zps = -at::round(min / scales); + auto min = quant_a_mode == QUANT_A_PER_M ? at::minimum(grouped_min, zeros) + : grouped_min; + auto max = quant_a_mode == QUANT_A_PER_M ? at::maximum(grouped_max, zeros) + : grouped_max; + auto scales = is_sym_quant + ? at::maximum(at::absolute(max), at::absolute(min)) / 127.0f + : (max - min) / 255.0f; + auto zps = is_sym_quant ? at::Tensor() : -at::round(min / scales); return std::make_pair( std::move(scales.to(c10::kFloat)), std::move(zps.to(c10::kInt))); } @@ -3075,7 +3176,8 @@ std::pair compute_int8_qparams_per_block( .index({at::indexing::Slice(), at::indexing::Slice(0, K - k_rem)}) .view({M, K / quant_block_k, quant_block_k}); at::Tensor grouped_min, grouped_max; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { grouped_min = std::get<0>(std::get<0>(grouped.min(-1)).min(0)); grouped_max = std::get<0>(std::get<0>(grouped.max(-1)).max(0)); } else { @@ -3085,15 +3187,18 @@ std::pair compute_int8_qparams_per_block( auto zeros = at::zeros_like(grouped_min); auto min = at::minimum(grouped_min, zeros); auto max = at::maximum(grouped_max, zeros); - auto scales = (max - min) / 255.0f; - auto zps = -at::round(min / scales); + auto scales = is_sym_quant + ? at::maximum(at::absolute(max), at::absolute(min)) / 127.0f + : (max - min) / 255.0f; + auto zps = is_sym_quant ? at::Tensor() : -at::round(min / scales); if (k_rem) { auto grouped_rem = t_reshape .index({at::indexing::Slice(), at::indexing::Slice(K - k_rem, K)}) .view({M, 1, k_rem}); at::Tensor grouped_rem_min, grouped_rem_max; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { grouped_rem_min = std::get<0>(std::get<0>(grouped_rem.min(-1)).min(0)); grouped_rem_max = std::get<0>(std::get<0>(grouped_rem.max(-1)).max(0)); } else { @@ -3102,10 +3207,14 @@ std::pair compute_int8_qparams_per_block( } auto min_rem = at::minimum(grouped_rem_min, at::tensor({0})); auto max_rem = at::maximum(grouped_rem_max, at::tensor({0})); - auto scales_rem = (max_rem - min_rem) / 255; - auto zps_rem = -at::round(min_rem / scales_rem); + auto scales_rem = is_sym_quant + ? at::maximum(at::absolute(max_rem), at::absolute(min_rem)) / 127.0f + : (max_rem - min_rem) / 255.0f; + auto zps_rem = + is_sym_quant ? at::Tensor() : -at::round(min_rem / scales_rem); scales = at::cat({scales, scales_rem}, -1).contiguous(); - zps = at::cat({zps, zps_rem}, -1).contiguous(); + zps = + is_sym_quant ? at::Tensor() : at::cat({zps, zps_rem}, -1).contiguous(); } return std::make_pair( std::move(scales.to(c10::kFloat)), std::move(zps.to(c10::kInt))); @@ -3115,7 +3224,8 @@ template <> std::pair compute_int8_qparams_per_block( const at::Tensor& t, int quant_block_k, - int quant_a_mode) { + int quant_a_mode, + bool is_sym_quant) { auto in_ptr = t.data_ptr(); int K = t.size(-1); int n = t.numel(); @@ -3123,10 +3233,12 @@ std::pair compute_int8_qparams_per_block( int Kc = (K + quant_block_k - 1) / quant_block_k; auto vecsize = at::vec::Vectorized::size(); at::Tensor scales, zps; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { scales = at::empty({Kc}, t.options().dtype(at::kFloat)); zps = at::empty({Kc}, t.options().dtype(at::kInt)); - } else if (quant_a_mode == QUANT_A_PER_M) { + } else if ( + quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { scales = at::empty({M}, t.options().dtype(at::kFloat)); zps = at::empty({M}, t.options().dtype(at::kInt)); } else { @@ -3134,14 +3246,15 @@ std::pair compute_int8_qparams_per_block( zps = at::empty({M, Kc}, t.options().dtype(at::kInt)); } auto scales_ptr = scales.data_ptr(); - auto zps_ptr = zps.data_ptr(); + auto zps_ptr = is_sym_quant ? nullptr : zps.data_ptr(); auto compute_minmax = [vecsize, scales_ptr, zps_ptr]( at::BFloat16* ptr, int M, int K, int scale_offset, int zp_offset, - int ld) { + int ld, + bool is_sym_quant) { float min_val = std::numeric_limits::infinity(); float max_val = -std::numeric_limits::infinity(); auto in_ptr_ = ptr; @@ -3181,22 +3294,28 @@ std::pair compute_int8_qparams_per_block( return at::vec::maximum(x, y); }, max_vec)); - scales_ptr[scale_offset] = (max_val - min_val) / 255.0f; - zps_ptr[zp_offset] = - (int32_t)(-std::nearbyint(min_val / scales_ptr[scale_offset])); + scales_ptr[scale_offset] = is_sym_quant + ? std::max(fabs(max_val), fabs(min_val)) / 128.0f + : (max_val - min_val) / 255.0f; + if (!is_sym_quant) { + zps_ptr[zp_offset] = + (int32_t)(-std::nearbyint(min_val / scales_ptr[scale_offset])); + } }; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { #pragma omp parallel for for (int kc = 0; kc < Kc; kc++) { int offset = kc * quant_block_k; int block_k = std::min(quant_block_k, K - offset); - compute_minmax(in_ptr + offset, M, block_k, kc, kc, K); + compute_minmax(in_ptr + offset, M, block_k, kc, kc, K, is_sym_quant); } - } else if (quant_a_mode == QUANT_A_PER_M) { + } else if ( + quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { #pragma omp parallel for for (int m = 0; m < M; m++) { int offset = m * K; - compute_minmax(in_ptr + offset, 1, K, m, m, K); + compute_minmax(in_ptr + offset, 1, K, m, m, K, is_sym_quant); } } else { #pragma omp parallel for collapse(2) @@ -3206,7 +3325,8 @@ std::pair compute_int8_qparams_per_block( auto scale_offset = m * Kc + kc; auto zp_offset = m * Kc + kc; int block_k = std::min(quant_block_k, K - kc * quant_block_k); - compute_minmax(in_ptr0, 1, block_k, scale_offset, zp_offset, K); + compute_minmax( + in_ptr0, 1, block_k, scale_offset, zp_offset, K, is_sym_quant); } } } @@ -3218,7 +3338,8 @@ template <> std::pair compute_int8_qparams_per_block( const at::Tensor& t, int quant_block_k, - int quant_a_mode) { + int quant_a_mode, + bool is_sym_quant) { auto in_ptr = t.data_ptr(); int K = t.size(-1); int n = t.numel(); @@ -3226,10 +3347,12 @@ std::pair compute_int8_qparams_per_block( int Kc = (K + quant_block_k - 1) / quant_block_k; auto vecsize = at::vec::Vectorized::size(); at::Tensor scales, zps; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { scales = at::empty({Kc}, t.options().dtype(at::kFloat)); zps = at::empty({Kc}, t.options().dtype(at::kInt)); - } else if (quant_a_mode == QUANT_A_PER_M) { + } else if ( + quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { scales = at::empty({M}, t.options().dtype(at::kFloat)); zps = at::empty({M}, t.options().dtype(at::kInt)); } else { @@ -3237,14 +3360,15 @@ std::pair compute_int8_qparams_per_block( zps = at::empty({M, Kc}, t.options().dtype(at::kInt)); } auto scales_ptr = scales.data_ptr(); - auto zps_ptr = zps.data_ptr(); + auto zps_ptr = is_sym_quant ? nullptr : zps.data_ptr(); auto compute_minmax = [vecsize, scales_ptr, zps_ptr]( float* ptr, int M, int K, int scale_offset, int zp_offset, - int ld) { + int ld, + bool is_sym_quant) { float min_val = std::numeric_limits::infinity(); float max_val = -std::numeric_limits::infinity(); auto in_ptr_ = ptr; @@ -3280,22 +3404,28 @@ std::pair compute_int8_qparams_per_block( return at::vec::maximum(x, y); }, max_vec)); - scales_ptr[scale_offset] = (max_val - min_val) / 255.0f; - zps_ptr[zp_offset] = - (int32_t)(-std::nearbyint(min_val / scales_ptr[scale_offset])); + scales_ptr[scale_offset] = is_sym_quant + ? std::max(fabs(max_val), fabs(min_val)) / 128.0f + : (max_val - min_val) / 255.0f; + if (!is_sym_quant) { + zps_ptr[zp_offset] = + (int32_t)(-std::nearbyint(min_val / scales_ptr[scale_offset])); + } }; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { #pragma omp parallel for for (int kc = 0; kc < Kc; kc++) { int offset = kc * quant_block_k; int block_k = std::min(quant_block_k, K - offset); - compute_minmax(in_ptr + offset, M, block_k, kc, kc, K); + compute_minmax(in_ptr + offset, M, block_k, kc, kc, K, is_sym_quant); } - } else if (quant_a_mode == QUANT_A_PER_M) { + } else if ( + quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { #pragma omp parallel for for (int m = 0; m < M; m++) { int offset = m * K; - compute_minmax(in_ptr + offset, 1, K, m, m, K); + compute_minmax(in_ptr + offset, 1, K, m, m, K, is_sym_quant); } } else { #pragma omp parallel for collapse(2) @@ -3305,7 +3435,8 @@ std::pair compute_int8_qparams_per_block( auto scale_offset = m * Kc + kc; auto zp_offset = m * Kc + kc; int block_k = std::min(quant_block_k, K - kc * quant_block_k); - compute_minmax(in_ptr0, 1, block_k, scale_offset, zp_offset, K); + compute_minmax( + in_ptr0, 1, block_k, scale_offset, zp_offset, K, is_sym_quant); } } } @@ -3314,22 +3445,30 @@ std::pair compute_int8_qparams_per_block( } template -at::Tensor quantize_per_tensor(const at::Tensor& t, float scale, int32_t zp) { +at::Tensor quantize_per_tensor( + const at::Tensor& t, + float scale, + int32_t zp, + bool is_sym_quant) { // TODO(jgong5): optimize me - auto t_q = t / scale + zp; - t_q = at::clamp(at::round(t_q), 0, 255); - return t_q.to(at::kByte); + auto t_q = is_sym_quant ? t / scale : t / scale + zp; + t_q = is_sym_quant ? at::clamp(at::round(t_q), -128, 127) + : at::clamp(at::round(t_q), 0, 255); + return is_sym_quant ? t_q.to(at::kChar) : t_q.to(at::kByte); } template <> at::Tensor quantize_per_tensor( const at::Tensor& t, float scale, - int32_t zp) { + int32_t zp, + bool is_sym_quant) { #ifdef __AVX512F__ - at::Tensor out = at::empty_like(t, at::kByte); + auto out_dtype = is_sym_quant ? at::kChar : at::kByte; + at::Tensor out = at::empty_like(t, out_dtype); auto in_ptr0 = t.data_ptr(); - auto out_ptr0 = out.data_ptr(); + uint8_t* out_ptr0 = is_sym_quant ? nullptr : out.data_ptr(); + int8_t* out_sym_ptr0 = is_sym_quant ? out.data_ptr() : nullptr; auto n = t.numel(); auto K = t.size(-1); auto M = t.numel() / K; @@ -3376,6 +3515,47 @@ at::Tensor quantize_per_tensor( out_ptr[i1] = tmp10; } }; + auto quantize_block_sym = + [vecsize, scale, zp](float* in_ptr, int start, int end, int8_t* out_ptr) { + int i1; + for (i1 = start; i1 < end / vecsize * vecsize; i1 += vecsize) { + auto tmp0 = at::vec::Vectorized::loadu(in_ptr + i1, vecsize); + auto tmp1 = + tmp0 / at::vec::Vectorized(static_cast(scale)); + auto tmp2 = tmp1 + at::vec::Vectorized(static_cast(zp)); + auto tmp3 = tmp2.round(); + auto tmp4 = (tmp3); + auto tmp5 = at::vec::Vectorized(static_cast(-128.0)); + auto tmp6 = at::vec::maximum(tmp4, tmp5); + auto tmp7 = at::vec::Vectorized(static_cast(127.0)); + auto tmp8 = at::vec::minimum(tmp6, tmp7); + auto tmp9 = (tmp8); + auto tmp10 = at::vec::convert_float_to_int8(tmp9); + tmp10.store(out_ptr + i1, vecsize); + } + for (; i1 < end; i1++) { + auto tmp0 = in_ptr[i1]; + auto tmp1 = tmp0 / static_cast(scale); + auto tmp2 = tmp1 + static_cast(zp); + auto tmp3 = std::nearbyint(tmp2); + auto tmp4 = static_cast(tmp3); + auto tmp5 = static_cast(-128.0); + auto tmp6 = 0; + if (at::_isnan(tmp4)) { + tmp6 = tmp4; + } + tmp6 = tmp4 > tmp5 ? tmp4 : tmp5; + auto tmp7 = static_cast(127.0); + auto tmp8 = 0; + if (at::_isnan(tmp6)) { + tmp8 = tmp6; + } + tmp8 = tmp6 < tmp7 ? tmp6 : tmp7; + auto tmp9 = static_cast(tmp8); + auto tmp10 = static_cast(tmp9); + out_ptr[i1] = tmp10; + } + }; if (n > QUANT_A_THRESHOLD) { int num_threads = omp_get_max_threads(); int vec_per_thread = std::ceil((float)n / vecsize / num_threads); @@ -3383,10 +3563,18 @@ at::Tensor quantize_per_tensor( for (int i0 = 0; i0 < n; i0 += vec_per_thread * vecsize) { auto vec_start = i0; auto vec_end = std::min(i0 + vec_per_thread * vecsize, (int)n); - quantize_block(in_ptr0, vec_start, vec_end, out_ptr0); + if (is_sym_quant) { + quantize_block_sym(in_ptr0, vec_start, vec_end, out_sym_ptr0); + } else { + quantize_block(in_ptr0, vec_start, vec_end, out_ptr0); + } } } else { - quantize_block(in_ptr0, 0, n, out_ptr0); + if (is_sym_quant) { + quantize_block_sym(in_ptr0, 0, n, out_sym_ptr0); + } else { + quantize_block(in_ptr0, 0, n, out_ptr0); + } } return out; #else @@ -3398,11 +3586,14 @@ template <> at::Tensor quantize_per_tensor( const at::Tensor& t, float scale, - int32_t zp) { + int32_t zp, + bool is_sym_quant) { #ifdef __AVX512F__ - at::Tensor out = at::empty_like(t, at::kByte); + auto out_dtype = is_sym_quant ? at::kChar : at::kByte; + at::Tensor out = at::empty_like(t, out_dtype); auto in_ptr0 = t.data_ptr(); - auto out_ptr0 = out.data_ptr(); + uint8_t* out_ptr0 = is_sym_quant ? nullptr : out.data_ptr(); + int8_t* out_sym_ptr0 = is_sym_quant ? out.data_ptr() : nullptr; auto n = t.numel(); auto K = t.size(-1); auto M = t.numel() / K; @@ -3458,6 +3649,57 @@ at::Tensor quantize_per_tensor( out_ptr[i1] = tmp13; } }; + auto quantize_block_sym = + [vecsize, scale, zp]( + at::BFloat16* in_ptr, int start, int end, int8_t* out_ptr) { + int i1; + for (i1 = start; i1 < end / vecsize * vecsize; i1 += vecsize) { + auto tmp0 = + at::vec::Vectorized::loadu(in_ptr + i1, vecsize); + at::vec::Vectorized res_vec1(0); + at::vec::Vectorized res_vec2(0); + std::tie(res_vec1, res_vec2) = at::vec::convert_bfloat16_float(tmp0); + auto tmp1 = res_vec1; + auto tmp2 = at::vec::Vectorized(static_cast(scale)); + auto tmp3 = tmp1 / tmp2; + auto tmp4 = at::vec::Vectorized(static_cast(zp)); + auto tmp5 = tmp3 + tmp4; + auto tmp6 = tmp5.round(); + auto tmp7 = (tmp6); + auto tmp8 = at::vec::Vectorized(static_cast(-128.0)); + auto tmp9 = at::vec::maximum(tmp7, tmp8); + auto tmp10 = at::vec::Vectorized(static_cast(127.0)); + auto tmp11 = at::vec::minimum(tmp9, tmp10); + auto tmp12 = (tmp11); + auto tmp13 = at::vec::convert_float_to_int8(tmp12); + tmp13.store(out_ptr + i1, vecsize); + } + for (; i1 < end; i1++) { + auto tmp0 = in_ptr[i1]; + auto tmp1 = static_cast(tmp0); + auto tmp2 = static_cast(scale); + auto tmp3 = tmp1 / tmp2; + auto tmp4 = static_cast(zp); + auto tmp5 = tmp3 + tmp4; + auto tmp6 = std::nearbyint(tmp5); + auto tmp7 = static_cast(tmp6); + auto tmp8 = static_cast(-128.0); + auto tmp9 = 0; + if (at::_isnan(tmp7)) { + tmp9 = tmp7; + } + tmp9 = tmp7 > tmp8 ? tmp7 : tmp8; + auto tmp10 = static_cast(127.0); + auto tmp11 = 0; + if (at::_isnan(tmp9)) { + tmp11 = tmp9; + } + tmp11 = tmp9 < tmp10 ? tmp9 : tmp10; + auto tmp12 = static_cast(tmp11); + auto tmp13 = static_cast(tmp12); + out_ptr[i1] = tmp13; + } + }; if (n > QUANT_A_THRESHOLD) { auto num_threads = omp_get_max_threads(); int vec_per_thread = std::ceil((float)n / vecsize / num_threads); @@ -3465,10 +3707,18 @@ at::Tensor quantize_per_tensor( for (int i0 = 0; i0 < n; i0 += vec_per_thread * vecsize) { auto vec_start = i0; auto vec_end = std::min(i0 + vec_per_thread * vecsize, (int)n); - quantize_block(in_ptr0, vec_start, vec_end, out_ptr0); + if (is_sym_quant) { + quantize_block_sym(in_ptr0, vec_start, vec_end, out_sym_ptr0); + } else { + quantize_block(in_ptr0, vec_start, vec_end, out_ptr0); + } } } else { - quantize_block(in_ptr0, 0, n, out_ptr0); + if (is_sym_quant) { + quantize_block_sym(in_ptr0, 0, n, out_sym_ptr0); + } else { + quantize_block(in_ptr0, 0, n, out_ptr0); + } } return out; #else @@ -3482,7 +3732,8 @@ at::Tensor quantize_per_block( const at::Tensor& scale, const at::Tensor& zp, int quant_block_k, - int quant_a_mode) { + int quant_a_mode, + bool is_sym_quant) { auto K = t.size(-1); auto n = t.numel(); auto M = n / K; @@ -3500,19 +3751,29 @@ at::Tensor quantize_per_block( if (quant_a_mode == QUANT_A_PER_K_BLOCK) { out = at::clamp( at::round(grouped / scale.unsqueeze(1)) + zp.unsqueeze(1), 0, 255); + } else if (quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { + out = at::clamp(at::round(grouped / scale.unsqueeze(1)), -128, 127); } else if (quant_a_mode == QUANT_A_PER_M) { out = at::clamp( at::round(grouped / scale.unsqueeze(1).unsqueeze(2)) + zp.unsqueeze(1).unsqueeze(2), 0, 255); - } else { + } else if (quant_a_mode == QUANT_A_PER_M_SYM) { out = at::clamp( - at::round(grouped / scale.unsqueeze(-1)) + zp.unsqueeze(-1), 0, 255); + at::round(grouped / scale.unsqueeze(1).unsqueeze(2)), -128, 127); + } else { + out = is_sym_quant + ? at::clamp(at::round(grouped / scale.unsqueeze(-1)), -128, 127) + : at::clamp( + at::round(grouped / scale.unsqueeze(-1)) + zp.unsqueeze(-1), + 0, + 255); } out = out.view({-1, K_padded}) .index({at::indexing::Slice(), at::indexing::Slice(0, K)}); - return out.to(at::kByte).contiguous(); + return is_sym_quant ? out.to(at::kChar).contiguous() + : out.to(at::kByte).contiguous(); } template <> @@ -3521,16 +3782,19 @@ at::Tensor quantize_per_block( const at::Tensor& scale, const at::Tensor& zp, int quant_block_k, - int quant_a_mode) { + int quant_a_mode, + bool is_sym_quant) { int K = t.size(-1); int n = t.numel(); int M = n / K; - at::Tensor out = at::empty_like(t, at::kByte); + auto out_dtype = is_sym_quant ? at::kChar : at::kByte; + at::Tensor out = at::empty_like(t, out_dtype); + uint8_t* out_ptr = is_sym_quant ? nullptr : out.data_ptr(); + int8_t* out_sym_ptr = is_sym_quant ? out.data_ptr() : nullptr; int Kc = (K + quant_block_k - 1) / quant_block_k; auto scale_ptr = scale.data_ptr(); auto zp_ptr = zp.data_ptr(); auto in_ptr = t.data_ptr(); - auto out_ptr = out.data_ptr(); auto vecsize = at::vec::Vectorized::size(); auto quantize_block = [vecsize]( at::BFloat16* in_ptr, @@ -3587,28 +3851,95 @@ at::Tensor quantize_per_block( out_ptr[k] = tmp13; } }; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + auto quantize_block_sym = [vecsize]( + at::BFloat16* in_ptr, + int8_t* out_ptr, + int block_k, + float scale_, + int zp_) { + int k; + for (k = 0; k < block_k / vecsize * vecsize; k += vecsize) { + auto in_ptr0 = in_ptr + k; + auto out_ptr0 = out_ptr + k; + auto tmp0 = at::vec::Vectorized::loadu(in_ptr0, vecsize); + at::vec::Vectorized res_vec1(0); + at::vec::Vectorized res_vec2(0); + std::tie(res_vec1, res_vec2) = at::vec::convert_bfloat16_float(tmp0); + auto tmp1 = res_vec1; + auto tmp2 = at::vec::Vectorized(static_cast(scale_)); + auto tmp3 = tmp1 / tmp2; + auto tmp4 = at::vec::Vectorized(static_cast(zp_)); + auto tmp5 = tmp3 + tmp4; + auto tmp6 = tmp5.round(); + auto tmp7 = (tmp6); + auto tmp8 = at::vec::Vectorized(static_cast(-128.0)); + auto tmp9 = at::vec::maximum(tmp7, tmp8); + auto tmp10 = at::vec::Vectorized(static_cast(127.0)); + auto tmp11 = at::vec::minimum(tmp9, tmp10); + auto tmp12 = (tmp11); + auto tmp13 = at::vec::convert_float_to_int8(tmp12); + tmp13.store(out_ptr0, vecsize); + } + for (; k < block_k; k++) { + auto tmp0 = in_ptr[k]; + auto tmp1 = static_cast(tmp0); + auto tmp2 = static_cast(scale_); + auto tmp3 = tmp1 / tmp2; + auto tmp4 = static_cast(zp_); + auto tmp5 = tmp3 + tmp4; + auto tmp6 = std::nearbyint(tmp5); + auto tmp7 = static_cast(tmp6); + auto tmp8 = static_cast(-128.0); + auto tmp9 = 0; + if (at::_isnan(tmp7)) { + tmp9 = tmp7; + } + tmp9 = tmp7 > tmp8 ? tmp7 : tmp8; + auto tmp10 = static_cast(127.0); + auto tmp11 = 0; + if (at::_isnan(tmp9)) { + tmp11 = tmp9; + } + tmp11 = tmp9 < tmp10 ? tmp9 : tmp10; + auto tmp12 = static_cast(tmp11); + auto tmp13 = static_cast(tmp12); + out_ptr[k] = tmp13; + } + }; + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { #pragma omp parallel for collapse(2) for (int m = 0; m < M; m++) { for (int kc = 0; kc < Kc; kc++) { auto in_ptr0 = in_ptr + m * K + kc * quant_block_k; - auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; auto scale_ = scale_ptr[kc]; - auto zp_ = zp_ptr[kc]; + auto zp_ = is_sym_quant ? 0 : zp_ptr[kc]; int block_k = std::min(quant_block_k, (int)K - kc * quant_block_k); - quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + if (is_sym_quant) { + auto out_ptr0 = out_sym_ptr + m * K + kc * quant_block_k; + quantize_block_sym(in_ptr0, out_ptr0, block_k, scale_, zp_); + } else { + auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; + quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + } } } - } else if (quant_a_mode == QUANT_A_PER_M) { + } else if ( + quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { #pragma omp parallel for collapse(2) for (int m = 0; m < M; m++) { for (int kc = 0; kc < Kc; kc++) { auto in_ptr0 = in_ptr + m * K + kc * quant_block_k; - auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; auto scale_ = scale_ptr[m]; - auto zp_ = zp_ptr[m]; + auto zp_ = is_sym_quant ? 0 : zp_ptr[m]; int block_k = std::min(quant_block_k, (int)K - kc * quant_block_k); - quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + if (is_sym_quant) { + auto out_ptr0 = out_sym_ptr + m * K + kc * quant_block_k; + quantize_block_sym(in_ptr0, out_ptr0, block_k, scale_, zp_); + } else { + auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; + quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + } } } } else { @@ -3616,14 +3947,20 @@ at::Tensor quantize_per_block( for (int m = 0; m < M; m++) { for (int kc = 0; kc < Kc; kc++) { auto in_ptr0 = in_ptr + m * K + kc * quant_block_k; - auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; auto scale_ = scale_ptr[m * Kc + kc]; - auto zp_ = zp_ptr[m * Kc + kc]; + auto zp_ = is_sym_quant ? 0 : zp_ptr[m * Kc + kc]; int block_k = std::min(quant_block_k, (int)K - kc * quant_block_k); - quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + if (is_sym_quant) { + auto out_ptr0 = out_sym_ptr + m * K + kc * quant_block_k; + quantize_block_sym(in_ptr0, out_ptr0, block_k, scale_, zp_); + } else { + auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; + quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + } } } } + return out; } @@ -3633,16 +3970,19 @@ at::Tensor quantize_per_block( const at::Tensor& scale, const at::Tensor& zp, int quant_block_k, - int quant_a_mode) { + int quant_a_mode, + bool is_sym_quant) { int K = t.size(-1); int n = t.numel(); int M = n / K; - at::Tensor out = at::empty_like(t, at::kByte); + auto out_dtype = is_sym_quant ? at::kChar : at::kByte; + at::Tensor out = at::empty_like(t, out_dtype); + uint8_t* out_ptr = is_sym_quant ? nullptr : out.data_ptr(); + int8_t* out_sym_ptr = is_sym_quant ? out.data_ptr() : nullptr; int Kc = (K + quant_block_k - 1) / quant_block_k; auto scale_ptr = scale.data_ptr(); auto zp_ptr = zp.data_ptr(); auto in_ptr = t.data_ptr(); - auto out_ptr = out.data_ptr(); auto vecsize = at::vec::Vectorized::size(); auto quantize_block = [vecsize]( @@ -3689,28 +4029,85 @@ at::Tensor quantize_per_block( out_ptr[k] = tmp10; } }; - if (quant_a_mode == QUANT_A_PER_K_BLOCK) { + auto quantize_block_sym = + [vecsize]( + float* in_ptr, int8_t* out_ptr, int block_k, float scale_, int zp_) { + int k; + for (k = 0; k < block_k / vecsize * vecsize; k += vecsize) { + auto in_ptr0 = in_ptr + k; + auto out_ptr0 = out_ptr + k; + auto tmp0 = at::vec::Vectorized::loadu(in_ptr0, vecsize); + auto tmp1 = + tmp0 / at::vec::Vectorized(static_cast(scale_)); + auto tmp2 = + tmp1 + at::vec::Vectorized(static_cast(zp_)); + auto tmp3 = tmp2.round(); + auto tmp4 = (tmp3); + auto tmp5 = at::vec::Vectorized(static_cast(-128.0)); + auto tmp6 = at::vec::maximum(tmp4, tmp5); + auto tmp7 = at::vec::Vectorized(static_cast(127.0)); + auto tmp8 = at::vec::minimum(tmp6, tmp7); + auto tmp9 = (tmp8); + auto tmp10 = at::vec::convert_float_to_int8(tmp9); + tmp10.store(out_ptr0, vecsize); + } + for (; k < block_k; k++) { + auto tmp0 = in_ptr[k]; + auto tmp1 = tmp0 / static_cast(scale_); + auto tmp2 = tmp1 + static_cast(zp_); + auto tmp3 = std::nearbyint(tmp2); + auto tmp4 = static_cast(tmp3); + auto tmp5 = static_cast(-128.0); + auto tmp6 = 0; + if (at::_isnan(tmp4)) { + tmp6 = tmp4; + } + tmp6 = tmp4 > tmp5 ? tmp4 : tmp5; + auto tmp7 = static_cast(127.0); + auto tmp8 = 0; + if (at::_isnan(tmp6)) { + tmp8 = tmp6; + } + tmp8 = tmp6 < tmp7 ? tmp6 : tmp7; + auto tmp9 = static_cast(tmp8); + auto tmp10 = static_cast(tmp9); + out_ptr[k] = tmp10; + } + }; + if (quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { #pragma omp parallel for collapse(2) for (int m = 0; m < M; m++) { for (int kc = 0; kc < Kc; kc++) { auto in_ptr0 = in_ptr + m * K + kc * quant_block_k; - auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; auto scale_ = scale_ptr[kc]; - auto zp_ = zp_ptr[kc]; + auto zp_ = is_sym_quant ? 0 : zp_ptr[kc]; int block_k = std::min(quant_block_k, (int)K - kc * quant_block_k); - quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + if (is_sym_quant) { + auto out_ptr0 = out_sym_ptr + m * K + kc * quant_block_k; + quantize_block_sym(in_ptr0, out_ptr0, block_k, scale_, zp_); + } else { + auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; + quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + } } } - } else if (quant_a_mode == QUANT_A_PER_M) { + } else if ( + quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { #pragma omp parallel for collapse(2) for (int m = 0; m < M; m++) { for (int kc = 0; kc < Kc; kc++) { auto in_ptr0 = in_ptr + m * K + kc * quant_block_k; - auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; auto scale_ = scale_ptr[m]; - auto zp_ = zp_ptr[m]; + auto zp_ = is_sym_quant ? 0 : zp_ptr[m]; int block_k = std::min(quant_block_k, (int)K - kc * quant_block_k); - quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + if (is_sym_quant) { + auto out_ptr0 = out_sym_ptr + m * K + kc * quant_block_k; + quantize_block_sym(in_ptr0, out_ptr0, block_k, scale_, zp_); + } else { + auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; + quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + } } } } else { @@ -3718,11 +4115,16 @@ at::Tensor quantize_per_block( for (int m = 0; m < M; m++) { for (int kc = 0; kc < Kc; kc++) { auto in_ptr0 = in_ptr + m * K + kc * quant_block_k; - auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; auto scale_ = scale_ptr[m * Kc + kc]; - auto zp_ = zp_ptr[m * Kc + kc]; + auto zp_ = is_sym_quant ? 0 : zp_ptr[m * Kc + kc]; int block_k = std::min(quant_block_k, (int)K - kc * quant_block_k); - quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + if (is_sym_quant) { + auto out_ptr0 = out_sym_ptr + m * K + kc * quant_block_k; + quantize_block_sym(in_ptr0, out_ptr0, block_k, scale_, zp_); + } else { + auto out_ptr0 = out_ptr + m * K + kc * quant_block_k; + quantize_block(in_ptr0, out_ptr0, block_k, scale_, zp_); + } } } } @@ -4033,9 +4435,10 @@ at::Tensor qlinear_woq_affine( if (quant_a_mode == QUANT_A_PER_TENSOR) { float scale_a; int32_t zp_a; - compute_int8_qparams_per_tensor(x, &scale_a, &zp_a); + compute_int8_qparams_per_tensor( + x, &scale_a, &zp_a, false); auto x_quantized = - quantize_per_tensor(x, scale_a, zp_a); + quantize_per_tensor(x, scale_a, zp_a, false); qlinear_woq_affine_impl< uint8_t, uint8_t, @@ -4058,15 +4461,47 @@ at::Tensor qlinear_woq_affine( zp_list[int8_idx], &scale_a, &zp_a); - } else { + } else if (quant_a_mode == QUANT_A_PER_TENSOR_SYM) { + float scale_a; + int32_t zp_a; + compute_int8_qparams_per_tensor( + x, &scale_a, &zp_a, true); + auto x_quantized = + quantize_per_tensor(x, scale_a, zp_a, true); + qlinear_woq_affine_impl< + int8_t, + uint8_t, + /*TGemmOut*/ float, + act_type, + float, + int8_t, + QUANT_A_PER_TENSOR_SYM, + quant_w_mode_>( + x_quantized, + qw, + scales_list[fp32_idx], + biases[fp32_idx], + y, + qw_type, + k_splits, + fusion_type, + others_list, + quant_block_k, + zp_list[int8_idx], + &scale_a, + &zp_a); + } else if ( + quant_a_mode == QUANT_A_PER_K_BLOCK || + quant_a_mode == QUANT_A_PER_M_K_BLOCK || + quant_a_mode == QUANT_A_PER_M) { auto block_k = w_sizes[2]; if (quant_block_k <= 0) quant_block_k = block_k; auto [scale_a, zp_a] = compute_int8_qparams_per_block( - x, quant_block_k, quant_a_mode); + x, quant_block_k, quant_a_mode, false); auto x_quantized = quantize_per_block( - x, scale_a, zp_a, quant_block_k, quant_a_mode); + x, scale_a, zp_a, quant_block_k, quant_a_mode, false); float* scale_a_ptr = (float*)scale_a.data_ptr(); int32_t* zp_a_ptr = (int32_t*)zp_a.data_ptr(); range_dispatcher< @@ -4100,6 +4535,48 @@ at::Tensor qlinear_woq_affine( zp_a_ptr); }, [&](auto quant_a_mode_) { failing_fallback(); }); + } else { + auto block_k = w_sizes[2]; + if (quant_block_k <= 0) + quant_block_k = block_k; + auto [scale_a, zp_a] = + compute_int8_qparams_per_block( + x, quant_block_k, quant_a_mode, true); + auto x_quantized = quantize_per_block( + x, scale_a, zp_a, quant_block_k, quant_a_mode, true); + float* scale_a_ptr = (float*)scale_a.data_ptr(); + int32_t* zp_a_ptr = nullptr; + range_dispatcher< + long, + QUANT_A_PER_K_BLOCK_SYM, + QUANT_A_PER_M_K_BLOCK_SYM>:: + call( + quant_a_mode, + [&](auto quant_a_mode_) { + qlinear_woq_affine_impl< + int8_t, + uint8_t, + /*TGemmOut*/ float, + act_type, + float, + int8_t, + quant_a_mode_, + quant_w_mode_>( + x_quantized, + qw, + scales_list[fp32_idx], + biases[fp32_idx], + y, + qw_type, + k_splits, + fusion_type, + others_list, + quant_block_k, + zp_list[int8_idx], + scale_a_ptr, + zp_a_ptr); + }, + [&](auto quant_a_mode_) { failing_fallback(); }); } } }, diff --git a/csrc/cpu/tpp/xsmm_functors.h b/csrc/cpu/tpp/xsmm_functors.h index dc851c202..c89b514fc 100644 --- a/csrc/cpu/tpp/xsmm_functors.h +++ b/csrc/cpu/tpp/xsmm_functors.h @@ -1873,7 +1873,8 @@ class BrgemmTPP { float beta, int a_trans, int unroll_hint, - int b_vnni = 1) + int b_vnni = 1, + bool is_s8s8 = false) : M(M), N(N), K(K), @@ -1886,6 +1887,7 @@ class BrgemmTPP { a_trans(a_trans), unroll_hint(unroll_hint), b_vnni(b_vnni), + is_s8s8(is_s8s8), k_gemm_with_tc(this, 0), k_cfg(this, 1), k_rls(this, 2), @@ -1998,7 +2000,7 @@ class BrgemmTPP { protected: uint64_t hash_int() override { - std::array params = { + std::array params = { p->M, p->N, p->K, @@ -2012,8 +2014,9 @@ class BrgemmTPP { p->ldb, p->ldc, config, - p->b_vnni}; - uint64_t hash_value = string_to_hash_int<14>("brgemm", params); + p->b_vnni, + p->is_s8s8}; + uint64_t hash_value = string_to_hash_int<15>("brgemm", params); return hash_value; } void* build_kernel() override { @@ -2062,7 +2065,9 @@ class BrgemmTPP { l_shape.comp_type = LIBXSMM_DATATYPE_F32; // TODO(jgong5): we should not always assume u8*i8 for int8 gemm if (std::is_same()) { - l_flags |= LIBXSMM_GEMM_FLAG_B_UNSIGNED; + if (!p->is_s8s8) { + l_flags |= LIBXSMM_GEMM_FLAG_B_UNSIGNED; + } l_shape.comp_type = LIBXSMM_DATATYPE_I32; } l_shape.out_type = XsmmDtype(); @@ -2114,6 +2119,7 @@ class BrgemmTPP { int64_t brgemm_type = -1; int unroll_hint; int b_vnni; + bool is_s8s8; BrgemmKernel k_gemm_with_tc; BrgemmKernel k_cfg; BrgemmKernel k_rls; diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index c31160757..5395e5bae 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -212,7 +212,16 @@ ) parser.add_argument( "--act-quant-mode", - choices=["PER_TENSOR", "PER_IC_BLOCK", "PER_BATCH", "PER_BATCH_IC_BLOCK"], + choices=[ + "PER_TENSOR", + "PER_IC_BLOCK", + "PER_BATCH", + "PER_BATCH_IC_BLOCK", + "PER_TENSOR_SYM", + "PER_IC_BLOCK_SYM", + "PER_BATCH_SYM", + "PER_BATCH_IC_BLOCK_SYM", + ], default="PER_IC_BLOCK", type=str, help="Quantization mode for activation with different granularity. " @@ -222,6 +231,10 @@ "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; " "PER_BATCH(2): quantize per batch; " "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " + "PER_TENSOR_SYM(4): symmetrically quantize per tensor; " + "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; " + "PER_BATCH_SYM(6): symmetrically quantize per batch; " + "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. " "IC_BLOCK is determined by IC automatically.", ) parser.add_argument( @@ -967,6 +980,10 @@ def calib_func(prepared_model): "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK, "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH, "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK, + "PER_TENSOR_SYM": ipex.quantization.WoqActQuantMode.PER_TENSOR_SYM, + "PER_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK_SYM, + "PER_BATCH_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_SYM, + "PER_BATCH_IC_BLOCK_SYM": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK_SYM, } qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( weight_dtype=weight_dtype, diff --git a/intel_extension_for_pytorch/quantization/_qconfig.py b/intel_extension_for_pytorch/quantization/_qconfig.py index 18a046717..90356161f 100644 --- a/intel_extension_for_pytorch/quantization/_qconfig.py +++ b/intel_extension_for_pytorch/quantization/_qconfig.py @@ -116,6 +116,10 @@ class WoqActQuantMode(IntEnum): PER_IC_BLOCK = 1 # IC = Input Channel PER_BATCH = 2 PER_BATCH_IC_BLOCK = 3 + PER_TENSOR_SYM = 4 + PER_IC_BLOCK_SYM = 5 + PER_BATCH_SYM = 6 + PER_BATCH_IC_BLOCK_SYM = 7 # Start from 1 to align with kernel diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py index d6b0befd4..5ab33059b 100644 --- a/tests/cpu/test_quantization_default_recipe.py +++ b/tests/cpu/test_quantization_default_recipe.py @@ -1337,6 +1337,71 @@ def _fakequant_by_group(self, t, quant_a_mode, groupsize): out = out[: orig_shape[0], : orig_shape[1]].contiguous() return out + def _fakequant_by_group_sym(self, t, quant_a_mode, groupsize): + assert quant_a_mode >= 4 and quant_a_mode <= 7 + if quant_a_mode == 4: + obs = torch.ao.quantization.MinMaxObserver( + torch.qint8, qscheme=torch.per_tensor_symmetric + ) + obs(t) + scale, zero_point = obs.calculate_qparams() + return ( + torch.quantize_per_tensor( + t.to(torch.float), scale, zero_point, torch.qint8 + ) + .dequantize() + .to(t.dtype) + ) + orig_shape = t.shape + if t.shape[-1] % groupsize: + pad_len = t.shape[-1] // groupsize * groupsize + groupsize - t.shape[-1] + t = torch.nn.functional.pad(t, (0, pad_len), value=0) + grouped = t.view(-1, t.shape[-1] // groupsize, groupsize) + if quant_a_mode == 5: + grouped_min = grouped.min(dim=-1)[0].min(dim=0)[0] + grouped_max = grouped.max(dim=-1)[0].max(dim=0)[0] + elif quant_a_mode == 6: + grouped_min = grouped.min(dim=-1)[0].min(dim=1)[0] + grouped_max = grouped.max(dim=-1)[0].max(dim=1)[0] + else: + grouped_min = grouped.min(dim=-1)[0] + grouped_max = grouped.max(dim=-1)[0] + min = grouped_min + max = grouped_max + eps = torch.tensor([torch.finfo(torch.float32).eps]) + scales = torch.max(torch.abs(max), torch.abs(min)) / 127 + scales = torch.max(scales, eps) + if quant_a_mode == 5: + qt = torch.clamp( + torch.round(grouped / scales.unsqueeze(1)), + min=-128, + max=127, + ) + out = ((qt) * scales.unsqueeze(1)).to(t.dtype).view(t.shape) + if orig_shape != out.shape: + out = out[: orig_shape[0], : orig_shape[1]].contiguous() + return out + elif quant_a_mode == 6: + qt = torch.clamp( + torch.round(grouped / scales.unsqueeze(1).unsqueeze(2)), + min=-128, + max=127, + ) + out = ((qt) * scales.unsqueeze(1).unsqueeze(2)).to(t.dtype).view(t.shape) + if orig_shape != out.shape: + out = out[: orig_shape[0], : orig_shape[1]].contiguous() + return out + else: + qt = torch.clamp( + torch.round(grouped / scales.unsqueeze(-1)), + min=-128, + max=127, + ) + out = ((qt) * scales.unsqueeze(-1)).to(t.dtype).view(t.shape) + if orig_shape != out.shape: + out = out[: orig_shape[0], : orig_shape[1]].contiguous() + return out + def test_weight_only_quantization_act_quant_mode(self): class Mod(nn.Module): @@ -1402,6 +1467,80 @@ def test(has_bias, act_quant_mode, M): for has_bias, quant_mode, M in cases: test(has_bias, quant_mode, M) + def test_weight_only_quantization_act_quant_sym_mode(self): + + class Mod(nn.Module): + def __init__(self, has_bias, K, N): + super(Mod, self).__init__() + self.linear = torch.nn.Linear(K, N, has_bias) + + def forward(self, x): + return self.linear(x) + + def test_sym(has_bias, act_quant_mode, shape): + dtype = torch.bfloat16 + model = Mod(has_bias, shape[1], shape[2]) + m = model.eval() + m2 = copy.deepcopy(m) + data = torch.randn(shape[0], shape[1]) * 0.5 + qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=WoqWeightDtype.INT4, + lowp_mode=WoqLowpMode.INT8, + act_quant_mode=act_quant_mode, + ) + fake_quant_x_sym = self._fakequant_by_group_sym( + data, act_quant_mode, groupsize + ) + prepared_model = prepare(m2, qconfig_mapping, inplace=True) + with torch.no_grad(), torch.autocast( + device_type="cpu", enabled=True, dtype=dtype + ): + woq_model = convert(prepared_model) + # Behavior of WOQ Linear to simulate: + # Quantize weight to int4 by float qparams at quantization time + # Quantize activation to int8 at runtime + # Convert weight and its zero points to INT8 for computation + qw = woq_model.linear._op_context.to_public( + woq_model.linear._op_context.get_weight() + ) + w_scales = woq_model.linear._op_context.get_scales() + w_zero_points = woq_model.linear._op_context.get_zero_points() + w = copy.deepcopy(m.linear.weight.data) + + qw, _, _ = quantize_per_channel( + w, WoqWeightDtype.INT4, w_scales, w_zero_points + ) + fake_quant_w = dequantize_per_channel( + qw, w_scales, w_zero_points.int(), WoqWeightDtype.INT4, w.shape + ) + m.linear.weight.data = fake_quant_w + y_ref = m(fake_quant_x_sym).to(dtype) + y = woq_model(data) + try: + torch.testing.assert_close(y, y_ref, atol=1e-2 * 5, rtol=1e-1 * 2) + except Exception: + # The fallback kernel does not support act quant mode + # It computes in fp32 by dequantizing weight. + fake_quant_w = qw.dequantize() + y_ref = data @ fake_quant_w.T + (m.linear.bias if has_bias else 0) + y_ref = y_ref.to(dtype) + torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-1) + + groupsize = 64 + shape_list = [ + [3, 31, 31], + [4, 4096, 4096], + [4, 4096, 4095], + [9, 4095, 4095], + [196, 4095, 4095], + [1024, 512, 512], + ] + has_bias_list = [False, True] + quant_mode_sym_list = [4, 5, 6, 7] + cases_sym = itertools.product(has_bias_list, quant_mode_sym_list, shape_list) + for has_bias, quant_mode, shape in cases_sym: + test_sym(has_bias, quant_mode, shape) + def test_weight_only_quantization_group_size(self): class Mod(nn.Module): def __init__(self, ic, oc, has_bias): From 276670f6918990cbe71aaaf7cc507d6fbabbc407 Mon Sep 17 00:00:00 2001 From: blzheng Date: Thu, 13 Jun 2024 11:12:16 +0800 Subject: [PATCH 110/199] enable falcon 11b (#2953) --- intel_extension_for_pytorch/llm/__init__.py | 2 ++ intel_extension_for_pytorch/llm/utils.py | 31 ++++++++++++++++++- .../models/reference/modules/decoder.py | 17 ++++++++-- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/intel_extension_for_pytorch/llm/__init__.py b/intel_extension_for_pytorch/llm/__init__.py index 725a1257e..1ebcb8b72 100644 --- a/intel_extension_for_pytorch/llm/__init__.py +++ b/intel_extension_for_pytorch/llm/__init__.py @@ -15,10 +15,12 @@ _gradient_checkpointing_enable, _get_class_from_dynamic_module, _get_cached_module_file, + _get_imports, ) import transformers transformers.dynamic_module_utils.get_relative_imports = _get_relative_imports + transformers.dynamic_module_utils.get_imports = _get_imports transformers.dynamic_module_utils.get_cached_module_file = _get_cached_module_file transformers.dynamic_module_utils.get_class_from_dynamic_module = ( _get_class_from_dynamic_module diff --git a/intel_extension_for_pytorch/llm/utils.py b/intel_extension_for_pytorch/llm/utils.py index ff84b0bfb..fc73f7941 100644 --- a/intel_extension_for_pytorch/llm/utils.py +++ b/intel_extension_for_pytorch/llm/utils.py @@ -10,7 +10,7 @@ import shutil import typing from ..utils._logger import logger, WarningType -from typing import Dict, Optional, Union +from typing import Dict, Optional, Union, List from transformers.dynamic_module_utils import ( check_imports, create_dynamic_module, @@ -303,6 +303,35 @@ def _get_cached_module_file( return os.path.join(full_submodule, module_file) +def _get_imports(filename: Union[str, os.PathLike]) -> List[str]: + """ + Extracts all the libraries (not relative imports this time) that are imported in a file. + + Args: + filename (`str` or `os.PathLike`): The module file to inspect. + + Returns: + `List[str]`: The list of all packages required to use the input module. + """ + with open(filename, "r", encoding="utf-8") as f: + content = f.read() + + # filter out try/except block so in custom code we can have try/except imports + content = re.sub( + r"\s*try\s*:\s*.*?\s*except\s*.*?:", "", content, flags=re.MULTILINE | re.DOTALL + ) + + # Imports of the form `import xxx` + imports = re.findall(r"^\s*import\s+(\S+)\s*$", content, flags=re.MULTILINE) + # Imports of the form `from xxx import yyy` + imports += re.findall(r"^\s*from\s+(\S+)\s+import", content, flags=re.MULTILINE) + # Only keep the top-level module + imports = [imp.split(".")[0] for imp in imports if not imp.startswith(".")] + while "flash_attn" in imports: + imports.remove("flash_attn") + return list(set(imports)) + + def _get_class_from_dynamic_module( class_reference: str, pretrained_model_name_or_path: Union[str, os.PathLike], diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index fb56ca61b..81a208230 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -192,9 +192,13 @@ def FalconDecoderLayer_forward( output_attentions: bool = False, ): residual = hidden_states - if self.self_attention.new_decoder_architecture or not hasattr( - self, "input_layernorm" - ): + if ( + self.self_attention.new_decoder_architecture + and not ( + hasattr(self.config, "num_ln_in_parallel_attn") + and self.config.num_ln_in_parallel_attn == 1 + ) + ) or not hasattr(self, "input_layernorm"): attention_layernorm_out = self.ln_attn(hidden_states) mlp_layernorm_out = self.ln_mlp(hidden_states) else: @@ -219,6 +223,13 @@ def FalconDecoderLayer_forward( else: residual = attention_output + residual mlp_layernorm_out = self.post_attention_layernorm(residual) + if ( + self.config.new_decoder_architecture + and self.config.parallel_attn + and hasattr(self.config, "num_ln_in_parallel_attn") + and self.config.num_ln_in_parallel_attn == 1 + ): + mlp_layernorm_out = attention_layernorm_out outputs = attn_outputs[1:] # MLP. From b7d2f9f6867a5958392aeba1078ef18b72fa0774 Mon Sep 17 00:00:00 2001 From: blzheng Date: Thu, 13 Jun 2024 12:55:16 +0800 Subject: [PATCH 111/199] remove unnecessary warnings and add attribute checks (#2974) --- .../python/llm/single_instance/run_quantization.py | 5 ++++- intel_extension_for_pytorch/llm/__init__.py | 2 +- .../transformers/models/reference/models.py | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 5395e5bae..31f185b94 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -817,7 +817,10 @@ def collate_batch(self, batch): return (model_inputs, last_ind) - if model.default_dataset == "librispeech_asr": + if ( + hasattr(model, "default_dataset") + and model.default_dataset == "librispeech_asr" + ): calib_dataset = load_dataset(model.default_dataset, split="train.clean.100") else: calib_dataset = load_dataset( diff --git a/intel_extension_for_pytorch/llm/__init__.py b/intel_extension_for_pytorch/llm/__init__.py index 1ebcb8b72..cb7e8997d 100644 --- a/intel_extension_for_pytorch/llm/__init__.py +++ b/intel_extension_for_pytorch/llm/__init__.py @@ -6,7 +6,7 @@ try: from . import generation except ImportError as e: - warnings.warn(f"failed to use huggingface generation fuctions due to: {e}.") + pass try: from .utils import ( diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index b4f2cf1e1..330771634 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -3545,9 +3545,9 @@ def detect_language( def output_hook(module: torch.nn.Module, args, kwargs, outputs: Any): - if module.config.use_return_dict or ( - "return_dict" in kwargs and kwargs["return_dict"] - ): + if ( + hasattr(module.config, "use_return_dict") and module.config.use_return_dict + ) or ("return_dict" in kwargs and kwargs["return_dict"]): idx = 0 loss = None aux_loss = None From 85179fd3777f514c16b284c275941d2577c07c6f Mon Sep 17 00:00:00 2001 From: DiweiSun <105627594+DiweiSun@users.noreply.github.com> Date: Thu, 13 Jun 2024 21:11:14 +0800 Subject: [PATCH 112/199] [Public CI] enable flake8 and clang format check for public repo (#2963) --- .github/workflows/format-check.yml | 43 ++++++ scripts/tools/setup/clang-format-diff.py | 176 +++++++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 .github/workflows/format-check.yml create mode 100644 scripts/tools/setup/clang-format-diff.py diff --git a/.github/workflows/format-check.yml b/.github/workflows/format-check.yml new file mode 100644 index 000000000..0ef6896d2 --- /dev/null +++ b/.github/workflows/format-check.yml @@ -0,0 +1,43 @@ +name: format-check + +on: + pull_request: + branches: + - main + - xpu-main + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }} + cancel-in-progress: true + +permissions: read-all + +jobs: + format-check: + # Don't run on forked repos + # if: github.repository_owner == 'intel' + name: format-check + runs-on: ubuntu-latest + steps: + - name: Checkout intel_extension_for_pytorch + uses: actions/checkout@v2 + + - name: Checkout submodules + run: | + git submodule update --init --recursive + + - name: flake8-check + run: | + pwd + pip install lintrunner pip install lintrunner-adapters + lintrunner init + python scripts/tools/setup/flake8.py + + - name: clang-check + run: | + pwd + python -m pip install clang-format==12.0.1 + git diff -U0 --no-color ${{ github.event.pull_request.base.ref }} | python scripts/tools/setup/clang-format-diff.py -p1 + + + diff --git a/scripts/tools/setup/clang-format-diff.py b/scripts/tools/setup/clang-format-diff.py new file mode 100644 index 000000000..31ccdbd61 --- /dev/null +++ b/scripts/tools/setup/clang-format-diff.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python +# +# ===- clang-format-diff.py - ClangFormat Diff Reformatter ----*- python -*--===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +# ===------------------------------------------------------------------------===# + +r""" +ClangFormat Diff Reformatter +============================ + +This script reads input from a unified diff and reformats all the changed +lines. This is useful to reformat all the lines touched by a specific patch. +Example usage for git/svn users: + + git diff -U0 --no-color HEAD^ | clang-format-diff.py -p1 -i + svn diff --diff-cmd=diff -x-U0 | clang-format-diff.py -i + +""" +from __future__ import absolute_import, division, print_function + +import argparse +import difflib +import re +import subprocess +import sys + +if sys.version_info.major >= 3: + from io import StringIO +else: + from io import BytesIO as StringIO + + +def main(): + parser = argparse.ArgumentParser( + description="Reformat changed lines in diff. Without -i " + "option just output the diff that would be " + "introduced." + ) + parser.add_argument( + "-i", + action="store_true", + default=False, + help="apply edits to files instead of displaying a diff", + ) + parser.add_argument( + "-p", + metavar="NUM", + default=0, + help="strip the smallest prefix containing P slashes", + ) + parser.add_argument( + "-regex", + metavar="PATTERN", + default=None, + help="custom pattern selecting file paths to reformat " + "(case sensitive, overrides -iregex)", + ) + parser.add_argument( + "-iregex", + metavar="PATTERN", + default=r".*\.(cpp|cc|c\+\+|cxx|c|cl|h|hpp|m|mm|inc|js|ts|proto" + r"|protodevel|java)", + help="custom pattern selecting file paths to reformat " + "(case insensitive, overridden by -regex)", + ) + parser.add_argument( + "-sort-includes", + action="store_true", + default=False, + help="let clang-format sort include blocks", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="be more verbose, ineffective without -i", + ) + parser.add_argument( + "-style", + help="formatting style to apply (LLVM, Google, Chromium, " "Mozilla, WebKit)", + ) + parser.add_argument( + "-binary", + default="clang-format", + help="location of binary to use for clang-format", + ) + args = parser.parse_args() + + # Extract changed lines for each file. + filename = None + lines_by_file = {} + for line in sys.stdin: + match = re.search("^\\+\\+\\+\\ (.*?/){%s}(\\S*)" % args.p, line) + if match: + filename = match.group(2) + if filename is None: + continue + + if args.regex is not None: + if not re.match("^%s$" % args.regex, filename): + continue + else: + if not re.match("^%s$" % args.iregex, filename, re.IGNORECASE): + continue + + match = re.search("^@@.*\\+(\\d+)(,(\\d+))?", line) + if match: + start_line = int(match.group(1)) + line_count = 1 + if match.group(3): + line_count = int(match.group(3)) + if line_count == 0: + continue + end_line = start_line + line_count - 1 + lines_by_file.setdefault(filename, []).extend( + ["-lines", str(start_line) + ":" + str(end_line)] + ) + + nerr = 0 + # Reformat files containing changes in place. + for filename, lines in lines_by_file.items(): + if args.i and args.verbose: + print("Formatting {}".format(filename)) + command = [args.binary, filename] + if args.i: + command.append("-i") + if args.sort_includes: + command.append("-sort-includes") + command.extend(lines) + if args.style: + command.extend(["-style", args.style]) + p = subprocess.Popen( + command, + stdout=subprocess.PIPE, + stderr=None, + stdin=subprocess.PIPE, + universal_newlines=True, + ) + stdout, stderr = p.communicate() + if p.returncode != 0: + sys.exit(p.returncode) + + if not args.i: + with open(filename) as f: + code = f.readlines() + formatted_code = StringIO(stdout).readlines() + diff = difflib.unified_diff( + code, + formatted_code, + filename, + filename, + "(before formatting)", + "(after formatting)", + ) + diff_string = "".join(diff) + if len(diff_string) > 0: + sys.stdout.write(diff_string) + nerr += 1 + + if nerr > 0: + sys.stdout.write( + "\nFormatter check failed. Please format the code" + " changes accoding to the formatting advice above.\n" + ) + sys.exit(1) + else: + sys.exit(0) + + +if __name__ == "__main__": + main() From 8f40e458dd27cec1c441c13a396b45adafb581c5 Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Thu, 13 Jun 2024 23:18:52 +0800 Subject: [PATCH 113/199] Add IPEX.distributed.allreduce/allgather/allgather-into-tensor support (#2925) --- csrc/cpu/aten/TPPShmAllReduceAdd.cpp | 14 + csrc/cpu/aten/TPPShmAllReduceAdd.h | 19 ++ .../aten/kernels/TPPSHMAllreduceAddKrnl.cpp | 318 ++++++++++++++++++ csrc/cpu/tpp/xsmm_functors.h | 24 +- intel_extension_for_pytorch/__init__.py | 1 + .../csrc/cpu/Module.cpp | 2 + .../distributed/__init__.py | 1 + .../distributed/dist.py | 263 +++++++++++++++ .../models/cpu/distributed/dist.py | 65 ++++ tests/cpu/test_ccl_primitive.py | 108 +++++- 10 files changed, 805 insertions(+), 10 deletions(-) create mode 100644 csrc/cpu/aten/TPPShmAllReduceAdd.cpp create mode 100644 csrc/cpu/aten/TPPShmAllReduceAdd.h create mode 100644 csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp create mode 100644 intel_extension_for_pytorch/distributed/__init__.py create mode 100644 intel_extension_for_pytorch/distributed/dist.py create mode 100644 intel_extension_for_pytorch/transformers/models/cpu/distributed/dist.py diff --git a/csrc/cpu/aten/TPPShmAllReduceAdd.cpp b/csrc/cpu/aten/TPPShmAllReduceAdd.cpp new file mode 100644 index 000000000..e0788e74b --- /dev/null +++ b/csrc/cpu/aten/TPPShmAllReduceAdd.cpp @@ -0,0 +1,14 @@ +#include "TPPShmAllReduceAdd.h" +#include +namespace torch_ipex { +namespace cpu { +IPEX_DEFINE_DISPATCH(tpp_allreduce_kernel_stub); +void tpp_shmallreduce_forward( + at::Tensor t_in, + c10::intrusive_ptr process_group) { + RECORD_FUNCTION("tpp_all_reduce_add", c10::ArrayRef({})); + return tpp_allreduce_kernel_stub(kCPU, t_in, process_group); +} + +} // namespace cpu +} // namespace torch_ipex \ No newline at end of file diff --git a/csrc/cpu/aten/TPPShmAllReduceAdd.h b/csrc/cpu/aten/TPPShmAllReduceAdd.h new file mode 100644 index 000000000..93e11ba94 --- /dev/null +++ b/csrc/cpu/aten/TPPShmAllReduceAdd.h @@ -0,0 +1,19 @@ +#pragma once +#include +#include +#include + +namespace torch_ipex { +namespace cpu { + +void tpp_shmallreduce_forward( + at::Tensor t_in, + c10::intrusive_ptr process_group); + +using tpp_allreduce_impl_fn = + void (*)(at::Tensor, c10::intrusive_ptr); + +IPEX_DECLARE_DISPATCH(tpp_allreduce_impl_fn, tpp_allreduce_kernel_stub); + +} // namespace cpu +} // namespace torch_ipex \ No newline at end of file diff --git a/csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp b/csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp new file mode 100644 index 000000000..3a7f98f42 --- /dev/null +++ b/csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp @@ -0,0 +1,318 @@ +#include +#include +#include +#include +#include +#include +#include "tpp/utils.h" +#include "tpp/xsmm_functors.h" + +namespace torch_ipex { +namespace cpu { + +namespace { + +#define BS 512 + +static const long master_port = torch_ipex::tpp::env2int("MASTER_PORT", 0); +namespace shm_tpp { +template +struct TppOps { + torch_ipex::tpp::CpyTPP cpy_tpp = torch_ipex::tpp::CpyTPP(BS); + torch_ipex::tpp::ConvertTPP ucvt_tpp = + torch_ipex::tpp::ConvertTPP(BS); + torch_ipex::tpp::ConvertTPP dcvt_tpp = + torch_ipex::tpp::ConvertTPP(BS); + torch_ipex::tpp::AddTPP add_tpp = + torch_ipex::tpp::AddTPP(BS); +}; + +static TppOps ops_f; +static TppOps ops_bf; +static TppOps ops_hf; + +template +static TppOps getOps() {} + +template <> +TppOps getOps() { + return ops_f; +} +template <> +TppOps getOps() { + return ops_bf; +} +template <> +TppOps getOps() { + return ops_hf; +} +} // namespace shm_tpp + +class SHMBuffer { + public: + static int SHMID; + static int BARID; + static const int MAX_RANKS = 64; + static const int DIRECT_THRESHOLD = 32 * 1024; + c10::intrusive_ptr pg; + int rank; + int size; + size_t bufsz; + int shmid[MAX_RANKS]; + int barid; + void* shm_data[MAX_RANKS]; + void* scratch_data[MAX_RANKS]; + void* bar_data; + volatile int* bar1; + volatile int* bar2; + + SHMBuffer(size_t bufsz_, c10::intrusive_ptr pg) : pg(pg) { + bufsz = ((bufsz_ + 4095) / 4096) * 4096 * 2; + rank = pg->getRank(); + size = pg->getSize(); + /* each process creates its own shared memory */ + shmid[rank] = shmget(SHMID + rank, bufsz, IPC_CREAT | 0666); + AT_ASSERT( + shmid[rank] >= 0, + "shmid cannot create shared memory of size %lu\n", + bufsz); + if (rank == 0) { + barid = shmget(BARID, 4096, IPC_CREAT | 0666); + AT_ASSERT(barid >= 0, "barid cannot create shared memory"); + } + pg->barrier()->wait(); + /* each process attaches itself with other processes */ + for (int i = 0; i < size; i++) { + if (i != rank) + shmid[i] = shmget(SHMID + i, bufsz, 0666); + AT_ASSERT(shmid[i] >= 0, "shmid cannot get shared memory\n"); + } + if (rank != 0) { + barid = shmget(BARID, 4096, IPC_CREAT | 0666); + AT_ASSERT(barid >= 0, "barid cannot create shared memory\n"); + } + for (int i = 0; i < size; i++) { + shm_data[i] = shmat(shmid[i], NULL, 0); + AT_ASSERT(shm_data[i], "shmat failed\n"); + scratch_data[i] = (void*)((char*)shm_data[i] + bufsz / 2); + } + bar_data = shmat(barid, NULL, 0); + AT_ASSERT(bar_data, "barat failed\n"); + bar1 = (int*)bar_data; + *bar1 = 0; + bar2 = bar1 + 128; + *bar2 = 0; + pg->barrier()->wait(); + shmctl(shmid[rank], IPC_RMID, NULL); + shmctl(barid, IPC_RMID, NULL); + } + + void cleanup_shm() { + // We can't use pg->barrier here as it may not be available + for (int i = 0; i < size; i++) + shmdt(shm_data[i]); + shmdt(bar_data); + } + + ~SHMBuffer() { + cleanup_shm(); + } + + static SHMBuffer* getInst( + size_t sz, + c10::intrusive_ptr pg) { + static size_t buf_sz = 0; + static SHMBuffer* inst = nullptr; + + // TODO: check for same pg as well + if (buf_sz < sz) { + if (inst != nullptr) { + delete inst; + inst = nullptr; + } + inst = new SHMBuffer(sz, pg); + AT_ASSERT(inst != nullptr, "Unable to create shm buffer\n"); + buf_sz = sz; + } + return inst; + } + + void barrier() { + static uint32_t count = 0; + if (count % 2) { + __sync_fetch_and_add(bar1, 1); + while ((*bar1 % size) != 0) + ; + } else { + __sync_fetch_and_add(bar2, 1); + while ((*bar2 % size) != 0) + ; + } + count++; + } + + at::Tensor getTensor(at::Tensor t) { + size_t sz = t.numel() * t.element_size(); + AT_ASSERT(sz <= bufsz, "Requested tensor size too big\n"); + auto ptr = shm_data[rank]; + auto t_new = torch::from_blob(ptr, t.sizes(), t.options()); + return t_new; + } + + template + void allreduce_impl(at::Tensor t) { + auto numel = t.numel(); + auto nBytes = numel * t.element_size(); + AT_ASSERT((size_t)nBytes <= bufsz / 2, "Too large allreduce size"); + long nBlk = (numel + BS - 1) / BS; + long max_threads = omp_get_max_threads(); + int nThreads = std::min(nBlk, max_threads); + T* ptr = (T*)t.data_ptr(); + long rem = numel % BS; + long numel_aligned = numel - rem; + bool need_copy = ptr != shm_data[rank]; + auto ops = shm_tpp::getOps(); + auto& cpy_tpp = ops.cpy_tpp; + auto& ucvt_tpp = ops.ucvt_tpp; + auto& dcvt_tpp = ops.dcvt_tpp; + auto& add_tpp = ops.add_tpp; + + if (need_copy) { + auto src = ptr; + auto dst = (T*)shm_data[rank]; +#pragma omp parallel for num_threads(nThreads) + for (int i = 0; i < numel_aligned; i += BS) { + cpy_tpp(src + i, dst + i); + } + if (rem > 0) { + for (int i = numel_aligned; i < numel; i++) { + dst[i] = src[i]; + } + } + } + + barrier(); + + if (numel <= DIRECT_THRESHOLD) { + auto dst = (T*)scratch_data[rank]; + auto lsrc = (T*)shm_data[rank]; +#pragma omp parallel for num_threads(nThreads) + for (int i = 0; i < numel; i += BS) { + float ldst[BS]; + ucvt_tpp(lsrc + i, ldst); + for (int r = 1; r < size; r++) { + int r1 = (r + rank) % size; + auto src = (T*)shm_data[r1]; + add_tpp(ldst, src + i, ldst); + } + dcvt_tpp(ldst, dst + i); + } + barrier(); + + if (true) { + auto src = (T*)scratch_data[rank]; + auto dst = ptr; +#pragma omp parallel for num_threads(nThreads) + for (int i = 0; i < numel_aligned; i += BS) { + cpy_tpp(src + i, dst + i); + } + if (rem > 0) { + for (int i = numel_aligned; i < numel; i++) { + dst[i] = src[i]; + } + } + } + } else { + int slice_start = (nBlk * rank / size) * BS; + int slice_end = (nBlk * (rank + 1) / size) * BS; + + auto dst = (T*)scratch_data[rank]; + auto lsrc = (T*)shm_data[rank]; +#pragma omp parallel for num_threads(nThreads) + for (int i = slice_start; i < slice_end; i += BS) { + float ldst[BS]; + ucvt_tpp(lsrc + i, ldst); + for (int r = 1; r < size; r++) { + int r1 = (r + rank) % size; + auto src = (T*)shm_data[r1]; + add_tpp(ldst, src + i, ldst); + } + dcvt_tpp(ldst, dst + i); + } + barrier(); + if (true) { + for (int r = 0; r < size; r++) { + int r1 = (r + rank) % size; + int slice_start = (nBlk * r1 / size) * BS; + int slice_end = (nBlk * (r1 + 1) / size) * BS; + bool handle_last_blk = false; + if (slice_end > numel) { + slice_end -= BS; + handle_last_blk = true; + } + + auto src = (T*)scratch_data[r1]; + auto dst = ptr; +#pragma omp parallel for num_threads(nThreads) + for (int i = slice_start; i < slice_end; i += BS) { + cpy_tpp(src + i, dst + i); + } + if (handle_last_blk) { + for (int i = slice_end; i < numel; i++) { + dst[i] = src[i]; + } + } + } + } + } + } + + void allreduce(at::Tensor t) { + auto dt = t.dtype(); + if (dt == at::kFloat) { + allreduce_impl(t); + } else if (dt == at::kBFloat16) { + allreduce_impl(t); + } else if (dt == at::kHalf) { + allreduce_impl(t); + } else { + AT_ASSERT(0, "Unsupported dtype in allreduce\n"); + } + } +}; + +int SHMBuffer::SHMID = 100 + master_port; +int SHMBuffer::BARID = 10000 + master_port; + +static const long TPP_SHM_BUF_SIZE = + torch_ipex::tpp::env2int("TPP_SHM_BUF_SIZE", 64 * 1024 * 1024); +// Using master port to distinguist multiple distributed instances for setting +// up shared memory +void tpp_allreduce_impl( + at::Tensor t_in, + c10::intrusive_ptr process_group) { + auto shm_inst = SHMBuffer::getInst(TPP_SHM_BUF_SIZE, process_group); + long max_elem = TPP_SHM_BUF_SIZE / t_in.element_size(); + long numel = t_in.numel(); + if (numel <= max_elem) { + shm_inst->allreduce(t_in); + } else { + t_in = t_in.view({-1}); + for (int64_t i = 0; i < numel; i += max_elem) { + auto start = i; + auto end = start + max_elem; + if (end > numel) + end = numel; + auto t = t_in.slice(0, start, end, 1); + shm_inst->allreduce(t); + } + } +} +#undef BS + +} // namespace + +IPEX_REGISTER_DISPATCH(tpp_allreduce_kernel_stub, &tpp_allreduce_impl); + +} // namespace cpu +} // namespace torch_ipex \ No newline at end of file diff --git a/csrc/cpu/tpp/xsmm_functors.h b/csrc/cpu/tpp/xsmm_functors.h index c89b514fc..e1f1d1534 100644 --- a/csrc/cpu/tpp/xsmm_functors.h +++ b/csrc/cpu/tpp/xsmm_functors.h @@ -667,8 +667,8 @@ class ConvertTPP { private: int rows = 0; int cols = 0; - int ldi; - int ldo; + int ldi = 0; + int ldo = 0; UnaryTPP kernel; bool init_done = false; }; @@ -884,34 +884,39 @@ class AddBiasTPP { ConvertTPP cvt; }; -template +template class AddTPP { public: AddTPP() {} AddTPP(int N) : AddTPP(1, N) {} AddTPP(int rows, int cols) : AddTPP(rows, cols, cols, cols) {} AddTPP(int rows, int cols, int ldi, int ldo) + : AddTPP(rows, cols, ldi, ldi, ldo) {} + AddTPP(int rows, int cols, int ldi0, int ldi1, int ldo) : rows(rows), cols(cols), - ldi(ldi), + ldi0(ldi0), + ldi1(ldi1), ldo(ldo), kernel( rows, cols, - ldi, + ldi0, + ldi1, ldo, XsmmDtype(), + XsmmDtype(), XsmmDtype(), LIBXSMM_DATATYPE_F32, LIBXSMM_MELTW_FLAG_BINARY_NONE, LIBXSMM_MELTW_TYPE_BINARY_ADD) {} - void operator()(Tin* in0, Tin* in1, Tout* out) { + void operator()(Tin* in0, Tin2* in1, Tout* out) { kernel((void*)in0, (void*)in1, (void*)out); } - void ref(Tin* in0, Tin* in1, Tout* out) { + void ref(Tin* in0, Tin2* in1, Tout* out) { for (int r = 0; r < rows; r++) { for (int c = 0; c < cols; c++) { - out[r * ldo + c] = (float)in0[r * ldi + c] + (float)in1[r * ldi + c]; + out[r * ldo + c] = (float)in0[r * ldi0 + c] + (float)in1[r * ldi1 + c]; } } } @@ -919,7 +924,8 @@ class AddTPP { private: int rows = 0; int cols = 0; - int ldi; + int ldi0; + int ldi1; int ldo; BinaryTPP kernel; }; diff --git a/intel_extension_for_pytorch/__init__.py b/intel_extension_for_pytorch/__init__.py index 17bcddca0..812d21a7c 100644 --- a/intel_extension_for_pytorch/__init__.py +++ b/intel_extension_for_pytorch/__init__.py @@ -144,6 +144,7 @@ _set_optimized_model_for_generation, ) from . import llm +from . import distributed from .frontend import enable_auto_channels_last, disable_auto_channels_last from .frontend import set_fp32_math_mode, get_fp32_math_mode, FP32MathMode from .cpu._auto_kernel_selection import _enable_dnnl, _disable_dnnl, _using_dnnl diff --git a/intel_extension_for_pytorch/csrc/cpu/Module.cpp b/intel_extension_for_pytorch/csrc/cpu/Module.cpp index 055b2a53b..c530496d1 100644 --- a/intel_extension_for_pytorch/csrc/cpu/Module.cpp +++ b/intel_extension_for_pytorch/csrc/cpu/Module.cpp @@ -36,6 +36,7 @@ #include "TaskModule.h" #include "aten/EmbeddingBag.h" +#include "aten/TPPShmAllReduceAdd.h" #include "comm/comm.h" #include "runtime/CPUPool.h" #include "runtime/TaskExecutor.h" @@ -280,6 +281,7 @@ void InitIpexModuleBindings(py::module m) { // communication related m.def("get_rank", &torch_ipex::cpu::get_rank); + m.def("tpp_shm_allreduce", &torch_ipex::cpu::tpp_shmallreduce_forward); m.def("get_world_size", &torch_ipex::cpu::get_world_size); m.def("barrier", &torch_ipex::cpu::barrier); diff --git a/intel_extension_for_pytorch/distributed/__init__.py b/intel_extension_for_pytorch/distributed/__init__.py new file mode 100644 index 000000000..edac5eb89 --- /dev/null +++ b/intel_extension_for_pytorch/distributed/__init__.py @@ -0,0 +1 @@ +from .dist import all_gather, all_reduce, all_gather_into_tensor, init_process_group diff --git a/intel_extension_for_pytorch/distributed/dist.py b/intel_extension_for_pytorch/distributed/dist.py new file mode 100644 index 000000000..0e5915a5c --- /dev/null +++ b/intel_extension_for_pytorch/distributed/dist.py @@ -0,0 +1,263 @@ +import sys +from intel_extension_for_pytorch.transformers.models.cpu.distributed.dist import ( # noqa + all_reduce_cpu, + all_gather_cpu, + all_gather_into_tensor_cpu, +) +import torch +from torch.distributed import Backend, default_pg_timeout, Store, ReduceOp +import torch.distributed as dist +from datetime import timedelta +from typing import Union, Optional, Any + + +def _get_function_from_device(device_type: str, f): + assert device_type in [ + "cpu", + "xpu", + ], "The device is not in the supported device list." + target_f_name = f.__name__ + "_" + device_type + assert hasattr( + sys.modules[__name__], target_f_name + ), f"Target function {f.__name__} on {device_type} haven't implemented yet." + target_f = getattr(sys.modules[__name__], target_f_name) + return target_f + + +def init_process_group( + backend: Union[str, Backend] = None, + init_method: Optional[str] = None, + timeout: timedelta = default_pg_timeout, + world_size: int = -1, + rank: int = -1, + store: Optional[Store] = None, + group_name: str = "", + pg_options: Optional[Any] = None, + device_id: Optional[torch.device] = None, +): + """ + Initialize the default distributed process group. + This will also initialize the distributed package. + There are 2 main ways to initialize a process group: + 1. Specify ``store``, ``rank``, and ``world_size`` explicitly. + 2. Specify ``init_method`` (a URL string) which indicates where/how + to discover peers. Optionally specify ``rank`` and ``world_size``, + or encode all required parameters in the URL and omit them. + If neither is specified, ``init_method`` is assumed to be "env://". + Args: + backend (str or Backend, optional): The backend to use. Depending on + build-time configurations, valid values include ``mpi``, ``gloo``, + ``nccl``, ``ccl``, and ``ucc``. If the backend is not provided, then both a ``gloo`` + and ``nccl`` backend will be created, see notes below for how multiple + backends are managed. This field can be given as a lowercase string + (e.g., ``"gloo"``), which can also be accessed via + :class:`Backend` attributes (e.g., ``Backend.GLOO``). If using + multiple processes per machine with ``nccl`` backend, each process + must have exclusive access to every GPU it uses, as sharing GPUs + between processes can result in deadlocks. ``ucc`` backend is + experimental. + init_method (str, optional): URL specifying how to initialize the + process group. Default is "env://" if no + ``init_method`` or ``store`` is specified. + Mutually exclusive with ``store``. + world_size (int, optional): Number of processes participating in + the job. Required if ``store`` is specified. + rank (int, optional): Rank of the current process (it should be a + number between 0 and ``world_size``-1). + Required if ``store`` is specified. + store(Store, optional): Key/value store accessible to all workers, used + to exchange connection/address information. + Mutually exclusive with ``init_method``. + timeout (timedelta, optional): Timeout for operations executed against + the process group. Default value is 10 minutes for NCCL and 30 minutes for other backends. + This is the duration after which collectives will be aborted asynchronously and the process will crash. + This is done since CUDA execution is async and it is no longer safe to continue executing user code since + failed async NCCL operations might result in subsequent CUDA operations running on corrupted data. + When TORCH_NCCL_BLOCKING_WAIT is set, the process will block and wait for this timeout. + group_name (str, optional, deprecated): Group name. This argument is ignored + pg_options (ProcessGroupOptions, optional): process group options + specifying what additional options need to be passed in during + the construction of specific process groups. As of now, the only + options we support is ``ProcessGroupNCCL.Options`` for the ``nccl`` + backend, ``is_high_priority_stream`` can be specified so that + the nccl backend can pick up high priority cuda streams when + there're compute kernels waiting. + device_id (torch.device, optional): a single, specific device + to "bind" this process to, allowing for backend-specific + optimizations. Currently this has two effects, only under + NCCL: the communicator is immediately formed (calling + ``ncclCommInit*`` immediately rather than the normal lazy + call) and sub-groups will use ``ncclCommSplit`` when + possible to avoid unnecessary overhead of group creation. If you + want to know NCCL initialization error early, you can also use this + field. + .. note:: To enable ``backend == Backend.MPI``, PyTorch needs to be built from source + on a system that supports MPI. + .. note:: Support for multiple backends is experimental. Currently when no backend is + specified, both ``gloo`` and ``nccl`` backends will be created. The ``gloo`` backend + will be used for collectives with CPU tensors and the ``nccl`` backend will be used + for collectives with CUDA tensors. A custom backend can be specified by passing in + a string with format ":,:", e.g. + "cpu:gloo,cuda:custom_backend". + .. note:: To enable backend ``ccl``, oneccl_bindings_for_pytorch needs to be installed + and it will be imported automatically. + """ + if backend == "ccl": + try: + import oneccl_bindings_for_pytorch # noqa + except ImportError as e: + raise RuntimeError("oneccl_bindings_for_pytorch is not installed!") + return dist.init_process_group( + backend, + init_method, + timeout, + world_size, + rank, + store, + group_name, + pg_options, + device_id, + ) + + +def all_reduce(tensor, op=ReduceOp.SUM, group=None, async_op=False): + """ + Reduces the tensor data across all machines in such a way that all get + the final result. + After the call ``tensor`` is going to be bitwise identical in all processes. + Complex tensors are supported. + Args: + tensor (Tensor): Input and output of the collective. The function + operates in-place. + op (optional): One of the values from + ``torch.distributed.ReduceOp`` + enum. Specifies an operation used for element-wise reductions. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + async_op (bool, optional): Whether this op should be an async op + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + Examples: + >>> # xdoctest: +SKIP("no rank") + >>> # All tensors below are of torch.int64 type. + >>> # We have 2 process groups, 2 ranks. + >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank + >>> tensor + tensor([1, 2]) # Rank 0 + tensor([3, 4]) # Rank 1 + >>> dist.all_reduce(tensor, op=ReduceOp.SUM) + >>> tensor + tensor([4, 6]) # Rank 0 + tensor([4, 6]) # Rank 1 + >>> # All tensors below are of torch.cfloat type. + >>> # We have 2 process groups, 2 ranks. + >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat) + 2 * rank * (1+1j) + >>> tensor + tensor([1.+1.j, 2.+2.j]) # Rank 0 + tensor([3.+3.j, 4.+4.j]) # Rank 1 + >>> dist.all_reduce(tensor, op=ReduceOp.SUM) + >>> tensor + tensor([4.+4.j, 6.+6.j]) # Rank 0 + tensor([4.+4.j, 6.+6.j]) # Rank 1 + """ + f = _get_function_from_device(tensor.device.type, all_reduce) + return f(tensor, op, group, async_op) + + +def all_gather(tensor_list, tensor, group=None, async_op=False): + """ + Gathers tensors from the whole group in a list. + Complex tensors are supported. + Args: + tensor_list (list[Tensor]): Output list. It should contain + correctly-sized tensors to be used for output of the collective. + tensor (Tensor): Tensor to be broadcast from current process. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + async_op (bool, optional): Whether this op should be an async op + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + Examples: + >>> # xdoctest: +SKIP("need process group init") + >>> # All tensors below are of torch.int64 dtype. + >>> # We have 2 process groups, 2 ranks. + >>> tensor_list = [torch.zeros(2, dtype=torch.int64) for _ in range(2)] + >>> tensor_list + [tensor([0, 0]), tensor([0, 0])] # Rank 0 and 1 + >>> tensor = torch.arange(2, dtype=torch.int64) + 1 + 2 * rank + >>> tensor + tensor([1, 2]) # Rank 0 + tensor([3, 4]) # Rank 1 + >>> dist.all_gather(tensor_list, tensor) + >>> tensor_list + [tensor([1, 2]), tensor([3, 4])] # Rank 0 + [tensor([1, 2]), tensor([3, 4])] # Rank 1 + >>> # All tensors below are of torch.cfloat dtype. + >>> # We have 2 process groups, 2 ranks. + >>> tensor_list = [torch.zeros(2, dtype=torch.cfloat) for _ in range(2)] + >>> tensor_list + [tensor([0.+0.j, 0.+0.j]), tensor([0.+0.j, 0.+0.j])] # Rank 0 and 1 + >>> tensor = torch.tensor([1+1j, 2+2j], dtype=torch.cfloat) + 2 * rank * (1+1j) + >>> tensor + tensor([1.+1.j, 2.+2.j]) # Rank 0 + tensor([3.+3.j, 4.+4.j]) # Rank 1 + >>> dist.all_gather(tensor_list, tensor) + >>> tensor_list + [tensor([1.+1.j, 2.+2.j]), tensor([3.+3.j, 4.+4.j])] # Rank 0 + [tensor([1.+1.j, 2.+2.j]), tensor([3.+3.j, 4.+4.j])] # Rank 1 + """ + f = _get_function_from_device(tensor.device.type, all_gather) + return f(tensor_list, tensor, group, async_op) + + +def all_gather_into_tensor(output_tensor, input_tensor, group=None, async_op=False): + """ + Gather tensors from all ranks and put them in a single output tensor. + Args: + output_tensor (Tensor): Output tensor to accommodate tensor elements + from all ranks. It must be correctly sized to have one of the + following forms: + (i) a concatenation of all the input tensors along the primary + dimension; for definition of "concatenation", see ``torch.cat()``; + (ii) a stack of all the input tensors along the primary dimension; + for definition of "stack", see ``torch.stack()``. + Examples below may better explain the supported output forms. + input_tensor (Tensor): Tensor to be gathered from current rank. + Different from the ``all_gather`` API, the input tensors in this + API must have the same size across all ranks. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + async_op (bool, optional): Whether this op should be an async op + Returns: + Async work handle, if async_op is set to True. + None, if not async_op or if not part of the group + Examples: + >>> # xdoctest: +SKIP("need process group init") + >>> # All tensors below are of torch.int64 dtype and on XPU devices. + >>> # We have two ranks. + >>> device = torch.device(f'xpu:{rank}') + >>> tensor_in = torch.arange(2, dtype=torch.int64, device=device) + 1 + 2 * rank + >>> tensor_in + tensor([1, 2], device='xpu:0') # Rank 0 + tensor([3, 4], device='xpu:1') # Rank 1 + >>> # Output in concatenation form + >>> tensor_out = torch.zeros(world_size * 2, dtype=torch.int64, device=device) + >>> dist.all_gather_into_tensor(tensor_out, tensor_in) + >>> tensor_out + tensor([1, 2, 3, 4], device='xpu:0') # Rank 0 + tensor([1, 2, 3, 4], device='xpu:1') # Rank 1 + >>> # Output in stack form + >>> tensor_out2 = torch.zeros(world_size, 2, dtype=torch.int64, device=device) + >>> dist.all_gather_into_tensor(tensor_out2, tensor_in) + >>> tensor_out2 + tensor([[1, 2], + [3, 4]], device='xpu:0') # Rank 0 + tensor([[1, 2], + [3, 4]], device='xpu:1') # Rank 1 + .. warning:: + The Gloo backend does not support this API. + """ + f = _get_function_from_device(output_tensor.device.type, all_gather_into_tensor) + return f(output_tensor, input_tensor, group, async_op) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/distributed/dist.py b/intel_extension_for_pytorch/transformers/models/cpu/distributed/dist.py new file mode 100644 index 000000000..ede523b9f --- /dev/null +++ b/intel_extension_for_pytorch/transformers/models/cpu/distributed/dist.py @@ -0,0 +1,65 @@ +import torch +from typing import List +import os +import intel_extension_for_pytorch as ipex +import torch.distributed as dist +from torch.distributed import ReduceOp + + +def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" + for e in env_keys: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +USE_SHM_ALLREDUCE = -1 + + +def all_reduce_cpu(t: torch.Tensor, op=ReduceOp.SUM, group=None, async_op=False): + pg = ( + torch.distributed.distributed_c10d._get_default_group() + if group is None + else group + ) + global USE_SHM_ALLREDUCE + if USE_SHM_ALLREDUCE == -1: + word_size = torch.distributed.get_world_size(pg) + local_size = get_int_from_env( + [ + "MPI_LOCALNRANKS", + "OMPI_COMM_WORLD_LOCAL_SIZE", + "MV2_COMM_WORLD_LOCAL_SIZE", + "LOCAL_WORLD_SIZE", + ], + -1, + ) + if local_size >= 0 and local_size == word_size: + USE_SHM_ALLREDUCE = 1 + else: + USE_SHM_ALLREDUCE = -1 + + if ( + USE_SHM_ALLREDUCE == 1 + and async_op is False + and op is ReduceOp.SUM + and torch.distributed.is_available() + and torch.distributed.is_initialized() + ): + + ipex._C.tpp_shm_allreduce(t, pg) + return t + else: + return dist.all_reduce(t, op, group, async_op) + + +def all_gather_cpu( + t_list: List[torch.Tensor], t: torch.Tensor, group=None, async_op=False +): + return dist.all_gather(t_list, t, group, async_op) + + +def all_gather_into_tensor_cpu(output_tensor, input_tensor, group=None, async_op=False): + return dist.all_gather_into_tensor(output_tensor, input_tensor, group, async_op) diff --git a/tests/cpu/test_ccl_primitive.py b/tests/cpu/test_ccl_primitive.py index 1b4ba4160..a55162e4f 100644 --- a/tests/cpu/test_ccl_primitive.py +++ b/tests/cpu/test_ccl_primitive.py @@ -7,8 +7,113 @@ world_size = 0 if not has_ccl else ipex.cpu.comm.get_world_size() -@unittest.skipIf(not (has_ccl and world_size > 1), "oneccl is not built") +def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" + for e in env_keys: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +ipex_llm_world_size = get_int_from_env( + ["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1 +) + + class CCLTester(unittest.TestCase): + def init_env(self): + _local_rank = get_int_from_env( + [ + "LOCAL_RANK", + "MPI_LOCALRANKID", + "OMPI_COMM_WORLD_LOCAL_RANK", + "MV2_COMM_WORLD_LOCAL_RANK", + ], + -1, + ) + _rank = get_int_from_env( + ["RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK"], 0 + ) + _world_size = get_int_from_env( + ["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1 + ) + os.environ["RANK"] = str(_rank) + os.environ["WORLD_SIZE"] = str(_world_size) + os.environ["LOCAL_RANK"] = str(_local_rank) + if not os.environ.get("MASTER_PORT", None): + os.environ["MASTER_PORT"] = "29500" + os.environ["MASTER_ADDR"] = "127.0.0.1" + if torch.distributed.is_available() and not torch.distributed.is_initialized(): + ipex.distributed.init_process_group() + return _local_rank, _rank, _world_size + + @unittest.skipIf(not ipex_llm_world_size > 1, "only test with distributed") + def test_ipex_llm_all_reduce_add(self): + _, _rank, _world_size = self.init_env() + ipex.enable_onednn_fusion(False) # just to workaround the flake8 + dtypes = [torch.float32, torch.float16, torch.bfloat16] + tensor_sizes = [4096, 4096 * 32, 8 * 1024 * 5120 * 4 * 2] + # Less than 8 * 1024 * 5120 * 4 use SHM, otherwise use ccl allreduce + # The above dispatch rule is transparent to users + for dtype in dtypes: + for tensor_size in tensor_sizes: + input_tensor = torch.tensor([_rank + 1.0]).to(dtype).repeat(tensor_size) + target_tensor = ( + torch.tensor([float(_world_size * (_world_size + 1) / 2)]) + .to(dtype) + .repeat(tensor_size) + ) + ipex.distributed.all_reduce(input_tensor) + torch.allclose(input_tensor, target_tensor) + + @unittest.skipIf(not ipex_llm_world_size > 1, "only test with distributed") + def test_ipex_llm_allgather(self): + _, _rank, _world_size = self.init_env() + count = 14336 + for dtypes in [torch.float32, torch.float16, torch.bfloat16]: + data = torch.arange(count, dtype=dtypes) + 1 + 2 * _rank + data_list = [torch.randn(count, dtype=dtypes) for _ in range(_world_size)] + ipex.distributed.all_gather(data_list, data) + for i in range(_world_size): + found = False + target_data = ( + torch.arange( + count, + dtype=dtypes, + ) + + 1 + + 2 * i + ) + for all_gather_data in data_list: + if torch.equal(all_gather_data, target_data): + found = True + break + assert ( + found is True + ), "Error: ipex.distributed.all_gather failed, no target tensor in tensor list." + + @unittest.skipIf(not ipex_llm_world_size > 1, "only test with distributed") + def test_all_gather_into_tensor(self): + _, _rank, _world_size = self.init_env() + count = 14336 + for datatype in [torch.bfloat16, torch.float]: + data = torch.arange(count, dtype=datatype) + 1 + 2 * _rank + dst = torch.empty(count * _world_size, dtype=datatype) + ipex.distributed.all_gather_into_tensor(dst, data) + chunk_list = dst.chunk(_world_size) + found = False + for i in range(_world_size): + target_data = torch.arange(count, dtype=datatype) + 1 + 2 * i + for single_chunk in chunk_list: + found = torch.allclose(single_chunk, target_data) + if found: + break + assert ( + found + ), "Error: ipex.distributed.all_gather_into_tensor failed, no target tensor in dst tensor." + + @unittest.skipIf(not (has_ccl and world_size > 1), "oneccl is not built") def test_all_reduce_add(self): mpi_world_size = int(os.environ.get("PMI_SIZE", -1)) mpi_rank = int(os.environ.get("PMI_RANK", -1)) @@ -34,6 +139,7 @@ def test_all_reduce_add(self): self.assertEqual(mpi_world_size, ipex.cpu.comm.get_world_size()) self.assertEqual(mpi_rank, ipex.cpu.comm.get_rank()) + @unittest.skipIf(not (has_ccl and world_size > 1), "oneccl is not built") def test_allgather(self): mpi_world_size = int(os.environ.get("PMI_SIZE", -1)) mpi_rank = int(os.environ.get("PMI_RANK", -1)) From 605cf609008eb0a3b960aef3145bab49d2b084ef Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Fri, 14 Jun 2024 09:27:35 +0800 Subject: [PATCH 114/199] [Rls2.3] Support gqa for varlen_atten API (#2947) (#2976) --- .../transformers/models/cpu/fusions/mha_fusion.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py index a656e674d..b1b78a0bd 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/mha_fusion.py @@ -384,6 +384,12 @@ class _IPEXVarlenScaledDotProductCPU(nn.Module): def __init__(self): super().__init__() + @classmethod + def repeat_kv(cls, x: torch.Tensor, n_rep: int) -> torch.Tensor: + if n_rep == 1: + return x + return torch.repeat_interleave(x, dim=1, repeats=n_rep) + @classmethod def apply_function( cls, @@ -405,6 +411,11 @@ def apply_function( assert return_softmax is False, "ipex do not support return_softmax option" assert gen_ is None, "ipex do not support custom random generator" assert zero_tensors is False, "ipex varlen_fwd do not support zero tensors" + + # Repeat kv if it is GQA. + key = cls.repeat_kv(key, int(query.shape[1] / key.shape[1])) + value = cls.repeat_kv(value, int(query.shape[1] / value.shape[1])) + total_q, num_head, head_size = query.size() total_k, num_head_k, _ = key.size() batch_size = seqlen_q.size(0) - 1 From 602fb2b7c845b29e7c03438f13898956bfa623ed Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Fri, 14 Jun 2024 11:21:31 +0800 Subject: [PATCH 115/199] Backport some changes about LLM README.md and GPTQ scripts from release/2.3 (#2978) * Update README.md (#2744) Co-authored-by: Chunyuan WU * Update README.md for llama-3-70B WQO recipe (#2972) * Update README.md for llama-3-70B WQO recipe * Update README.md * Fix runtime error for Baichuan2-13b and fix logger in GPTQ module (#2973) --------- Co-authored-by: MingxuZh <109504044+MingxuZh@users.noreply.github.com> Co-authored-by: Chunyuan WU --- examples/cpu/inference/python/llm/README.md | 4 ++++ .../cpu/inference/python/llm/utils/run_gptq.py | 2 +- .../quantization/_GPTQ/_gptq_utils.py | 4 +++- .../quantization/_GPTQ/_quantize.py | 12 ++++++++---- .../quantization/_GPTQ/gptq/gptq.py | 18 +++++++++--------- .../quantization/_GPTQ/gptq/model_utils.py | 3 ++- 6 files changed, 27 insertions(+), 16 deletions(-) diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index 3c9d8782a..223416806 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -373,6 +373,8 @@ The command above works for most models we listed. However, to get better accura | EleutherAI/gpt-neox-20b | Remove "`--quant-with-amp`"; add "`--group-size 256`" | | facebook/opt-30b | Remove "`--quant-with-amp`" | | databricks/dolly-v2-12b | Remove "`--quant-with-amp`"; add "`--lowp-mode FP32`" | +| stabilityai/stablelm-2-1_6b | Add "`--group-size 128`" | +| meta-llama/Meta-Llama-3-70B | Add "`--group-size 128`" | - Command (INT4): ```bash @@ -458,6 +460,8 @@ Similar to single instance usage, we need to update some arguments of the runnin | bigscience/bloom-1b7 | Remove "`--quant-with-amp`"; add "`--group-size 128`" | | facebook/opt-30b | Remove "`--quant-with-amp`"; add "`--dtype float32`" | | databricks/dolly-v2-12b | Remove "`--quant-with-amp`"; add "`--lowp-mode FP32 --dtype float32`" | +| stabilityai/stablelm-2-1_6b | Add "`--group-size 128`" | +| meta-llama/Meta-Llama-3-70B | Add "`--group-size 128`" | - An example of llama2 7b model: ```bash diff --git a/examples/cpu/inference/python/llm/utils/run_gptq.py b/examples/cpu/inference/python/llm/utils/run_gptq.py index 39fb3e8d0..96d861f7b 100644 --- a/examples/cpu/inference/python/llm/utils/run_gptq.py +++ b/examples/cpu/inference/python/llm/utils/run_gptq.py @@ -150,7 +150,7 @@ def get_user_model(): torchscript=torchscript, # torchscript will force `return_dict=False` to avoid jit errors trust_remote_code=True, ) - tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) # Set model's seq_len when GPTQ calibration is enabled. user_model.seqlen = 2048 diff --git a/intel_extension_for_pytorch/quantization/_GPTQ/_gptq_utils.py b/intel_extension_for_pytorch/quantization/_GPTQ/_gptq_utils.py index 5df5a18ee..31804f72d 100644 --- a/intel_extension_for_pytorch/quantization/_GPTQ/_gptq_utils.py +++ b/intel_extension_for_pytorch/quantization/_GPTQ/_gptq_utils.py @@ -5,7 +5,8 @@ format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging.basicConfig(level=logging.INFO, format=format_str) -logger = logging.getLogger(__name__) +logger = logging.getLogger("GPTQ") +logger.setLevel(logging.INFO) def gptq_quantize( @@ -81,6 +82,7 @@ def gptq_export( scale_dtype=torch.float16, ): for k, v in weight_config.items(): + logger.info(f"Exporting {k}") num_bits = v["wbits"] group_size = v["group_size"] sym = v["sym"] diff --git a/intel_extension_for_pytorch/quantization/_GPTQ/_quantize.py b/intel_extension_for_pytorch/quantization/_GPTQ/_quantize.py index 19d7bfe2a..b3d612bbb 100644 --- a/intel_extension_for_pytorch/quantization/_GPTQ/_quantize.py +++ b/intel_extension_for_pytorch/quantization/_GPTQ/_quantize.py @@ -1,7 +1,11 @@ import logging import torch from pathlib import Path -from ...utils._logger import logger, WarningType + +format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +logging.basicConfig(level=logging.INFO, format=format_str) +logger = logging.getLogger("GPTQ") +logger.setLevel(logging.INFO) @torch.no_grad() @@ -66,8 +70,7 @@ def gptq( logger.warning( "You choose to use unified sequence length for calibration" + "but you have not set length value. Default sequence length" - + "is 2048 and this might cause inference error!", - _type=WarningType.WrongArgument, + + "is 2048 and this might cause inference error!" ) model, gptq_config = gptq_quantize( model, @@ -79,6 +82,7 @@ def gptq( layer_wise, model_path, ) + logger.info("Exporting compressed model...") compressed_model = gptq_export( model, weight_config, @@ -91,7 +95,7 @@ def gptq( output_file_name = f"gptq_checkpoint_g{group_size}.pt" output_file_path = save_dir + "/" + output_file_name torch.save(compressed_model.state_dict(), output_file_path) - logging.info( + logger.info( "Low-precision checkpoint generated and saved to {}.".format(output_file_path) ) return compressed_model diff --git a/intel_extension_for_pytorch/quantization/_GPTQ/gptq/gptq.py b/intel_extension_for_pytorch/quantization/_GPTQ/gptq/gptq.py index f6a8a3a75..b70b6fe95 100644 --- a/intel_extension_for_pytorch/quantization/_GPTQ/gptq/gptq.py +++ b/intel_extension_for_pytorch/quantization/_GPTQ/gptq/gptq.py @@ -1,4 +1,4 @@ -from ....utils._logger import logger, WarningType +import logging import math import random import re @@ -16,6 +16,10 @@ ) DEBUG = False +format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +logging.basicConfig(level=logging.INFO, format=format_str) +logger = logging.getLogger("GPTQ") +logger.setLevel(logging.INFO) class GPTQuantizer(object): @@ -134,8 +138,7 @@ def obtain_first_n_samples(self, seed=0): length = batch["input_ids"].shape[-1] except Exception: logger.warning( - "Please make sure your dict'like data contains key of 'input_ids'.", - _type=WarningType.WrongArgument, + "Please make sure your dict'like data contains key of 'input_ids'." ) continue batch_final = {} @@ -163,8 +166,7 @@ def obtain_first_n_samples(self, seed=0): self.dataloader.append(batch_final) if len(self.dataloader) < self.nsamples: logger.warning( - f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}.", - _type=WarningType.WrongArgument, + f"Try to use {self.nsamples} data, but entire dataset size is {len(self.dataloader)}." ) def obtain_first_n_samples_fulllength(self, seed=0): @@ -197,8 +199,7 @@ def obtain_first_n_samples_fulllength(self, seed=0): length = batch["input_ids"].shape[-1] except Exception: logger.warning( - "Please make sure your dict'like data contains key of 'input_ids'.", - _type=WarningType.WrongArgument, + "Please make sure your dict'like data contains key of 'input_ids'." ) continue batch_final = {} @@ -233,8 +234,7 @@ def obtain_first_n_samples_fulllength(self, seed=0): if len(self.dataloader) < self.nsamples: # pragma: no cover logger.warning( f"Trying to allocate {self.nsamples} data with fixed length {unified_length}," - + f"but only {len(self.dataloader)} samples are found. Please use smaller 'self.pad_max_length' value.", - _type=WarningType.WrongArgument, + + f"but only {len(self.dataloader)} samples are found. Please use smaller 'self.pad_max_length' value." ) def get_full_layer_name(self, sub_layer_name, block_idx): diff --git a/intel_extension_for_pytorch/quantization/_GPTQ/gptq/model_utils.py b/intel_extension_for_pytorch/quantization/_GPTQ/gptq/model_utils.py index f5bf261ad..7597151d5 100644 --- a/intel_extension_for_pytorch/quantization/_GPTQ/gptq/model_utils.py +++ b/intel_extension_for_pytorch/quantization/_GPTQ/gptq/model_utils.py @@ -6,7 +6,8 @@ format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" logging.basicConfig(level=logging.INFO, format=format_str) -logger = logging.getLogger(__name__) +logger = logging.getLogger("GPTQ") +logger.setLevel(logging.INFO) def move_input_to_device(input, device=torch.device("cpu")): From 7a226b544eb2f9aee366ac5b1a96fd8ece4f5318 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Fri, 14 Jun 2024 16:31:58 +0800 Subject: [PATCH 116/199] update oneDNN to 6860e98e71 on rls-v3.5 (#2983) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index e01a42907..3cd211cb6 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit e01a4290724b3b28401b22f63c199b8f289cd3ae +Subproject commit 3cd211cb6a78392a8d3a6509d23d7d2bb03cf0d2 From 334ada194b07a2641bf619ed799b964729ce56f5 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Fri, 14 Jun 2024 22:48:42 +0800 Subject: [PATCH 117/199] Add note for SmoothQuant with AMP; update note for GPTQ; enforce SmoothQuant alpha being float (#2982) (#2986) --- examples/cpu/inference/python/llm/README.md | 4 ++-- intel_extension_for_pytorch/quantization/_qconfig.py | 4 ++-- intel_extension_for_pytorch/quantization/_smooth_quant.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/cpu/inference/python/llm/README.md b/examples/cpu/inference/python/llm/README.md index 223416806..89e39ca76 100644 --- a/examples/cpu/inference/python/llm/README.md +++ b/examples/cpu/inference/python/llm/README.md @@ -323,7 +323,7 @@ OMP_NUM_THREADS=56 numactl -m 0 -C 0-55 python run.py --benchmark -m meta-llama/ #### 4.2.1.3 Static quantization (INT8): -We use the SmoothQuant algorithm to get good accuracy of static quantization, which is a popular method for LLM models. Besides, by default, we enable quantization mixed fp32 inference (non-quantized OPs run with fp32 dtype). To get better performance, please add "--quant-with-amp" to enable quantization with [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) inference (non-quantized OPs run with bf16 dtype, which may affect the accuracy). +We use the SmoothQuant algorithm to get good accuracy of static quantization, which is a popular method for LLM models. Besides, by default, we enable quantization mixed fp32 inference (non-quantized OPs run with fp32 dtype). To get better performance, you may add "--quant-with-amp" to enable quantization with [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) inference (non-quantized OPs run with bf16 dtype). Please note that static quantization with AMP is still experimental and it may lead to accuracy drop and other issues. - Command: ```bash @@ -541,7 +541,7 @@ deepspeed --bind_cores_to_rank run.py --benchmark -m ./local_llama2_7b --dtype b Using INT4 weights can further improve performance by reducing memory bandwidth. However, direct per-channel quantization of weights to INT4 probably results in poor accuracy. Some algorithms can modify weights through calibration before quantizing weights to minimize accuracy drop. GPTQ is one of such algorithms. You may generate modified weights and quantization info (scales, zero points) for a certain model with a dataset by such algorithms. The low precision checkpoint is saved as a `state_dict` in a `.pt` file and can be loaded later for weight only quantization. We provide an example here to run GPTQ. -*Note:* Currently GPTQ API is verified on the following models: gpt-j, opt, llama, Llama-2, bloom, bloomz, dolly-v1, dolly-v2, gpt-neo, gpt-neox, mpt, falcon, starcoder. Some of them are not in the list of optimized models. Please use with care. +*Note:* The GPTQ API is verified on the following models: gpt-j, opt, llama, Llama-2, Llama-3, bloom, bloomz, dolly-v1, dolly-v2, gpt-neo, gpt-neox, mpt, falcon, starcoder, condegen, mistral, mixtral, stablelm, phi-2, phi-3. Pleaes note that it's still experimental, please use with care. Here is how to use it: diff --git a/intel_extension_for_pytorch/quantization/_qconfig.py b/intel_extension_for_pytorch/quantization/_qconfig.py index 90356161f..ef6990b53 100644 --- a/intel_extension_for_pytorch/quantization/_qconfig.py +++ b/intel_extension_for_pytorch/quantization/_qconfig.py @@ -86,14 +86,14 @@ def get_smooth_quant_qconfig_mapping( qconfig = QConfigSmoothQuant( activation=SmoothQuantActivationObserver.with_args( reduce_range=False, - alpha=alpha, + alpha=float(alpha), act_observer=act_observer, act_ic_observer=act_ic_observer, ), weight=SmoothQuantWeightObserver.with_args( dtype=torch.qint8, qscheme=torch.per_channel_symmetric, - alpha=alpha, + alpha=float(alpha), wei_observer=wei_observer, wei_ic_observer=wei_ic_observer, ), diff --git a/intel_extension_for_pytorch/quantization/_smooth_quant.py b/intel_extension_for_pytorch/quantization/_smooth_quant.py index ae16301e8..5731111d2 100644 --- a/intel_extension_for_pytorch/quantization/_smooth_quant.py +++ b/intel_extension_for_pytorch/quantization/_smooth_quant.py @@ -76,7 +76,7 @@ def __init__( # if smooth_quant_enabled is false, this observer acts as # a normal per-tensor observer self.smooth_quant_enabled = smooth_quant_enabled - self.alpha = alpha + self.alpha = float(alpha) # Normally we don't use min_val or max_val here # They are for checks, like `_check_observer_has_run` self.min_val = self.act_obs.min_val @@ -214,7 +214,7 @@ def __init__( # if smooth_quant_enabled is false, this observer acts as # a normal observer self.smooth_quant_enabled = smooth_quant_enabled - self.alpha = alpha + self.alpha = float(alpha) # Normally we don't use min_val or max_val here # They are for checks, like `_check_observer_has_run` self.min_val = self.oc_obs.min_val From fceb807223d302f80e070246308870786de5f5bc Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Mon, 17 Jun 2024 14:14:24 +0900 Subject: [PATCH 118/199] update launch script for feature requests (#2885) * update launch script for feature requests * deprecate arg --skip-cross-node-cores to --avoid-across-numa-nodes * warning message format adjustment * remove deprecate arguments * change the default strategy to scatter * update UTs to remove deprecated arguments * add bind-numa-node argument to launcher_distributed; bug fix when ninstances is less than number of numa nodes; --- .../performance_tuning/launch_script.md | 5 +- .../optimize_pytorch_models_with_ipex.ipynb | 4 +- .../cpu/launch/cpu_info.py | 285 +++++++++++------- .../cpu/launch/launch.py | 189 +----------- .../cpu/launch/launcher_base.py | 32 +- .../cpu/launch/launcher_distributed.py | 26 +- .../cpu/launch/launcher_multi_instances.py | 37 +-- tests/cpu/bench/custom_op_bench/README.md | 24 +- tests/cpu/run_distributed_test.sh | 2 +- tests/cpu/test_ipex_optimize.py | 8 +- tests/cpu/test_launcher.py | 80 ++--- 11 files changed, 291 insertions(+), 401 deletions(-) diff --git a/docs/tutorials/performance_tuning/launch_script.md b/docs/tutorials/performance_tuning/launch_script.md index 0c611d8fe..61c5826f3 100644 --- a/docs/tutorials/performance_tuning/launch_script.md +++ b/docs/tutorials/performance_tuning/launch_script.md @@ -32,11 +32,12 @@ Launcher Common Arguments: | knob | type | default value | help | | :-- | :--: | :--: | :-- | -| `--ncores-per-instance` | int | 0 | Number of cores per instance | +| `--ncores-per-instance` | int | 0 | Number of cores per instance. It has to be an integer larger than or equal to `-1`. When set to `0`, cores are evenly assigned to each instance. If number of cores cannot be divided by number of instances, residual cores are unused. When set to `-1`, cores are evenly assigned to each instance as much as possible to fully utilize all cores. When set to a number larger than `0`, designated number of cores are assigned to each instance. | | `--nodes-list` | str | '' | Specify nodes list for multiple instances to run on, in format of list of single node ids "node_id,node_id,..." or list of node ranges "node_id-node_id,...". By default all nodes will be used. | | `--use-e-cores` | - | False | Use Efficient-Cores on the workloads or not. By default, only Performance-Cores are used. | | `--memory-allocator` | str | 'auto' | Choose which memory allocator to run the workloads with. Supported choices are ['auto', 'default', 'tcmalloc', 'jemalloc']. | | `--omp-runtime` | str | 'auto' | Choose which OpenMP runtime to run the workloads with. Supported choices are ['auto', 'default', 'intel']. | +| `--strategy` | str | 'scatter' | Tell how cores are distributed over instances when only part of all cores are needed on a machine with multiple NUMA nodes. Supported choices are ['scatter', 'close']. With 'scatter', instances are distributed evenly as much as possible over all available NUMA nodes. While with 'close', instances are assigned to cores in order continuously. | Multi-instance Arguments: @@ -45,7 +46,7 @@ Multi-instance Arguments: | `--ninstances` | int | 0 | Number of instances | | `--instance-idx` | int | -1 | Inside the multi instance list, execute a specific instance at index. If it is set to -1, run all of them. | | `--use-logical-cores` | - | False | Use logical cores on the workloads or not. By default, only physical cores are used. | -| `--skip-cross-node-cores` | - | False | Allow instances to be executed on cores across NUMA nodes. | +| `--bind-numa-node` | - | False | Bind instances to be executed on cores on a single NUMA node. | | `--multi-task-manager` | str | 'auto' | Choose which multi task manager to run the workloads with. Supported choices are ['auto', 'none', 'numactl', 'taskset']. | | `--latency-mode` | - | False | Use 4 cores per instance over all physical cores. | | `--throughput-mode` | - | False | Run one instance per node with all physical cores. | diff --git a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb index fe1fb8b52..b0cec76a8 100644 --- a/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb +++ b/examples/cpu/inference/python/jupyter-notebooks/optimize_pytorch_models_with_ipex.ipynb @@ -801,7 +801,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 3 --log_path ./logs ./python/resnet50.py" + "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncores-per-instance 3 --log_path ./logs ./python/resnet50.py" ] }, { @@ -819,7 +819,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncore_per_instance 6 --log_path ./logs ./python/resnet50.py" + "!python -m intel_extension_for_pytorch.cpu.launch --ninstances 1 --ncores-per-instance 6 --log_path ./logs ./python/resnet50.py" ] }, { diff --git a/intel_extension_for_pytorch/cpu/launch/cpu_info.py b/intel_extension_for_pytorch/cpu/launch/cpu_info.py index 162096710..caaed97ec 100644 --- a/intel_extension_for_pytorch/cpu/launch/cpu_info.py +++ b/intel_extension_for_pytorch/cpu/launch/cpu_info.py @@ -3,7 +3,6 @@ import platform import re import subprocess -from ...utils._logger import WarningType # lscpu Examples # # The following is the parsable format, which can be fed to other @@ -225,21 +224,21 @@ def verbose(self, level, msg, warning_type=None): """ Get CPU pools from all available CPU cores with designated criterias. - - ninstances [int]: Number of instances. Should be a non negative integer, 0 by default. \ + - ninstances [int]: Number of instances. Should be a non negative integer, 0 by default. When it is 0, it will be set according to usage scenarios automatically in the function. - - ncores_per_instance [int]: Number of cores per instance. Should be a non negative integer, 0 by default. \ + - ncores_per_instance [int]: Number of cores per instance. Should be a non negative integer, 0 by default. When it is 0, it will be set according to usage scenarios automatically in the function. - - use_logical_cores [bool]: Use logical cores on the workloads or not, False by default. When set to False, \ + - use_logical_cores [bool]: Use logical cores on the workloads or not, False by default. When set to False, only physical cores are used. - use_e_cores [bool]: Use Efficient-Cores, False by default. When set to False, only Performance-Cores are used. - - skip_cross_node_cores [bool]: Allow instances to be executed on cores across NUMA nodes, False by default. + - bind_numa_node [bool]: Bind instances to be executed on cores on a single NUMA node, False by default. - nodes_list [list]: A list containing all node ids that the execution is expected to be running on. - cores_list [list]: A list containing all cpu ids that the execution is expected to be running on. - - return_mode [str]: A string that defines how result values are formed, could be either of 'auto', \ - 'list' and 'range'. When set to 'list', a string with comma-separated cpu ids, '0,1,2,3,...', is returned. \ - When set to 'range', a string with comma-separated cpu id ranges, '0-2,6-8,...', is returned. \ - When set to 'auto', a 'list' or a 'range' whoever has less number of elements that are separated by \ - comma is returned. I.e. for a list '0,1,2,6,7,8' and a range '0-2,6-8', both reflect the same cpu \ + - return_mode [str]: A string that defines how result values are formed, could be either of 'auto', + 'list' and 'range'. When set to 'list', a string with comma-separated cpu ids, '0,1,2,3,...', is returned. + When set to 'range', a string with comma-separated cpu id ranges, '0-2,6-8,...', is returned. + When set to 'auto', a 'list' or a 'range' whoever has less number of elements that are separated by + comma is returned. I.e. for a list '0,1,2,6,7,8' and a range '0-2,6-8', both reflect the same cpu configuration, the range '0-2,6-8' is returned. """ @@ -249,9 +248,10 @@ def gen_pools_ondemand( ncores_per_instance=0, use_logical_cores=False, use_e_cores=False, - skip_cross_node_cores=False, + bind_numa_node=False, nodes_list=None, cores_list=None, + strategy="close", return_mode="auto", ): if nodes_list is None: @@ -265,40 +265,7 @@ def gen_pools_ondemand( assert set(cores_list).issubset( set(cores_available) ), f"Designated cores list {cores_list} contains invalid cores." - if use_logical_cores: - self.verbose( - "warning", - "Argument --use-logical-cores won't take effect when --cores-list is set." - + "please see https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/launch_script.html#launch-script-usage-guide" # noqa: B950 - + "for usage guide", - warning_type=WarningType.AmbiguousArgument, - ) - if use_e_cores: - self.verbose( - "warning", - "Argument --use-e-cores won't take effect when --cores-list is set.", - +"please see https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/launch_script.html#launch-script-usage-guide" # noqa: B950 - + "for usage guide", - warning_type=WarningType.AmbiguousArgument, - ) pool = [c for c in self.pool_all if c.cpu in cores_list] - nodes = list(set([c.node for c in pool])) - ncores_per_node = -1 - for n in nodes: - ncores_local = len([c for c in pool if c.node == n]) - if ncores_per_node == -1: - ncores_per_node = ncores_local - else: - if ncores_per_node != ncores_local and skip_cross_node_cores: - skip_cross_node_cores = False - self.verbose( - "warning", - "Argument --skip-cross-node-cores cannot take effect on the designated cores. Disabled.", - +"please see https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/launch_script.html#launch-script-usage-guide" # noqa: B950 - + "for usage guide", - warning_type=WarningType.WrongArgument, - ) - break else: if len(nodes_list) > 0: nodes_available = set([c.node for c in self.pool_all]) @@ -308,101 +275,201 @@ def gen_pools_ondemand( pool = [c for c in self.pool_all if c.node in nodes_list] else: pool = self.pool_all - if not use_logical_cores: - pool = [c for c in pool if c.is_physical_core] - if not use_e_cores: - pool = [c for c in pool if c.is_p_core] - e_cores = [c.cpu for c in pool if not c.is_p_core] - if len(e_cores) > 0: - self.verbose( - "info", - f"Efficient-Cores are detected ({e_cores}). Disabled for performance consideration. \ - You can enable them with argument --use-e-cores.", - ) + if not use_logical_cores: + pool = [c for c in pool if c.is_physical_core] + logical_cores = [c.cpu for c in pool if not c.is_physical_core] + if len(logical_cores) > 0: + self.verbose( + "info", + f"Logical cores are detected ({logical_cores}). Disabled for performance consideration. " + + "You can enable them with argument --use-logical-cores.", + ) + if not use_e_cores: + pool = [c for c in pool if c.is_p_core] + e_cores = [c.cpu for c in pool if not c.is_p_core] + if len(e_cores) > 0: + self.verbose( + "info", + f"Efficient-Cores are detected ({e_cores}). Disabled for performance consideration. " + + "You can enable them with argument --use-e-cores.", + ) # Determine ninstances and ncores_per_instance for grouping assert ( - ncores_per_instance >= 0 - ), "Argument --ncores-per-instance cannot be a negative value." + ncores_per_instance >= -1 + ), "Argument --ncores-per-instance cannot be a negative value other than -1." assert ninstances >= 0, "Argument --ninstances cannot be a negative value." - nodes = set([c.node for c in pool]) + pool.sort(key=lambda x: (x.core, 1 - int(x.is_physical_core))) + nodes = list(set([c.node for c in pool])) + is_greedy = False + if ncores_per_instance == -1: + is_greedy = True + ncores_per_instance = 0 if ncores_per_instance + ninstances == 0: # Both ncores_per_instance and ninstances are 0 ninstances = 1 - if ncores_per_instance * ninstances == 0: - # Either ncores_per_instance or ninstances is 0 - if skip_cross_node_cores: - ncores_per_node = len(pool) // len(nodes) - nresidual = 0 - if ncores_per_instance == 0: - nins_per_node = ninstances // len(nodes) - if ninstances % len(nodes) > 0: - nins_per_node += 1 - ncores_per_instance = ncores_per_node // nins_per_node - nresidual = ncores_per_node % nins_per_node - if ninstances == 0: - ninstances = ncores_per_node // ncores_per_instance * len(nodes) - nresidual = ncores_per_node % ncores_per_instance - if nresidual > 0: - cores_remove = [] - for n in nodes: - cores = [c for c in pool if c.node == n] - for i in range(nresidual): - cores_remove.append(cores[-1 * (i + 1)]) - for c in cores_remove: - pool.remove(c) + + rst = [] + if ncores_per_instance == 0: + pool_process = [] + ninstances_node = [] + if bind_numa_node: + for node in nodes: + pool_node = [c for c in pool if c.node == node] + pool_process.append(pool_node) + ninstances_local = (ninstances * len(pool_node)) // len(pool) + if (ninstances_local) == 0 or ( + (ninstances * len(pool_node)) % len(pool) > 0 + ): + ninstances_local += 1 + ninstances_node.append(ninstances_local) + for _ in range(int(sum(ninstances_node)) - ninstances): + ncores_per_instance_local = [] + for i in range(len(nodes)): + ncores_node = len([c for c in pool if c.node == nodes[i]]) + tmp = ncores_node / ninstances_node[i] + if ninstances_node[i] == 1: + tmp = len(pool) + ncores_per_instance_local.append(tmp) + ncores_per_instance_local_min = min(ncores_per_instance_local) + if ncores_per_instance_local_min == len(pool): + break + index = ncores_per_instance_local.index( + ncores_per_instance_local_min + ) + ninstances_node[index] -= 1 + delta = int(sum(ninstances_node)) - ninstances + if delta > 0: + ncores_per_instance_local = [] + for i in range(len(nodes)): + ncores_per_instance_local.append( + { + "index": i, + "count": len([c for c in pool if c.node == nodes[i]]), + } + ) + ncores_per_instance_local.sort( + key=lambda x: (x["count"], len(nodes) - x["index"]) + ) + for i in range(delta): + ninstances_node[ncores_per_instance_local[i]["index"]] -= 1 else: - if ninstances == 0: - ninstances = len(pool) // ncores_per_instance - if ncores_per_instance == 0: - ncores_per_instance = len(pool) // ninstances + pool_process.append(pool) + ninstances_node.append(ninstances) + for i in range(len(pool_process)): + p = pool_process[i] + n = ninstances_node[i] + if n == 0: + continue + tmp = [] + for j in range(n): + tmp.append({"ncores": len(p) // n, "pool": []}) + if is_greedy: + ncores_residual = len(p) % n + for j in range(ncores_residual): + tmp[j]["ncores"] += 1 + ncores_assigned = 0 + for j in range(len(tmp)): + tmp[j]["pool"] = p[ + ncores_assigned : ncores_assigned + tmp[j]["ncores"] + ] + ncores_assigned += tmp[j]["ncores"] + rst += tmp else: - # Neither ncores_per_instance nor ninstances is 0 - if skip_cross_node_cores: - self.verbose( - "warning", - "Argument --skip-cross-node-cores won't take effect when both --ninstances and" - + " --ncores-per-instance are explicitly set." - + "please see https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/launch_script.html#launch-script-usage-guide" # noqa: B950 - + "for usage guide", - warning_type=WarningType.AmbiguousArgument, + pool_process = [] + if bind_numa_node: + for node in nodes: + pool_process.append([c for c in pool if c.node == node]) + else: + pool_process.append(pool) + for i in range(len(pool_process)): + p = pool_process[i] + n = len(p) // ncores_per_instance + ncores_assigned = 0 + for _ in range(n): + item = {"ncores": 0, "node": nodes[i], "pool": []} + item["ncores"] = ncores_per_instance + item["pool"] = p[ + ncores_assigned : ncores_assigned + ncores_per_instance + ] + ncores_assigned += ncores_per_instance + rst.append(item) + if ninstances > 0: + assert ninstances <= len(rst), ( + f"Requested --ninstances ({ninstances}) and --ncores_per_instance ({ncores_per_instance}) " + + "combination is not supported. Please adjust either or both of these 2 parameters and try again." ) - assert ( - ninstances * ncores_per_instance > 0 - and ninstances * ncores_per_instance <= len(pool) - ), "Requested number of cores exceeds what is available." + if ninstances < len(rst): + if strategy == "close": + rst = rst[:ninstances] + elif strategy == "scatter": + if len(pool_process) == 1: + step = len(rst) // ninstances + if len(rst) % ninstances > 0: + step += 1 + rst = rst[::step] + else: + rst_map = [] + ninstances_node_avai = [] + ninstances_node = [] + for node in nodes: + tmp = [r for r in rst if r["node"] == node] + rst_map.append(tmp) + ninstances_node_avai.append(len(tmp)) + ninstances_node.append(0) + index = 0 + for _ in range(ninstances): + while index < len(nodes): + index += 1 + if index == len(nodes): + index = 0 + if ninstances_node_avai[index - 1] > 0: + ninstances_node[index - 1] += 1 + ninstances_node_avai[index - 1] -= 1 + break + rst.clear() + for i in range(len(ninstances_node)): + rst += rst_map[i][: ninstances_node[i]] + else: + raise ValueError(f"Strategy {strategy} is not available.") # Split the aggregated pool into individual pools self.pools_ondemand.clear() - pool.sort(key=lambda x: (x.core, 1 - int(x.is_physical_core))) - for i in range(ninstances): + for item in rst: # Generate individual raw pool pool_local = CPUPool() - for j in range(ncores_per_instance): - pool_local.append(pool[i * ncores_per_instance + j]) + for c in item["pool"]: + pool_local.append(c) pool_local.sort(key=lambda x: x.cpu) self.pools_ondemand.append(pool_local) if __name__ == "__main__": lscpu_txt = """ +""" + try: + with open("example.txt", "r") as f: + lscpu_txt = f.read() + except Exception: + lscpu_txt = """ """ pools = CPUPoolList(lscpu_txt=lscpu_txt) pools.gen_pools_ondemand( use_logical_cores=False, return_mode="auto", - ninstances=3, - ncores_per_instance=0, + ninstances=10, + ncores_per_instance=-1, use_e_cores=True, - skip_cross_node_cores=False, + bind_numa_node=True, + strategy="scatter", ) print(f'capacity pool_auto: {pools.pool_all.get_pool_txt(return_mode="auto")}') - print(f'capacity pool_list: {pools.pool_all.get_pool_txt(return_mode="list")}') - print(f'capacity pool_range: {pools.pool_all.get_pool_txt(return_mode="range")}') + # print(f'capacity pool_list: {pools.pool_all.get_pool_txt(return_mode="list")}') + # print(f'capacity pool_range: {pools.pool_all.get_pool_txt(return_mode="range")}') print("") for i in range(len(pools.pools_ondemand)): p = pools.pools_ondemand[i] print(f'ondemand pool_auto: {i} {p.get_pool_txt(return_mode="auto")}') - print(f'ondemand pool_list: {i} {p.get_pool_txt(return_mode="list")}') - print(f'ondemand pool_range: {i} {p.get_pool_txt(return_mode="range")}') - print([c.cpu for c in p]) + # print(f'ondemand pool_list: {i} {p.get_pool_txt(return_mode="list")}') + # print(f'ondemand pool_range: {i} {p.get_pool_txt(return_mode="range")}') + # print([c.cpu for c in p]) diff --git a/intel_extension_for_pytorch/cpu/launch/launch.py b/intel_extension_for_pytorch/cpu/launch/launch.py index 580ecd5a2..7332ee380 100644 --- a/intel_extension_for_pytorch/cpu/launch/launch.py +++ b/intel_extension_for_pytorch/cpu/launch/launch.py @@ -122,198 +122,21 @@ def add_deprecated_params(parser): group = parser.add_argument_group("Deprecated Arguments") group.add_argument( - "--nproc_per_node", - metavar="\b", - type=int, - default=-1, - help="Deprecated by --nprocs-per-node.", - ) - group.add_argument( - "--more_mpi_params", - metavar="\b", - type=str, - default="", - help="Deprecated by --extra-mpi-params.", - ) - group.add_argument( - "--ncore_per_instance", - metavar="\b", - type=int, - default=-1, - help="Deprecated by --ncores-per-instance.", - ) - group.add_argument( - "--node_id", - metavar="\b", - type=int, - default=-1, - help="Deprecated by --nodes-list.", - ) - group.add_argument( - "--core_list", - metavar="\b", - type=str, - default="", - help="Deprecated by --cores-list.", - ) - group.add_argument( - "--logical_core_for_ccl", - action="store_true", - default=False, - help="Deprecated by --logical-cores-for-ccl.", - ) - group.add_argument( - "--enable_tcmalloc", + "--skip-cross-node-cores", + "--skip_cross_node_cores", action="store_true", default=False, - help="Deprecated by --memory-allocator.", - ) - group.add_argument( - "--enable_jemalloc", - action="store_true", - default=False, - help="Deprecated by --memory-allocator.", - ) - group.add_argument( - "--use_default_allocator", - action="store_true", - default=False, - help="Deprecated by --memory-allocator.", - ) - group.add_argument( - "--use_logical_core", - action="store_true", - default=False, - help="Deprecated by --use-logical-cores.", - ) - group.add_argument( - "--disable_numactl", - action="store_true", - default=False, - help="Deprecated by --multi-task-manager.", - ) - group.add_argument( - "--disable_taskset", - action="store_true", - default=False, - help="Deprecated by --multi-task-manager.", - ) - group.add_argument( - "--disable_iomp", - action="store_true", - default=False, - help="Deprecated by --omp-runtime.", - ) - group.add_argument( - "--log_path", type=str, default="", help="Deprecated by --log-dir." - ) - group.add_argument( - "--multi_instance", - action="store_true", - default=False, - help="Deprecated. Will be removed.", - ) - group.add_argument( - "--distributed", - action="store_true", - default=False, - help="Deprecated. Will be removed.", + help="Deprecated by --bind-numa-node.", ) def process_deprecated_params(args, logger): - if args.nproc_per_node != -1: - logger.warning( - "Argument --nproc_per_node is deprecated by --nprocs-per-node.", - _type=WarningType.DeprecatedArgument, - ) - args.nprocs_per_node = args.nproc_per_node - if args.more_mpi_params != "": - logger.warning( - "Argument --more_mpi_params is deprecated by --extra-mpi-params.", - _type=WarningType.DeprecatedArgument, - ) - args.extra_mpi_params = args.more_mpi_params - if args.ncore_per_instance != -1: - logger.warning( - "Argument --ncore_per_instance is deprecated by --ncores-per-instance.", - _type=WarningType.DeprecatedArgument, - ) - args.ncores_per_instance = args.ncore_per_instance - if args.node_id != -1: - logger.warning( - "Argument --node_id is deprecated by --nodes-list.", - _type=WarningType.DeprecatedArgument, - ) - args.nodes_list = str(args.node_id) - if args.core_list != "": - logger.warning( - "Argument --core_list is deprecated by --cores-list.", - _type=WarningType.DeprecatedArgument, - ) - args.cores_list = args.core_list - if args.logical_core_for_ccl: - logger.warning( - "Argument --logical_core_for_ccl is deprecated by --logical-cores-for-ccl.", - _type=WarningType.DeprecatedArgument, - ) - args.logical_cores_for_ccl = args.logical_core_for_ccl - if args.use_logical_core: - logger.warning( - "Argument --use_logical_core is deprecated by --use-logical-cores.", - _type=WarningType.DeprecatedArgument, - ) - args.use_logical_cores = args.use_logical_core - if args.log_path != "": - logger.warning( - "Argument --log_path is deprecated by --log-dir.", - _type=WarningType.DeprecatedArgument, - ) - args.log_dir = args.log_path - - if args.multi_instance: - logger.warning( - "Argument --multi_instance is deprecated. Will be removed." - + "If you are using the deprecated argument, please update it to the new one.", - _type=WarningType.DeprecatedArgument, - ) - if args.distributed: - logger.warning( - "Argument --distributed is deprecated. Will be removed." - + "If you are using the deprecated argument, please update it to the new one.", - _type=WarningType.DeprecatedArgument, - ) - - if args.enable_tcmalloc or args.enable_jemalloc or args.use_default_allocator: - logger.warning( - "Arguments --enable_tcmalloc, --enable_jemalloc and --use_default_allocator" - + "are deprecated by --memory-allocator tcmalloc/jemalloc/auto.", - _type=WarningType.DeprecatedArgument, - ) - if args.use_default_allocator: - args.memory_allocator = "default" - if args.enable_jemalloc: - args.memory_allocator = "jemalloc" - if args.enable_tcmalloc: - args.memory_allocator = "tcmalloc" - if args.disable_numactl: - logger.warning( - "Argument --disable_numactl is deprecated by --multi-task-manager taskset.", - _type=WarningType.DeprecatedArgument, - ) - args.multi_task_manager = "taskset" - if args.disable_taskset: - logger.warning( - "Argument --disable_taskset is deprecated by --multi-task-manager numactl.", - _type=WarningType.DeprecatedArgument, - ) - args.multi_task_manager = "numactl" - if args.disable_iomp: + if args.skip_cross_node_cores: logger.warning( - "Argument --disable_iomp is deprecated by --omp-runtime default.", + "Argument --skip-cross-node-cores is deprecated by --bind-numa-node.", _type=WarningType.DeprecatedArgument, ) - args.omp_runtime = "default" + args.bind_numa_node = args.skip_cross_node_cores class ArgumentTypesDefaultsHelpFormatter(argparse.HelpFormatter): diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_base.py b/intel_extension_for_pytorch/cpu/launch/launcher_base.py index baaa4d1f7..8b8f24080 100644 --- a/intel_extension_for_pytorch/cpu/launch/launcher_base.py +++ b/intel_extension_for_pytorch/cpu/launch/launcher_base.py @@ -30,6 +30,7 @@ def __init__(self, logger=None, lscpu_txt=""): ) self.ma_supported = ["auto", "default", "tcmalloc", "jemalloc"] self.omp_supported = ["auto", "default", "intel"] + self.strategy_supported = ["scatter", "close"] self.environ_set = {} self.ld_preload = ( os.environ["LD_PRELOAD"].split(":") if "LD_PRELOAD" in os.environ else [] @@ -42,15 +43,19 @@ def add_common_params(self, parser): "--ncores_per_instance", default=0, type=int, - help="Number of cores used for computation per instance", + help="Number of cores used for computation per instance. It has to be an integer larger than -1. " + + "When set to 0, cores are evenly assigned to each instance. If number of cores cannot be divided " + + "by number of instances, residual cores are unused. When set to -1, cores are evenly assigned to " + + "each instance as much as possible to fully utilize all cores. When set to a number larger than 0, " + + "designated number of cores are assigned to each instance.", ) group.add_argument( "--nodes-list", "--nodes_list", default="", type=str, - help='Specify nodes list for multiple instances to run on, in format of list of single node ids \ - "node_id,node_id,..." or list of node ranges "node_id-node_id,...". By default all nodes will be used.', + help="Specify nodes list for multiple instances to run on, in format of list of single node ids " + + '"node_id,node_id,..." or list of node ranges "node_id-node_id,...". By default all nodes will be used.', ) group.add_argument( "--use-e-cores", @@ -59,6 +64,16 @@ def add_common_params(self, parser): default=False, help="Use Efficient-Cores on the workloads or not. By default, only Performance-Cores are used.", ) + group.add_argument( + "--strategy", + default="scatter", + type=str, + choices=self.strategy_supported, + help="Tell how cores are distributed over instances when only part of all cores are needed on a " + + f'machine with multiple NUMA nodes. Supported choices are {self.strategy_supported}. With "scatter", ' + + "instances are distributed evenly as much as possible over all available NUMA nodes. While with " + + '"close", instances are assigned to cores in order continuously.', + ) group.add_argument( "--memory-allocator", "--memory_allocator", @@ -117,7 +132,7 @@ def add_lib_preload(self, lib_type): break return lib_set or lib_found - def add_env(self, env_name, env_value): + def check_env(self, env_name, env_value): value = os.getenv(env_name, "") if value != "" and value != env_value: self.verbose( @@ -126,9 +141,12 @@ def add_env(self, env_name, env_value): + f" is {env_value}. Use the exsiting value. Please unset the {env_name} if you wish ipex launcher set it ", warning_type=WarningType.AmbiguousArgument, ) - self.environ_set[env_name] = os.environ[env_name] + return os.environ[env_name] else: - self.environ_set[env_name] = env_value + return env_value + + def add_env(self, env_name, env_value): + self.environ_set[env_name] = self.check_env(env_name, env_value) def set_lib_bin_from_list( self, @@ -183,7 +201,7 @@ def set_lib_bin_from_list( warning_type=WarningType.WrongArgument, ) if name_local == supported[0]: - self.verbose("info", "auto choosing bin...") + self.verbose("info", f"auto choosing {category}...") for name in supported[2:]: if name in skip_list: continue diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py b/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py index 877ea16a7..6cded4502 100644 --- a/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py +++ b/intel_extension_for_pytorch/cpu/launch/launcher_distributed.py @@ -150,9 +150,9 @@ def launch(self, args): if args.nnodes > 1: assert os.path.exists( args.hostfile - ), "A hostfile is required when you perform multi-node distributed training. \ - Please create the hostfile which includes ip addresses of nodes that you will use for \ - the distributed computation workload." + ), "A hostfile is required when you perform multi-node distributed training. " + +"Please create the hostfile which includes ip addresses of nodes that you will " + +"use for the distributed computation workload." ipv4_addr_pattern = r"^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" ip_list = [] with open(args.hostfile) as f: @@ -177,8 +177,8 @@ def launch(self, args): master_check = True assert ( master_check - ), "MASTER_ADDR is incorrect. Please make sure the first line ({ip_list[0]}) of the hostfile is the \ - ip address of the current node." + ), f"MASTER_ADDR is incorrect. Please make sure the first line ({ip_list[0]}) of the hostfile " + +"is the ip address of the current node." self.verbose("info", "Begin to validate SSH connections") args.master_addr = ip_list[0] @@ -218,16 +218,22 @@ def launch(self, args): use_logical_cores=True, use_e_cores=args.use_e_cores, nodes_list=nodes_list, + strategy=args.strategy, + bind_numa_node=args.bind_numa_node, ) self.set_memory_allocator(args.memory_allocator, False, ["jemalloc"]) self.set_omp_runtime(args.omp_runtime, True) - omp_num_threads = len( - [c for c in self.cpuinfo.pools_ondemand[0] if c.is_physical_core] - ) + ninstances = len(self.cpuinfo.pools_ondemand) + omp_num_threads = [] + for i in range(ninstances): + omp_num_threads.append( + len([c for c in self.cpuinfo.pools_ondemand[i] if c.is_physical_core]) + ) + omp_num_threads_value = min(omp_num_threads) if not args.logical_cores_for_ccl: - omp_num_threads -= args.ccl_worker_count - self.add_env("OMP_NUM_THREADS", str(omp_num_threads)) + omp_num_threads_value -= args.ccl_worker_count + self.add_env("OMP_NUM_THREADS", str(omp_num_threads_value)) # set distributed related environmental variables self.add_env("MASTER_ADDR", args.master_addr) diff --git a/intel_extension_for_pytorch/cpu/launch/launcher_multi_instances.py b/intel_extension_for_pytorch/cpu/launch/launcher_multi_instances.py index bfadbdc0f..c6ab7402d 100644 --- a/intel_extension_for_pytorch/cpu/launch/launcher_multi_instances.py +++ b/intel_extension_for_pytorch/cpu/launch/launcher_multi_instances.py @@ -29,8 +29,8 @@ def add_params(self, parser): "--instance_idx", default="", type=str, - help="Inside the multi instance list, execute a specific instance at indices. \ - If it is set to -1 or empty, run all of them.", + help="Inside the multi instance list, execute a specific instance at indices. " + + "If it is set to -1 or empty, run all of them.", ) group.add_argument( "--use-logical-cores", @@ -40,11 +40,11 @@ def add_params(self, parser): help="Use logical cores on the workloads or not. By default, only physical cores are used.", ) group.add_argument( - "--skip-cross-node-cores", - "--skip_cross_node_cores", + "--bind-numa-node", + "--bind_numa_node", action="store_true", default=False, - help="Allow instances to be executed on cores across NUMA nodes.", + help="Bind instances to be executed on cores on a single NUMA node.", ) group.add_argument( "--multi-task-manager", @@ -52,8 +52,7 @@ def add_params(self, parser): default="auto", type=str, choices=self.tm_supported, - help=f"Choose which multi task manager to run the workloads with. Supported choices are \ - {self.tm_supported}.", + help="Choose which multi task manager to run the workloads with. Supported choices are {self.tm_supported}.", ) group.add_argument( "--latency-mode", @@ -74,17 +73,17 @@ def add_params(self, parser): "--cores_list", default="", type=str, - help='Specify cores list for multiple instances to run on, in format of list of single core ids \ - "core_id,core_id,..." or list of core ranges "core_id-core_id,...". \ - By default all cores will be used.', + help="Specify cores list for multiple instances to run on, in format of list of single core ids " + + '"core_id,core_id,..." or list of core ranges "core_id-core_id,...". ' + + "By default all cores will be used.", ) group.add_argument( "--benchmark", action="store_true", default=False, - help="Enable benchmark config. JeMalloc's MALLOC_CONF has been tuned for low latency. \ - Recommend to use this for benchmarking purpose; for other use cases, \ - this MALLOC_CONF may cause Out-of-Memory crash.", + help="Enable benchmark config. JeMalloc's MALLOC_CONF has been tuned for low latency. " + + "Recommend to use this for benchmarking purpose; for other use cases, " + + "this MALLOC_CONF may cause Out-of-Memory crash.", ) def is_command_available(self, cmd): @@ -135,6 +134,7 @@ def execution_command_builder( pool_txt = pool.get_pool_txt() cores_list_local = pool_txt["cores"] nodes_list_local = pool_txt["nodes"] + self.verbose("info", f"========== instance {index} ==========") if task_mgr != self.tm_supported[1]: params = "" if task_mgr == "numactl": @@ -154,9 +154,11 @@ def execution_command_builder( k = "KMP_AFFINITY" v = f"granularity=fine,proclist=[{cores_list_local}],explicit" if k != "": - self.verbose("info", "==========") self.verbose("info", f"env: {k}={v}") environ_local[k] = v + omp_num_threads = self.check_env("OMP_NUM_THREADS", len(pool)) + environ_local["OMP_NUM_THREADS"] = str(omp_num_threads) + self.verbose("info", f"env: OMP_NUM_THREADS={omp_num_threads}") if not args.no_python: cmd.append(sys.executable) @@ -225,12 +227,12 @@ def launch(self, args): ncores_per_instance=args.ncores_per_instance, use_logical_cores=args.use_logical_cores, use_e_cores=args.use_e_cores, - skip_cross_node_cores=args.skip_cross_node_cores, + bind_numa_node=args.bind_numa_node, nodes_list=nodes_list, cores_list=cores_list, + strategy=args.strategy, ) args.ninstances = len(self.cpuinfo.pools_ondemand) - args.ncores_per_instance = len(self.cpuinfo.pools_ondemand[0]) is_iomp_set = False for item in self.ld_preload: @@ -239,7 +241,7 @@ def launch(self, args): break is_kmp_affinity_set = True if "KMP_AFFINITY" in os.environ else False set_kmp_affinity = True - # When using all cores on all nodes, including logical cores, setting KMP_AFFINITY disables logical cores. \ + # When using all cores on all nodes, including logical cores, setting KMP_AFFINITY disables logical cores. # Thus, KMP_AFFINITY should not be set. if args.use_logical_cores and len( set([c for p in self.cpuinfo.pools_ondemand for c in p]) @@ -251,7 +253,6 @@ def launch(self, args): self.set_memory_allocator(args.memory_allocator, args.benchmark) omp_runtime = self.set_omp_runtime(args.omp_runtime, set_kmp_affinity) - self.add_env("OMP_NUM_THREADS", str(args.ncores_per_instance)) skip_list = [] if is_iomp_set and is_kmp_affinity_set: diff --git a/tests/cpu/bench/custom_op_bench/README.md b/tests/cpu/bench/custom_op_bench/README.md index a898b8d88..271f9c8a7 100644 --- a/tests/cpu/bench/custom_op_bench/README.md +++ b/tests/cpu/bench/custom_op_bench/README.md @@ -15,32 +15,32 @@ conda install intel-openmp ``` export OMP_NUM_THREADS=1 export CORES=`lscpu | grep Core | awk '{print $4}'` -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 interaction.py --num-instance=$CORES --inference # for fp32 -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 interaction.py --num-instance=$CORES --inference --bf16 # for bf16 +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 interaction.py --num-instance=$CORES --inference # for fp32 +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 interaction.py --num-instance=$CORES --inference --bf16 # for bf16 ``` unset OMP_NUM_THREADS 2.Training: 1 instance on 1 socket in real world scenario ``` -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 interaction.py # for fp32 -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 interaction.py --bf16 # for bf16 +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 interaction.py # for fp32 +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 interaction.py --bf16 # for bf16 ``` ## Evaluate IPEX fused optimizer ``` -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 optimizer.py --optimizer sgd # for sgd -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 optimizer.py --optimizer lamb # for lamb -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 optimizer.py --optimizer adagrad # for adagrad -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 optimizer.py --optimizer adam # for adam +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 optimizer.py --optimizer sgd # for sgd +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 optimizer.py --optimizer lamb # for lamb +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 optimizer.py --optimizer adagrad # for adagrad +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 optimizer.py --optimizer adam # for adam ``` ## Evaluate IPEX [MergedEmbeddingBag](../../../../intel_extension_for_pytorch/nn/module/merged_embeddingbag.py) ``` export CORES=`lscpu | grep Core | awk '{print $4}'` export BATCHSIZE=$((128*CORES)) -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 merged_embeddingbag.py --inference --batch-size=${BATCHSIZE} -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 merged_embeddingbag.py --inference --with-cat --batch-size=${BATCHSIZE} +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 merged_embeddingbag.py --inference --batch-size=${BATCHSIZE} +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 merged_embeddingbag.py --inference --with-cat --batch-size=${BATCHSIZE} -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 merged_embeddingbag.py --batch-size=${BATCHSIZE} --optimizer=sgd -python -m intel_extension_for_pytorch.cpu.launch --node_id 0 merged_embeddingbag.py --batch-size=${BATCHSIZE} --optimizer=adagrad +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 merged_embeddingbag.py --batch-size=${BATCHSIZE} --optimizer=sgd +python -m intel_extension_for_pytorch.cpu.launch --node-id 0 merged_embeddingbag.py --batch-size=${BATCHSIZE} --optimizer=adagrad ``` diff --git a/tests/cpu/run_distributed_test.sh b/tests/cpu/run_distributed_test.sh index c1a33814d..988f96c81 100644 --- a/tests/cpu/run_distributed_test.sh +++ b/tests/cpu/run_distributed_test.sh @@ -5,5 +5,5 @@ DISTRIBUTED_EMB=${DIR}/test_distributed_merged_emb.py for w_size in 2 4 8 16 do export W_SIZE=$w_size - python -m intel_extension_for_pytorch.cpu.launch --ccl_worker_count=1 --nproc_per_node=$W_SIZE --distributed --nnodes 1 $DISTRIBUTED_EMB + python -m intel_extension_for_pytorch.cpu.launch --ccl-worker-count=1 --nprocs-per-node=$W_SIZE --distributed --nnodes 1 $DISTRIBUTED_EMB done diff --git a/tests/cpu/test_ipex_optimize.py b/tests/cpu/test_ipex_optimize.py index fc2b2b02e..3c3cf7cea 100644 --- a/tests/cpu/test_ipex_optimize.py +++ b/tests/cpu/test_ipex_optimize.py @@ -929,8 +929,8 @@ def get_loss(line): loc = os.path.dirname(os.path.abspath(__file__)) loss = -1 with subprocess.Popen( - "python -m intel_extension_for_pytorch.cpu.launch --ccl_worker_count=1" - + f" --nproc_per_node=2 --distributed --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py --get-state-dict", + "python -m intel_extension_for_pytorch.cpu.launch --ccl-worker-count=1" + + f" --nprocs-per-node=2 --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py --get-state-dict", shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, @@ -944,8 +944,8 @@ def get_loss(line): num = 0 with subprocess.Popen( - "python -m intel_extension_for_pytorch.cpu.launch --ccl_worker_count=1" - + f" --nproc_per_node=2 --distributed --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py", + "python -m intel_extension_for_pytorch.cpu.launch --ccl-worker-count=1" + + f" --nprocs-per-node=2 --nnodes 1 {loc}/ipex-optimize-ddp-static-graph.py", shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, diff --git a/tests/cpu/test_launcher.py b/tests/cpu/test_launcher.py index b736e11d8..0d1485fec 100644 --- a/tests/cpu/test_launcher.py +++ b/tests/cpu/test_launcher.py @@ -64,6 +64,7 @@ def test_memory_allocator_setup(self): ":".join(launcher.ld_preload) if len(launcher.ld_preload) > 0 else "" ) jemalloc_enabled = "libjemalloc.so" in ld_preload + self.assertEqual(find_jemalloc, jemalloc_enabled) if jemalloc_enabled: self.assertTrue("MALLOC_CONF" in launcher.environ_set) @@ -172,6 +173,7 @@ def test_mpi_pin_domain_and_ccl_worker_affinity(self): ninstances=nprocs_per_node, ncores_per_instance=(8 + ccl_worker_count) * nprocs_per_node, use_logical_cores=True, + bind_numa_node=True, ) pin_domain_affinity = launcher.get_pin_domain_affinity( launcher.cpuinfo.pools_ondemand, ccl_worker_count @@ -210,7 +212,6 @@ def test_launcher_scripts(self): def verify_affinity(self, pools, ground_truth): self.assertEqual(len(pools), ground_truth["ninstances"]) - self.assertEqual(len(pools[0]), ground_truth["ncores_per_instance"]) self.assertEqual( len(set([c.cpu for p in pools for c in p])), ground_truth["num_cores_sum"] ) @@ -238,7 +239,6 @@ def test_core_affinity(self): cpuinfo = CPUPoolList(lscpu_txt=lscpu_txt) ground_truth = { "ninstances": 1, - "ncores_per_instance": 112, "num_cores_sum": 112, "num_nodes_sum": 2, "num_cores": [112], @@ -251,7 +251,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -264,7 +263,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=4) ground_truth = { "ninstances": 4, - "ncores_per_instance": 14, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [14, 14, 14, 14], @@ -277,7 +275,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ncores_per_instance=28) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -290,7 +287,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ncores_per_instance=14) ground_truth = { "ninstances": 4, - "ncores_per_instance": 14, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [14, 14, 14, 14], @@ -303,10 +299,9 @@ def test_core_affinity(self): cores_list_local = [] cores_list_local.extend(list(i for i in range(14, 28))) cores_list_local.extend(list(i for i in range(42, 56))) - cpuinfo.gen_pools_ondemand(cores_list=cores_list_local) + cpuinfo.gen_pools_ondemand(cores_list=cores_list_local, use_logical_cores=True) ground_truth = { "ninstances": 1, - "ncores_per_instance": 28, "num_cores_sum": 28, "num_nodes_sum": 1, "num_cores": [28], @@ -324,7 +319,6 @@ def test_core_affinity(self): cpuinfo = CPUPoolList(lscpu_txt=lscpu_txt) ground_truth = { "ninstances": 1, - "ncores_per_instance": 112, "num_cores_sum": 112, "num_nodes_sum": 4, "num_cores": [112], @@ -337,7 +331,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(nodes_list=[1, 2]) ground_truth = { "ninstances": 1, - "ncores_per_instance": 28, "num_cores_sum": 28, "num_nodes_sum": 2, "num_cores": [28], @@ -356,7 +349,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -376,7 +368,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=1) ground_truth = { "ninstances": 1, - "ncores_per_instance": 56, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [56], @@ -389,7 +380,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -402,7 +392,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=4) ground_truth = { "ninstances": 4, - "ncores_per_instance": 14, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [14, 14, 14, 14], @@ -415,7 +404,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ncores_per_instance=28) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -428,7 +416,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ncores_per_instance=14) ground_truth = { "ninstances": 4, - "ncores_per_instance": 14, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [14, 14, 14, 14], @@ -444,7 +431,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2, cores_list=cores_list_local) ground_truth = { "ninstances": 2, - "ncores_per_instance": 14, "num_cores_sum": 28, "num_nodes_sum": 2, "num_cores": [14, 14], @@ -463,7 +449,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2, nodes_list=[1, 2]) ground_truth = { "ninstances": 2, - "ncores_per_instance": 14, "num_cores_sum": 28, "num_nodes_sum": 2, "num_cores": [14, 14], @@ -482,7 +467,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -502,7 +486,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -518,7 +501,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=4) ground_truth = { "ninstances": 4, - "ncores_per_instance": 14, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [14, 14, 14, 14], @@ -536,7 +518,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ncores_per_instance=28) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -552,7 +533,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ncores_per_instance=14) ground_truth = { "ninstances": 4, - "ncores_per_instance": 14, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [14, 14, 14, 14], @@ -570,7 +550,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=3) ground_truth = { "ninstances": 3, - "ncores_per_instance": 18, "num_cores_sum": 54, "num_nodes_sum": 2, "num_cores": [18, 18, 18], @@ -587,10 +566,11 @@ def test_core_affinity(self): cores_list_local = [] cores_list_local.extend(list(i for i in range(14, 28))) cores_list_local.extend(list(i for i in range(98, 112))) - cpuinfo.gen_pools_ondemand(ninstances=2, cores_list=cores_list_local) + cpuinfo.gen_pools_ondemand( + ninstances=2, cores_list=cores_list_local, use_logical_cores=True + ) ground_truth = { "ninstances": 2, - "ncores_per_instance": 14, "num_cores_sum": 28, "num_nodes_sum": 2, "num_cores": [14, 14], @@ -609,7 +589,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(nodes_list=[1, 2]) ground_truth = { "ninstances": 1, - "ncores_per_instance": 28, "num_cores_sum": 28, "num_nodes_sum": 2, "num_cores": [28], @@ -630,7 +609,6 @@ def test_core_affinity(self): cpuinfo.gen_pools_ondemand(ninstances=2) ground_truth = { "ninstances": 2, - "ncores_per_instance": 28, "num_cores_sum": 56, "num_nodes_sum": 2, "num_cores": [28, 28], @@ -653,7 +631,6 @@ def test_core_affinity_with_logical_cores(self): cpuinfo.gen_pools_ondemand(ninstances=2, use_logical_cores=True) ground_truth = { "ninstances": 2, - "ncores_per_instance": 56, "num_cores_sum": 112, "num_nodes_sum": 2, "num_cores": [56, 56], @@ -663,27 +640,26 @@ def test_core_affinity_with_logical_cores(self): } self.verify_affinity(cpuinfo.pools_ondemand, ground_truth) - def test_core_affinity_with_skip_cross_node_cores(self): + def test_core_affinity_with_bind_numa_node(self): num_nodes = 2 n_phycores_per_node = 28 lscpu_txt = construct_numa_config( num_nodes, n_phycores_per_node, enable_ht=True, numa_mode=1 ) cpuinfo = CPUPoolList(lscpu_txt=lscpu_txt) - cpuinfo.gen_pools_ondemand(ninstances=3, skip_cross_node_cores=True) + cpuinfo.gen_pools_ondemand(ninstances=3, bind_numa_node=True) ground_truth = { "ninstances": 3, - "ncores_per_instance": 14, - "num_cores_sum": 42, + "num_cores_sum": 56, "num_nodes_sum": 2, - "num_cores": [14, 14, 14], + "num_cores": [28, 14, 14], "num_nodes": [1, 1, 1], - "pools_cores": ["0-13", "14-27", "28-41"], - "pools_nodes": ["0", "0", "1"], + "pools_cores": ["0-27", "28-41", "42-55"], + "pools_nodes": ["0", "1", "1"], } self.verify_affinity(cpuinfo.pools_ondemand, ground_truth) - def test_core_affinity_with_skip_cross_node_cores_and_use_logical_core(self): + def test_core_affinity_with_bind_numa_node_and_use_logical_core(self): num_nodes = 2 n_phycores_per_node = 28 lscpu_txt = construct_numa_config( @@ -691,29 +667,28 @@ def test_core_affinity_with_skip_cross_node_cores_and_use_logical_core(self): ) cpuinfo = CPUPoolList(lscpu_txt=lscpu_txt) cpuinfo.gen_pools_ondemand( - ninstances=7, use_logical_cores=True, skip_cross_node_cores=True + ninstances=7, use_logical_cores=True, bind_numa_node=True ) ground_truth = { "ninstances": 7, - "ncores_per_instance": 14, - "num_cores_sum": 98, + "num_cores_sum": 110, "num_nodes_sum": 2, - "num_cores": [14, 14, 14, 14, 14, 14, 14], + "num_cores": [18, 18, 18, 14, 14, 14, 14], "num_nodes": [1, 1, 1, 1, 1, 1, 1], "pools_cores": [ - "0-6,56-62", - "7-13,63-69", - "14-20,70-76", - "21-27,77-83", + "0-8,56-64", + "9-17,65-73", + "18-26,74-82", "28-34,84-90", "35-41,91-97", "42-48,98-104", + "49-55,105-111", ], - "pools_nodes": ["0", "0", "0", "0", "1", "1", "1"], + "pools_nodes": ["0", "0", "0", "1", "1", "1", "1"], } self.verify_affinity(cpuinfo.pools_ondemand, ground_truth) - def test_core_affinity_with_skip_cross_node_cores_and_node_id_use_logical_core( + def test_core_affinity_with_bind_numa_node_and_node_id_use_logical_core( self, ): num_nodes = 4 @@ -726,17 +701,16 @@ def test_core_affinity_with_skip_cross_node_cores_and_node_id_use_logical_core( ninstances=3, nodes_list=[1, 2], use_logical_cores=True, - skip_cross_node_cores=True, + bind_numa_node=True, ) ground_truth = { "ninstances": 3, - "ncores_per_instance": 14, - "num_cores_sum": 42, + "num_cores_sum": 56, "num_nodes_sum": 2, - "num_cores": [14, 14, 14], + "num_cores": [28, 14, 14], "num_nodes": [1, 1, 1], - "pools_cores": ["14-20,70-76", "21-27,77-83", "28-34,84-90"], - "pools_nodes": ["1", "1", "2"], + "pools_cores": ["14-27,70-83", "28-34,84-90", "35-41,91-97"], + "pools_nodes": ["1", "2", "2"], } self.verify_affinity(cpuinfo.pools_ondemand, ground_truth) From eda7a7c42df6f9a64e0de9c2b69304ee02f2c32a Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Mon, 17 Jun 2024 14:17:43 +0800 Subject: [PATCH 119/199] fix online doc search issue (#2979) Co-authored-by: ZhangJianyu --- docs/conf.py | 2 ++ docs/requirements.txt | 1 + scripts/build_doc.sh | 10 ++++++++++ 3 files changed, 13 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 6b76704d2..6d8b0b48f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -38,6 +38,8 @@ 'recommonmark', 'sphinx_markdown_tables', 'sphinx_md', + 'sphinx_rtd_theme', + 'sphinxcontrib.jquery', 'sphinxemoji.sphinxemoji' ] diff --git a/docs/requirements.txt b/docs/requirements.txt index e4e0e734f..8aa1b5cb2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,7 @@ wheel sphinx sphinx_rtd_theme +sphinxcontrib.jquery sphinx-markdown-tables sphinx_md sphinxemoji diff --git a/scripts/build_doc.sh b/scripts/build_doc.sh index 8bc92cdba..35131c934 100644 --- a/scripts/build_doc.sh +++ b/scripts/build_doc.sh @@ -266,3 +266,13 @@ elif [[ ${DEVICE} == "gpu" ]]; then rm -rf xml mv tutorials/features/advanced_configuration.md.bk tutorials/features/advanced_configuration.md fi + +LN=$(grep "searchtools.js" -n _build/html/search.html | cut -d ":" -f 1) +sed -i "${LN}i \ \ \ \ " _build/html/search.html +sed -i "${LN}i \ \ \ \ " _build/html/search.html +sed -i "${LN}i \ \ \ \ " _build/html/search.html +sed -i "${LN}i \ \ \ \ " _build/html/search.html +sed -i "${LN}i \ \ \ \ " _build/html/search.html +sed -i "${LN}i \ \ \ \ " _build/html/search.html +sed -i "${LN}i \ \ \ \ <\!\-\-[if lt IE 9]><\![endif]\-\->" _build/html/search.html + From fc25949f6ecd18ec0586be0ad8f4fb8aa2b23fb7 Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 18 Jun 2024 12:41:03 +0800 Subject: [PATCH 120/199] Remove overhead of ChatGLM int8 (#2988) * apply ROPE for concat output and remove the split overheads * remove to before and after rmsnorm --- .../kernels/RotaryPositionEmbeddingKnl.cpp | 2 +- csrc/cpu/jit/passes/graph_rewrite.cpp | 19 ++- .../models/reference/modules/attentions.py | 109 ++++++++---------- 3 files changed, 69 insertions(+), 61 deletions(-) diff --git a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp index 0894ef23e..4112dd022 100644 --- a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp +++ b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp @@ -65,7 +65,7 @@ std::tuple ApplyROPEKernel( if (is_fused_qkv(t_in, N * H)) { TORCH_CHECK( - in_stride_s == HS, + t_in.dim() == 3, "The shape of input tensor of rotary_position_embedding should be in (batch, seq_len, qkv_hidden_size) when using fused qkv)"); N_KV = (HS - N * H) / (2 * H); } diff --git a/csrc/cpu/jit/passes/graph_rewrite.cpp b/csrc/cpu/jit/passes/graph_rewrite.cpp index e309877af..e5a4cccc8 100644 --- a/csrc/cpu/jit/passes/graph_rewrite.cpp +++ b/csrc/cpu/jit/passes/graph_rewrite.cpp @@ -209,8 +209,25 @@ void FuseRMSNorm(std::shared_ptr& graph) { graph(%hidden_states, %weight, %exponent:int, %dim:int[], %keepdim:bool, %dtype:NoneType, %eps:float, %alpha:int): %r = ipex::RMSNorm(%hidden_states, %weight, %eps) return (%r) )"; - SubgraphRewriter rewriter_aten; + std::string aten_RMSNorm_v2 = R"( + graph(%hidden_states, %weight, %exponent:int, %dim:int[], %keepdim:bool, %dtype:NoneType, %eps:float, %alpha:int, %idx, %idx2, %no): + %h2 = aten::to(%hidden_states, %idx, %no, %no, %dtype) + %s = aten::pow(%h2, %exponent) + %v = aten::mean(%s, %dim, %keepdim, %dtype) + %m = aten::add(%v, %eps, %alpha) + %n = aten::rsqrt(%m) + %l = aten::mul(%h2, %n) + %r1 = aten::mul(%weight, %l) + %r = aten::to(%r1, %idx2, %no, %no, %dtype) + return (%r) )"; + std::string fused_RMSNorm_v2 = R"( + graph(%hidden_states, %weight, %exponent:int, %dim:int[], %keepdim:bool, %dtype:NoneType, %eps:float, %alpha:int, %idx, %idx2, %no): + %r = ipex::RMSNorm(%hidden_states, %weight, %eps) + return (%r) )"; + SubgraphRewriter rewriter_aten, rewriter_aten_v2; + rewriter_aten_v2.RegisterRewritePattern(aten_RMSNorm_v2, fused_RMSNorm_v2); rewriter_aten.RegisterRewritePattern(aten_RMSNorm, fused_RMSNorm); + rewriter_aten_v2.runOnGraph(graph); rewriter_aten.runOnGraph(graph); } diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 85e0f47ae..38a053443 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -825,72 +825,63 @@ def _GLM2Attention_forward( # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer = self.query_key_value(hidden_states) mixed_x_layer = mixed_x_layer.transpose(0, 1) - - if self.multi_query_attention: - (query_layer, key_layer, value_layer) = mixed_x_layer.split( - [ - self.num_attention_heads_per_partition - * self.hidden_size_per_attention_head, - self.num_multi_query_groups_per_partition - * self.hidden_size_per_attention_head, - self.num_multi_query_groups_per_partition - * self.hidden_size_per_attention_head, - ], - dim=-1, - ) - query_layer = query_layer.view( - query_layer.size()[:-1] - + ( - self.num_attention_heads_per_partition, - self.hidden_size_per_attention_head, - ) - ) - key_layer = key_layer.view( - key_layer.size()[:-1] - + ( - self.num_multi_query_groups_per_partition, - self.hidden_size_per_attention_head, - ) - ) - value_layer = value_layer.view( - value_layer.size()[:-1] - + ( - self.num_multi_query_groups_per_partition, - self.hidden_size_per_attention_head, - ) - ) - else: - new_tensor_shape = mixed_x_layer.size()[:-1] + ( - self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head, - ) - mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) - - # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] - (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim( - mixed_x_layer, 3 - ) past_len = kv_cache[0].shape[-2] if kv_cache is not None else 0 # apply relative positional encoding (rotary embedding) if rotary_pos_emb is not None: - query_layer = query_layer.contiguous() - key_layer = key_layer.contiguous() - key_layer = self._IPEXROPE( - key_layer, - torch.tensor(past_len), - key_layer.size(-2), - key_layer.size(-1), - 1, - 64, - ) - query_layer = self._IPEXROPE( - query_layer, + query_layer, key_layer, value_layer = self._IPEXROPE( + mixed_x_layer, torch.tensor(past_len), - query_layer.size(-2), - query_layer.size(-1), + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, 1, 64, + num_concats=3, ) + else: + if self.multi_query_attention: + (query_layer, key_layer, value_layer) = mixed_x_layer.split( + [ + self.num_attention_heads_per_partition + * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition + * self.hidden_size_per_attention_head, + self.num_multi_query_groups_per_partition + * self.hidden_size_per_attention_head, + ], + dim=-1, + ) + query_layer = query_layer.view( + query_layer.size()[:-1] + + ( + self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head, + ) + ) + key_layer = key_layer.view( + key_layer.size()[:-1] + + ( + self.num_multi_query_groups_per_partition, + self.hidden_size_per_attention_head, + ) + ) + value_layer = value_layer.view( + value_layer.size()[:-1] + + ( + self.num_multi_query_groups_per_partition, + self.hidden_size_per_attention_head, + ) + ) + else: + new_tensor_shape = mixed_x_layer.size()[:-1] + ( + self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head, + ) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + (query_layer, key_layer, value_layer) = self.split_tensor_along_last_dim( + mixed_x_layer, 3 + ) if attention_mask is None: attention_mask = torch.ones( From 279505378e72274a3e43e6207e5ff24720a1c107 Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Tue, 18 Jun 2024 16:35:28 +0800 Subject: [PATCH 121/199] Enable optimized Qwen2 model (#2977) --- .../llm/single_instance/run_quantization.py | 6 +- .../python/llm/utils/model_class/qwen2.py | 13 ++ .../llm/functional/fusions.py | 4 +- .../transformers/generation/beam_sample.py | 1 + .../transformers/generation/beam_search.py | 1 + .../transformers/generation/greedy_search.py | 1 + .../transformers/generation/sample.py | 1 + .../models/cpu/modules/attentions.py | 1 + .../models/cpu/modules/decoder.py | 1 + .../models/reference/fusions/linear_fusion.py | 4 +- .../transformers/models/reference/models.py | 212 ++++++++++++++++++ .../models/reference/modules/attentions.py | 115 ++++++++++ .../models/reference/modules/decoder.py | 64 ++++++ .../transformers/optimize.py | 29 +++ tests/cpu/hf_configs/qwen2/config.json | 27 +++ ...test_ipex_optimize_transformers_nightly.py | 7 + 16 files changed, 482 insertions(+), 5 deletions(-) create mode 100644 examples/cpu/inference/python/llm/utils/model_class/qwen2.py create mode 100644 tests/cpu/hf_configs/qwen2/config.json diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 31f185b94..5938dab6f 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -33,6 +33,7 @@ from llm.utils.model_class.mpt import MPTConfig from llm.utils.model_class.stablelm import StableLMConfig from llm.utils.model_class.qwen import QwenConfig +from llm.utils.model_class.qwen2 import Qwen2Config from llm.utils.model_class.git import GitConfig from llm.utils.model_class.llava import LlavaConfig from llm.utils.model_class.phi import PhiConfig @@ -317,7 +318,10 @@ elif re.search("stablelm", config.architectures[0], re.IGNORECASE): model = StableLMConfig(args.model_id) elif re.search("qwen", config.architectures[0], re.IGNORECASE): - model = QwenConfig(args.model_id) + if re.search("qwen2", config.architectures[0], re.IGNORECASE): + model = Qwen2Config(args.model_id) + else: + model = QwenConfig(args.model_id) elif re.search("git", config.architectures[0], re.IGNORECASE): from PIL import Image import requests diff --git a/examples/cpu/inference/python/llm/utils/model_class/qwen2.py b/examples/cpu/inference/python/llm/utils/model_class/qwen2.py new file mode 100644 index 000000000..b90adf980 --- /dev/null +++ b/examples/cpu/inference/python/llm/utils/model_class/qwen2.py @@ -0,0 +1,13 @@ +from .llm import LLMConfig, EXAMPLE_INPUTS_MODE + + +class Qwen2Config(LLMConfig): + def __init__(self, model_id): + self.name = "qwen" + self.model_id = model_id + self.to_channels_last = False + self.example_inputs_mode = EXAMPLE_INPUTS_MODE.MASK_POS_KV + + # for smooth quant + self.use_global_past_key_value = True + self.use_ipex_autotune = True diff --git a/intel_extension_for_pytorch/llm/functional/fusions.py b/intel_extension_for_pytorch/llm/functional/fusions.py index 12aea9a7f..bb5bceee6 100644 --- a/intel_extension_for_pytorch/llm/functional/fusions.py +++ b/intel_extension_for_pytorch/llm/functional/fusions.py @@ -250,7 +250,7 @@ def varlen_attention( def silu_mul(x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None): r""" - Applies PyTorch silu on input x, and them mul input y: + Applies PyTorch silu on input x, and mul input y: out = silu(x)*y Args: @@ -267,7 +267,7 @@ def gelu_mul( x: torch.Tensor, y: torch.Tensor, out: torch.Tensor = None, approximate="none" ): r""" - Applies PyTorch gelu on input x, and them mul input y: + Applies PyTorch gelu on input x, and mul input y: out = gelu(x)*y Args: diff --git a/intel_extension_for_pytorch/transformers/generation/beam_sample.py b/intel_extension_for_pytorch/transformers/generation/beam_sample.py index 713294dd9..75e502d72 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_sample.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_sample.py @@ -192,6 +192,7 @@ def _beam_sample( "PhiForCausalLM", "Phi3ForCausalLM", "WhisperForConditionalGeneration", + "Qwen2ForCausalLM", ]: first_token = False if model_inputs["past_key_values"] is None: diff --git a/intel_extension_for_pytorch/transformers/generation/beam_search.py b/intel_extension_for_pytorch/transformers/generation/beam_search.py index e36e329a0..e3f8db713 100644 --- a/intel_extension_for_pytorch/transformers/generation/beam_search.py +++ b/intel_extension_for_pytorch/transformers/generation/beam_search.py @@ -194,6 +194,7 @@ def _beam_search( "PhiForCausalLM", "Phi3ForCausalLM", "WhisperForConditionalGeneration", + "Qwen2ForCausalLM", ]: first_token = False has_position_id = model_inputs.get("position_ids", None) is not None diff --git a/intel_extension_for_pytorch/transformers/generation/greedy_search.py b/intel_extension_for_pytorch/transformers/generation/greedy_search.py index b874668bb..08e4926e0 100644 --- a/intel_extension_for_pytorch/transformers/generation/greedy_search.py +++ b/intel_extension_for_pytorch/transformers/generation/greedy_search.py @@ -175,6 +175,7 @@ def _greedy_search( "PhiForCausalLM", "Phi3ForCausalLM", "WhisperForConditionalGeneration", + "Qwen2ForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] diff --git a/intel_extension_for_pytorch/transformers/generation/sample.py b/intel_extension_for_pytorch/transformers/generation/sample.py index 22f6bc5d0..6d1591082 100644 --- a/intel_extension_for_pytorch/transformers/generation/sample.py +++ b/intel_extension_for_pytorch/transformers/generation/sample.py @@ -181,6 +181,7 @@ def _sample( "PhiForCausalLM", "Phi3ForCausalLM", "WhisperForConditionalGeneration", + "Qwen2ForCausalLM", ]: first_token = False input_bs = input_ids.size()[0] diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py index cb45433b9..bf770255c 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/attentions.py @@ -66,6 +66,7 @@ def __init__(self, module, config, tpp=False, woq=False): "MistralForCausalLM", "MixtralForCausalLM", "PhiForCausalLM", + "Qwen2ForCausalLM", ]: if hasattr(module, "concat_qkv"): self.concat_qkv = _IPEXConcatLinearCPU( diff --git a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py index 5ed1a85d2..a08de54d6 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/modules/decoder.py @@ -32,6 +32,7 @@ def __init__(self, module, config, tpp=False, woq=False): "BaichuanForCausalLM", "MistralForCausalLM", "QWenLMHeadModel", + "Qwen2ForCausalLM", "YuanForCausalLM", ]: if not self.distributed: diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/linear_fusion.py index 71fd44d13..c1a720a43 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/linear_fusion.py @@ -97,10 +97,10 @@ def __init__(self, linear_list: list): if linear_list[i].bias is not None: bias_list.append(linear_list[i].bias) concat_weight = torch.concat(weights_list, 0) - use_bias = True if bias_list is None else False + use_bias = True if bias_list != [] else False concat_bias = torch.concat(bias_list, 0) if use_bias else None self.concat_linear = nn.Linear( - concat_weight.shape[1], concat_weight.shape[0], use_bias + concat_weight.shape[1], concat_weight.shape[0], bias=use_bias ) self.concat_linear.weight = nn.Parameter(concat_weight) self.concat_linear.bias = nn.Parameter(concat_bias) if use_bias else None diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py index 330771634..cebe97410 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/models.py +++ b/intel_extension_for_pytorch/transformers/models/reference/models.py @@ -2219,6 +2219,150 @@ def custom_forward(*inputs): ) +def QWen2Model_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0).repeat(batch_size, 1) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if hasattr(self, "_prepare_decoder_attention_mask"): + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...", + _type=WarningType.WrongArgument, + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def QWenLMHeadModel_forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -2276,6 +2420,74 @@ def QWenLMHeadModel_forward( return ((loss,) + output) if loss is not None else output +def Qwen2ForCausalLM_forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, +) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if ( + hasattr(self, "config") + and hasattr(self.config, "lm_head_generation") + and self.config.lm_head_generation + and hidden_states.size(1) != 1 + ): + hidden_states = hidden_states[:, -1:, :] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + def GitEncoder_forward( self, hidden_states: torch.Tensor, diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 38a053443..f8dd55f37 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -1512,6 +1512,110 @@ def _StableLMEpochAttention_forward( return attn_output, attn_weights, past_key_value +def _QWen2Attention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +): + bsz, q_len, _ = hidden_states.size() + concat_qkv = None + if hasattr(self, "concat_qkv"): + concat_qkv = self.concat_qkv(hidden_states) + else: + query = self.q_proj(hidden_states) + key = self.k_proj(hidden_states) + value = self.v_proj(hidden_states) + + kv_seq_len = ( + q_len + past_key_value[0].size(-2) if past_key_value is not None else q_len + ) + + if concat_qkv is not None and type(concat_qkv) is not tuple: + query, key, value = self._IPEXROPE( + concat_qkv, + position_ids, + self.num_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + self.concat_qkv.num_concat, + ) + else: + if concat_qkv is not None: + query, key, value = concat_qkv + query = query.view(bsz, q_len, self.num_heads, self.head_dim) + key = key.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + value = value.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + key = self._IPEXROPE( + key, + position_ids, + self.num_key_value_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + query = self._IPEXROPE( + query, + position_ids, + self.num_heads, + self.head_dim, + self.head_dim // 2, + self.head_dim, + kv_seq_len, + ) + + if use_cache: + (attn_output, attn_weights, past_key_value) = self._IPEXScaleDotProduct( + query, + key, + value, + math.sqrt(self.head_dim), + past_key_value, + None, + attention_mask, + ) + else: + value_states = value.transpose(1, 2) + query_states = query.transpose(1, 2) + key_states = key.transpose(1, 2) + kv_seq_len = key_states.shape[-2] + + past_key_value = None + # repeat k/v heads if n_kv_heads < n_heads + key_states = _repeat_kv(key_states, self.num_key_value_groups) + value_states = _repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul( + query_states, key_states.transpose(2, 3) + ) / math.sqrt(self.head_dim) + + if attention_mask is not None: + attn_weights = torch.tensor(attn_weights) + torch.tensor(attention_mask) + attn_weights = torch.max( + attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min) + ) + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _QWenAttention_forward( self, hidden_states: Optional[Tuple[torch.FloatTensor]], @@ -2306,6 +2410,7 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): "StableLmForCausalLM", "LlavaLlamaForCausalLM", "PhiForCausalLM", + "Qwen2ForCausalLM", ]: supported_linear_types = [ torch.nn.Linear, @@ -2526,6 +2631,16 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "Qwen2ForCausalLM": + return _QWen2Attention_forward( + self, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) elif self.model_backbone == "GPTNeoXForCausalLM": return _GPTNeoXAttention_forward( self, diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py index 81a208230..e14764f38 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py @@ -1019,6 +1019,59 @@ def QWenBlock_forward( return outputs +def Qwen2DecoderLayer_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, +) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + if not self.distributed: + hidden_states = self.mha_linear_add(hidden_states, residual) + else: + hidden_states = self.self_attn.o_proj(hidden_states) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + + mlp_gate = self.linear_silu_mul(hidden_states) + + if not self.distributed: + hidden_states = self.mlp_linear_add(mlp_gate, residual) + else: + hidden_states = self.mlp.down_proj(mlp_gate) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + def GitLayer_forward( self, hidden_states: torch.Tensor, @@ -1427,6 +1480,7 @@ def __init__(self, module, config, distributed=False): "LlamaForCausalLM", "BaichuanForCausalLM", "MistralForCausalLM", + "Qwen2ForCausalLM", ]: if not self.distributed: self.mha_linear_add = _IPEXlinearAddRef(module.self_attn.o_proj) @@ -1663,6 +1717,16 @@ def forward( output_attentions, use_cache, ) + elif self.model_backbone == "Qwen2ForCausalLM": + return Qwen2DecoderLayer_forward( + self, + hidden_states, + attention_mask, + position_ids, + past_key_value, + output_attentions, + use_cache, + ) elif self.model_backbone == "OPTForCausalLM": return OPTDecoderLayer_forward( self, diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index bbd8d4bd1..79793006c 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -176,6 +176,8 @@ def model_convert_reference(_model): StableLMEpochModel_forward, QWenLMHeadModel_forward, QWenModel_forward, + QWen2Model_forward, + Qwen2ForCausalLM_forward, GitForCausalLM_forward, GitEncoder_forward, GitVisionEncoder_forward, @@ -649,6 +651,28 @@ def model_convert_reference(_model): _model.config, distributed=distributed, ) + elif _model.config.architectures[0] == "Qwen2ForCausalLM": + convert_function(_model, "forward", Qwen2ForCausalLM_forward) + convert_function(_model.model, "forward", QWen2Model_forward) + convert_function( + _model, + "prepare_inputs_for_generation", + prepare_inputs_for_generation_llama, + ) + convert_class( + _model, + transformers.models.qwen2.modeling_qwen2.Qwen2SdpaAttention, + _IPEXAttentionRef, + _model.config, + distributed=distributed, + ) + convert_class( + _model, + transformers.models.qwen2.modeling_qwen2.Qwen2DecoderLayer, + _IPEXDecoderLayerRef, + _model.config, + distributed=distributed, + ) elif _model.config.architectures[0] == "GitForCausalLM": convert_function(_model, "forward", GitForCausalLM_forward) convert_function(_model.git.encoder, "forward", GitEncoder_forward) @@ -1165,6 +1189,10 @@ def model_convert_lowering( ) if _model.config.architectures[0] == "QWenLMHeadModel": supported_classes.append(type(_model.transformer.h[0].ln_1)) + if _model.config.architectures[0] == "Qwen2ForCausalLM": + supported_classes.append( + transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm + ) if hasattr(transformers.models, "mistral"): supported_classes.append( transformers.models.mistral.modeling_mistral.MistralRMSNorm @@ -1391,6 +1419,7 @@ def optimize( "MptForCausalLM", "StableLmForCausalLM", "QWenLMHeadModel", + "Qwen2ForCausalLM", "GitForCausalLM", "LlavaLlamaForCausalLM", "YuanForCausalLM", diff --git a/tests/cpu/hf_configs/qwen2/config.json b/tests/cpu/hf_configs/qwen2/config.json new file mode 100644 index 000000000..8bf0170ee --- /dev/null +++ b/tests/cpu/hf_configs/qwen2/config.json @@ -0,0 +1,27 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "model_type": "qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 1, + "num_key_value_heads": 4, + "rms_norm_eps": 1e-06, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.41.2", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 152064 + \ No newline at end of file diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 0b34ba776..9d656a65c 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -137,6 +137,13 @@ lambda m: m.transformer.h[0].attn.__class__, lambda m: m.transformer.h[0].__class__, ), + model_info( + "qwen2", + transformers.models.qwen2.modeling_qwen2.Qwen2ForCausalLM, + True, + lambda m: m.model.layers[0].self_attn.__class__, + lambda m: m.model.layers[0].__class__, + ), model_info( "git", transformers.models.git.modeling_git.GitForCausalLM, From 52f8c48d704ab4060a653996e4cb54e60dc8c234 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Tue, 18 Jun 2024 19:46:50 +0800 Subject: [PATCH 122/199] Cache weight for large batch inference for full bf16 and WOQ lowp-mode=bf16 (#2898) * Keep bf16 weight for WOQ first token * Keep first token weight for woq int4 and full bf16 * Revert unnecessary changes * fix clang-format issue * Fix UT failures * Fix concat linear * Fix UT failures * fix lint issue * Cache extra weight at runtime instead of ahead-of-time * fix lint --- csrc/cpu/aten/TPPGEMM.cpp | 30 ++- csrc/cpu/aten/TPPGEMM.h | 27 ++- csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp | 72 ++++++- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 200 +----------------- csrc/cpu/aten/utils/woq.h | 132 ++++++++++++ csrc/cpu/jit/cpu/kernels/ContextLinearWoq.h | 8 +- csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp | 161 +++++++++++++- csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h | 9 +- csrc/cpu/jit/cpu/kernels/OpContext.cpp | 10 +- csrc/cpu/jit/cpu/kernels/OpContext.h | 13 +- .../cpu/kernels/RegisterOpContextClass.cpp | 10 +- csrc/cpu/jit/passes/graph_rewrite.cpp | 18 +- csrc/cpu/tpp/xsmm_functors.h | 72 ++++--- .../run_accuracy_with_deepspeed.py | 10 + .../llm/distributed/run_generation_tp.py | 9 + .../run_generation_with_deepspeed.py | 8 + examples/cpu/inference/python/llm/run.py | 16 ++ .../llm/single_instance/run_accuracy.py | 10 + .../llm/single_instance/run_generation.py | 8 + .../llm/single_instance/run_quantization.py | 8 + .../cpu/tpp/utils/blocked_layout.py | 16 ++ .../nn/modules/weight_only_quantization.py | 41 +++- .../nn/utils/_weight_prepack.py | 30 ++- .../quantization/_qconfig.py | 38 +++- .../models/cpu/fusions/linear_fusion.py | 61 +++++- .../models/reference/modules/attentions.py | 12 +- .../transformers/optimize.py | 73 +++++++ .../utils/weight_only_quantization.py | 18 ++ tests/cpu/test_ipex_optimize_transformers.py | 80 ++++++- tests/cpu/test_quantization_default_recipe.py | 51 +++++ 30 files changed, 968 insertions(+), 283 deletions(-) diff --git a/csrc/cpu/aten/TPPGEMM.cpp b/csrc/cpu/aten/TPPGEMM.cpp index 98497abf7..7cc3f9d4f 100644 --- a/csrc/cpu/aten/TPPGEMM.cpp +++ b/csrc/cpu/aten/TPPGEMM.cpp @@ -15,6 +15,17 @@ IPEX_DEFINE_DISPATCH(tpp_linear_relu_kernel_stub); IPEX_DEFINE_DISPATCH(tpp_linear_add_kernel_stub); IPEX_DEFINE_DISPATCH(tpp_linear_mul_kernel_stub); IPEX_DEFINE_DISPATCH(tpp_linear_add_add_kernel_stub); +IPEX_DEFINE_DISPATCH(tpp_gelu_tanh_bf16_kernel_stub); + +void tpp_gelu_tanh_bf16_forward_cpu( + at::BFloat16* in, + at::BFloat16* out, + int M, + int N, + int ldi, + int ldo) { + tpp_gelu_tanh_bf16_kernel_stub(kCPU, in, out, M, N, ldi, ldo); +} at::Tensor tpp_linear_nobias_forward_cpu( const at::Tensor& t_in, @@ -36,7 +47,15 @@ at::Tensor tpp_linear_gelu_forward_cpu( const at::Tensor& t_wt, const at::Tensor& t_bias, c10::optional out_features) { - return tpp_linear_gelu_kernel_stub(kCPU, t_in, t_wt, t_bias); + return tpp_linear_gelu_kernel_stub(kCPU, t_in, t_wt, t_bias, "none"); +} + +at::Tensor tpp_linear_gelu_tanh_forward_cpu( + const at::Tensor& t_in, + const at::Tensor& t_wt, + const at::Tensor& t_bias, + c10::optional out_features) { + return tpp_linear_gelu_kernel_stub(kCPU, t_in, t_wt, t_bias, "tanh"); } at::Tensor tpp_fused_gate_up_proj_forward_cpu( @@ -129,6 +148,15 @@ TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { torch_ipex::cpu::tpp_linear_gelu_forward_cpu); } +TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { + m.def( + "tpp_linear_gelu_tanh(Tensor t_in, Tensor t_wt, Tensor t_bias, int? out_features=None)-> Tensor out"); + m.impl( + "tpp_linear_gelu_tanh", + c10::DispatchKey::CPU, + torch_ipex::cpu::tpp_linear_gelu_tanh_forward_cpu); +} + TORCH_LIBRARY_FRAGMENT(torch_ipex, m) { m.def( "tpp_fused_gate_up_proj(Tensor t_in, Tensor t_wt_gate, Tensor t_bias_gate, Tensor t_wt_up, Tensor t_bias_up,int? out_features=None)-> Tensor out"); diff --git a/csrc/cpu/aten/TPPGEMM.h b/csrc/cpu/aten/TPPGEMM.h index fd9d94749..d34603a28 100644 --- a/csrc/cpu/aten/TPPGEMM.h +++ b/csrc/cpu/aten/TPPGEMM.h @@ -23,6 +23,12 @@ at::Tensor tpp_linear_gelu_forward_cpu( const at::Tensor& t_bias, c10::optional out_features); +at::Tensor tpp_linear_gelu_tanh_forward_cpu( + const at::Tensor& t_in, + const at::Tensor& t_wt, + const at::Tensor& t_bias, + c10::optional out_features); + at::Tensor tpp_fused_gate_up_proj_forward_cpu( const at::Tensor& t_in, const at::Tensor& t_wt_gate, @@ -67,14 +73,25 @@ at::Tensor tpp_linear_add_add_forward_cpu( double scale, c10::optional out_features); +void tpp_gelu_tanh_bf16_forward_cpu( + at::BFloat16* in, + at::BFloat16* out, + int M, + int N, + int ldi, + int ldo); + using tpp_linear_nobias_impl_fn = at::Tensor (*)(const at::Tensor&, const at::Tensor&); using tpp_linear_bias_kernel_impl_fn = at::Tensor (*)(const at::Tensor&, const at::Tensor&, const at::Tensor&); -using tpp_linear_gelu_kernel_impl_fn = - at::Tensor (*)(const at::Tensor&, const at::Tensor&, const at::Tensor&); +using tpp_linear_gelu_kernel_impl_fn = at::Tensor (*)( + const at::Tensor&, + const at::Tensor&, + const at::Tensor&, + const c10::string_view&); using tpp_fused_gate_up_proj_kernel_impl_fn = at::Tensor (*)( const at::Tensor&, @@ -110,6 +127,9 @@ using tpp_linear_add_add_kernel_impl_fn = at::Tensor (*)( const at::Tensor&, double); +using tpp_gelu_tanh_bf16_kernel_impl_fn = + void (*)(at::BFloat16*, at::BFloat16*, int, int, int, int); + IPEX_DECLARE_DISPATCH(tpp_linear_nobias_impl_fn, tpp_linear_nobias_kernel_stub); IPEX_DECLARE_DISPATCH( tpp_linear_bias_kernel_impl_fn, @@ -135,6 +155,9 @@ IPEX_DECLARE_DISPATCH( IPEX_DECLARE_DISPATCH( tpp_linear_add_add_kernel_impl_fn, tpp_linear_add_add_kernel_stub); +IPEX_DECLARE_DISPATCH( + tpp_gelu_tanh_bf16_kernel_impl_fn, + tpp_gelu_tanh_bf16_kernel_stub); } // namespace cpu } // namespace torch_ipex diff --git a/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp b/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp index 988b605fa..9a5acc9c7 100644 --- a/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp +++ b/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp @@ -65,7 +65,13 @@ at::Tensor tpp_linear_nobias_kernel_impl( at::Tensor tpp_linear_gelu_kernel_impl( const at::Tensor& t_in, const at::Tensor& t_wt, - const at::Tensor& t_bias) { + const at::Tensor& t_bias, + const c10::string_view& algorithm) { + AT_ASSERT( + algorithm == "none" || algorithm == "tanh", + "tpp_linear_gelu: Invalid gelu algorithm %s\n", + algorithm); + auto sizes = t_in.sizes().vec(); auto wt_sizes = t_wt.sizes(); sizes[2] = wt_sizes[0] * wt_sizes[3]; @@ -74,9 +80,18 @@ at::Tensor tpp_linear_gelu_kernel_impl( auto dt = t_wt.dtype(); if (dt == at::kFloat) { - torch_ipex::tpp::tpp_linear_gelu(t_in, t_wt, t_bias, t_out); + if (algorithm == "none") { + torch_ipex::tpp::tpp_linear_gelu(t_in, t_wt, t_bias, t_out); + } else { // tanh + torch_ipex::tpp::tpp_linear_gelu_tanh(t_in, t_wt, t_bias, t_out); + } } else if (dt == at::kBFloat16) { - torch_ipex::tpp::tpp_linear_gelu(t_in, t_wt, t_bias, t_out); + if (algorithm == "none") { + torch_ipex::tpp::tpp_linear_gelu(t_in, t_wt, t_bias, t_out); + } else { // tanh + torch_ipex::tpp::tpp_linear_gelu_tanh( + t_in, t_wt, t_bias, t_out); + } } else { AT_ASSERT( 0, @@ -240,6 +255,54 @@ at::Tensor tpp_linear_mul_kernel_impl( return t_out; } +void tpp_gelu_tanh_bf16_kernel_impl( + at::BFloat16* in, + at::BFloat16* out, + int M, + int N, + int ldi, + int ldo) { +#ifdef CPU_CAPABILITY_AVX512 + const __m512 c1 = _mm512_set1_ps((float)0.7978846); + const __m512 c2 = _mm512_set1_ps((float)0.0356814); + const __m512 c_half = _mm512_set1_ps((float)0.5); + for (int j = 0; j < M; j++) { + int i; + for (i = 0; i < ALIGNDOWN(N, 16); i += 16) { + auto vin = torch_ipex::tpp::_mm512_loadu_ps_auto(&in[j * ldi + i]); + __m512 x_half = _mm512_mul_ps(vin, c_half); + __m512 x_sq = _mm512_mul_ps(vin, vin); + __m512 poly_x1 = _mm512_mul_ps(vin, _mm512_fmadd_ps(x_sq, c2, c1)); + __m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(poly_x1); + __m512 vout = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half); + torch_ipex::tpp::_mm512_storeu_ps_auto(&out[j * ldo + i], vout); + } + if (i < N) { + int rem = N - i; + __mmask16 mask = (1 << rem) - 1; + auto vin = + torch_ipex::tpp::_mm512_maskz_loadu_ps_auto(mask, &in[j * ldi + i]); + __m512 x_half = _mm512_mul_ps(vin, c_half); + __m512 x_sq = _mm512_mul_ps(vin, vin); + __m512 poly_x1 = _mm512_mul_ps(vin, _mm512_fmadd_ps(x_sq, c2, c1)); + __m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(poly_x1); + __m512 vout = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half); + torch_ipex::tpp::_mm512_mask_storeu_ps_auto( + &out[j * ldo + i], mask, vout); + } + } +#else + for (int j = 0; j < M; j++) { + for (int i = 0; i < N; i++) { + float x = in[j * ldi + i]; + out[j * ldo + i] = + ((tanh(sqrt(2 / M_PI) * (x + 0.044715 * std::pow(x, 3)))) + 1) * x * + 0.5; + } + } +#endif +} + } // namespace IPEX_REGISTER_DISPATCH( @@ -265,6 +328,9 @@ IPEX_REGISTER_DISPATCH(tpp_linear_add_kernel_stub, &tpp_linear_add_kernel_impl); IPEX_REGISTER_DISPATCH( tpp_linear_add_add_kernel_stub, &tpp_linear_add_add_kernel_impl); +IPEX_REGISTER_DISPATCH( + tpp_gelu_tanh_bf16_kernel_stub, + &tpp_gelu_tanh_bf16_kernel_impl); } // namespace cpu } // namespace torch_ipex #endif \ No newline at end of file diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index 8ca6442ac..1e4fe0c94 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -5,6 +5,7 @@ #include #include #include +#include "aten/utils/woq.h" #include "csrc/cpu/tpp/kernels/TPPGEMMKrnl.h" #include "csrc/cpu/tpp/woq/tla.h" @@ -46,70 +47,6 @@ constexpr bool is_sym_quant(const int qw_type) { return qw_type == NF4; } -static constexpr std::array NF4_QUANT_TABLE = { - -1.0 - 1e-2, // 0b0000 - -0.8480964004993439, // 0b0001 - -0.6106329262256622, // 0b0010 - -0.4599952697753906, // 0b0011 - -0.33967943489551544, // 0b0100 - -0.23460740596055984, // 0b0101 - -0.13791173323988914, // 0b0110 - -0.045525018125772476, // 0b0111 - 0.03979014977812767, // 0b1000 - 0.1202552504837513, // 0b1001 - 0.2035212516784668, // 0b1010 - 0.2920137718319893, // 0b1011 - 0.3893125355243683, // 0b1100 - 0.5016634166240692, // 0b1101 - 0.6427869200706482, // 0b1110 - 0.8614784181118011, // 0b1111 -}; - -static constexpr std::array NF4_DEQUANT_TABLE = { - -1.0, - -0.6961928009986877, - -0.5250730514526367, - -0.39491748809814453, - -0.28444138169288635, - -0.18477343022823334, - -0.09105003625154495, - 0.0, - 0.07958029955625534, - 0.16093020141124725, - 0.24611230194568634, - 0.33791524171829224, - 0.44070982933044434, - 0.5626170039176941, - 0.7229568362236023, - 1.0, -}; - -at::Tensor map_float_tensor_to_nf4(const at::Tensor& t) { - // Map [-1, 1] to nf4. Assume t in [-1, 1] - // Logic: - // for i in range(len(NF4_QUANT_TABLE)): - // out_uint8[t > NF4_QUANT_TABLE[i]] = i - using namespace at::indexing; - auto out_uint8 = at::empty(t.sizes(), t.options().dtype(at::kByte)); - for (size_t i = 0; i < NF4_QUANT_TABLE.size(); ++i) { - out_uint8.index_put_({t.greater(NF4_QUANT_TABLE[i])}, i); - } - return out_uint8; -} - -at::Tensor map_nf4_tensor_to_float(const at::Tensor& t) { - // Map nf4 to [-1, 1], t is already unpacked as uint8 - // Logic: - // for i in range(len(NF4_DEQUANT_TABLE)): - // out_dq[t == i] = NF4_DEQUANT_TABLE[i] - using namespace at::indexing; - auto out_dq = at::empty(t.sizes(), t.options().dtype(at::kFloat)); - for (size_t i = 0; i < NF4_DEQUANT_TABLE.size(); ++i) { - out_dq.index_put_({t.eq(i)}, NF4_DEQUANT_TABLE[i]); - } - return out_dq; -} - // We only build optimized kernels if AVX512_FP16 is supported and gcc>=12.3 // Otherwise we just return empty results // TODO(Weiwen) Merge WoqTppKrnl.cpp and WoqLinearKrnl.cpp and put the latter in @@ -4597,72 +4534,12 @@ at::Tensor qlinear_woq_affine( } at::Tensor scale, zp; scale = scales_list[fp32_idx].unsqueeze(-1); - if (qw_type != NF4) { + if (!sym_quant) { zp = zp_list[fp32_idx].unsqueeze(-1); } - auto w = - [&]() { - at::Tensor dqw; - if (qw_type == NF4) { - TLA_ASSERT( - sym_quant, "Weight must be symmetrically quantized for NF4"); - using namespace at::indexing; - auto w_int8 = - at::empty({N, qw.size(1) * 2}, qw.options().dtype(at::kByte)); - w_int8.index({Slice(), Slice(None, None, 2)}) - .copy_(qw.bitwise_and(0xf)); - w_int8.index({Slice(), Slice(1, None, 2)}) - .copy_(qw.bitwise_right_shift(4)); - auto w_ret = map_nf4_tensor_to_float(w_int8); - if (quant_w_mode == 0) { - dqw = w_ret * scale; - } else { - int64_t num_blocks = scale.size(-2); - auto w_int8_view = w_ret.view({N, num_blocks, -1}); - dqw = w_int8_view * scale; - dqw = dqw.view({N, -1}); - } - } else if (qw_type == QINT4) { - TLA_ASSERT( - !sym_quant, "Weight must be asymmetrically quantized for INT4"); - using namespace at::indexing; - auto w_int8 = - at::empty({N, qw.size(1) * 2}, qw.options().dtype(at::kByte)); - w_int8.index({Slice(), Slice(None, None, 2)}) - .copy_(qw.bitwise_and(0xf)); - w_int8.index({Slice(), Slice(1, None, 2)}) - .copy_(qw.bitwise_right_shift(4)); - if (quant_w_mode == 0) { - dqw = (w_int8.to(at::kFloat) - zp) * scale; - } else { - int64_t num_blocks = scale.size(-2); - auto w_int8_view = w_int8.view({N, num_blocks, -1}); - dqw = (w_int8_view.to(at::kFloat) - zp) * scale; - dqw = dqw.view({N, -1}); - } - } else { - TLA_ASSERT( - !sym_quant, "Weight must be asymmetrically quantized for INT8"); - if (quant_w_mode == 0) { - dqw = sym_quant ? qw.to(at::kFloat) * scale - : (qw.to(at::kFloat) - zp) * scale; - } else { - int64_t num_blocks = scale.size(-2); - auto w_int8_view = qw.view({N, num_blocks, -1}); - dqw = sym_quant ? w_int8_view.to(at::kFloat) * scale - : (w_int8_view.to(at::kFloat) - zp) * scale; - dqw = dqw.view({N, -1}); - } - } - if (K != qw.size(1) * 2) { - TORCH_CHECK( - K < qw.size(1) * 2, - 'WOQ Linear kernel: Unexpected weight shape'); - dqw = dqw.narrow(1, 0, K); - } - return dqw; - }() - .to(compute_dtype); + auto w = torch_ipex::cpu::dequantize_woq_weight( + qw, {N, K}, scale, zp, qw_type, quant_w_mode) + .to(compute_dtype); auto x_reshape = x.reshape({M, K}); auto x_fp = x_reshape.to(compute_dtype); // PyTorch does not support computing in half yet @@ -4739,71 +4616,12 @@ at::Tensor qlinear_woq_affine( } at::Tensor scale, zp; scale = scales_list[fp32_idx].unsqueeze(-1); - if (qw_type != NF4) { + if (!sym_quant) { zp = zp_list[fp32_idx].unsqueeze(-1); } - auto w = - [&]() { - at::Tensor dqw; - if (qw_type == NF4) { - TLA_ASSERT( - sym_quant, "Weight must be symmetrically quantized for NF4"); - using namespace at::indexing; - auto w_int8 = - at::empty({N, qw.size(1) * 2}, qw.options().dtype(at::kByte)); - w_int8.index({Slice(), Slice(None, None, 2)}) - .copy_(qw.bitwise_and(0xf)); - w_int8.index({Slice(), Slice(1, None, 2)}) - .copy_(qw.bitwise_right_shift(4)); - auto w_ret = map_nf4_tensor_to_float(w_int8); - if (quant_w_mode == 0) { - dqw = w_ret * scale; - } else { - int64_t num_blocks = scale.size(-2); - auto w_int8_view = w_ret.view({N, num_blocks, -1}); - dqw = w_int8_view * scale; - dqw = dqw.view({N, -1}); - } - } else if (qw_type == QINT4) { - TLA_ASSERT( - !sym_quant, "Weight must be asymmetrically quantized for INT4"); - using namespace at::indexing; - auto w_int8 = - at::empty({N, qw.size(1) * 2}, qw.options().dtype(at::kByte)); - w_int8.index({Slice(), Slice(None, None, 2)}) - .copy_(qw.bitwise_and(0xf)); - w_int8.index({Slice(), Slice(1, None, 2)}) - .copy_(qw.bitwise_right_shift(4)); - if (quant_w_mode == 0) { - dqw = (w_int8.to(at::kFloat) - zp) * scale; - } else { - int64_t num_blocks = scale.size(-2); - auto w_int8_view = w_int8.view({N, num_blocks, -1}); - dqw = (w_int8_view.to(at::kFloat) - zp) * scale; - dqw = dqw.view({N, -1}); - } - } else { - TLA_ASSERT( - !sym_quant, "Weight must be asymmetrically quantized for INT8"); - if (quant_w_mode == 0) { - dqw = sym_quant ? qw.to(at::kFloat) * scale - : (qw.to(at::kFloat) - zp) * scale; - } else { - int64_t num_blocks = scale.size(-2); - auto w_int8_view = qw.view({N, num_blocks, -1}); - dqw = sym_quant ? w_int8_view.to(at::kFloat) * scale - : (w_int8_view.to(at::kFloat) - zp) * scale; - dqw = dqw.view({N, -1}); - } - } - if (K != qw.size(1) * 2) { - TORCH_CHECK( - K < qw.size(1) * 2, 'WOQ Linear kernel: Unexpected weight shape'); - dqw = dqw.narrow(1, 0, K); - } - return dqw; - }() - .to(compute_dtype); + auto w = torch_ipex::cpu::dequantize_woq_weight( + qw, {N, K}, scale, zp, qw_type, quant_w_mode) + .to(compute_dtype); auto x_reshape = x.reshape({M, K}); auto x_fp = x_reshape.to(compute_dtype); // PyTorch does not support computing in half yet diff --git a/csrc/cpu/aten/utils/woq.h b/csrc/cpu/aten/utils/woq.h index 82e0d75c5..28ada4cef 100644 --- a/csrc/cpu/aten/utils/woq.h +++ b/csrc/cpu/aten/utils/woq.h @@ -126,6 +126,138 @@ DotMicroKernelRef create_or_get_dot_microkernel( } } } // namespace + +static constexpr std::array NF4_QUANT_TABLE = { + -1.0 - 1e-2, // 0b0000 + -0.8480964004993439, // 0b0001 + -0.6106329262256622, // 0b0010 + -0.4599952697753906, // 0b0011 + -0.33967943489551544, // 0b0100 + -0.23460740596055984, // 0b0101 + -0.13791173323988914, // 0b0110 + -0.045525018125772476, // 0b0111 + 0.03979014977812767, // 0b1000 + 0.1202552504837513, // 0b1001 + 0.2035212516784668, // 0b1010 + 0.2920137718319893, // 0b1011 + 0.3893125355243683, // 0b1100 + 0.5016634166240692, // 0b1101 + 0.6427869200706482, // 0b1110 + 0.8614784181118011, // 0b1111 +}; + +static constexpr std::array NF4_DEQUANT_TABLE = { + -1.0, + -0.6961928009986877, + -0.5250730514526367, + -0.39491748809814453, + -0.28444138169288635, + -0.18477343022823334, + -0.09105003625154495, + 0.0, + 0.07958029955625534, + 0.16093020141124725, + 0.24611230194568634, + 0.33791524171829224, + 0.44070982933044434, + 0.5626170039176941, + 0.7229568362236023, + 1.0, +}; + +static at::Tensor map_float_tensor_to_nf4(const at::Tensor& t) { + // Map [-1, 1] to nf4. Assume t in [-1, 1] + // Logic: + // for i in range(len(NF4_QUANT_TABLE)): + // out_uint8[t > NF4_QUANT_TABLE[i]] = i + using namespace at::indexing; + auto out_uint8 = at::empty(t.sizes(), t.options().dtype(at::kByte)); + for (size_t i = 0; i < NF4_QUANT_TABLE.size(); ++i) { + out_uint8.index_put_({t.greater(NF4_QUANT_TABLE[i])}, i); + } + return out_uint8; +} + +static at::Tensor map_nf4_tensor_to_float(const at::Tensor& t) { + // Map nf4 to [-1, 1], t is already unpacked as uint8 + // Logic: + // for i in range(len(NF4_DEQUANT_TABLE)): + // out_dq[t == i] = NF4_DEQUANT_TABLE[i] + using namespace at::indexing; + auto out_dq = at::empty(t.sizes(), t.options().dtype(at::kFloat)); + for (size_t i = 0; i < NF4_DEQUANT_TABLE.size(); ++i) { + out_dq.index_put_({t.eq(i)}, NF4_DEQUANT_TABLE[i]); + } + return out_dq; +} + +#define WOQ_DTYPE_INT8 1 +#define WOQ_DTYPE_INT4 2 +#define WOQ_DTYPE_NF4 3 + +static at::Tensor dequantize_woq_weight( + const at::Tensor& qw, + const std::vector& weight_shape, + const at::Tensor& scale, + const at::Tensor& zp, + int64_t qw_type, // weight dtype + int64_t quant_w_mode // weight quant mode +) { + bool sym_quant = qw_type == WOQ_DTYPE_NF4; + TORCH_CHECK(qw.dim() == 2, "Weight must 2D but got ", qw.dim(), "D"); + auto N = weight_shape[0]; + auto K = weight_shape[1]; + at::Tensor dqw; + if (qw_type == WOQ_DTYPE_NF4) { + TORCH_CHECK(sym_quant, "Weight must be symmetrically quantized for NF4"); + using namespace at::indexing; + auto w_int8 = at::empty({N, qw.size(1) * 2}, qw.options().dtype(at::kByte)); + w_int8.index({Slice(), Slice(None, None, 2)}).copy_(qw.bitwise_and(0xf)); + w_int8.index({Slice(), Slice(1, None, 2)}).copy_(qw.bitwise_right_shift(4)); + auto w_ret = map_nf4_tensor_to_float(w_int8); + if (quant_w_mode == 0) { + dqw = w_ret * scale; + } else { + int64_t num_blocks = scale.size(-2); + auto w_int8_view = w_ret.view({N, num_blocks, -1}); + dqw = w_int8_view * scale; + dqw = dqw.view({N, -1}); + } + } else if (qw_type == WOQ_DTYPE_INT4) { + TORCH_CHECK(!sym_quant, "Weight must be asymmetrically quantized for INT4"); + using namespace at::indexing; + auto w_int8 = at::empty({N, qw.size(1) * 2}, qw.options().dtype(at::kByte)); + w_int8.index({Slice(), Slice(None, None, 2)}).copy_(qw.bitwise_and(0xf)); + w_int8.index({Slice(), Slice(1, None, 2)}).copy_(qw.bitwise_right_shift(4)); + if (quant_w_mode == 0) { + dqw = (w_int8.to(at::kFloat) - zp) * scale; + } else { + int64_t num_blocks = scale.size(-2); + auto w_int8_view = w_int8.view({N, num_blocks, -1}); + dqw = (w_int8_view.to(at::kFloat) - zp) * scale; + dqw = dqw.view({N, -1}); + } + } else { + TORCH_CHECK(!sym_quant, "Weight must be asymmetrically quantized for INT8"); + if (quant_w_mode == 0) { + dqw = sym_quant ? qw.to(at::kFloat) * scale + : (qw.to(at::kFloat) - zp) * scale; + } else { + int64_t num_blocks = scale.size(-2); + auto w_int8_view = qw.view({N, num_blocks, -1}); + dqw = sym_quant ? w_int8_view.to(at::kFloat) * scale + : (w_int8_view.to(at::kFloat) - zp) * scale; + dqw = dqw.view({N, -1}); + } + } + if (K != qw.size(1) * 2) { + TORCH_CHECK( + K < qw.size(1) * 2, 'WOQ Linear kernel: Unexpected weight shape'); + dqw = dqw.narrow(1, 0, K); + } + return dqw; +} + } // namespace cpu } // namespace torch_ipex diff --git a/csrc/cpu/jit/cpu/kernels/ContextLinearWoq.h b/csrc/cpu/jit/cpu/kernels/ContextLinearWoq.h index 74d84ef5e..5df09bea6 100644 --- a/csrc/cpu/jit/cpu/kernels/ContextLinearWoq.h +++ b/csrc/cpu/jit/cpu/kernels/ContextLinearWoq.h @@ -25,6 +25,8 @@ struct ContextLinearWoq final { int64_t group_size_; int64_t lowp_mode_; int64_t act_quant_mode_; + bool cache_weight_for_large_batch_ = false; + c10::optional cached_weight_ = c10::nullopt; ContextLinearWoq() = delete; @@ -38,7 +40,8 @@ struct ContextLinearWoq final { c10::optional&& g_idx, int64_t group_size = -1, int64_t lowp_mode = 0, - int64_t act_quant_mode = 0) + int64_t act_quant_mode = 0, + bool cache_weight_for_large_batch = false) : at_weight_(std::move(at_weight)), weight_dtype_(weight_dtype), weight_shape_(std::move(weight_shape)), @@ -46,7 +49,8 @@ struct ContextLinearWoq final { g_idx_(std::move(g_idx)), group_size_(group_size), lowp_mode_(lowp_mode), - act_quant_mode_(act_quant_mode) { + act_quant_mode_(act_quant_mode), + cache_weight_for_large_batch_(cache_weight_for_large_batch) { is_4bit_ = (weight_dtype == WOQ_DTYPE_INT4 || weight_dtype == WOQ_DTYPE_NF4); // Make three dtype versions of scale, zp and bias diff --git a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp index 627c38fce..8a10c9085 100644 --- a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp +++ b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.cpp @@ -3,6 +3,9 @@ #include #include "aten/Linear.h" #include "aten/WeightPack.h" +#include "aten/utils/woq.h" +#include "csrc/cpu/aten/TPPGEMM.h" +#include "csrc/cpu/tpp/utils.h" #include "csrc/cpu/tpp/woq/tla.h" #include "ideep/IDeepConversions.h" @@ -11,6 +14,8 @@ namespace cpu { namespace detail { namespace woq_linear { +#define SMALL_BATCH_THRESHOLD 32 + c10::intrusive_ptr createWoqLinearPrePackOpContext( at::Tensor&& weight, int64_t weight_dtype, @@ -22,7 +27,8 @@ c10::intrusive_ptr createWoqLinearPrePackOpContext( c10::optional batch_size, int64_t group_size, int64_t lowp_mode, - int64_t act_quant_mode) { + int64_t act_quant_mode, + bool cache_weight_for_large_batch) { RECORD_FUNCTION( "ipex_prepack::createWoqLinearPrePackOpContext", c10::ArrayRef({})); @@ -38,7 +44,8 @@ c10::intrusive_ptr createWoqLinearPrePackOpContext( batch_size, group_size, lowp_mode, - act_quant_mode); + act_quant_mode, + cache_weight_for_large_batch); } c10::intrusive_ptr createWoqLinearPrePackOpContextInt4( @@ -50,7 +57,8 @@ c10::intrusive_ptr createWoqLinearPrePackOpContextInt4( c10::optional batch_size, int64_t group_size, // group_size along input channel int64_t lowp_mode, - int64_t act_quant_mode) { + int64_t act_quant_mode, + bool cache_weight_for_large_batch) { RECORD_FUNCTION( "ipex_prepack::createWoqLinearPrePackOpContextInt4", c10::ArrayRef({})); @@ -166,7 +174,8 @@ c10::intrusive_ptr createWoqLinearPrePackOpContextInt4( batch_size, group_size, lowp_mode, - act_quant_mode); + act_quant_mode, + cache_weight_for_large_batch); } at::Tensor woq_linear_run( @@ -189,7 +198,8 @@ ContextLinearWoq create( const c10::optional batch_size, int64_t group_size, int64_t lowp_mode, - int64_t act_quant_mode) { + int64_t act_quant_mode, + bool cache_weight_for_large_batch) { at::Tensor packed_weight; int64_t N = weight_shape[0]; int64_t K = weight_shape[1]; @@ -246,7 +256,8 @@ ContextLinearWoq create( std::move(g_idx), group_size, lowp_mode, - act_quant_mode); + act_quant_mode, + cache_weight_for_large_batch); } c10::optional zero_points_float = c10::nullopt; if (zero_points.has_value() && zero_points.value().defined()) { @@ -262,7 +273,8 @@ ContextLinearWoq create( std::move(g_idx), group_size, lowp_mode, - act_quant_mode); + act_quant_mode, + cache_weight_for_large_batch); } static at::Tensor _shuffle_input_channels_if_needed( @@ -281,7 +293,61 @@ static at::Tensor _shuffle_input_channels_if_needed( return input; } +// Unpack WOQ Linear weight to plain format, dequantize it, then repack it to +// blocked format for BF16 computation. +static at::Tensor _weight_unpack_dequantize_repack(ContextLinearWoq& context) { + // Requres lowp_mode=BF16, g_idx disabled, and N/K divisible by block size + auto N = context.weight_shape_[0]; + auto K = context.weight_shape_[1]; + bool supported = context.lowp_mode_ == 2 && !context.g_idx_.has_value() && + K % 64 == 0 && (N % 100 == 0 || N % 64 == 0); + if (!supported) + return at::Tensor(); + auto unpacked_weight = unpack(context, context.at_weight_); + auto block_weight = [&](const at::Tensor& weight, int64_t Nb, int64_t Kb) { + return weight.reshape({N / Nb, Nb, K / Kb, Kb / 2, 2}) + .permute({0, 2, 3, 1, 4}) + .contiguous() + .to(c10::kBFloat16); + }; + at::Tensor scale, zp; + scale = context.scales_list_[0].unsqueeze(-1); + if (context.weight_dtype_ != WOQ_DTYPE_NF4) { + zp = context.zero_points_list_[0].unsqueeze(-1); + } + int64_t quant_w_mode = context.group_size_ > 0 ? 1 : 0; + auto dequant_weight = torch_ipex::cpu::dequantize_woq_weight( + unpacked_weight, + context.weight_shape_, + scale, + zp, + context.weight_dtype_, + quant_w_mode); + if (N % 100 == 0) { + return block_weight(dequant_weight, 100, 64); + } + return block_weight(dequant_weight, 64, 64); +} + at::Tensor run(ContextLinearWoq& context, const at::Tensor& input) { + if (context.cache_weight_for_large_batch_ && + !context.cached_weight_.has_value()) { + auto dequant_weight = + _weight_unpack_dequantize_repack(context).to(c10::kBFloat16); + context.cached_weight_ = + c10::make_optional(std::move(dequant_weight)); + } + auto M = input.numel() > 0 ? input.numel() / input.size(-1) : 0; + if (M > SMALL_BATCH_THRESHOLD && context.cached_weight_.has_value() && + context.cached_weight_.value().defined()) { + auto input_reshaped = input.dim() == 2 ? input.unsqueeze(0) : input; + auto out = tpp_linear_bias_forward_cpu( + input_reshaped.to(c10::kBFloat16).contiguous(), + context.cached_weight_.value(), + context.bias_list_[2], + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } // TPP kernel packs weight to 4d (Nc, Kc, block_k, block_n) auto w_k = context.weight_shape_[1]; TORCH_CHECK( @@ -318,6 +384,48 @@ at::Tensor run_unary( const c10::string_view& post_op, const torch::List>& scalars, const c10::optional& algorithm) { + if (context.cache_weight_for_large_batch_ && + !context.cached_weight_.has_value()) { + auto dequant_weight = + _weight_unpack_dequantize_repack(context).to(c10::kBFloat16); + context.cached_weight_ = c10::make_optional(dequant_weight); + } + auto M = input.numel() > 0 ? input.numel() / input.size(-1) : 0; + if (M > SMALL_BATCH_THRESHOLD && context.cached_weight_.has_value() && + context.cached_weight_.value().defined()) { + auto input_reshaped = input.dim() == 2 ? input.unsqueeze(0) : input; + if (post_op == "gelu") { + if (algorithm == "none") { + auto out = tpp_linear_gelu_forward_cpu( + input_reshaped.to(c10::kBFloat16).contiguous(), + context.cached_weight_.value(), + context.bias_list_[2], + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } else if (algorithm == "tanh") { + auto out = tpp_linear_gelu_tanh_forward_cpu( + input_reshaped.to(c10::kBFloat16).contiguous(), + context.cached_weight_.value(), + context.bias_list_[2], + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } + } else if (post_op == "silu") { + auto out = tpp_linear_silu_forward_cpu( + input_reshaped.to(c10::kBFloat16).contiguous(), + context.cached_weight_.value(), + context.bias_list_[2], + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } else if (post_op == "relu") { + auto out = tpp_linear_relu_forward_cpu( + input_reshaped.to(c10::kBFloat16).contiguous(), + context.cached_weight_.value(), + context.bias_list_[2], + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } + } // TPP kernel packs weight to 4d (Nc, Kc, block_k, block_n) auto w_k = context.weight_shape_[1]; TORCH_CHECK( @@ -351,6 +459,45 @@ at::Tensor run_binary( const at::Tensor& input, const c10::string_view& post_op, const std::vector& others) { + auto M = input.numel() > 0 ? input.numel() / input.size(-1) : 0; + if (context.cache_weight_for_large_batch_ && + !context.cached_weight_.has_value()) { + auto dequant_weight = + _weight_unpack_dequantize_repack(context).to(c10::kBFloat16); + context.cached_weight_ = c10::make_optional(dequant_weight); + } + if (M > SMALL_BATCH_THRESHOLD && context.cached_weight_.has_value() && + context.cached_weight_.value().defined()) { + auto input_reshaped = input.dim() == 2 ? input.unsqueeze(0) : input; + if (post_op == "add") { + auto out = tpp_linear_add_forward_cpu( + input_reshaped.to(c10::kBFloat16).contiguous(), + others[0], + context.cached_weight_.value(), + context.bias_list_[2], + 1.0, + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } else if (post_op == "add_add") { + auto out = tpp_linear_add_add_forward_cpu( + input_reshaped.to(c10::kBFloat16), + others[0], + others[1], + context.cached_weight_.value(), + context.bias_list_[2], + 1.0, + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } else if (post_op == "mul") { + auto out = tpp_linear_mul_forward_cpu( + input_reshaped.to(c10::kBFloat16), + others[0], + context.cached_weight_.value(), + context.bias_list_[2], + c10::nullopt); + return input.dim() == 2 ? out.squeeze(0) : out; + } + } // TPP kernel packs weight to 4d (Nc, Kc, block_k, block_n) auto w_k = context.weight_shape_[1]; TORCH_CHECK( diff --git a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h index f03ded598..731a1725f 100644 --- a/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h +++ b/csrc/cpu/jit/cpu/kernels/LinearWoqPacked.h @@ -21,7 +21,8 @@ c10::intrusive_ptr createWoqLinearPrePackOpContext( c10::optional batch_size, int64_t group_size, int64_t lowp_mode, - int64_t act_quant_mode); + int64_t act_quant_mode, + bool cache_weight_for_large_batch); c10::intrusive_ptr createWoqLinearPrePackOpContextInt4( at::Tensor&& weight, @@ -32,7 +33,8 @@ c10::intrusive_ptr createWoqLinearPrePackOpContextInt4( c10::optional batch_size, int64_t group_size, int64_t lowp_mode, - int64_t act_quant_mode); + int64_t act_quant_mode, + bool cache_weight_for_large_batch); at::Tensor woq_linear_run( const at::Tensor& input, @@ -49,7 +51,8 @@ ContextLinearWoq create( const c10::optional batch_size, int64_t group_size, int64_t lowp_mode, - int64_t act_quant_mode); + int64_t act_quant_mode, + bool cache_weight_for_large_batch); at::Tensor run(ContextLinearWoq& context, const at::Tensor& input); diff --git a/csrc/cpu/jit/cpu/kernels/OpContext.cpp b/csrc/cpu/jit/cpu/kernels/OpContext.cpp index 3dfeeee14..18539f9fe 100644 --- a/csrc/cpu/jit/cpu/kernels/OpContext.cpp +++ b/csrc/cpu/jit/cpu/kernels/OpContext.cpp @@ -372,7 +372,8 @@ c10::intrusive_ptr IpexWoqLinearOpContext::create_context( c10::optional batch_size, int64_t group_size, int64_t lowp_mode, - int64_t act_quant_mode) { + int64_t act_quant_mode, + bool cache_weight_for_large_batch) { auto op_context = torch_ipex::cpu::detail::woq_linear::create( weight, weight_dtype, @@ -384,7 +385,8 @@ c10::intrusive_ptr IpexWoqLinearOpContext::create_context( batch_size, group_size, lowp_mode, - act_quant_mode); + act_quant_mode, + cache_weight_for_large_batch); return c10::make_intrusive( batch_size, std::move(op_context)); } @@ -432,6 +434,10 @@ c10::optional IpexWoqLinearOpContext::get_g_idx() { return op_context_.g_idx_; } +c10::optional IpexWoqLinearOpContext::get_cached_weight() { + return op_context_.cached_weight_; +} + at::Tensor IpexWoqLinearOpContext::get_scales() { if (op_context_.group_size_ > 0 && op_context_.at_weight_.dim() == 4) { // [#block_n, #block_k, n_block_size] -> [#block_n, n_block_size, #block_k] diff --git a/csrc/cpu/jit/cpu/kernels/OpContext.h b/csrc/cpu/jit/cpu/kernels/OpContext.h index a5a141b9c..b0b10a0a9 100644 --- a/csrc/cpu/jit/cpu/kernels/OpContext.h +++ b/csrc/cpu/jit/cpu/kernels/OpContext.h @@ -371,7 +371,8 @@ using SerializationTypeWoqLinearPrePack = std::tuple< c10::optional, // batch size int64_t, // group size int64_t, // lowp_mode - int64_t>; // act_quant_mode + int64_t, // act_quant_mode + bool>; // cache_weight_for_large_batch class WoqLinearOpContext : public torch::jit::CustomClassHolder { protected: @@ -397,7 +398,8 @@ class WoqLinearOpContext : public torch::jit::CustomClassHolder { batch_size_, this->get_context().group_size_, this->get_context().lowp_mode_, - this->get_context().act_quant_mode_); + this->get_context().act_quant_mode_, + this->get_context().cache_weight_for_large_batch_); } virtual at::Tensor get_data_handle() = 0; @@ -429,6 +431,8 @@ class WoqLinearOpContext : public torch::jit::CustomClassHolder { virtual std::vector get_weight_shape() = 0; + virtual c10::optional get_cached_weight() = 0; + virtual at::Tensor pack(const at::Tensor& tensor) = 0; virtual detail::ContextLinearWoq& get_context() = 0; @@ -484,6 +488,8 @@ class IpexWoqLinearOpContext final : public WoqLinearOpContext { virtual std::vector get_weight_shape() override; + virtual c10::optional get_cached_weight() override; + virtual at::Tensor pack(const at::Tensor& tensor) override; virtual detail::ContextLinearWoq& get_context() override; @@ -499,7 +505,8 @@ class IpexWoqLinearOpContext final : public WoqLinearOpContext { c10::optional batch_size, int64_t group_size, int64_t lowp_mode, - int64_t act_quant_mode); + int64_t act_quant_mode, + bool cache_weight_for_large_batch); virtual void load_from_ctx( c10::intrusive_ptr other) override; diff --git a/csrc/cpu/jit/cpu/kernels/RegisterOpContextClass.cpp b/csrc/cpu/jit/cpu/kernels/RegisterOpContextClass.cpp index 2c19f75af..4e063bc0d 100644 --- a/csrc/cpu/jit/cpu/kernels/RegisterOpContextClass.cpp +++ b/csrc/cpu/jit/cpu/kernels/RegisterOpContextClass.cpp @@ -141,7 +141,8 @@ TORCH_LIBRARY(ipex_prepack, m) { std::move(std::get<7>(state)), // batch size std::move(std::get<8>(state)), // group size std::move(std::get<9>(state)), // lowp_mode - std::move(std::get<10>(state))); // act_quant_mode + std::move(std::get<10>(state)), // act_quant_mode + std::move(std::get<11>(state))); // cache_weight_for_large_batch }) .def( "get_weight", @@ -155,6 +156,9 @@ TORCH_LIBRARY(ipex_prepack, m) { "get_weight_shape", &torch_ipex::cpu::WoqLinearOpContext::get_weight_shape) .def("get_g_idx", &torch_ipex::cpu::WoqLinearOpContext::get_g_idx) + .def( + "get_cached_weight", + &torch_ipex::cpu::WoqLinearOpContext::get_cached_weight) .def("pack", &torch_ipex::cpu::WoqLinearOpContext::pack) .def("to_public", &torch_ipex::cpu::WoqLinearOpContext::to_public) .def( @@ -181,10 +185,10 @@ TORCH_LIBRARY(ipex_prepack, m) { "-> __torch__.torch.classes.ipex_prepack.ConvTransposeOpContext"); #ifdef USE_LIBXSMM m.def( - "weight_only_qlinear_prepack(Tensor W, int W_dtype, int[] W_shape, Tensor scales, Tensor? zero_points, Tensor? B, Tensor? g_idx, int? batch_size, int group_size, int lowp_mode, int act_quant_mode) " + "weight_only_qlinear_prepack(Tensor W, int W_dtype, int[] W_shape, Tensor scales, Tensor? zero_points, Tensor? B, Tensor? g_idx, int? batch_size, int group_size, int lowp_mode, int act_quant_mode, bool cache_weight_for_large_batch) " "-> __torch__.torch.classes.ipex_prepack.WoqLinearOpContext"); m.def( - "weight_only_qlinear_prepack_int4(Tensor W, Tensor scales, Tensor zero_points, Tensor? B, Tensor? g_idx, int? batch_size, int group_size, int lowp_mode, int act_quant_mode) " + "weight_only_qlinear_prepack_int4(Tensor W, Tensor scales, Tensor zero_points, Tensor? B, Tensor? g_idx, int? batch_size, int group_size, int lowp_mode, int act_quant_mode, bool cache_weight_for_large_batch) " "-> __torch__.torch.classes.ipex_prepack.WoqLinearOpContext"); #endif } diff --git a/csrc/cpu/jit/passes/graph_rewrite.cpp b/csrc/cpu/jit/passes/graph_rewrite.cpp index e5a4cccc8..c279daf6e 100644 --- a/csrc/cpu/jit/passes/graph_rewrite.cpp +++ b/csrc/cpu/jit/passes/graph_rewrite.cpp @@ -1337,24 +1337,26 @@ void replaceAtenMaxPool2dWithIpexMaxPool2d(std::shared_ptr& graph) { void simplifyAllReduce(std::shared_ptr& graph) { std::string all_reduce_v1 = R"( - graph(%a, %weight, %out_features1, %out_features2, %b, %fc_in_weight, %fc_in_bias, %fc_out_weight, %fc_out_bias, %alpha, %idx, %no, %dtype, %zero): + graph(%a, %weight, %out_features1, %none, %b, %fc_in_weight, %fc_in_bias, %fc_out_weight, %fc_out_bias, %alpha, %no, %dtype, %zero): %r1 = torch_ipex::tpp_linear(%a, %weight, %out_features1) %r2 = deepspeed_comm::all_reduce(%r1) - %r3 = torch_ipex::tpp_linear_gelu(%b, %fc_in_weight, %fc_in_bias, %out_features2) - %r4 = aten::to(%r3, %idx, %no, %no, %dtype) + %r3 = torch_ipex::tpp_linear_gelu(%b, %fc_in_weight, %fc_in_bias, %none) + %r4 = aten::to(%r3, %dtype, %no, %no, %none) %r5 = aten::contiguous(%r4, %zero) - %r6 = torch_ipex::tpp_linear(%r5, %fc_out_weight, %out_features1) + %w = torch_ipex::choose_tpp_linear_weight(%r5, %fc_out_weight, %none) + %r6 = torch_ipex::tpp_linear(%r5, %w, %out_features1) %r7 = deepspeed_comm::all_reduce(%r6) %r8 = aten::add_(%r7, %fc_out_bias, %alpha) %r = aten::add(%r2, %r8, %alpha) return (%r) )"; std::string all_reduce_repl_v1 = R"( - graph(%a, %weight, %out_features1, %out_features2, %b, %fc_in_weight, %fc_in_bias, %fc_out_weight, %fc_out_bias, %alpha, %idx, %no, %dtype, %zero): + graph(%a, %weight, %out_features1, %none, %b, %fc_in_weight, %fc_in_bias, %fc_out_weight, %fc_out_bias, %alpha, %no, %dtype, %zero): %r1 = torch_ipex::tpp_linear(%a, %weight, %out_features1) - %r2 = torch_ipex::tpp_linear_gelu(%b, %fc_in_weight, %fc_in_bias, %out_features2) - %r3 = aten::to(%r2, %idx, %no, %no, %dtype) + %r2 = torch_ipex::tpp_linear_gelu(%b, %fc_in_weight, %fc_in_bias, %none) + %r3 = aten::to(%r2, %dtype, %no, %no, %none) %r4 = aten::contiguous(%r3, %zero) - %r5 = torch_ipex::tpp_linear(%r4, %fc_out_weight, %out_features1) + %w = torch_ipex::choose_tpp_linear_weight(%r4, %fc_out_weight, %none) + %r5 = torch_ipex::tpp_linear(%r4, %w, %out_features1) %r6 = aten::add(%r1, %r5, %alpha) %r7 = deepspeed_comm::all_reduce(%r6) %r = aten::add_(%r7, %fc_out_bias, %alpha) diff --git a/csrc/cpu/tpp/xsmm_functors.h b/csrc/cpu/tpp/xsmm_functors.h index e1f1d1534..a863017b6 100644 --- a/csrc/cpu/tpp/xsmm_functors.h +++ b/csrc/cpu/tpp/xsmm_functors.h @@ -9,6 +9,7 @@ #include #include #include +#include "csrc/cpu/aten/TPPGEMM.h" namespace torch_ipex { namespace tpp { @@ -2203,43 +2204,50 @@ class GeluTanhFwdTPP { : M(M), N(N), ldi(ldi), ldo(ldo) {} void operator()(Tin* in, Tout* out) { + if constexpr ( + std::is_same() && std::is_same()) { + torch_ipex::cpu::tpp_gelu_tanh_bf16_forward_cpu(in, out, M, N, ldi, ldo); + } else { #ifdef __AVX512F__ - const __m512 c1 = _mm512_set1_ps((float)0.7978846); - const __m512 c2 = _mm512_set1_ps((float)0.0356814); - const __m512 c_half = _mm512_set1_ps((float)0.5); - for (int j = 0; j < M; j++) { - int i; - for (i = 0; i < ALIGNDOWN(N, 16); i += 16) { - auto vin = _mm512_loadu_ps_auto(&in[j * ldi + i]); - __m512 x_half = _mm512_mul_ps(vin, c_half); - __m512 x_sq = _mm512_mul_ps(vin, vin); - __m512 poly_x1 = _mm512_mul_ps(vin, _mm512_fmadd_ps(x_sq, c2, c1)); - __m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(poly_x1); - __m512 vout = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half); - _mm512_storeu_ps_auto(&out[j * ldo + i], vout); - } - if (i < N) { - int rem = N - i; - __mmask16 mask = (1 << rem) - 1; - auto vin = _mm512_maskz_loadu_ps_auto(mask, &in[j * ldi + i]); - __m512 x_half = _mm512_mul_ps(vin, c_half); - __m512 x_sq = _mm512_mul_ps(vin, vin); - __m512 poly_x1 = _mm512_mul_ps(vin, _mm512_fmadd_ps(x_sq, c2, c1)); - __m512 tanh_poly_x = LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(poly_x1); - __m512 vout = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half); - _mm512_mask_storeu_ps_auto(&out[j * ldo + i], mask, vout); + const __m512 c1 = _mm512_set1_ps((float)0.7978846); + const __m512 c2 = _mm512_set1_ps((float)0.0356814); + const __m512 c_half = _mm512_set1_ps((float)0.5); + for (int j = 0; j < M; j++) { + int i; + for (i = 0; i < ALIGNDOWN(N, 16); i += 16) { + auto vin = _mm512_loadu_ps_auto(&in[j * ldi + i]); + __m512 x_half = _mm512_mul_ps(vin, c_half); + __m512 x_sq = _mm512_mul_ps(vin, vin); + __m512 poly_x1 = _mm512_mul_ps(vin, _mm512_fmadd_ps(x_sq, c2, c1)); + __m512 tanh_poly_x = + LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(poly_x1); + __m512 vout = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half); + _mm512_storeu_ps_auto(&out[j * ldo + i], vout); + } + if (i < N) { + int rem = N - i; + __mmask16 mask = (1 << rem) - 1; + auto vin = _mm512_maskz_loadu_ps_auto(mask, &in[j * ldi + i]); + __m512 x_half = _mm512_mul_ps(vin, c_half); + __m512 x_sq = _mm512_mul_ps(vin, vin); + __m512 poly_x1 = _mm512_mul_ps(vin, _mm512_fmadd_ps(x_sq, c2, c1)); + __m512 tanh_poly_x = + LIBXSMM_INTRINSICS_MM512_TANH_PS_MINIMAX3(poly_x1); + __m512 vout = _mm512_fmadd_ps(tanh_poly_x, x_half, x_half); + _mm512_mask_storeu_ps_auto(&out[j * ldo + i], mask, vout); + } } - } #else - for (int j = 0; j < M; j++) { - for (int i = 0; i < N; i++) { - float x = in[j * ldi + i]; - out[j * ldo + i] = - ((tanh(sqrt(2 / M_PI) * (x + 0.044715 * std::pow(x, 3)))) + 1) * x * - 0.5; + for (int j = 0; j < M; j++) { + for (int i = 0; i < N; i++) { + float x = in[j * ldi + i]; + out[j * ldo + i] = + ((tanh(sqrt(2 / M_PI) * (x + 0.044715 * std::pow(x, 3)))) + 1) * + x * 0.5; + } } - } #endif + } } void ref(Tin* in, Tout* out) { diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index f608de80a..4df3c54cc 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -185,6 +185,13 @@ def decorator(func): "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " "IC_BLOCK is determined by IC automatically.", ) +parser.add_argument( + "--cache-weight-for-large-batch", + action="store_true", + help="Cache an extra linear weight for large batch inference, such as the first token (prefill phase)." + " It brings better performance at the cost of higher memory usage. It is only valid for full bf16 path" + " and weight-only quantization with lowp-mode=BF16. Otherwise, it has no effect.", +) args = parser.parse_args() @@ -442,6 +449,7 @@ def write_checkpoints_json(): quantization_config=qconfig if ipex_woq_enabled else None, inplace=True, deployment_mode=False, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) self.base_model = self.model @@ -1211,6 +1219,7 @@ def write_checkpoints_json(): quantization_config=qconfig if ipex_woq_enabled else None, inplace=True, deployment_mode=False, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) self._base_model = self._model @@ -1856,6 +1865,7 @@ def write_checkpoints_json(): quantization_config=qconfig if ipex_woq_enabled else None, inplace=True, deployment_mode=False, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) self.base_model = self.model diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py index 5397b05de..c96110b2e 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py @@ -196,6 +196,13 @@ " HuggingFace Optimum format for backward compatibility. It must be used with" " --low-precision-checkpoint. Otherwise, it has no effect.", ) +parser.add_argument( + "--cache-weight-for-large-batch", + action="store_true", + help="Cache an extra linear weight for large batch inference, such as the first token (prefill phase)." + " It brings better performance at the cost of higher memory usage. It is only valid for full bf16 path" + " and weight-only quantization with lowp-mode=BF16. Otherwise, it has no effect.", +) args = parser.parse_args() print(args) @@ -322,6 +329,7 @@ def trace_handler(prof): dtype=amp_dtype, inplace=True, deployment_mode=args.deployment_mode, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) elif args.ipex_weight_only_quantization: from intel_extension_for_pytorch.quantization import WoqWeightDtype @@ -375,6 +383,7 @@ def trace_handler(prof): dtype=amp_dtype, quantization_config=qconfig, inplace=True, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) if args.torch_compile: diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index 58e428d72..e8eb026ed 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -197,6 +197,13 @@ "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " "IC_BLOCK is determined by IC automatically.", ) +parser.add_argument( + "--cache-weight-for-large-batch", + action="store_true", + help="Cache an extra linear weight for large batch inference, such as the first token (prefill phase)." + " It brings better performance at the cost of higher memory usage. It is only valid for full bf16 path" + " and weight-only quantization with lowp-mode=BF16. Otherwise, it has no effect.", +) parser.add_argument( "--config-file", default=None, type=str, help="specific configuration file" ) @@ -512,6 +519,7 @@ def write_checkpoints_json(): quantization_config=qconfig if ipex_woq_enabled else None, inplace=True, deployment_mode=args.deployment_mode, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index e7fe93c85..662613f21 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -210,6 +210,13 @@ def main(args_in: Optional[List[str]] = None) -> None: " automatically, -1 for INT8 and 128 for INT4. If --low-precision-checkpoint is given, this parameter is " "overwritten by data in the checkpoint file.", ) + parser.add_argument( + "--cache-weight-for-large-batch", + action="store_true", + help="Cache an extra linear weight for large batch inference, such as the first token (prefill phase)." + " It brings better performance at the cost of higher memory usage. It is only valid for full bf16 path" + " and weight-only quantization with lowp-mode=BF16. Otherwise, it has no effect.", + ) # inference related arguments. parser.add_argument( @@ -298,6 +305,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--config-file", str(args.config_file)]) if args.image_url is not None: infer_cmd.extend(["--image-url", str(args.image_url)]) + if args.cache_weight_for_large_batch: + infer_cmd.extend(["--cache-weight-for-large-batch"]) if args.audio is not None: infer_cmd.extend(["--audio", str(args.audio)]) @@ -400,6 +409,9 @@ def main(args_in: Optional[List[str]] = None) -> None: if args.prompt is not None: infer_cmd.extend(["--prompt", str(args.prompt)]) + if args.cache_weight_for_large_batch: + infer_cmd.extend(["--cache-weight-for-large-batch"]) + print("LLM RUNTIME INFO: quantizing model ...") result = subprocess.run(infer_cmd) if result.returncode != 0: @@ -429,6 +441,8 @@ def main(args_in: Optional[List[str]] = None) -> None: quant_cmd.extend(["--greedy"]) if args.image_url is not None: quant_cmd.extend(["--image-url", str(args.image_url)]) + if args.cache_weight_for_large_batch: + quant_cmd.extend(["--cache-weight-for-large-batch"]) if args.audio is not None: quant_cmd.extend(["--audio", str(args.audio)]) if args.ipex_weight_only_quantization: @@ -686,6 +700,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--group-size", str(group_size)]) if args.quant_with_amp: infer_cmd.extend(["--quant-with-amp"]) + if args.cache_weight_for_large_batch: + infer_cmd.extend(["--cache-weight-for-large-batch"]) print("LLM RUNTIME INFO: running model geneartion with deepspeed (autotp)...") result = subprocess.run(infer_cmd) diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 2cdc8d563..404e18ebe 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -92,6 +92,13 @@ parser.add_argument( "--config-file", default=None, type=str, help="specific configuration file" ) +parser.add_argument( + "--cache-weight-for-large-batch", + action="store_true", + help="Cache an extra linear weight for large batch inference, such as the first token (prefill phase)." + " It brings better performance at the cost of higher memory usage. It is only valid for full bf16 path" + " and weight-only quantization with lowp-mode=BF16. Otherwise, it has no effect.", +) args = parser.parse_args() @@ -239,6 +246,7 @@ def __init__( dtype=infer_dtype, inplace=True, deployment_mode=False, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) if args.torch_compile: @@ -896,6 +904,7 @@ def __init__( dtype=infer_dtype, inplace=True, deployment_mode=False, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) if args.torch_compile: @@ -1424,6 +1433,7 @@ def __init__( dtype=infer_dtype, inplace=True, deployment_mode=False, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) if args.torch_compile: diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index a747adabe..7e9bda416 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -130,6 +130,13 @@ parser.add_argument( "--token-latency", action="store_true", help="get token latency breakdown" ) +parser.add_argument( + "--cache-weight-for-large-batch", + action="store_true", + help="Cache an extra linear weight for large batch inference, such as the first token (prefill phase)." + " It brings better performance at the cost of higher memory usage. It is only valid for dtype=bfloat16." + " Otherwise, it has no effect.", +) args = parser.parse_args() print(args) @@ -273,6 +280,7 @@ def trace_handler(prof): dtype=amp_dtype, inplace=True, deployment_mode=args.deployment_mode, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) if args.torch_compile: if args.deployment_mode: diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 5938dab6f..f5fc94c9f 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -245,6 +245,13 @@ " HuggingFace Optimum format for backward compatibility. It must be used with" " --low-precision-checkpoint. Otherwise, it has no effect.", ) +parser.add_argument( + "--cache-weight-for-large-batch", + action="store_true", + help="Cache an extra linear weight for large batch inference, such as the first token (prefill phase)." + " It brings better performance at the cost of higher memory usage. It is only valid for weight-only" + " quantization with lowp-mode=BF16. Otherwise, it has no effect.", +) args = parser.parse_args() @@ -1028,6 +1035,7 @@ def calib_func(prepared_model): inplace=True, low_precision_checkpoint=low_precision_checkpoint, deployment_mode=False, + cache_weight_for_large_batch=args.cache_weight_for_large_batch, ) example_inputs = get_example_inputs(model) with torch.no_grad(), torch.cpu.amp.autocast( diff --git a/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py b/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py index a132b9dd7..f7594c207 100644 --- a/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py +++ b/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py @@ -1,5 +1,6 @@ import torch import torch.utils._pytree as pytree +import copy # import math # from enum import Enum @@ -299,6 +300,21 @@ def __torch_dispatch__(cls, func, types, args, kwargs): args_data = pytree.tree_map_only(BlockedParameter, lambda x: x._data, args) return func(*args_data, **kwargs) + def __copy__(self): + new_param = BlockedParameter(self._data, requires_grad=self.requires_grad) + for k, v in self.__dict__.items(): + if k != "_data": + setattr(new_param, k, copy.copy(v)) + return new_param + + def __deepcopy__(self, memo): + new_param = BlockedParameter( + copy.deepcopy(self._data, memo), requires_grad=self.requires_grad + ) + for k, v in self.__dict__.items(): + setattr(new_param, k, copy.deepcopy(v, memo)) + return new_param + class BlockedModule(torch.nn.Module): def _save_to_state_dict(self, destination, prefix, keep_vars): diff --git a/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py b/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py index ce3b2eff2..f21ae1165 100644 --- a/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py +++ b/intel_extension_for_pytorch/nn/modules/weight_only_quantization.py @@ -11,6 +11,11 @@ quantize_per_block, WoqWeightDtype, ) +from intel_extension_for_pytorch.quantization._qconfig import ( + WOQ_LOWP_MODE_TO_STR, + WOQ_ACT_QUANT_MODE_TO_STR, + WOQ_DTYPE_TO_STR, +) from ...utils._logger import logger, WarningType @@ -37,6 +42,7 @@ def __init__( self._lowp_mode = 0 self._act_quant_mode = 0 self._group_size = -1 + self._cache_weight_for_large_batch = False def pre_ipex_gemm(self, input): return input @@ -56,12 +62,17 @@ def _get_name(self): def extra_repr(self): extra_repr_str = "in_features={}, out_features={}, dtype={}".format( - self.in_features, self.out_features, self.dtype + self.in_features, self.out_features, WOQ_DTYPE_TO_STR[self.dtype] ) extra_repr_str += ", bias={}".format(self.bias) - extra_repr_str += ", lowp_mode={}".format(self._lowp_mode) - extra_repr_str += ", act_quant_mode={}".format(self._act_quant_mode) + extra_repr_str += ", lowp_mode={}".format(WOQ_LOWP_MODE_TO_STR[self._lowp_mode]) + extra_repr_str += ", act_quant_mode={}".format( + WOQ_ACT_QUANT_MODE_TO_STR[self._act_quant_mode] + ) extra_repr_str += ", group_size={}".format(self._group_size) + extra_repr_str += ", cache_weight_for_large_batch={}".format( + self._cache_weight_for_large_batch + ) return extra_repr_str @classmethod @@ -119,6 +130,9 @@ def from_float(cls, mod, scales=None, zero_points=None): mod.in_features = mod.weight.size()[1] if not hasattr(mod, "out_features"): mod.out_features = mod.weight.size()[0] + cache_weight_for_large_batch = ( + qconfig.cache_weight_for_large_batch and lowp_mode == 2 + ) qlinear = cls._init_cls( mod, @@ -130,8 +144,10 @@ def from_float(cls, mod, scales=None, zero_points=None): group_size, lowp_mode, act_quant_mode, + cache_weight_for_large_batch, ) del qweight + mod.weight = torch.nn.Parameter() return qlinear @classmethod @@ -167,11 +183,16 @@ def from_float_and_int4_weight( lowp_mode = 0 act_quant_mode = 0 + cache_weight_for_large_batch = False if mod.qconfig is not None: if hasattr(mod.qconfig, "lowp_mode"): lowp_mode = mod.qconfig.lowp_mode if hasattr(mod.qconfig, "act_quant_mode"): act_quant_mode = mod.qconfig.act_quant_mode + if hasattr(mod.qconfig, "cache_weight_for_large_batch"): + cache_weight_for_large_batch = ( + mod.qconfig.cache_weight_for_large_batch and lowp_mode == 2 + ) w_dtype = qweight.dtype supported_qw_dtype = [ @@ -192,8 +213,10 @@ def from_float_and_int4_weight( mod.out_features = mod.weight.size()[0] qlinear = cls(mod.in_features, mod.out_features, dtype=WoqWeightDtype.INT4) - if bias is None: + if mod.bias is not None: bias = mod.bias + if bias is not None and torch.count_nonzero(bias) == 0: + bias = None qlinear._op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack_int4( qweight, scales, @@ -204,11 +227,14 @@ def from_float_and_int4_weight( group_size, int(lowp_mode), act_quant_mode, + cache_weight_for_large_batch, ) qlinear.weight = qlinear._op_context.get_weight() + qlinear.bias = bias is not None qlinear._lowp_mode = lowp_mode qlinear._act_quant_mode = act_quant_mode qlinear._group_size = group_size + qlinear._cache_weight_for_large_batch = cache_weight_for_large_batch del qweight return qlinear @@ -224,6 +250,7 @@ def _init_cls( group_size, lowp_mode, act_quant_mode, + cache_weight_for_large_batch, ): qlinear = cls( mod.in_features, mod.out_features, mod.bias is not None, dtype=dtype @@ -240,11 +267,14 @@ def _init_cls( group_size, int(lowp_mode), act_quant_mode, + cache_weight_for_large_batch, ) qlinear.weight = qlinear._op_context.get_weight() + qlinear.bias = mod.bias is not None qlinear._lowp_mode = lowp_mode qlinear._act_quant_mode = act_quant_mode qlinear._group_size = group_size + qlinear._cache_weight_for_large_batch = cache_weight_for_large_batch return qlinear @@ -287,6 +317,7 @@ def _init_cls( group_size, lowp_mode, act_quant_mode, + cache_weight_for_large_batch, ): qlinear = cls._init_from_mod(mod, dtype) @@ -302,11 +333,13 @@ def _init_cls( group_size, lowp_mode, act_quant_mode, + cache_weight_for_large_batch, ) qlinear.weight = qlinear._op_context.get_weight() qlinear._lowp_mode = lowp_mode qlinear._act_quant_mode = act_quant_mode qlinear._group_size = group_size + qlinear._cache_weight_for_large_batch = cache_weight_for_large_batch is not None return qlinear diff --git a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py index 604232ecd..d1a344e11 100644 --- a/intel_extension_for_pytorch/nn/utils/_weight_prepack.py +++ b/intel_extension_for_pytorch/nn/utils/_weight_prepack.py @@ -25,6 +25,7 @@ def TPPLinear_weight_prepack(m, bk=None, bc=None, layer_dtype=torch.float32): [0, 2, 3, 1], ) ) + m.weight_for_large_batch = None layer_use_low_prec = layer_dtype != torch.float32 if layer_use_low_prec is True and USE_LOW_PREC_PARAMS: low_prec_vnni_blocking = get_vnni_blocking(layer_dtype) @@ -231,9 +232,26 @@ def __init__(self): super(_IPEXConv3d, self).__init__() +torch.library.define( + "torch_ipex::choose_tpp_linear_weight", + "(Tensor x, Tensor weight, Tensor? weight_for_large_batch) -> Tensor", +) + + +@torch.library.impl("torch_ipex::choose_tpp_linear_weight", "cpu") +def choose_tpp_linear_weight(x, weight, weight_for_large_batch): + M = x.numel() // x.size(-1) + return ( + weight_for_large_batch + if weight_for_large_batch is not None and M >= 256 + else weight + ) + + class _IPEXLinear(_IPEXPrepackModule): def __init__(self): super(_IPEXLinear, self).__init__() + self.weight_for_large_batch = None # for LLM large batch/first token inference def maybe_block_params(self): self.weight.block() @@ -262,13 +280,21 @@ def forward(self, x): output = torch.nn.functional.linear(x, self.weight, self.bias) else: x = x.to(self.weight.dtype).contiguous() + weight_for_large_batch = ( + self.weight_for_large_batch + if hasattr(self, "weight_for_large_batch") + else None + ) + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.weight, weight_for_large_batch + ) if self.bias is not None: output = torch.ops.torch_ipex.tpp_linear_bias( - x, self.weight.detach(), self.bias.detach(), self.out_features + x, w.detach(), self.bias.detach(), self.out_features ) else: output = torch.ops.torch_ipex.tpp_linear( - x, self.weight.detach(), self.out_features + x, w.detach(), self.out_features ) else: output = torch.ops.torch_ipex.ipex_MKLSGEMM( diff --git a/intel_extension_for_pytorch/quantization/_qconfig.py b/intel_extension_for_pytorch/quantization/_qconfig.py index ef6990b53..b3cdd8a5a 100644 --- a/intel_extension_for_pytorch/quantization/_qconfig.py +++ b/intel_extension_for_pytorch/quantization/_qconfig.py @@ -110,6 +110,14 @@ class WoqLowpMode(IntEnum): INT8 = 3 +WOQ_LOWP_MODE_TO_STR = { + WoqLowpMode.NONE: "none", + WoqLowpMode.FP16: "fp16", + WoqLowpMode.BF16: "bf16", + WoqLowpMode.INT8: "int8", +} + + class WoqActQuantMode(IntEnum): NONE = -1 PER_TENSOR = 0 @@ -122,6 +130,15 @@ class WoqActQuantMode(IntEnum): PER_BATCH_IC_BLOCK_SYM = 7 +WOQ_ACT_QUANT_MODE_TO_STR = { + WoqActQuantMode.NONE: "none", + WoqActQuantMode.PER_TENSOR: "per_tensor", + WoqActQuantMode.PER_IC_BLOCK: "per_ic_block", + WoqActQuantMode.PER_BATCH: "per_batch", + WoqActQuantMode.PER_BATCH_IC_BLOCK: "per_batch_ic_block", +} + + # Start from 1 to align with kernel class WoqWeightDtype(IntEnum): INT8 = 1 @@ -129,9 +146,23 @@ class WoqWeightDtype(IntEnum): NF4 = 3 +WOQ_DTYPE_TO_STR = { + WoqWeightDtype.INT8: "int8", + WoqWeightDtype.INT4: "int4", + WoqWeightDtype.NF4: "nf4", +} + + QConfigWoq = namedtuple( "QConfigWoq", - [*QConfig._fields, "lowp_mode", "act_quant_mode", "weight_dtype", "group_size"], + [ + *QConfig._fields, + "lowp_mode", + "act_quant_mode", + "weight_dtype", + "group_size", + "cache_weight_for_large_batch", + ], ) @@ -196,6 +227,10 @@ def get_weight_only_quant_qconfig_mapping( assert ( weight_dtype in valid_values ), f"Invalid weight data type for weight only quantization: {weight_dtype}" + + if lowp_mode != WoqLowpMode.INT8: + act_quant_mode = WoqActQuantMode.NONE + _weight_only_quant_qconfig = QConfigWoq( activation=PlaceholderObserver.with_args(dtype=torch.float, is_dynamic=False), weight=PerChannelMinMaxObserver(), @@ -203,6 +238,7 @@ def get_weight_only_quant_qconfig_mapping( act_quant_mode=act_quant_mode, weight_dtype=weight_dtype, group_size=group_size, + cache_weight_for_large_batch=False, ) weight_only_quant_qconfig_mapping = QConfigMapping().set_global( _weight_only_quant_qconfig diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py index 542455735..f902298ae 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py @@ -31,9 +31,12 @@ def __init__(self, module, tpp=False, woq=False): def forward(self, x): if self.tpp and not self.linear.tpp_fallback: x = x.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_linear_silu( x, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None @@ -62,9 +65,12 @@ def __init__(self, module, tpp=False, woq=False): def forward(self, x): if self.tpp and not self.linear.tpp_fallback: x = x.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_linear_relu( x, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None @@ -94,10 +100,13 @@ def forward(self, x, y): if self.tpp and not self.linear.tpp_fallback: x = x.to(self.dtype).contiguous() y = y.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_linear_mul( x, y, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None @@ -128,10 +137,13 @@ def forward(self, x, y): if self.tpp and not self.linear.tpp_fallback: x = x.to(self.dtype).contiguous() y = y.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_linear_add( x, y, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None @@ -164,11 +176,14 @@ def forward(self, x, y, z): x = x.to(self.dtype).contiguous() y = y.to(self.dtype).contiguous() z = z.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_linear_add_add( x, y, z, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None @@ -199,9 +214,12 @@ def __init__(self, module, tpp=False, woq=False): def forward(self, x): if self.tpp and not self.linear.tpp_fallback: x = x.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_linear_gelu( x, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None @@ -241,9 +259,12 @@ def __init__(self, module, tpp=False, woq=False): def forward(self, x): if self.tpp and not self.linear.tpp_fallback: x = x.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_linear_gelu( x, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None @@ -302,12 +323,23 @@ def __init__(self, module, tpp=False, woq=False): lowp_mode = self.linear_list[0]._lowp_mode act_quant_mode = self.linear_list[0]._act_quant_mode group_size = self.linear_list[0]._group_size + cache_weight_for_large_batch = self.linear_list[ + 0 + ]._cache_weight_for_large_batch qconfig_mapping = get_weight_only_quant_qconfig_mapping( weight_dtype=w_dtype, lowp_mode=lowp_mode, act_quant_mode=act_quant_mode, group_size=group_size, ) + if cache_weight_for_large_batch: + from intel_extension_for_pytorch.utils.weight_only_quantization import ( + _woq_enable_weight_cache_for_large_batch, + ) + + qconfig_mapping = _woq_enable_weight_cache_for_large_batch( + qconfig_mapping + ) qconfig = qconfig_mapping.global_qconfig for i in range(self.num_concat): linear = self.linear_list[i] @@ -425,15 +457,21 @@ def forward(self, x): and not self.linear_m.tpp_fallback ): x = x.to(self.dtype).contiguous() + w_s = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear_s.weight, self.linear_s.weight_for_large_batch + ) + w_m = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear_m.weight, self.linear_m.weight_for_large_batch + ) return torch.ops.torch_ipex.tpp_fused_gate_up_proj( x, - self.linear_s.weight.detach(), + w_s.detach(), ( self.linear_s.bias.detach() if self.linear_s.bias is not None else x.new_empty(0) ), - self.linear_m.weight.detach(), + w_m.detach(), ( self.linear_m.bias.detach() if self.linear_m.bias is not None @@ -471,9 +509,12 @@ def __init__(self, module, tpp=False, woq=False): def forward(self, x, y): if self.tpp and not self.linear.tpp_fallback: x = x.to(self.dtype).contiguous() + w = torch.ops.torch_ipex.choose_tpp_linear_weight( + x, self.linear.weight, self.linear.weight_for_large_batch + ) x1 = torch.ops.torch_ipex.tpp_linear_silu( x, - self.linear.weight.detach(), + w.detach(), ( self.linear.bias.detach() if self.linear.bias is not None diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index f8dd55f37..9951374e7 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -2416,14 +2416,12 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): torch.nn.Linear, WeightOnlyQuantizedLinear, ] - try: - import deepspeed + from intel_extension_for_pytorch.nn.utils._weight_prepack import ( + may_import_deepspeed_modules, + ) - supported_linear_types.append( - deepspeed.module_inject.layers.LinearLayer - ) - except ImportError: - pass + ds_modules = may_import_deepspeed_modules() + supported_linear_types.extend(ds_modules) supported_linear_types = tuple(supported_linear_types) if ( hasattr(module, "q_proj") diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 79793006c..137f82a95 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -10,6 +10,7 @@ import intel_extension_for_pytorch as ipex from ..utils.weight_only_quantization import ( _is_woq_qconfig, + _woq_enable_weight_cache_for_large_batch, _convert_woq_with_low_precision_checkpoint, ) @@ -1121,6 +1122,60 @@ def ipex_quantization_flow( return convert_model +def attach_extra_weight_for_large_batch_inference(model): + # Traverse the entire model and attch extra bf16 weight to linear + assert _using_tpp() + from intel_extension_for_pytorch.nn.utils._weight_prepack import ( + _IPEXLinear, + ) + + def _pack_weight_for_large_batch(weight): + assert weight.dim() == 2, "Expected 2D weight to pack, but got {}D".format( + weight.dim() + ) + + def block_weight(weight, Nb, Kb): + N = weight.size(0) + K = weight.size(1) + return ( + weight.reshape((N // Nb, Nb, K // Kb, Kb // 2, 2)) + .permute((0, 2, 3, 1, 4)) + .contiguous() + .to(torch.bfloat16) + ) + + if weight.size(0) % 100 == 0 and weight.size(1) % 64 == 0: + return block_weight(weight, 100, 64) + elif weight.size(0) % 64 == 0 and weight.size(1) % 64 == 0: + return block_weight(weight, 64, 64) + else: + return None + + def _unpack_blocked_weight(weight): + assert weight.dim() == 5, "Expected 5D weight to unpack, but got {}D".format( + weight.dim() + ) + N = weight.size(0) * weight.size(3) + return weight.permute((0, 3, 1, 2, 4)).contiguous().reshape((N, -1)) + + def _convert(mod, attr_name): + if isinstance(mod, _IPEXLinear): + weight = mod.weight.data + unblocked_weight = _unpack_blocked_weight(weight) + extra_weight = _pack_weight_for_large_batch(unblocked_weight) + mod.weight_for_large_batch = extra_weight + return mod + + mod_new = mod + + for name, child in mod.named_children(): + attr = attr_name + "." + name if attr_name != "" else name + setattr(mod_new, name, _convert(child, attr)) + return mod_new + + return _convert(model, "") + + def model_convert_lowering( _model, device, @@ -1129,6 +1184,7 @@ def model_convert_lowering( deployment_mode, is_quantization=False, woq=False, + cache_weight_for_large_batch=False, ): from .models.reference.modules.attentions import _IPEXAttentionRef from .models.reference.modules.decoder import _IPEXDecoderLayerRef @@ -1167,6 +1223,8 @@ def model_convert_lowering( elif dtype is torch.bfloat16: _enable_tpp() _model = ipex.optimize(_model.eval(), dtype=dtype, inplace=True) + if cache_weight_for_large_batch: + _model = attach_extra_weight_for_large_batch_inference(_model) if not is_quantization or woq: import transformers @@ -1327,6 +1385,7 @@ def optimize( low_precision_checkpoint=None, sample_inputs=None, deployment_mode=True, + cache_weight_for_large_batch=False, ): r""" Apply optimizations at Python frontend to the given transformers model (nn.Module). @@ -1369,6 +1428,10 @@ def optimize( Default value is ``None``, and for well supported model, we provide this sample inputs automaticlly. deployment_mode (bool): Whether to apply the optimized model for deployment of model generation. It means there is no need to further apply optimization like torchscirpt. Default value is ``True``. + cache_weight_for_large_batch (bool): Whether to cache the dedicated weight for large batch to speed up + its inference (e.g., prefill phase) with extra memory usage. It is only valid for non-quantization cases + where dtype = bfloat16 and weight-only quantization cases where lowp-mode=BF16. In other cases, an error + will be raised. Default value is ``False``. Returns: Optimized model object for model.generate(), also workable with model.forward @@ -1482,6 +1545,15 @@ def optimize( if _is_woq_qconfig(quantization_config): is_woq = True + if cache_weight_for_large_batch: + assert ( + is_woq or dtype == torch.bfloat16 + ), "cache_weight_for_large_batch is only valid for WOQ or BF16 cases" + if is_woq: + quantization_config = _woq_enable_weight_cache_for_large_batch( + quantization_config + ) + # Load low precision checkpoint (generated by GPTQ, etc.) for WOQ before any conversion if device == "cpu" and is_woq and low_precision_checkpoint is not None: state_dict, config = None, None @@ -1605,6 +1677,7 @@ def optimize( deployment_mode, is_quantization, is_woq, + cache_weight_for_large_batch, ) # do not register output hook when doing calibration in static int8 if not (is_quantization and not is_woq and qconfig_summary_file is None): diff --git a/intel_extension_for_pytorch/utils/weight_only_quantization.py b/intel_extension_for_pytorch/utils/weight_only_quantization.py index c5c10f33c..5f45082cd 100644 --- a/intel_extension_for_pytorch/utils/weight_only_quantization.py +++ b/intel_extension_for_pytorch/utils/weight_only_quantization.py @@ -1,6 +1,7 @@ import copy import torch from intel_extension_for_pytorch.nn.modules import WeightOnlyQuantizedLinear +from intel_extension_for_pytorch.quantization import QConfigWoq, WoqLowpMode from torch.ao.quantization import PlaceholderObserver, QConfigMapping # The config describes how to load low precision checkpoint for weight only quantization. @@ -39,6 +40,23 @@ def _is_woq_qconfig(qconfig_mapping): ) +def _woq_enable_weight_cache_for_large_batch(qconfig_mapping): + qconfig = ( + qconfig_mapping.global_qconfig + if isinstance(qconfig_mapping, QConfigMapping) + else qconfig_mapping + ) + assert ( + qconfig.lowp_mode == WoqLowpMode.BF16 + ), "Weight cache is only supported for lowp-mode=BF16" + qconfig_dict = qconfig._asdict() + qconfig_dict["cache_weight_for_large_batch"] = True + if isinstance(qconfig_mapping, QConfigMapping): + qconfig_mapping.set_global(QConfigWoq(**qconfig_dict)) + return qconfig_mapping + return QConfigWoq(**qconfig_dict) + + def _default_lowp_checkpoint_config(): return DEFAULT_LOWP_CHECKPOINT_CONFIG diff --git a/tests/cpu/test_ipex_optimize_transformers.py b/tests/cpu/test_ipex_optimize_transformers.py index c11b7e93f..63ba64436 100644 --- a/tests/cpu/test_ipex_optimize_transformers.py +++ b/tests/cpu/test_ipex_optimize_transformers.py @@ -30,8 +30,8 @@ curpath = os.path.abspath(os.path.dirname(__file__)) -def _get_gptj_example_inputs(): - input_ids = torch.ones(8).to(torch.long) +def _get_gptj_example_inputs(batch_size=8): + input_ids = torch.ones(batch_size).to(torch.long) attention_mask = torch.ones(len(input_ids)) position_ids = torch.arange(len(input_ids)) past_key_values = tuple( @@ -249,6 +249,50 @@ def test_weight_only_quant_flow_for_llama(self): m = transformers.models.llama.modeling_llama.LlamaForCausalLM(config).eval() self._model_replacement_check_woq(m) + def test_weight_only_quant_cache_weight_for_large_batch(self): + config = AutoConfig.from_pretrained( + f"{curpath}/hf_configs/gptj", return_dict=False + ) + model = transformers.models.gptj.modeling_gptj.GPTJForCausalLM(config).eval() + + for weight_dtype in [ + ipex.quantization.WoqWeightDtype.INT8, + ipex.quantization.WoqWeightDtype.INT4, + ipex.quantization.WoqWeightDtype.NF4, + ]: + qconfig_mapping = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=ipex.quantization.WoqLowpMode.BF16, + ) + model_ref = ipex.llm.optimize( + copy.deepcopy(model), + dtype=torch.bfloat16, + quantization_config=qconfig_mapping, + deployment_mode=True, + cache_weight_for_large_batch=False, + ) + model = ipex.llm.optimize( + model, + dtype=torch.bfloat16, + quantization_config=qconfig_mapping, + deployment_mode=True, + cache_weight_for_large_batch=True, + ) + linear_list = [ + model.transformer.h[0].attn.concat_qkv.concat_linear, + model.transformer.h[0].attn.out_proj, + model.transformer.h[0].linear_add_add.linear, + model.transformer.h[0].linear_gelu.linear, + ] + with torch.no_grad(), torch.cpu.amp.autocast(enabled=True): + example_inputs = _get_gptj_example_inputs(batch_size=128) + y = model(*example_inputs) + y_ref = model_ref(*example_inputs) + assert all( + l._op_context.get_cached_weight() is not None for l in linear_list + ) + self.assertEqual(y[0], y_ref[0], prec=5e-2) + def test_static_quant_flow(self): config = AutoConfig.from_pretrained( f"{curpath}/hf_configs/gptj", return_dict=False @@ -451,6 +495,38 @@ def test_generate_functions(self): ref_res = ref_m.generate(input_ids, **generate_kwargs) self.assertEqual(ipex_res, ref_res) + def test_cache_weight_for_large_batch(self): + config = AutoConfig.from_pretrained( + f"{curpath}/hf_configs/gptj", return_dict=False + ) + model = transformers.models.gptj.modeling_gptj.GPTJForCausalLM(config).eval() + model_ref = ipex.llm.optimize( + copy.deepcopy(model), + dtype=torch.bfloat16, + deployment_mode=True, + cache_weight_for_large_batch=False, + ) + + model = ipex.llm.optimize( + model, + dtype=torch.bfloat16, + deployment_mode=True, + cache_weight_for_large_batch=True, + ) + linear_list = [ + model.transformer.h[0].attn.concat_qkv.concat_linear, + model.transformer.h[0].attn.out_proj, + model.transformer.h[0].linear_add_add.linear, + model.transformer.h[0].linear_gelu.linear, + ] + with torch.no_grad(), torch.cpu.amp.autocast(enabled=True): + example_inputs = _get_gptj_example_inputs(batch_size=512) + y = model(*example_inputs) + y_ref = model_ref(*example_inputs) + assert all(hasattr(l, "weight_for_large_batch") for l in linear_list) + assert all(l.weight_for_large_batch is not None for l in linear_list) + self.assertEqual(y[0], y_ref[0]) + if __name__ == "__main__": test = unittest.main() diff --git a/tests/cpu/test_quantization_default_recipe.py b/tests/cpu/test_quantization_default_recipe.py index 5ab33059b..da459d476 100644 --- a/tests/cpu/test_quantization_default_recipe.py +++ b/tests/cpu/test_quantization_default_recipe.py @@ -1793,6 +1793,57 @@ def forward(self, x): # Dequantized weights should be close torch.testing.assert_close(dqw, dqw_2) + def test_weight_only_quantization_weight_for_first_token(self): + class M(nn.Module): + def __init__(self, input_channel, output_channel, has_bias): + super(M, self).__init__() + self.linear = torch.nn.Linear(input_channel, output_channel, has_bias) + + def forward(self, x): + return self.linear(x) + + def test(feature, has_bias, w_dtype): + model = M(feature[1], feature[2], has_bias) + m = model.to(torch.bfloat16).eval() + data = torch.rand(feature[0], feature[1]) + + qconfig_ref = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=w_dtype, + lowp_mode=ipex.quantization.WoqLowpMode.BF16, + ) + prepared_model_ref = prepare( + m, qconfig_ref, example_inputs=data, inplace=False + ) + from intel_extension_for_pytorch.utils.weight_only_quantization import ( + _woq_enable_weight_cache_for_large_batch, + ) + + qconfig = _woq_enable_weight_cache_for_large_batch(qconfig_ref) + prepared_model = prepare(m, qconfig, example_inputs=data, inplace=False) + + with torch.no_grad(), torch.autocast( + device_type="cpu", enabled=True, dtype=torch.bfloat16 + ): + woq_model_ref = convert(prepared_model_ref) + woq_model_ref = torch.jit.trace(woq_model_ref, data) + woq_model_ref = torch.jit.freeze(woq_model_ref) + woq_model = convert(prepared_model) + woq_model = torch.jit.trace(woq_model, data) + woq_model = torch.jit.freeze(woq_model) + out_ref = woq_model_ref(data).bfloat16() + out = woq_model(data).bfloat16() + torch.testing.assert_close(out_ref, out, atol=1.5e-4, rtol=1.6e-2) + + shape_list = [ + [196, 4096, 4096], + [1024, 512, 512], + ] + use_bias_list = [True, False] + w_dtype_list = [WoqWeightDtype.INT8, WoqWeightDtype.INT4] + cases = itertools.product(shape_list, use_bias_list, w_dtype_list) + for shape, use_bias, w_dtype in cases: + test(shape, use_bias, w_dtype) + class QuantizedOpsTester(TestCase): def test_matmul_i8i8i32(self): From 620a9bfd9db42813931a857e78fa3f5d298be200 Mon Sep 17 00:00:00 2001 From: Xuan Liao Date: Wed, 19 Jun 2024 08:04:31 +0800 Subject: [PATCH 123/199] [flash attention] fix bugs for attention mask (#2987) --- csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp | 266 +++++++++++++------ tests/cpu/test_cpu_ops.py | 124 +++++++-- 2 files changed, 287 insertions(+), 103 deletions(-) diff --git a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp index d409c6667..d3252f77f 100644 --- a/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/FlashAttentionKrnl.cpp @@ -283,76 +283,52 @@ inline Vectorized exp_u20(Vectorized data) { #endif // out = val * a + b -template +// is_b_stride_zero: If the stride of b is 0 (mask broadcasting case), +// take b as a scalar pointer. +template inline void _scale_attn_mask_fusion_kernel( T1* a, T2* b, const int& size, T1* out, T1& val) { - auto vec_size = at::vec::Vectorized::size(); - auto vec_scale = at::vec::Vectorized(val); - for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { - auto tmp0 = at::vec::Vectorized::loadu(a + i); - auto tmp1 = at::vec::Vectorized::loadu(b + i); - auto tmp2 = at::vec::convert(tmp1); - auto tmp3 = tmp0 * vec_scale + tmp2; - _store(out + i, tmp3); - } - for (long i = vec_size * (size / vec_size); i < size; i++) { - auto tmp0 = a[i]; - auto tmp1 = (T1)b[i]; - out[i] = tmp0 * val + tmp1; - } -} - -// out = val * a + b -template -inline void _scale_attn_mask_fusion_kernel( - T1* a, - T1* b, - const int& size, - T1* out, - T1& val) { - auto vec_size = at::vec::Vectorized::size(); - auto vec_scale = at::vec::Vectorized(val); - for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { - auto tmp0 = at::vec::Vectorized::loadu(a + i); - auto tmp1 = at::vec::Vectorized::loadu(b + i); - auto tmp2 = tmp0 * vec_scale + tmp1; - _store(out + i, tmp2); - } - for (long i = vec_size * (size / vec_size); i < size; i++) { - auto tmp0 = a[i]; - auto tmp1 = b[i]; - out[i] = tmp0 * val + tmp1; - } -} - -// out = b ? val * a : -inf -template -inline void _scale_attn_mask_fusion_kernel( - T1* a, - bool* b, - const int& size, - T1* out, - T1& val) { - auto vec_size = at::vec::Vectorized::size(); - auto vec_scale = at::vec::Vectorized(val); - auto neg_inf = -std::numeric_limits::infinity(); - auto vec_neg_inf = at::vec::Vectorized(neg_inf); - for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { - auto tmp0 = at::vec::Vectorized::loadu(a + i); - auto tmp1 = at::vec::Vectorized::loadu(b + i); - auto tmp2 = at::vec::convert(tmp1); - auto tmp3 = - at::vec::Vectorized::blendv(vec_neg_inf, tmp0 * vec_scale, tmp2); - _store(out + i, tmp3); - } - for (long i = vec_size * (size / vec_size); i < size; i++) { - auto tmp0 = a[i]; - auto tmp1 = b[i]; - out[i] = tmp1 ? tmp0 * val : neg_inf; + const auto vec_size1 = at::vec::Vectorized::size(); + const auto vec_size2 = at::vec::Vectorized::size(); + constexpr int64_t T1_n = + (vec_size2 == vec_size1 * 2 && is_reduced_floating_point_v) ? 2 : 1; + constexpr int64_t T2_n = 1; + auto vec_scale = at::vec::VectorizedN(val); + int64_t i = 0; + if (is_b_stride_zero) { + auto b_first_val = (T1)b[0]; + auto b_first_vec = at::vec::VectorizedN(b_first_val); + for (; i < size - (size % vec_size2); i += vec_size2) { + auto a_n = at::vec::VectorizedN::loadu(a + i); + auto b_n = b_first_vec; + at::vec::VectorizedN b_n_convert = + at::vec::convert(b_n); + auto res = a_n * vec_scale + b_n_convert; + res.store(out + i); + } + for (; i < size; i++) { + auto tmp0 = a[i]; + auto tmp1 = b_first_val; + out[i] = tmp0 * val + tmp1; + } + } else { + for (; i < size - (size % vec_size2); i += vec_size2) { + auto a_n = at::vec::VectorizedN::loadu(a + i); + auto b_n = at::vec::VectorizedN::loadu(b + i); + at::vec::VectorizedN b_n_convert = + at::vec::convert(b_n); + auto res = a_n * vec_scale + b_n_convert; + res.store(out + i); + } + for (; i < size; i++) { + auto tmp0 = a[i]; + auto tmp1 = (T1)b[i]; + out[i] = tmp0 * val + tmp1; + } } } @@ -425,6 +401,82 @@ inline void _mul_reduce_max_fusion_kernel( vec_tmp_max)); } +// This function is used to produce an attn_mask in a standard format +inline std::optional convert_boolean_attn_mask( + const std::optional& attn_mask, + caffe2::TypeMeta dtype) { + // Pass through + if (!attn_mask.has_value()) { + return c10::nullopt; + } + // Convert boolean mask to additive mask + if (attn_mask->dtype() == at::kBool) { + auto new_attn_mask = at::zeros_like(attn_mask.value(), dtype); + new_attn_mask.masked_fill_( + attn_mask->logical_not(), -std::numeric_limits::infinity()); + return new_attn_mask; + } + // Otherwise, attn_mask represents an additive attention tensor + return attn_mask; +} + +// Support mask shapes: +// 2d: ({Q_seq_len, 1} x {KV_seq_len, 1}) +// 4d: ({Batch, 1} x {Num_heads, 1} x {Q_seq_len, 1} x {KV_seq_len, 1}) +inline bool check_attn_mask_shape( + at::Tensor& attn_mask, + int64_t batchSize, + int64_t num_head, + int64_t qSize, + int64_t kvSize) { + if (attn_mask.size(-2) != qSize && attn_mask.size(-2) != 1) { + return false; + } + if (attn_mask.size(-1) != kvSize && attn_mask.size(-1) != 1) { + return false; + } + if (attn_mask.dim() == 2) { + return true; + } else if (attn_mask.dim() == 4) { + if ((attn_mask.size(0) == 1 || attn_mask.size(0) == batchSize) && + (attn_mask.size(1) == 1 || attn_mask.size(1) == num_head)) { + return true; + } + } + return false; +} + +// Reshape attention mask to 4d +inline void reshape_attn_mask_to_4d( + at::Tensor& attn_mask, + int64_t batchSize, + int64_t num_head, + int64_t qSize, + int64_t kvSize) { + TORCH_CHECK( + check_attn_mask_shape(attn_mask, batchSize, num_head, qSize, kvSize), + "IPEX flash_attention: Please use the following attn mask shapes: ", + "2d - ({Q_seq_len, 1} x {KV_seq_len, 1}); ", + "4d - ({Batch, 1} x {Num_heads, 1} x {Q_seq_len, 1} x {KV_seq_len, 1})"); + int64_t attn_mask_size_0 = 1; + int64_t attn_mask_size_1 = 1; + if (attn_mask.dim() == 4) { + if (attn_mask.size(0) == batchSize) { + attn_mask_size_0 = batchSize; + } + if (attn_mask.size(1) == num_head) { + attn_mask_size_1 = num_head; + } + } + attn_mask = attn_mask + .view( + {attn_mask_size_0, + attn_mask_size_1, + attn_mask.size(-2), + attn_mask.size(-1)}) + .expand({attn_mask_size_0, attn_mask_size_1, qSize, kvSize}); +} + /* *Caculate the flash attention SDPA. *@template scalar_t: q/k/v data type @@ -480,6 +532,12 @@ cpu_flash_attention( int64_t num_head = query.size(2); int64_t headSize = query.size(3); + // reshape mask + if (attention_mask.has_value()) { + reshape_attn_mask_to_4d( + attention_mask.value(), batchSize, num_head, qSize, kvSize); + } + // Strides int64_t qStrideB = query.stride(0); int64_t qStrideM = query.stride(1); @@ -505,7 +563,13 @@ cpu_flash_attention( ? attention_mask.value().stride(1) : 0; int64_t mStrideM = - attention_mask.has_value() ? attention_mask.value().stride(2) : 0; + (attention_mask.has_value() && attention_mask.value().size(2) > 1) + ? attention_mask.value().stride(2) + : 0; + int64_t mStrideN = + (attention_mask.has_value() && attention_mask.value().size(3) > 1) + ? attention_mask.value().stride(3) + : 0; int64_t qSplitSize = q_split_size > qSize ? qSize : q_split_size; int64_t kvSplitSize = kv_split_size > kvSize ? kvSize : kv_split_size; @@ -596,15 +660,24 @@ cpu_flash_attention( // And apply scaling factor if (attention_mask.has_value()) { for (int64_t row = 0; row < qBlockSize; ++row) { - // qk <- attn_mask ? qk : -inf, if attn_mask is bool - // qk <- qk + attn_mask, else - _scale_attn_mask_fusion_kernel( - qk_data + row * kvBlockSize, - mask_data + i * mStrideB + j * mStrideH + - (m + row) * mStrideM + n, - kvBlockSize, - qk_data + row * kvBlockSize, - scaling_factor); + // qk <- qk * scaling_factor + attn_mask, else + if (mStrideN == 0) { + _scale_attn_mask_fusion_kernel( + qk_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM, + kvBlockSize, + qk_data + row * kvBlockSize, + scaling_factor); + } else { + _scale_attn_mask_fusion_kernel( + qk_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM + n, + kvBlockSize, + qk_data + row * kvBlockSize, + scaling_factor); + } } } // Update coefficients with Softmax @@ -737,6 +810,12 @@ cpu_flash_attention( int64_t num_head = query.size(2); int64_t headSize = query.size(3); + // reshape mask + if (attention_mask.has_value()) { + reshape_attn_mask_to_4d( + attention_mask.value(), batchSize, num_head, qSize, kvSize); + } + // Strides int64_t qStrideB = query.stride(0); int64_t qStrideM = query.stride(1); @@ -762,7 +841,13 @@ cpu_flash_attention( ? attention_mask.value().stride(1) : 0; int64_t mStrideM = - attention_mask.has_value() ? attention_mask.value().stride(2) : 0; + (attention_mask.has_value() && attention_mask.value().size(2) > 1) + ? attention_mask.value().stride(2) + : 0; + int64_t mStrideN = + (attention_mask.has_value() && attention_mask.value().size(3) > 1) + ? attention_mask.value().stride(3) + : 0; int64_t qSplitSize = q_split_size > qSize ? qSize : q_split_size; int64_t kvSplitSize = kv_split_size > kvSize ? kvSize : kv_split_size; @@ -1241,15 +1326,24 @@ cpu_flash_attention( // And apply scaling factor if (attention_mask.has_value()) { for (int64_t row = 0; row < qBlockSize; ++row) { - // qk <- attn_mask ? qk : -inf, if attn_mask is bool - // qk <- qk + attn_mask, else - _scale_attn_mask_fusion_kernel( - qk_data + row * kvBlockSize, - mask_data + i * mStrideB + j * mStrideH + - (m + row) * mStrideM + n, - kvBlockSize, - qk_data + row * kvBlockSize, - scaling_factor); + // qk <- qk * scaling_factor + attn_mask, else + if (mStrideN == 0) { + _scale_attn_mask_fusion_kernel( + qk_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM, + kvBlockSize, + qk_data + row * kvBlockSize, + scaling_factor); + } else { + _scale_attn_mask_fusion_kernel( + qk_data + row * kvBlockSize, + mask_data + i * mStrideB + j * mStrideH + + (m + row) * mStrideM + n, + kvBlockSize, + qk_data + row * kvBlockSize, + scaling_factor); + } } } // Update coefficients with Softmax @@ -1558,6 +1652,8 @@ std::tuple flash_attention_kernel( attention_mask.value().stride(-1) == 1), "IPEX flash_attention: Q/K/V/Mask should be continuous on the last dim"); + std::optional attn_mask = + convert_boolean_attn_mask(attention_mask, query.dtype()); at::Tensor output = at::empty({batchSize, qSize, num_head, headSize}, query.options()); const auto accumulate_dtype = at::toOpMathType(dtype); @@ -1572,7 +1668,7 @@ std::tuple flash_attention_kernel( value, dropout_p, is_causal, - attention_mask, + attn_mask, scale); output = output.transpose(1, 2); diff --git a/tests/cpu/test_cpu_ops.py b/tests/cpu/test_cpu_ops.py index c9fcd0f0f..f8491f00c 100644 --- a/tests/cpu/test_cpu_ops.py +++ b/tests/cpu/test_cpu_ops.py @@ -1402,14 +1402,10 @@ def test_cat(self): self.assertTrue(y7.size() == torch.Size([8, 2])) self.assertTrue(y7.dtype == datatype) - def test_flash_attention(self): + def test_flash_attention_without_mask(self): dtypes = [torch.float, torch.double, torch.bfloat16, torch.float16] for dtype in dtypes: - for causal, has_attention_mask in [ - [False, False], - [True, False], - [False, True], - ]: + for causal in [True, False]: for batch_size, seq_len, n_head, head_dim in itertools.product( [2, 12], [1, 129, 267, 533, 1030], [1, 3, 4], [7, 8, 16] ): @@ -1447,30 +1443,18 @@ def test_flash_attention(self): q2 = q2.view(batch_size, seq_len, n_head, head_dim).transpose(1, 2) v2 = v2.view(batch_size, seq_len, n_head, head_dim).transpose(1, 2) - mask = ( - torch.randn( - (batch_size, 1, seq_len, seq_len), - device="cpu", - dtype=dtype, - requires_grad=False, - ) - if has_attention_mask - else None - ) actual = torch.ops.torch_ipex.flash_attention( q, k, v, dropout_p=0.0, is_causal=causal, - attention_mask=mask, )[0] math_ref = ( torch._scaled_dot_product_attention_math( q2, k2, v2, - attn_mask=mask, dropout_p=0.0, is_causal=causal, ) @@ -1480,6 +1464,110 @@ def test_flash_attention(self): math_ref = math_ref.to(dtype) torch.testing.assert_close(actual, math_ref, atol=atol, rtol=rtol) + def test_flash_attention_with_mask(self): + dtypes = [torch.float, torch.double, torch.bfloat16, torch.float16] + for dtype in dtypes: + for mask_dim in [2, 4]: + batch_size, seq_len, n_head, head_dim = 2, 129, 4, 8 + atol = 1e-5 + rtol = 5e-6 + if dtype is torch.bfloat16: + atol = 2e-2 + rtol = 2e-2 + if dtype is torch.float16: + atol = 1e-2 + rtol = 1e-2 + attn_mask_dtypes = ( + [dtype, torch.bool, torch.float] + if dtype in [torch.bfloat16, torch.float16] + else [dtype, torch.bool] + ) + for attn_mask_dtype in attn_mask_dtypes: + for attn_mask_shape in ( + itertools.product([seq_len, 1], [seq_len, 1]) + if mask_dim == 2 + else itertools.product( + [batch_size, 1], [n_head, 1], [seq_len, 1], [seq_len, 1] + ) + ): + n_embd = n_head * head_dim + x = torch.randn( + (batch_size, seq_len, 3 * n_head * head_dim), + device="cpu", + dtype=dtype, + requires_grad=False, + ) + x2 = x.clone() + + q, k, v = x.split(n_embd, dim=2) + q2, k2, v2 = x2.split(n_embd, dim=2) + + if dtype in [torch.bfloat16, torch.float16]: + q2 = q2.float() + k2 = k2.float() + v2 = v2.float() + + # (B, nh, T, hs) + k = k.view(batch_size, seq_len, n_head, head_dim).transpose( + 1, 2 + ) + q = q.view(batch_size, seq_len, n_head, head_dim).transpose( + 1, 2 + ) + v = v.view(batch_size, seq_len, n_head, head_dim).transpose( + 1, 2 + ) + k2 = k2.view(batch_size, seq_len, n_head, head_dim).transpose( + 1, 2 + ) + q2 = q2.view(batch_size, seq_len, n_head, head_dim).transpose( + 1, 2 + ) + v2 = v2.view(batch_size, seq_len, n_head, head_dim).transpose( + 1, 2 + ) + + if attn_mask_dtype == torch.bool: + mask = torch.ones( + attn_mask_shape, + dtype=torch.bool, + device="cpu", + requires_grad=False, + ).tril(diagonal=0) + # _scaled_dot_product_attention_math does the type conversion outside + mask2 = torch.zeros_like(mask, dtype=dtype) + mask2[mask == False] = -float("inf") # noqa: E712 + else: + mask = torch.randn( + attn_mask_shape, + dtype=attn_mask_dtype, + device="cpu", + requires_grad=False, + ) + mask2 = mask + actual = torch.ops.torch_ipex.flash_attention( + q, + k, + v, + dropout_p=0.0, + attention_mask=mask, + )[0] + math_ref = ( + torch._scaled_dot_product_attention_math( + q2, + k2, + v2, + attn_mask=mask2, + dropout_p=0.0, + ) + )[0] + + if dtype in [torch.bfloat16, torch.float16]: + math_ref = math_ref.to(dtype) + torch.testing.assert_close( + actual, math_ref, atol=atol, rtol=rtol + ) + def test_flash_attention_stride0(self): input_shape = ( 1, From a6d3a14c49e6d018381a72ce59af239330c323ff Mon Sep 17 00:00:00 2001 From: shiyang-weng Date: Wed, 19 Jun 2024 15:52:07 +0800 Subject: [PATCH 124/199] Wengshiy/rotary embedding (#2962) * naive implementation * support f32 vectorized * support bf16 vectorized * improve code style * remove overhead for cos and sin loading for bfloat16 * improve code style; add comment --------- Co-authored-by: Zhang, Liangang Co-authored-by: WeizhuoZhang-intel --- .../kernels/RotaryPositionEmbeddingKnl.cpp | 296 ++++++++++++++++-- 1 file changed, 278 insertions(+), 18 deletions(-) diff --git a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp index 4112dd022..a416eeb11 100644 --- a/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp +++ b/csrc/cpu/aten/kernels/RotaryPositionEmbeddingKnl.cpp @@ -43,6 +43,274 @@ bool is_fused_qkv(at::Tensor& t_in, int64_t hidden_size) { * @param rotary_dim The rotary dimension. * @return A tuple containing the query, key, and value tensors. */ + +template +inline void apply_rotary_embedding( + const scalar_t* __restrict__ arr, + const float* __restrict__ cos_ptr, + const float* __restrict__ sin_ptr, + scalar_t* __restrict__ out, + int embed_dim) { + using Vec = Vectorized; + const int kVecSize = Vec::size(); + const int len = embed_dim - (embed_dim % kVecSize); + + // GPT-J style rotary embedding. + // format: {d, 2}, stride-2 access need permute to be vectorized. + int d = 0; + for (; d < len; d += kVecSize) { + Vec x = Vec::loadu(arr + 2 * d + 0 * kVecSize); + Vec y = Vec::loadu(arr + 2 * d + 1 * kVecSize); + Vec cos = Vec::loadu(cos_ptr + d); + Vec sin = Vec::loadu(sin_ptr + d); + // x: {x0, y0, x1, y1, x2, y2, x3, y3} + // y: {x4, y4, x5, y5, x6, y6, x7, y7} + // x1: {x0, x1, x2, x3, x4, x5, x6, x7} + // y1: {y0, y1, y2, y3, y4, y5, y6, y7} + auto xy = deinterleave2(x, y); + Vec x1 = std::get<0>(xy); + Vec y1 = std::get<1>(xy); + Vec x2 = x1 * cos - y1 * sin; + Vec y2 = y1 * cos + x1 * sin; + // x2: {x0, x1, x2, x3, x4, x5, x6, x7} + // y2: {y0, y1, y2, y3, y4, y5, y6, y7} + // x_out: {x0, y0, x1, y1, x2, y2, x3, y3} + // y_out: {x4, y4, x5, y5, x6, y6, x7, y7} + xy = interleave2(x2, y2); + Vec x_out = std::get<0>(xy); + Vec y_out = std::get<1>(xy); + x_out.store(out + 2 * d + 0 * kVecSize); + y_out.store(out + 2 * d + 1 * kVecSize); + } + for (; d < embed_dim; d++) { + scalar_t x = arr[2 * d + 0]; + scalar_t y = arr[2 * d + 1]; + scalar_t x_out = x * cos_ptr[d] - y * sin_ptr[d]; + scalar_t y_out = y * cos_ptr[d] + x * sin_ptr[d]; + out[2 * d + 0] = x_out; + out[2 * d + 1] = y_out; + } +} + +template <> +inline void apply_rotary_embedding( + const at::BFloat16* __restrict__ arr, + const float* __restrict__ cos_ptr, + const float* __restrict__ sin_ptr, + at::BFloat16* __restrict__ out, + int embed_dim) { + using fVec = Vectorized; + using bVec = Vectorized; + + const int kVecSize = bVec::size(); + const int len = 2 * embed_dim - (2 * embed_dim % kVecSize); + + // GPT-J style rotary embedding. + // format: {d, 2}, stride-2 access need permute to be vectorized. + int d = 0; + for (; d < len; d += kVecSize) { + bVec a = bVec::loadu(arr + d); + fVec x, y; + std::tie(x, y) = convert_bfloat16_float(a); + fVec cos = fVec::loadu(cos_ptr + d / 2); + fVec sin = fVec::loadu(sin_ptr + d / 2); + // x: {x0, y0, x1, y1, x2, y2, x3, y3} + // y: {x4, y4, x5, y5, x6, y6, x7, y7} + // x1: {x0, x1, x2, x3, x4, x5, x6, x7} + // y1: {y0, y1, y2, y3, y4, y5, y6, y7} + auto xy = deinterleave2(x, y); + fVec x1 = std::get<0>(xy); + fVec y1 = std::get<1>(xy); + fVec x2 = x1 * cos - y1 * sin; + fVec y2 = y1 * cos + x1 * sin; + // x2: {x0, x1, x2, x3, x4, x5, x6, x7} + // y2: {y0, y1, y2, y3, y4, y5, y6, y7} + // x_out: {x0, y0, x1, y1, x2, y2, x3, y3} + // y_out: {x4, y4, x5, y5, x6, y6, x7, y7} + xy = interleave2(x2, y2); + fVec x_out = std::get<0>(xy); + fVec y_out = std::get<1>(xy); + bVec a_out = convert_float_bfloat16(x_out, y_out); + a_out.store(out + d); + } + for (; d < embed_dim; d++) { + float x = static_cast(arr[2 * d + 0]); + float y = static_cast(arr[2 * d + 1]); + float x_out = x * cos_ptr[d] - y * sin_ptr[d]; + float y_out = y * cos_ptr[d] + x * sin_ptr[d]; + out[2 * d + 0] = static_cast(x_out); + out[2 * d + 1] = static_cast(y_out); + } +} + +template +inline void RotateEveryTwo( + const scalar_t* in_query_ptr, + const scalar_t* in_key_ptr, + scalar_t* out_query_ptr, + scalar_t* out_key_ptr, + const float* sin_start, + const float* cos_start, + const int HR, + const int offset, + const bool calc_key) { + // TODO: remove overhead for loading sin and cos + int embed_dim = HR / 2; + apply_rotary_embedding( + in_query_ptr, cos_start, sin_start, out_query_ptr, embed_dim); + + if (calc_key) { + apply_rotary_embedding( + in_key_ptr, cos_start, sin_start, out_key_ptr, embed_dim); + } +} + +template <> +inline void RotateEveryTwo( + const at::BFloat16* in_query_ptr, + const at::BFloat16* in_key_ptr, + at::BFloat16* out_query_ptr, + at::BFloat16* out_key_ptr, + const float* sin_ptr, + const float* cos_ptr, + const int HR, + const int offset, + const bool calc_key) { + int embed_dim = HR / 2; + + using fVec = Vectorized; + using bVec = Vectorized; + + const int kVecSize = bVec::size(); + const int len = HR - (HR % kVecSize); + + // GPT-J style rotary embedding. + // format: {d, 2}, stride-2 access need permute to be vectorized. + int d = 0; + for (; d < len; d += kVecSize) { + bVec in_query = bVec::loadu(in_query_ptr + d); + fVec x, y; + std::tie(x, y) = convert_bfloat16_float(in_query); + fVec cos = fVec::loadu(cos_ptr + d / 2); + fVec sin = fVec::loadu(sin_ptr + d / 2); + // x: {x0, y0, x1, y1, x2, y2, x3, y3} + // y: {x4, y4, x5, y5, x6, y6, x7, y7} + // x1: {x0, x1, x2, x3, x4, x5, x6, x7} + // y1: {y0, y1, y2, y3, y4, y5, y6, y7} + auto xy = deinterleave2(x, y); + fVec x1 = std::get<0>(xy); + fVec y1 = std::get<1>(xy); + fVec x2 = x1 * cos - y1 * sin; + fVec y2 = y1 * cos + x1 * sin; + // x2: {x0, x1, x2, x3, x4, x5, x6, x7} + // y2: {y0, y1, y2, y3, y4, y5, y6, y7} + // x_out: {x0, y0, x1, y1, x2, y2, x3, y3} + // y_out: {x4, y4, x5, y5, x6, y6, x7, y7} + xy = interleave2(x2, y2); + fVec x_out = std::get<0>(xy); + fVec y_out = std::get<1>(xy); + bVec a_out = convert_float_bfloat16(x_out, y_out); + a_out.store(out_query_ptr + d); + if (calc_key) { + bVec in_key = bVec::loadu(in_key_ptr + d); + fVec x, y; + std::tie(x, y) = convert_bfloat16_float(in_key); + // x: {x0, y0, x1, y1, x2, y2, x3, y3} + // y: {x4, y4, x5, y5, x6, y6, x7, y7} + // x1: {x0, x1, x2, x3, x4, x5, x6, x7} + // y1: {y0, y1, y2, y3, y4, y5, y6, y7} + auto xy = deinterleave2(x, y); + fVec x1 = std::get<0>(xy); + fVec y1 = std::get<1>(xy); + fVec x2 = x1 * cos - y1 * sin; + fVec y2 = y1 * cos + x1 * sin; + // x2: {x0, x1, x2, x3, x4, x5, x6, x7} + // y2: {y0, y1, y2, y3, y4, y5, y6, y7} + // x_out: {x0, y0, x1, y1, x2, y2, x3, y3} + // y_out: {x4, y4, x5, y5, x6, y6, x7, y7} + xy = interleave2(x2, y2); + fVec x_out = std::get<0>(xy); + fVec y_out = std::get<1>(xy); + bVec a_out = convert_float_bfloat16(x_out, y_out); + a_out.store(out_key_ptr + d); + } + } + for (; d < embed_dim; d++) { + float x = static_cast(in_query_ptr[2 * d + 0]); + float y = static_cast(in_query_ptr[2 * d + 1]); + float cos = cos_ptr[d]; + float sin = sin_ptr[d]; + float x_out = x * cos - y * sin; + float y_out = y * cos + x * sin; + out_query_ptr[2 * d + 0] = static_cast(x_out); + out_query_ptr[2 * d + 1] = static_cast(y_out); + if (calc_key) { + float x = static_cast(in_key_ptr[2 * d + 0]); + float y = static_cast(in_key_ptr[2 * d + 1]); + float x_out = x * cos - y * sin; + float y_out = y * cos + x * sin; + out_key_ptr[2 * d + 0] = static_cast(x_out); + out_key_ptr[2 * d + 1] = static_cast(y_out); + } + } +} + +template +inline void RotateEveryTwoNaive( + const scalar_t* in_query_ptr, + const scalar_t* in_key_ptr, + scalar_t* out_query_ptr, + scalar_t* out_key_ptr, + const float* sin_start, + const float* cos_start, + const int HR, + const int offset, + const bool calc_key) { + int embed_dim = HR / 2; + for (int h = 0, h2 = 0; h < HR; h += 2, h2++) { + float sin = sin_start[h2]; + float cos = cos_start[h2]; + float in0 = in_query_ptr[h]; + float in1 = in_query_ptr[h + offset]; + float out0 = in0 * cos - in1 * sin; + float out1 = in1 * cos + in0 * sin; + out_query_ptr[h] = out0; + out_query_ptr[h + offset] = out1; + if (calc_key) { + in0 = in_key_ptr[h]; + in1 = in_key_ptr[h + offset]; + out0 = in0 * cos - in1 * sin; + out1 = in1 * cos + in0 * sin; + out_key_ptr[h] = out0; + out_key_ptr[h + offset] = out1; + } + } +} + +template <> +inline void RotateEveryTwo( + const at::Half* in_query_ptr, + const at::Half* in_key_ptr, + at::Half* out_query_ptr, + at::Half* out_key_ptr, + const float* sin_start, + const float* cos_start, + const int HR, + const int offset, + const bool calc_key) { + // TODO: vectorized + RotateEveryTwoNaive( + in_query_ptr, + in_key_ptr, + out_query_ptr, + out_key_ptr, + sin_start, + cos_start, + HR, + offset, + calc_key); +} + template std::tuple ApplyROPEKernel( at::Tensor& t_in, @@ -140,24 +408,16 @@ std::tuple ApplyROPEKernel( } } else { // used by GPT-J 6B & CodeGen & ChatGLM // logic is like to the rotate_every_two in python code - for (int h = 0, h2 = 0; h < HR; h += 2, h2++) { - float sin = sin_start[h2]; - float cos = cos_start[h2]; - float in0 = in_ptr[in_offset_q + h]; - float in1 = in_ptr[in_offset_q + h + offset]; - float out0 = in0 * cos - in1 * sin; - float out1 = in1 * cos + in0 * sin; - query_ptr[out_offset_q + h] = out0; - query_ptr[out_offset_q + h + offset] = out1; - if (concat_qkv && n < N_KV) { - in0 = in_ptr[in_offset_k + h]; - in1 = in_ptr[in_offset_k + h + offset]; - out0 = in0 * cos - in1 * sin; - out1 = in1 * cos + in0 * sin; - key_ptr[out_offset_k + h] = out0; - key_ptr[out_offset_k + h + offset] = out1; - } - } + RotateEveryTwo( + &in_ptr[in_offset_q], + &in_ptr[in_offset_k], + &query_ptr[out_offset_q], + &key_ptr[out_offset_k], + sin_start, + cos_start, + HR, + offset, + (concat_qkv && n < N_KV)); } // step 2) copy the rest of the input tensor to query/key (query_pass // & key_pass) From a493a17a396e23d44b0102a440b7372d8b9749b5 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Thu, 20 Jun 2024 09:43:56 +0800 Subject: [PATCH 125/199] Update dependency_version.yml 20240612 (#2994) * Update dependency_version.yml 20240612 * Fix compile_fx API change in Torch for PR #126822 * update DeepSpeed commit to master --- dependency_version.yml | 10 +++++----- intel_extension_for_pytorch/_inductor/compile_fx.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index e518c3829..87402495e 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -8,9 +8,9 @@ # branch: Branch name of the Github repository. For human understanding only. deepspeed: - commit: v0.14.2 + commit: master repo: https://github.com/microsoft/DeepSpeed.git - version: 0.14.2 + version: 0.14.4 gcc: max-version: null min-version: 12.3.0 @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240522+cpu + version: 2.4.0.dev20240612+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.2.0.dev20240522+cpu + version: 2.4.0.dev20240612+cpu torchvision: - version: 0.19.0.dev20240522+cpu + version: 0.19.0.dev20240612+cpu transformers: version: 4.38.1 diff --git a/intel_extension_for_pytorch/_inductor/compile_fx.py b/intel_extension_for_pytorch/_inductor/compile_fx.py index c531362d8..8e372f4f3 100644 --- a/intel_extension_for_pytorch/_inductor/compile_fx.py +++ b/intel_extension_for_pytorch/_inductor/compile_fx.py @@ -13,7 +13,7 @@ def ipex_compile_fx_inner( gm: torch.fx.GraphModule, example_inputs: List[torch.Tensor], cudagraphs=None, - num_fixed=0, + static_input_idxs=0, is_backward=False, graph_id=None, cpp_wrapper=False, @@ -28,7 +28,7 @@ def ipex_compile_fx_inner( gm, example_inputs, cudagraphs=cudagraphs, - num_fixed=num_fixed, + static_input_idxs=static_input_idxs, is_backward=is_backward, graph_id=graph_id, cpp_wrapper=cpp_wrapper, From b97d4c45c3cba0ee51cd8e313f0432edb08099da Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Thu, 20 Jun 2024 11:31:20 +0800 Subject: [PATCH 126/199] switch IPEX to 2.5 to align with the latest torch nightly (#2999) --- dependency_version.yml | 6 +++--- version.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 87402495e..1b0fe2f20 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.4.0.dev20240612+cpu + version: 2.5.0.dev20240619+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.4.0.dev20240612+cpu + version: 2.4.0.dev20240619+cpu torchvision: - version: 0.19.0.dev20240612+cpu + version: 0.20.0.dev20240619+cpu transformers: version: 4.38.1 diff --git a/version.txt b/version.txt index 88b953154..d76e99307 100644 --- a/version.txt +++ b/version.txt @@ -1,3 +1,3 @@ VERSION_MAJOR 2 -VERSION_MINOR 4 +VERSION_MINOR 5 VERSION_PATCH 0 From b74ff102b0b4e1e8979f23ee802bb7c059e45396 Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Thu, 20 Jun 2024 13:34:33 +0800 Subject: [PATCH 127/199] Fix deepspeed commit to 0c979d6779e3251fa0a65bd27e61f31a0883f022 (#3000) --- dependency_version.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependency_version.yml b/dependency_version.yml index 1b0fe2f20..93dd151ac 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -8,7 +8,7 @@ # branch: Branch name of the Github repository. For human understanding only. deepspeed: - commit: master + commit: 0c979d6779e3251fa0a65bd27e61f31a0883f022 repo: https://github.com/microsoft/DeepSpeed.git version: 0.14.4 gcc: From 40e81bc6065e7e3e6f0c97249ec777abd90aa18c Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Thu, 20 Jun 2024 16:31:35 +0800 Subject: [PATCH 128/199] deepspeed bug fix in attentions.py (#3001) --- .../transformers/models/reference/modules/attentions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 9951374e7..f4c9b1b7e 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -2421,7 +2421,8 @@ def __init__(self, module, config, sdp_module_ref, distributed=False): ) ds_modules = may_import_deepspeed_modules() - supported_linear_types.extend(ds_modules) + if ds_modules is not None: + supported_linear_types.extend(ds_modules) supported_linear_types = tuple(supported_linear_types) if ( hasattr(module, "q_proj") From 8abae4acb21e452b78d8f5bb46b07100160b0d84 Mon Sep 17 00:00:00 2001 From: blzheng Date: Fri, 21 Jun 2024 09:23:30 +0800 Subject: [PATCH 129/199] Fix nightly ut failure (#3002) --- .../llm/distributed/run_accuracy_with_deepspeed.py | 6 ++---- .../python/llm/single_instance/run_accuracy.py | 6 ++---- .../python/llm/single_instance/run_quantization.py | 4 ++-- intel_extension_for_pytorch/transformers/optimize.py | 12 ++++-------- tests/cpu/hf_configs/qwen2/config.json | 2 +- tests/cpu/test_ipex_optimize_transformers_nightly.py | 2 ++ 6 files changed, 13 insertions(+), 19 deletions(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py index 4df3c54cc..ef2ae50fe 100644 --- a/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_accuracy_with_deepspeed.py @@ -1251,10 +1251,8 @@ def write_checkpoints_json(): self.amp_dtype ) elif re.search("git", pretrained, re.IGNORECASE): - sample_inputs["input_ids"] = input_ids.repeat(self.batch_size, 1) - sample_inputs["attention_mask"] = attention_mask.repeat( - self.batch_size, 1 - ) + sample_inputs["input_ids"] = torch.ones(batch_size, 1).to(torch.long) + sample_inputs["attention_mask"] = torch.ones(batch_size, 1) sample_inputs["pixel_values"] = torch.zeros(batch_size, 3, 224, 224) num_head = self.model.git.encoder.layer[ 0 diff --git a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py index 404e18ebe..b8b27c1bd 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_accuracy.py +++ b/examples/cpu/inference/python/llm/single_instance/run_accuracy.py @@ -952,10 +952,8 @@ def __init__( self.amp_dtype ) elif re.search("git", pretrained, re.IGNORECASE): - sample_inputs["input_ids"] = input_ids.repeat(self.batch_size, 1) - sample_inputs["attention_mask"] = attention_mask.repeat( - self.batch_size, 1 - ) + sample_inputs["input_ids"] = torch.ones(batch_size, 1).to(torch.long) + sample_inputs["attention_mask"] = torch.ones(batch_size, 1) sample_inputs["pixel_values"] = torch.zeros(batch_size, 3, 224, 224) num_head = self.model.config.num_attention_heads head_dim = int(self.model.config.hidden_size / num_head) diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index f5fc94c9f..37426edf0 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -544,8 +544,8 @@ def get_example_inputs(model): ] pixel_inputs = torch.ones(batch_size, 3, 224, 224) example_inputs = ( - input_ids.unsqueeze(0).repeat(batch_size, 1), - attention_mask.unsqueeze(0).repeat(batch_size, 1), + torch.ones(batch_size, 1).to(torch.long), + torch.ones(batch_size, 1), tuple(past_key_value), pixel_inputs, ) diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 137f82a95..5844a8ab8 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -1025,18 +1025,14 @@ def get_dummy_input(_model, return_dict=False): ] ) if return_dict: - sample_inputs["input_ids"] = sample_inputs["input_ids"].repeat( - batch_size, 1 - ) - sample_inputs["attention_mask"] = sample_inputs["attention_mask"].repeat( - batch_size, 1 - ) + sample_inputs["input_ids"] = torch.ones(batch_size, 1).to(torch.long) + sample_inputs["attention_mask"] = torch.ones(batch_size, 1) sample_inputs["pixel_values"] = torch.zeros(batch_size, 3, 224, 224) sample_inputs["past_key_values"] = past_key_values else: sample_inputs = ( - input_ids.repeat(_model.config.batch_size, 1), - attention_mask.repeat(_model.config.batch_size, 1), + torch.ones(batch_size, 1).to(torch.long), + torch.ones(batch_size, 1), past_key_values, torch.zeros(_model.config.batch_size, 3, 224, 224), ) diff --git a/tests/cpu/hf_configs/qwen2/config.json b/tests/cpu/hf_configs/qwen2/config.json index 8bf0170ee..041b0fc4c 100644 --- a/tests/cpu/hf_configs/qwen2/config.json +++ b/tests/cpu/hf_configs/qwen2/config.json @@ -24,4 +24,4 @@ "use_cache": true, "use_sliding_window": false, "vocab_size": 152064 - \ No newline at end of file +} diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 9d656a65c..34014aaee 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -260,6 +260,8 @@ def model_replacement_check( if re.search("t5", model.config.architectures[0], re.IGNORECASE): input_dict["decoder_input_ids"] = decoder_input_ids.unsqueeze(0) if m.name == "git": + input_dict["input_ids"] = torch.ones(1, 1).to(torch.long) + input_dict["attention_mask"] = torch.ones(1, 1) input_dict["pixel_values"] = torch.zeros(1, 3, 224, 224) if m.name == "whisper": last_hidden_state = torch.rand([1, 32, 1280]) From 3226ffd74df0b3b0033f82e86c7be94e16eee885 Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Fri, 21 Jun 2024 12:56:36 +0800 Subject: [PATCH 130/199] Enable LLM models concat linear for static int8 (#3005) --- .../transformers/models/cpu/fusions/linear_fusion.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py index f902298ae..8d88732b9 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py @@ -417,11 +417,7 @@ def __init__(self, module, tpp=False, woq=False): self.concat_linear = WeightOnlyQuantizedLinear.from_float( mod, concat_scales, concat_zeros ) - elif ( - hasattr(module, "concat_linear") - and module.concat_linear is not None - and (self.tpp or module.concat_linear.weight.dtype == torch.half) - ): + elif hasattr(module, "concat_linear") and module.concat_linear is not None: self.concat_linear = module.concat_linear else: for i in range(self.num_concat): From c37bace4698a33335ca01ba6e9766b5d647cf75e Mon Sep 17 00:00:00 2001 From: Cao E Date: Fri, 21 Jun 2024 16:34:59 +0800 Subject: [PATCH 131/199] Switch FP16 LLM to TPP implementations (#2981) * add weight prepack support for fp16 deconv * switch fp16 llm to tpp implementations --------- Co-authored-by: Zhang, Liangang --- csrc/cpu/aten/ConvTranspose.cpp | 7 +- csrc/cpu/aten/Linear.cpp | 3 +- csrc/cpu/aten/TensorAdvancedIndexing.cpp | 3 +- csrc/cpu/aten/kernels/IndexSelectKrnl.cpp | 3 +- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 21 ++- csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp | 15 ++ csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp | 54 ++++++++ .../jit/cpu/kernels/ConvTransposePacked.cpp | 34 ++++- csrc/cpu/tpp/kernels/TPPGEMMKrnl.h | 13 +- csrc/cpu/vec/general/rope.h | 34 +++-- .../cpu/tpp/utils/blocked_layout.py | 2 +- .../nn/utils/_parameter_wrapper.py | 16 +-- .../models/cpu/fusions/linear_fusion.py | 2 +- .../models/reference/fusions/mha_fusion.py | 2 +- .../models/reference/modules/attentions.py | 5 +- .../transformers/optimize.py | 20 +-- tests/cpu/test_ipex_llm_module.py | 59 ++++++-- tests/cpu/test_ipex_optimize_transformers.py | 88 ++++++------ ...test_ipex_optimize_transformers_nightly.py | 11 +- tests/cpu/test_linear_fuse_eltwise.py | 21 ++- tests/cpu/test_paged_attention.py | 5 + tests/cpu/test_tpp_linear.py | 128 +++++++++++++++--- tests/cpu/test_weight_prepack.py | 13 +- 23 files changed, 420 insertions(+), 139 deletions(-) diff --git a/csrc/cpu/aten/ConvTranspose.cpp b/csrc/cpu/aten/ConvTranspose.cpp index 858920f94..799d04f21 100644 --- a/csrc/cpu/aten/ConvTranspose.cpp +++ b/csrc/cpu/aten/ConvTranspose.cpp @@ -349,9 +349,14 @@ std::tuple conv_transpose_backward_weights( if (grad_output.scalar_type() == at::ScalarType::Float) { mkldnn_grad_weight.init( packed_weight_desc, grad_weight.template data_ptr()); - } else { + } else if (grad_output.scalar_type() == at::ScalarType::BFloat16) { mkldnn_grad_weight.init( packed_weight_desc, grad_weight.template data_ptr()); + } else if (grad_output.scalar_type() == at::ScalarType::Half) { + mkldnn_grad_weight.init( + packed_weight_desc, grad_weight.template data_ptr()); + } else { + TORCH_CHECK(false, "only fp32, bf16, and fp16 are supported"); } std::vector real_weight_size = { diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp index 490acc3f7..8256b5f4e 100644 --- a/csrc/cpu/aten/Linear.cpp +++ b/csrc/cpu/aten/Linear.cpp @@ -708,8 +708,9 @@ at::Tensor ipex_linear_eltwise( auto target_type = get_autocast_dtype(); TORCH_CHECK( weight.scalar_type() == at::kBFloat16 || + weight.scalar_type() == at::kHalf || weight.scalar_type() == at::kFloat, - "ipex_linear_eltwise only support bfloat16 and float autocast dtype"); + "ipex_linear_eltwise only support bfloat16, float16 and float autocast dtype"); // should not autocast weight/bias here since we are using it from op_context, // The cast for weight/bias should be only handled in ipex.optimize return op.call( diff --git a/csrc/cpu/aten/TensorAdvancedIndexing.cpp b/csrc/cpu/aten/TensorAdvancedIndexing.cpp index 7935617d9..6b5e40921 100644 --- a/csrc/cpu/aten/TensorAdvancedIndexing.cpp +++ b/csrc/cpu/aten/TensorAdvancedIndexing.cpp @@ -68,7 +68,8 @@ at::Tensor& index_select_out_cpu_( const auto st = self.scalar_type(); if (result.is_contiguous() && - (st == at::kFloat || st == at::kDouble || st == at::kBFloat16)) { + (st == at::kFloat || st == at::kDouble || st == at::kBFloat16 || + st == at::kHalf)) { auto self_contig = self.contiguous(); index_select_contig_stub(kCPU, result, self_contig, dim, index_contig); return result; diff --git a/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp b/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp index 8245e8bf7..130b6b6e2 100644 --- a/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp +++ b/csrc/cpu/aten/kernels/IndexSelectKrnl.cpp @@ -262,8 +262,9 @@ void index_select_contig_kernel( const at::Tensor& self, int64_t dim, const at::Tensor& index) { - AT_DISPATCH_FLOATING_TYPES_AND( + AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::BFloat16, + at::ScalarType::Half, result.scalar_type(), "index_select_contig", [&result, &self, &dim, &index] { diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index 0bff47b20..5fa66be9e 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -77,7 +77,7 @@ void reduce_head( } qk_sum_vec = _mm512_fmadd_ps(q_vec_fp32, k_vec_fp32, qk_sum_vec); } - attn_w_pos[0] += (at::BFloat16)_mm512_reduce_add_ps(qk_sum_vec); + attn_w_pos[0] += _mm512_reduce_add_ps(qk_sum_vec); for (; hsi < head_size; hsi++) { k_cache_start[hsi] = k_ptr_start[hsi]; // cat the key into the key_cache. attn_w_pos[0] += q_ptr_start[hsi] * k_ptr_start[hsi]; @@ -85,6 +85,7 @@ void reduce_head( return; } +template <> void reduce_head( const at::Half* q_ptr_start, const at::Half* k_ptr_start, @@ -107,7 +108,7 @@ void reduce_head( } qk_sum_vec = _mm512_fmadd_ps(q_vec_fp32, k_vec_fp32, qk_sum_vec); } - attn_w_pos[0] += (at::Half)_mm512_reduce_add_ps(qk_sum_vec); + attn_w_pos[0] += _mm512_reduce_add_ps(qk_sum_vec); for (; hsi < head_size; hsi++) { k_cache_start[hsi] = k_ptr_start[hsi]; // cat the key into the key_cache. attn_w_pos[0] += q_ptr_start[hsi] * k_ptr_start[hsi]; @@ -1575,6 +1576,22 @@ at::Tensor prepare_4d_causal_attention_mask_kernel_impl( past_key_value_length, length, diagonal); + } else if (dtype == at::kHalf) { + at::Half* attention_mask_ptr = attention_mask.data_ptr(); + at::Half* causal_4d_mask_ptr = causal_4d_mask.data_ptr(); + attention_mask_2d_to_4d( + attention_mask_ptr, + causal_4d_mask_ptr, + finfo_min, + batch_size, + seq_length, + src_length, + past_key_value_length, + length, + diagonal); + } else { + AT_ASSERT( + 0, "TPP does not support current dtype %s:%d\n", __FILE__, __LINE__); } return causal_4d_mask; diff --git a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp index f620b90f9..1873d518a 100644 --- a/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/PagedAttentionKrnl.cpp @@ -538,6 +538,18 @@ void single_query_cached_kv_attention_kernel_impl( block_size, max_context_len, alibi_slopes); + } else if (out.scalar_type() == at::ScalarType::Half) { + single_query_cached_kv_attention_kernel( + out, + query, + key_cache, + value_cache, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes); } else { TORCH_CHECK( false, "Unsupported data type for single_query_cached_kv_attention"); @@ -568,6 +580,9 @@ void reshape_and_cache_cpu_kernel_impl( } else if (key.scalar_type() == at::ScalarType::BFloat16) { reshape_and_cache_kernel( key, value, key_cache, value_cache, slot_mapping); + } else if (key.scalar_type() == at::ScalarType::Half) { + reshape_and_cache_kernel( + key, value, key_cache, value_cache, slot_mapping); } else { TORCH_CHECK(false, "Unsupported data type for ipex::reshape_and_cache"); } diff --git a/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp b/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp index 9a5acc9c7..bb08ccf63 100644 --- a/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp +++ b/csrc/cpu/aten/kernels/TPPGEMMKrnl.cpp @@ -7,6 +7,7 @@ #include #include #include +#include "../../utils/isa_utils.h" namespace torch_ipex { namespace cpu { @@ -27,6 +28,11 @@ at::Tensor tpp_linear_bias_kernel_impl( torch_ipex::tpp::tpp_linear_bias(t_in, t_wt, t_bias, t_out); } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_linear_bias(t_in, t_wt, t_bias, t_out); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_linear_bias(t_in, t_wt, t_bias, t_out); } else { AT_ASSERT( 0, @@ -52,6 +58,11 @@ at::Tensor tpp_linear_nobias_kernel_impl( torch_ipex::tpp::tpp_linear_no_bias(t_in, t_wt, t_out); } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_linear_no_bias(t_in, t_wt, t_out); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_linear_no_bias(t_in, t_wt, t_out); } else { AT_ASSERT( 0, @@ -92,6 +103,16 @@ at::Tensor tpp_linear_gelu_kernel_impl( torch_ipex::tpp::tpp_linear_gelu_tanh( t_in, t_wt, t_bias, t_out); } + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + if (algorithm == "none") { + torch_ipex::tpp::tpp_linear_gelu(t_in, t_wt, t_bias, t_out); + } else { // tanh + torch_ipex::tpp::tpp_linear_gelu_tanh( + t_in, t_wt, t_bias, t_out); + } } else { AT_ASSERT( 0, @@ -124,6 +145,12 @@ at::Tensor tpp_fused_gate_up_proj_kernel_impl( } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_fused_gate_up_proj( t_in, t_wt_gate, t_bias_gate, t_wt_up, t_bias_up, t_out); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_fused_gate_up_proj( + t_in, t_wt_gate, t_bias_gate, t_wt_up, t_bias_up, t_out); } else { AT_ASSERT( 0, @@ -149,6 +176,11 @@ at::Tensor tpp_linear_silu_kernel_impl( torch_ipex::tpp::tpp_linear_silu(t_in, t_wt, t_bias, t_out); } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_linear_silu(t_in, t_wt, t_bias, t_out); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_linear_silu(t_in, t_wt, t_bias, t_out); } else { AT_ASSERT( 0, @@ -174,6 +206,11 @@ at::Tensor tpp_linear_relu_kernel_impl( torch_ipex::tpp::tpp_linear_relu(t_in, t_wt, t_bias, t_out); } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_linear_relu(t_in, t_wt, t_bias, t_out); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_linear_relu(t_in, t_wt, t_bias, t_out); } else { AT_ASSERT( 0, @@ -199,6 +236,12 @@ at::Tensor tpp_linear_add_add_kernel_impl( } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_linear_add_add( t_in, t_in1, t_in2, t_wt, t_bias, t_out, scale); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_linear_add_add( + t_in, t_in1, t_in2, t_wt, t_bias, t_out, scale); } else { AT_ASSERT( 0, @@ -223,6 +266,12 @@ at::Tensor tpp_linear_add_kernel_impl( } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_linear_add( t_in, t_in1, t_wt, t_bias, t_out, scale); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_linear_add( + t_in, t_in1, t_wt, t_bias, t_out, scale); } else { AT_ASSERT( 0, @@ -245,6 +294,11 @@ at::Tensor tpp_linear_mul_kernel_impl( } else if (dt == at::kBFloat16) { torch_ipex::tpp::tpp_linear_mul( t_in, t_in1, t_wt, t_bias, t_out); + } else if (dt == at::kHalf) { + TORCH_CHECK( + torch_ipex::utils::isa_has_amx_fp16_support(), + "TPP does not support fp16 on platforms without amx_fp16 support"); + torch_ipex::tpp::tpp_linear_mul(t_in, t_in1, t_wt, t_bias, t_out); } else { AT_ASSERT( 0, diff --git a/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.cpp b/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.cpp index 95465ef98..0dc620c6e 100644 --- a/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.cpp +++ b/csrc/cpu/jit/cpu/kernels/ConvTransposePacked.cpp @@ -308,9 +308,14 @@ ContextConvTranspose create( ideep::tensor packed_weight; if (ideep::data_type::f32 == weight_dtype) { packed_weight.init(expected_desc, at_weight.template data_ptr()); - } else { + } else if (ideep::data_type::bf16 == weight_dtype) { packed_weight.init( expected_desc, at_weight.template data_ptr()); + } else if (ideep::data_type::f16 == weight_dtype) { + packed_weight.init( + expected_desc, at_weight.template data_ptr()); + } else { + TORCH_CHECK(false, "only fp32, bf16, and fp16 are supported"); } w.transpose_(0, 1); @@ -452,9 +457,14 @@ at::Tensor pack(ContextConvTranspose& context, const at::Tensor& tensor) { if (ideep::data_type::f32 == dtype) { packed_tensor.init( expected_desc, packed_at_tensor.template data_ptr()); - } else { + } else if (ideep::data_type::bf16 == dtype) { packed_tensor.init( expected_desc, packed_at_tensor.template data_ptr()); + } else if (ideep::data_type::f16 == dtype) { + packed_tensor.init( + expected_desc, packed_at_tensor.template data_ptr()); + } else { + TORCH_CHECK(false, "only fp32, bf16, and fp16 are supported"); } ideep_tensor.transpose_(0, 1); packed_tensor.feed_from(ideep_tensor, true); @@ -467,9 +477,14 @@ at::Tensor unpack(ContextConvTranspose& context, const at::Tensor& tensor) { ideep::tensor blocked_tensor; if (ideep::data_type::f32 == dtype) { blocked_tensor.init(expected_desc, tensor.template data_ptr()); - } else { + } else if (ideep::data_type::bf16 == dtype) { blocked_tensor.init( expected_desc, tensor.template data_ptr()); + } else if (ideep::data_type::f16 == dtype) { + blocked_tensor.init( + expected_desc, tensor.template data_ptr()); + } else { + TORCH_CHECK(false, "only fp32, bf16, and fp16 are supported"); } at::Tensor result = at::empty(context.origin_weight_dims_, tensor.options()); @@ -484,8 +499,12 @@ at::Tensor unpack(ContextConvTranspose& context, const at::Tensor& tensor) { auto pub_tensor_desc = context.original_desc_.to_type(dtype); if (ideep::data_type::f32 == dtype) { pub_tensor.init(pub_tensor_desc, result.template data_ptr()); - } else { + } else if (ideep::data_type::bf16 == dtype) { pub_tensor.init(pub_tensor_desc, result.template data_ptr()); + } else if (ideep::data_type::f16 == dtype) { + pub_tensor.init(pub_tensor_desc, result.template data_ptr()); + } else { + TORCH_CHECK(false, "only fp32, bf16, and fp16 are supported"); } pub_tensor.transpose_(0, 1); pub_tensor.feed_from(blocked_tensor, true); @@ -518,9 +537,14 @@ void repack_for( empty_aten_tensor_from_desc(packed_desc, context.at_weight_.options()); if (ideep::data_type::f32 == dtype) { packed_weight.init(packed_desc, new_at_weight.template data_ptr()); - } else { + } else if (ideep::data_type::bf16 == dtype) { packed_weight.init( packed_desc, new_at_weight.template data_ptr()); + } else if (ideep::data_type::f16 == dtype) { + packed_weight.init( + packed_desc, new_at_weight.template data_ptr()); + } else { + TORCH_CHECK(false, "only fp32, bf16, and fp16 are supported"); } packed_weight.feed_from(context.weight_packed_); context.at_weight_ = new_at_weight; diff --git a/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h b/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h index f5bb644fe..a9b0345e3 100644 --- a/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h +++ b/csrc/cpu/tpp/kernels/TPPGEMMKrnl.h @@ -10,6 +10,7 @@ #include "tpp/threaded_loops.h" #endif #include +#include "../../utils/isa_utils.h" #include "tpp/tensor_helper.h" #include "tpp/xsmm_functors.h" @@ -136,7 +137,9 @@ inline void tpp_linear_bias( auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); - constexpr int b_vnni = std::is_same(); + int b_vnni = std::is_same() || + (std::is_same() && + torch_ipex::utils::isa_has_amx_fp16_support()); auto brgemm_tpp = SCOPEITGEMM((BrgemmTPP( BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); auto brgemm_tpp_rem = SCOPEITGEMM((BrgemmTPP( @@ -485,7 +488,9 @@ inline void tpp_linear_gelu( auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); - constexpr int b_vnni = std::is_same(); + int b_vnni = std::is_same() || + (std::is_same() && + torch_ipex::utils::isa_has_amx_fp16_support()); auto brgemm_tpp = SCOPEITGEMM((BrgemmTPP( BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); auto brgemm_tpp_rem = SCOPEITGEMM((BrgemmTPP( @@ -577,7 +582,9 @@ inline void tpp_linear_gelu_tanh( auto copy_bias_tpp_rem = SCOPEIT(CpyBiasTPP(rem, Hk, K), BIAS); auto zero_tpp = SCOPEIT(SetZeroTPP(BSb, Hk, K), EW_ZERO); auto zero_tpp_rem = SCOPEIT(SetZeroTPP(rem, Hk, K), EW_ZERO); - constexpr int b_vnni = std::is_same(); + int b_vnni = std::is_same() || + (std::is_same() && + torch_ipex::utils::isa_has_amx_fp16_support()); auto brgemm_tpp = SCOPEITGEMM((BrgemmTPP( BSb, Hk, Hc, Hc, Hk * Hc, C, Hk, K, 1.0, 0, Ncb, b_vnni))); auto brgemm_tpp_rem = SCOPEITGEMM((BrgemmTPP( diff --git a/csrc/cpu/vec/general/rope.h b/csrc/cpu/vec/general/rope.h index 7a6c281a6..e011abf63 100644 --- a/csrc/cpu/vec/general/rope.h +++ b/csrc/cpu/vec/general/rope.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -11,7 +12,10 @@ namespace kernel { using namespace at::vec; template -inline void apply_rope_along_head_kernel( +inline typename std::enable_if_t< + !is_reduced_floating_point_v && !std::is_same_v, + void> +apply_rope_along_head_kernel( scalar_t* in_ptr_start, scalar_t* out_ptr_start, float* cos_start, @@ -31,10 +35,11 @@ inline void apply_rope_along_head_kernel( } } -template <> -inline void apply_rope_along_head_kernel( - float* in_ptr_start, - float* out_ptr_start, +template +inline typename std::enable_if_t, void> +apply_rope_along_head_kernel( + scalar_t* in_ptr_start, + scalar_t* out_ptr_start, float* cos_start, float* sin_start, int64_t rotary_ndims, @@ -64,16 +69,17 @@ inline void apply_rope_along_head_kernel( } } -template <> -inline void apply_rope_along_head_kernel( - at::BFloat16* in_ptr_start, - at::BFloat16* out_ptr_start, +template +inline typename std::enable_if_t, void> +apply_rope_along_head_kernel( + scalar_t* in_ptr_start, + scalar_t* out_ptr_start, float* cos_start, float* sin_start, int64_t rotary_ndims, int64_t offset) { auto h = 0; - using bVec = Vectorized; + using bVec = Vectorized; using fVec = Vectorized; const int fvec_size = fVec::size(); const int bvec_size = bVec::size(); @@ -81,8 +87,8 @@ inline void apply_rope_along_head_kernel( bVec x = bVec::loadu(in_ptr_start + h); bVec y = bVec::loadu(in_ptr_start + h + offset); fVec x0, x1, y0, y1; - std::tie(x0, x1) = convert_bfloat16_float(x); - std::tie(y0, y1) = convert_bfloat16_float(y); + std::tie(x0, x1) = convert_to_float(x); + std::tie(y0, y1) = convert_to_float(y); fVec c0 = fVec::loadu(cos_start + h); fVec s0 = fVec::loadu(sin_start + h); fVec c1 = fVec::loadu(cos_start + h + fvec_size); @@ -91,8 +97,8 @@ inline void apply_rope_along_head_kernel( fVec x_out1 = x1 * c1 - y1 * s1; fVec y_out0 = y0 * c0 + x0 * s0; fVec y_out1 = y1 * c1 + x1 * s1; - bVec x_out = convert_float_bfloat16(x_out0, x_out1); - bVec y_out = convert_float_bfloat16(y_out0, y_out1); + bVec x_out = convert_from_float(x_out0, x_out1); + bVec y_out = convert_from_float(y_out0, y_out1); x_out.store(out_ptr_start + h); y_out.store(out_ptr_start + h + offset); } diff --git a/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py b/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py index f7594c207..2ac9d43c6 100644 --- a/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py +++ b/intel_extension_for_pytorch/cpu/tpp/utils/blocked_layout.py @@ -19,7 +19,7 @@ def _prod(myList): def get_vnni_blocking(dtype): if dtype == torch.float32: return 1 - elif dtype == torch.bfloat16: + elif dtype in [torch.bfloat16, torch.float16]: return 2 elif dtype == torch.bfloat8: return 4 diff --git a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py index 3e3894d01..16a01ce77 100644 --- a/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py +++ b/intel_extension_for_pytorch/nn/utils/_parameter_wrapper.py @@ -9,6 +9,7 @@ _using_tpp, ) from intel_extension_for_pytorch import frontend +import intel_extension_for_pytorch._C as core from intel_extension_for_pytorch.nn.utils._weight_prepack import ( _IPEXLinear, _IPEXConv1d, @@ -166,11 +167,7 @@ def _should_prepack(module, is_training, is_xpu=False): # Conv1d backward is not implemented, will not prepack. if isinstance(module, torch.nn.Conv1d) and module.training: return False - if module.weight.dtype == torch.half and module.__class__ in ( - torch.nn.ConvTranspose2d, - torch.nn.ConvTranspose3d, - ): - return False + if module.weight.dtype not in ( torch.float, torch.float32, @@ -525,9 +522,7 @@ def conv_transpose_prepack(self, module): def linear_prepack(self, module, is_training): if module.__class__ in IPEX_GEMM_MODULE_CPU(): - if module.weight.dtype == torch.half: - use_dnnl = True - elif ( + if ( module.weight.dtype == torch.float32 and not is_training and frontend.get_fp32_math_mode(device="cpu") @@ -539,10 +534,13 @@ def linear_prepack(self, module, is_training): assert module.weight.dtype in [ torch.float32, torch.bfloat16, + torch.float16, ], "Only float, bf16 and fp16 are supported" use_dnnl = True - module.use_tpp = _using_tpp() + module.use_tpp = _using_tpp() and ( + module.weight.dtype != torch.float16 or core.isa_has_amx_fp16_support() + ) if not hasattr(module, "out_features"): setattr(module, "out_features", module.weight.shape[0]) # noqa: B010 diff --git a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py index 8d88732b9..779d77110 100644 --- a/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/cpu/fusions/linear_fusion.py @@ -16,7 +16,7 @@ def __init__(self, linear, tpp=False, woq=False): super().__init__() self.tpp = tpp self.woq = woq - self.dtype = linear.weight.dtype if self.tpp else None + self.dtype = None if woq else linear.weight.dtype def extra_repr(self): extra_repr_str = f"dtype = {self.dtype}, tpp = {self.tpp}, woq = {self.woq}" diff --git a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py index 1d1a4e113..e45b6aca4 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py +++ b/intel_extension_for_pytorch/transformers/models/reference/fusions/mha_fusion.py @@ -1000,7 +1000,7 @@ def forward(self, hidden_states): class _IPEXPagedAttentionRef: @classmethod def reshape_and_cache(cls, key, value, key_cache, value_cache, slot_mapping): - if key.dtype is torch.bfloat16: + if key.dtype in [torch.bfloat16, torch.float16]: x = 16 // torch.tensor([], dtype=key.dtype).element_size() else: x = 32 // torch.tensor([], dtype=key.dtype).element_size() diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index f4c9b1b7e..67a00c699 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -445,7 +445,10 @@ def _FalconAttention_forward( ) attention_mask_float = ( (attention_mask * 1.0) - .masked_fill(attention_mask.to(torch.bool), float("-1e9")) + .masked_fill( + attention_mask.to(torch.bool), + float("-6e4") if query_layer.dtype == torch.half else float("-1e9"), + ) .to(query_layer.dtype) ) diff --git a/intel_extension_for_pytorch/transformers/optimize.py b/intel_extension_for_pytorch/transformers/optimize.py index 5844a8ab8..e24d51710 100644 --- a/intel_extension_for_pytorch/transformers/optimize.py +++ b/intel_extension_for_pytorch/transformers/optimize.py @@ -8,6 +8,7 @@ _using_tpp, ) import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch._C as core from ..utils.weight_only_quantization import ( _is_woq_qconfig, _woq_enable_weight_cache_for_large_batch, @@ -1210,12 +1211,9 @@ def model_convert_lowering( auto_kernel_selection=True, ) elif dtype is torch.half: - _model = ipex.optimize( - _model.eval(), - dtype=dtype, - inplace=True, - auto_kernel_selection=True, - ) + if core.isa_has_amx_fp16_support(): + _enable_tpp() + _model = ipex.optimize(_model.eval(), dtype=dtype, inplace=True) elif dtype is torch.bfloat16: _enable_tpp() _model = ipex.optimize(_model.eval(), dtype=dtype, inplace=True) @@ -1486,7 +1484,6 @@ def optimize( "Phi3ForCausalLM", "WhisperForConditionalGeneration", ] - if well_supported_model: check_transformers_for_llm_support() else: @@ -1509,15 +1506,8 @@ def optimize( else False ), ) - elif dtype is torch.bfloat16: + elif dtype in [torch.bfloat16, torch.half]: _model = ipex.optimize(model.eval(), dtype=dtype, inplace=inplace) - elif dtype is torch.half: - _model = ipex.optimize( - model.eval(), - dtype=dtype, - auto_kernel_selection=True, - inplace=inplace, - ) return _model diff --git a/tests/cpu/test_ipex_llm_module.py b/tests/cpu/test_ipex_llm_module.py index 1674b2b01..bfd176cb9 100644 --- a/tests/cpu/test_ipex_llm_module.py +++ b/tests/cpu/test_ipex_llm_module.py @@ -2,6 +2,7 @@ import torch import math import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch._C as core from torch.testing._internal.common_utils import TestCase import copy from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( @@ -209,16 +210,19 @@ def test_linearfusion_args0(self): ipex.llm.modules.LinearRelu, ipex.llm.modules.Linear2SiluMul, ] + dtypes = [torch.float32, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) with torch.no_grad(): for i in range(len(ref_scope)): - for dtype in [torch.float32, torch.bfloat16]: + for dtype in dtypes: for use_ipex_optimize in [True, False]: for use_tpp in [True, False]: model = ref_scope[i]().eval().to(dtype) ref_out = model(x1.to(dtype)) if use_ipex_optimize: if use_tpp: - if dtype == torch.bfloat16: + if dtype in [torch.bfloat16, torch.float16]: _enable_tpp() else: continue @@ -231,7 +235,15 @@ def test_linearfusion_args0(self): else: model = ipex_scope[i](model.linear_1, model.linear_2) out = model(x2.to(dtype)) - self.assertEqual(out, ref_out) + atol = None + rtol = None + if dtype is torch.float16: + atol = 1e-3 + rtol = 1e-3 + elif dtype is torch.bfloat16: + atol = 1e-3 + rtol = 0.016 + self.assertEqual(out, ref_out, atol=atol, rtol=rtol) _disable_tpp() def test_linearfusion_args1(self): @@ -243,16 +255,19 @@ def test_linearfusion_args1(self): ipex.llm.modules.LinearAdd, ipex.llm.modules.LinearSiluMul, ] + dtypes = [torch.float32, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) with torch.no_grad(): for i in range(len(ref_scope)): - for dtype in [torch.float32, torch.bfloat16]: + for dtype in dtypes: for use_ipex_optimize in [True, False]: for use_tpp in [True, False]: model = ref_scope[i]().eval().to(dtype) ref_out = model(x1.to(dtype), x1.to(dtype)) if use_ipex_optimize: if use_tpp: - if dtype == torch.bfloat16: + if dtype in [torch.bfloat16, torch.float16]: _enable_tpp() else: continue @@ -264,7 +279,12 @@ def test_linearfusion_args1(self): model = ipex_scope[i](model.linear) out = model(x2.to(dtype), x2.to(dtype)) - self.assertEqual(out, ref_out) + atol = None + rtol = None + if dtype is torch.float16: + atol = 1e-3 + rtol = 1e-3 + self.assertEqual(out, ref_out, atol=atol, rtol=rtol) _disable_tpp() def test_linearfusion_args2(self): @@ -272,16 +292,19 @@ def test_linearfusion_args2(self): x2 = copy.deepcopy(x1) ref_scope = [Linear_add_add] ipex_scope = [ipex.llm.modules.LinearAddAdd] + dtypes = [torch.float32, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) with torch.no_grad(): for i in range(len(ref_scope)): - for dtype in [torch.float32, torch.bfloat16]: + for dtype in dtypes: for use_ipex_optimize in [True, False]: for use_tpp in [True, False]: model = ref_scope[i]().eval().to(dtype) ref_out = model(x1.to(dtype), x1.to(dtype), x1.to(dtype)) if use_ipex_optimize: if use_tpp: - if dtype == torch.bfloat16: + if dtype in [torch.bfloat16, torch.float16]: _enable_tpp() else: continue @@ -293,7 +316,12 @@ def test_linearfusion_args2(self): model = ipex_scope[i](model.linear) out = model(x2.to(dtype), x2.to(dtype), x2.to(dtype)) - self.assertEqual(out, ref_out) + atol = None + rtol = None + if dtype is torch.float16: + atol = 1e-4 + rtol = 1e-3 + self.assertEqual(out, ref_out, atol=atol, rtol=rtol) _disable_tpp() def test_rmsnorm(self): @@ -301,7 +329,10 @@ def test_rmsnorm(self): x2 = copy.deepcopy(x1) ref_m = LlamaRMSNorm(4096) target_m = ipex.llm.modules.RMSNorm(4096) - for dtype in [torch.float32, torch.bfloat16]: + dtypes = [torch.float32, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: ref_m = LlamaRMSNorm(4096).eval().to(dtype) target_m = ipex.llm.modules.RMSNorm(4096).to(dtype) ref_out = ref_m(x1.to(dtype)) @@ -359,7 +390,7 @@ def test_rotary_embedding_tgi(self): def test_add_layernorm(self): for add_back in [True, False]: - for dtype in [torch.float, torch.bfloat16]: + for dtype in [torch.float, torch.bfloat16, torch.float16]: for residual_is_none in [True, False]: weight = torch.nn.Parameter(torch.randn(4096)).to(dtype) eps = 1e-6 @@ -387,7 +418,7 @@ def test_add_layernorm(self): def test_add_rmsnorm(self): for add_back in [True, False]: - for dtype in [torch.float, torch.bfloat16]: + for dtype in [torch.float, torch.bfloat16, torch.float16]: for residual_is_none in [True, False]: weight = torch.nn.Parameter(torch.randn(4096)).to(dtype) eps = 1e-6 @@ -414,7 +445,7 @@ def test_add_rmsnorm(self): self.assertEqual(ref_out, ipex_out) def test_gelu_mul(self): - for dtype in [torch.float, torch.bfloat16]: + for dtype in [torch.float, torch.bfloat16, torch.float16]: for approximate in ["tanh", "none"]: x = torch.rand(1, 32, 4096).to(dtype) x_ = copy.deepcopy(x) @@ -423,7 +454,7 @@ def test_gelu_mul(self): self.assertEqual(ref_out, ipex_out) def test_silu_mul(self): - for dtype in [torch.float, torch.bfloat16]: + for dtype in [torch.float, torch.bfloat16, torch.float16]: x = torch.rand(1, 32, 4096).to(dtype) x_ = copy.deepcopy(x) ref_out = silu_mul(x_, x_) diff --git a/tests/cpu/test_ipex_optimize_transformers.py b/tests/cpu/test_ipex_optimize_transformers.py index 63ba64436..c740330fb 100644 --- a/tests/cpu/test_ipex_optimize_transformers.py +++ b/tests/cpu/test_ipex_optimize_transformers.py @@ -172,8 +172,6 @@ def test_model_replacement(self): ): if torchcompile and deployment_mode: continue - if dtype == torch.float16: - _disable_tpp() self.model_replacement_check(m, dtype, jit, torchcompile, return_dict) _disable_tpp() @@ -454,46 +452,52 @@ def test_generate_functions(self): config = AutoConfig.from_pretrained( f"{curpath}/hf_configs/gptj", return_dict=False ) - m = transformers.models.gptj.modeling_gptj.GPTJForCausalLM(config).eval() - ref_m = copy.deepcopy(m) - ipex_m = ipex.llm.optimize( - m, dtype=torch.bfloat16, deployment_mode=True, inplace=True - ) - input_ids = torch.ones(8).unsqueeze(0).to(torch.long) - # beam_search, beam=4 - generate_kwargs_beam = dict( - do_sample=False, - temperature=0.9, - num_beams=4, - max_new_tokens=2, - min_new_tokens=2, - ) - # greedy_search - generate_kwargs_greedy = dict( - do_sample=False, temperature=0.9, max_new_tokens=2, min_new_tokens=2 - ) - # sample, use a temperature of 0.01 to constrain text generation diversity in UT. - generate_kwargs_sample = dict( - do_sample=True, temperature=0.01, max_new_tokens=2, min_new_tokens=2 - ) - # beam_sample, use a temperature of 0.01 to constrain text generation diversity in UT. - generate_kwargs_sample = dict( - do_sample=True, - temperature=0.01, - num_beams=4, - max_new_tokens=2, - min_new_tokens=2, - ) - for generate_kwargs in [ - generate_kwargs_beam, - generate_kwargs_greedy, - generate_kwargs_sample, - generate_kwargs_sample, - ]: - with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast(): - ipex_res = ipex_m.generate(input_ids, **generate_kwargs) - ref_res = ref_m.generate(input_ids, **generate_kwargs) - self.assertEqual(ipex_res, ref_res) + dtypes = [torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: + m = transformers.models.gptj.modeling_gptj.GPTJForCausalLM(config).eval() + ref_m = copy.deepcopy(m) + ipex_m = ipex.llm.optimize( + m, dtype=dtype, deployment_mode=True, inplace=True + ) + input_ids = torch.ones(8).unsqueeze(0).to(torch.long) + # beam_search, beam=4 + generate_kwargs_beam = dict( + do_sample=False, + temperature=0.9, + num_beams=4, + max_new_tokens=2, + min_new_tokens=2, + ) + # greedy_search + generate_kwargs_greedy = dict( + do_sample=False, temperature=0.9, max_new_tokens=2, min_new_tokens=2 + ) + # sample, use a temperature of 0.01 to constrain text generation diversity in UT. + generate_kwargs_sample = dict( + do_sample=True, temperature=0.01, max_new_tokens=2, min_new_tokens=2 + ) + # beam_sample, use a temperature of 0.01 to constrain text generation diversity in UT. + generate_kwargs_sample = dict( + do_sample=True, + temperature=0.01, + num_beams=4, + max_new_tokens=2, + min_new_tokens=2, + ) + for generate_kwargs in [ + generate_kwargs_beam, + generate_kwargs_greedy, + generate_kwargs_sample, + generate_kwargs_sample, + ]: + with torch.inference_mode(), torch.no_grad(), torch.cpu.amp.autocast( + enabled=True, dtype=dtype + ): + ipex_res = ipex_m.generate(input_ids, **generate_kwargs) + ref_res = ref_m.generate(input_ids, **generate_kwargs) + self.assertEqual(ipex_res, ref_res) def test_cache_weight_for_large_batch(self): config = AutoConfig.from_pretrained( diff --git a/tests/cpu/test_ipex_optimize_transformers_nightly.py b/tests/cpu/test_ipex_optimize_transformers_nightly.py index 34014aaee..81f668d5b 100644 --- a/tests/cpu/test_ipex_optimize_transformers_nightly.py +++ b/tests/cpu/test_ipex_optimize_transformers_nightly.py @@ -1,6 +1,7 @@ import unittest import torch import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch._C as core import sys import subprocess import os @@ -270,10 +271,14 @@ def model_replacement_check( "encoder_outputs": (last_hidden_state,), } - with torch.no_grad(): + with torch.no_grad(), torch.cpu.amp.autocast( + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, + ): key_hf = ref_m(**input_dict) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, ): key_ipex = ipex_m(**input_dict) error_message = f"model={m.name}, deployment_mode={deployment_mode}, torchcompile={torchcompile}, return_dict={return_dict}" @@ -288,6 +293,8 @@ def model_replacement_check( def test_model_replacement(self): dtypes = [torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) enable_torchcompile = [False, True] deployment_mode = [True, False] return_dict = [False, True] diff --git a/tests/cpu/test_linear_fuse_eltwise.py b/tests/cpu/test_linear_fuse_eltwise.py index f72276834..bc59e1618 100644 --- a/tests/cpu/test_linear_fuse_eltwise.py +++ b/tests/cpu/test_linear_fuse_eltwise.py @@ -1,6 +1,7 @@ import unittest import torch import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch._C as core from torch.testing._internal.common_utils import TestCase import copy @@ -24,13 +25,18 @@ class TestLinearFuseEltwise(TestCase): def test_linear_fuse_eltwise(self): x1 = torch.rand(5, 10).requires_grad_() x2 = copy.deepcopy(x1) - for dtype in [torch.float, torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = MLP() opt = torch.optim.SGD(model.parameters(), lr=0.01) model, opt = ipex.optimize( model, optimizer=opt, dtype=dtype, auto_kernel_selection=True ) - with torch.cpu.amp.autocast(enabled=(dtype == torch.bfloat16)): + with torch.cpu.amp.autocast( + enabled=(dtype in [torch.bfloat16, torch.float16]), dtype=dtype + ): ref_out = model(x1).sum() ref_out.backward() @@ -43,11 +49,18 @@ def test_linear_fuse_eltwise(self): fused_model.mlp[2], "sigmoid" ) fused_model.mlp[3] = torch.nn.Identity() - with torch.cpu.amp.autocast(enabled=(dtype == torch.bfloat16)): + with torch.cpu.amp.autocast( + enabled=(dtype in [torch.bfloat16, torch.float16]), dtype=dtype + ): out = fused_model(x2).sum() out.backward() + atol = None + rtol = None + if dtype == torch.float16: + atol = 1e-4 + rtol = 1e-3 self.assertEqual(out, ref_out) - self.assertEqual(x1.grad, x2.grad) + self.assertEqual(x1.grad, x2.grad, atol=atol, rtol=rtol) if __name__ == "__main__": diff --git a/tests/cpu/test_paged_attention.py b/tests/cpu/test_paged_attention.py index 25b3a8b19..e0924dd69 100644 --- a/tests/cpu/test_paged_attention.py +++ b/tests/cpu/test_paged_attention.py @@ -4,6 +4,7 @@ import random from typing import List, Optional, Tuple from itertools import product +import intel_extension_for_pytorch._C as core class PagedAttentionTest(TestCase): @@ -210,6 +211,8 @@ def _test_paged_attention_func( def test_paged_attention(self): num_blocks = 128 dtypes = [torch.bfloat16, torch.float] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) num_gen_seqs = [7] # Arbitrary values for testing num_heads = [(40, 40), (64, 16)] # Arbitrary values for testing head_sizes = [64, 80, 128, 96, 112, 128, 256] @@ -301,6 +304,8 @@ def test_reshape_and_cache(self): head_sizes = [64, 80, 128, 96, 112, 128, 256] block_sizes = [16, 32] dtypes = [torch.bfloat16, torch.float] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) seeds = [0] for ( num_token, diff --git a/tests/cpu/test_tpp_linear.py b/tests/cpu/test_tpp_linear.py index a121be8e7..e26de5c09 100644 --- a/tests/cpu/test_tpp_linear.py +++ b/tests/cpu/test_tpp_linear.py @@ -2,6 +2,7 @@ import itertools import torch import intel_extension_for_pytorch as ipex +import intel_extension_for_pytorch._C as core from torch.testing._internal.common_utils import TestCase import copy from intel_extension_for_pytorch.cpu._auto_kernel_selection import ( @@ -105,17 +106,22 @@ class TestTPPlinear(TestCase): def test_tpp_linear_fallback_flag(self): x1 = torch.rand(1, 1, 4097) x2 = copy.deepcopy(x1) - for dtype in [torch.float, torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_tpp_fallback_dnnl().eval() with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, ): ref_out = model(x1) model = ipex.optimize(model, dtype=dtype) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, ): model = torch.jit.script(model) model = torch.jit.freeze(model) @@ -125,18 +131,23 @@ def test_tpp_linear_fallback_flag(self): def test_tpp_linear_fallback(self): x1 = torch.rand(1, 1, 4097) x2 = copy.deepcopy(x1) - for dtype in [torch.float, torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_tpp_fallback_dnnl().eval() with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, ): ref_out = model(x1) _enable_tpp() model = ipex.optimize(model, dtype=dtype) with torch.no_grad(), torch.cpu.amp.autocast( - enabled=True if dtype is torch.bfloat16 else False + enabled=True if dtype in [torch.bfloat16, torch.float16] else False, + dtype=dtype, ): out = model(x2) self.assertEqual(out, ref_out) @@ -145,7 +156,10 @@ def test_tpp_linear_fallback(self): def test_tpp_linear(self): x1 = torch.rand(1, 1, 4096) x2 = copy.deepcopy(x1) - for dtype in [torch.float, torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.onednn_has_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_with_bias().eval() model_nb = Linear_without_bias().eval() if dtype is torch.bfloat16: @@ -153,6 +167,11 @@ def test_tpp_linear(self): x2 = x2.to(torch.bfloat16) model = model.to(torch.bfloat16) model_nb = model_nb.to(torch.bfloat16) + elif dtype == torch.float16: + x1 = x1.to(torch.float16) + x2 = x2.to(torch.float16) + model = model.to(torch.float16) + model_nb = model_nb.to(torch.float16) ref_out = model(x1) ref_out_nb = model_nb(x1) @@ -171,10 +190,14 @@ def test_tpp_fused_gate_up_proj(self): x = torch.randn(1, 4, in_feature) x_tpp = copy.deepcopy(x) - + dtypes = [torch.float, torch.bfloat16] + if core.isa_has_amx_fp16_support(): + dtypes.append(torch.float16) with torch.no_grad(): for dtype, bias_gate, bias_up in itertools.product( - [torch.float, torch.bfloat16], [False, True], [False, True] + dtypes, + [False, True], + [False, True], ): model = Linear_Gate_Up( in_feature, out_feature, bias_gate, bias_up @@ -183,6 +206,11 @@ def test_tpp_fused_gate_up_proj(self): x = x.to(torch.bfloat16) x_tpp = x_tpp.to(torch.bfloat16) model = model.to(torch.bfloat16) + elif dtype == torch.float16: + x = x.to(torch.float16) + x_tpp = x_tpp.to(torch.float16) + model = model.to(torch.float16) + ref_out = model(x) _enable_tpp() @@ -201,40 +229,64 @@ def test_tpp_fused_gate_up_proj(self): out_tpp_ref = torch.ops.torch_ipex.tpp_linear_mul( x_tpp, out_linear_silu, model.up_proj.weight, model.up_proj.bias ) + atol = None + rtol = None + if dtype is torch.float16: + atol = 1e-3 + rtol = 1e-3 self.assertEqual(out, out_tpp_ref) - self.assertEqual(out, ref_out) + self.assertEqual(out, ref_out, atol=atol, rtol=rtol) _disable_tpp() def test_tpp_linear_gelu(self): x1 = torch.rand(1, 4, 4096) x2 = copy.deepcopy(x1) with torch.no_grad(): - for dtype in [torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.isa_has_amx_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_gelu().eval() if dtype is torch.bfloat16: x1 = x1.to(torch.bfloat16) x2 = x2.to(torch.bfloat16) model = model.to(torch.bfloat16) + elif dtype is torch.float16: + x1 = x1.to(torch.float16) + x2 = x2.to(torch.float16) + model = model.to(torch.float16) ref_out = model(x1) _enable_tpp() + atol = None + rtol = None + if dtype == torch.float: + atol = 3e-5 + rtol = 1.3e-6 model = ipex.optimize(model, dtype=dtype) out = torch.ops.torch_ipex.tpp_linear_gelu( x2, model.mlp.weight, model.mlp.bias ) - self.assertEqual(out, ref_out) + self.assertEqual(out, ref_out, atol=atol, rtol=rtol) _disable_tpp() def test_tpp_linear_silu(self): x1 = torch.rand(1, 4, 4096) x2 = copy.deepcopy(x1) with torch.no_grad(): - for dtype in [torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.isa_has_amx_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_silu().eval() if dtype is torch.bfloat16: x1 = x1.to(torch.bfloat16) x2 = x2.to(torch.bfloat16) model = model.to(torch.bfloat16) + elif dtype is torch.float16: + x1 = x1.to(torch.float16) + x2 = x2.to(torch.float16) + model = model.to(torch.float16) ref_out = model(x1) _enable_tpp() @@ -242,19 +294,31 @@ def test_tpp_linear_silu(self): out = torch.ops.torch_ipex.tpp_linear_silu( x2, model.mlp.weight, x2.new_empty(0) ) - self.assertEqual(out, ref_out) + atol = None + rtol = None + if dtype == torch.float: + atol = 2e-5 + rtol = 1.3e-6 + self.assertEqual(out, ref_out, atol=atol, rtol=rtol) _disable_tpp() def test_tpp_linear_relu(self): x1 = torch.rand(1, 4, 4096) x2 = copy.deepcopy(x1) with torch.no_grad(): - for dtype in [torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.isa_has_amx_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_relu().eval() if dtype is torch.bfloat16: x1 = x1.to(torch.bfloat16) x2 = x2.to(torch.bfloat16) model = model.to(torch.bfloat16) + elif dtype is torch.float16: + x1 = x1.to(torch.float16) + x2 = x2.to(torch.float16) + model = model.to(torch.float16) ref_out = model(x1) _enable_tpp() @@ -269,12 +333,19 @@ def test_tpp_linear_mul(self): x1 = torch.rand(1, 4, 4096) x2 = copy.deepcopy(x1) with torch.no_grad(): - for dtype in [torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.isa_has_amx_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_mul().eval() if dtype is torch.bfloat16: x1 = x1.to(torch.bfloat16) x2 = x2.to(torch.bfloat16) model = model.to(torch.bfloat16) + elif dtype is torch.float16: + x1 = x1.to(torch.float16) + x2 = x2.to(torch.float16) + model = model.to(torch.float16) ref_out = model(x1) _enable_tpp() @@ -289,12 +360,19 @@ def test_tpp_linear_add(self): x1 = torch.rand(1, 4, 4096) x2 = copy.deepcopy(x1) with torch.no_grad(): - for dtype in [torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.isa_has_amx_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_add().eval() if dtype is torch.bfloat16: x1 = x1.to(torch.bfloat16) x2 = x2.to(torch.bfloat16) model = model.to(torch.bfloat16) + elif dtype is torch.float16: + x1 = x1.to(torch.float16) + x2 = x2.to(torch.float16) + model = model.to(torch.float16) ref_out = model(x1) _enable_tpp() @@ -302,19 +380,31 @@ def test_tpp_linear_add(self): out = torch.ops.torch_ipex.tpp_linear_add( x2, x2, model.mlp.weight, x2.new_empty(0), 1.0 ) - self.assertEqual(out, ref_out) + atol = None + rtol = None + if dtype is torch.float16: + atol = 1e-3 + rtol = 1e-3 + self.assertEqual(out, ref_out, atol=atol, rtol=rtol) _disable_tpp() def test_tpp_linear_add2(self): x1 = torch.rand(1, 4, 4096) x2 = copy.deepcopy(x1) with torch.no_grad(): - for dtype in [torch.bfloat16]: + dtypes = [torch.float, torch.bfloat16] + if core.isa_has_amx_fp16_support(): + dtypes.append(torch.float16) + for dtype in dtypes: model = Linear_add_add().eval() if dtype is torch.bfloat16: x1 = x1.to(torch.bfloat16) x2 = x2.to(torch.bfloat16) model = model.to(torch.bfloat16) + elif dtype is torch.float16: + x1 = x1.to(torch.float16) + x2 = x2.to(torch.float16) + model = model.to(torch.float16) ref_out = model(x1) _enable_tpp() diff --git a/tests/cpu/test_weight_prepack.py b/tests/cpu/test_weight_prepack.py index 91e69d885..dc0f4674c 100644 --- a/tests/cpu/test_weight_prepack.py +++ b/tests/cpu/test_weight_prepack.py @@ -1405,6 +1405,8 @@ def forward(self, x): test_dtypes = [torch.float] if core.onednn_has_bf16_support(): test_dtypes.append(torch.bfloat16) + if core.onednn_has_fp16_support(): + test_dtypes.append(torch.float16) for dtype, feed_sample_input in itertools.product( test_dtypes, [True, False] ): @@ -1455,6 +1457,8 @@ def forward(self, x): y_origin, y_ipex.float(), rtol=1e-2, atol=1e-03 ) else: + if dtype == torch.float16: + scaler = torch.cpu.amp.GradScaler(init_scale=1) model.train() origin_model = copy.deepcopy(model).train() origin_optimizer = SGD( @@ -1513,8 +1517,13 @@ def forward(self, x): y2 = ipex_model(x2) loss2 = y2.sum() ipex_optimizer.zero_grad() - loss2.backward() - ipex_optimizer.step() + if dtype == torch.float16: + scaler.scale(loss2).backward() + scaler.step(ipex_optimizer) + scaler.update(new_scale=1.0) + else: + loss2.backward() + ipex_optimizer.step() self.assertEqual(y1, y2.float(), rtol=1e-2, atol=1e-3) self.assertEqual(x1.grad, x2.grad, rtol=1e-2, atol=1e-3) if bias: From 88ffb6c771ea995accfb105fe53b860c0b594054 Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Fri, 21 Jun 2024 18:35:12 +0800 Subject: [PATCH 132/199] Remove kv length in ROPE forward (#3007) --- .../transformers/models/reference/modules/attentions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py index 67a00c699..f8d066176 100644 --- a/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py +++ b/intel_extension_for_pytorch/transformers/models/reference/modules/attentions.py @@ -152,7 +152,7 @@ def _LlamaAttention_forward( self.head_dim, self.head_dim // 2, self.head_dim, - kv_seq_len, + None, self.concat_qkv.num_concat, ) else: @@ -168,7 +168,7 @@ def _LlamaAttention_forward( self.head_dim, self.head_dim // 2, self.head_dim, - kv_seq_len, + None, ) query = self._IPEXROPE( query, @@ -177,7 +177,7 @@ def _LlamaAttention_forward( self.head_dim, self.head_dim // 2, self.head_dim, - kv_seq_len, + None, ) if use_cache: From 5b8196c465df4b7e26af711a7430ec77be409316 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Fri, 21 Jun 2024 20:48:39 +0800 Subject: [PATCH 133/199] update oneDNN to dc2701ae41 on rls-v3.5 (#3004) * update oneDNN to dc2701ae41 on rls-v3.5 * update UT pattern due to backend change --- tests/cpu/test_ao_jit_llga_quantization_fuser.py | 3 ++- third_party/ideep | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/cpu/test_ao_jit_llga_quantization_fuser.py b/tests/cpu/test_ao_jit_llga_quantization_fuser.py index 646ba9d91..6d36cb654 100644 --- a/tests/cpu/test_ao_jit_llga_quantization_fuser.py +++ b/tests/cpu/test_ao_jit_llga_quantization_fuser.py @@ -2171,6 +2171,7 @@ def forward(self, x): return x5 patterns = [ + ["aten::layer_norm", "aten::quantize_per_tensor"], [ "aten::dequantize", "aten::linear", @@ -2182,7 +2183,7 @@ def forward(self, x): m = FFN_Residual(1024, 4096).eval() x = torch.rand(128, 1024) graph = self.checkQuantizeTrace(m, [x], atol=2e-1) - self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2) + self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3) self.assertFused(graph, ["aten::linear", "aten::gelu"]) self.assertFused(graph, ["aten::linear", "aten::add"]) self.checkPatterns(graph, patterns) diff --git a/third_party/ideep b/third_party/ideep index 3cd211cb6..f088291d3 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit 3cd211cb6a78392a8d3a6509d23d7d2bb03cf0d2 +Subproject commit f088291d374940ee3495372f9670d6a8778e364e From 2cabe757da4831afefc95134fe08b9f975510630 Mon Sep 17 00:00:00 2001 From: Ryan Tao <65508217+RanTao123@users.noreply.github.com> Date: Tue, 25 Jun 2024 09:26:26 +0800 Subject: [PATCH 134/199] fix WOQ INT4 performance regression (#3008) * fix regression * add constexpr * add constexpr * add constexpr function * add constexpr function --- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index 1e4fe0c94..9544920d8 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -73,6 +73,10 @@ constexpr long LOOP_K_UNROLL = 4; // TODO(jgong5): do not hard-code #define QUANT_W_PER_CHANNEL 0 #define QUANT_W_PER_K_BLOCK 1 +constexpr bool is_asymmetric_quant_a(const int quant_a_mode) { + return quant_a_mode <= QUANT_A_PER_M_K_BLOCK; +} + // negate elements in a according to b's sign static inline __m512i _mm512_sign_epi8(__m512i a, __m512i b) { __m512i zero = _mm512_setzero_si512(); @@ -749,7 +753,7 @@ struct GemmMicroKernel< vscales[i] = _mm512_loadu_ps(scales + i * 16); // TODO(jgong5): should we use 512 or two 256 here? vzps[i] = combine_m256i(load_zps_4vnni(zps + i * 16)); - if (zp_a) { + if constexpr (is_asymmetric_quant_a(quant_a_mode)) { vcompensate[i] = _mm512_setzero_epi32(); } }); @@ -772,7 +776,7 @@ struct GemmMicroKernel< if constexpr (row == 0) { vb[col] = combine_m256i(load_int4_as_int8(pqB[k / 4][col * 16])); vb[col] = _mm512_sub_epi8(vb[col], vzps[col]); - if (zp_a) { + if constexpr (is_asymmetric_quant_a(quant_a_mode)) { vcompensate[col] = _mm512_dpbusd_epi32(vcompensate[col], ones, vb[col]); } @@ -780,7 +784,7 @@ struct GemmMicroKernel< _mm_prefetch(pqB[(k + PREFETCH_K_DIST) / 4][col * 16], _MM_HINT_T0); } } - if (zp_a) { + if constexpr (is_asymmetric_quant_a(quant_a_mode)) { vc[i] = _mm512_dpbusd_epi32(vc[i], va, vb[col]); } else { auto vsb = _mm512_sign_epi8(vb[col], va); @@ -814,7 +818,9 @@ struct GemmMicroKernel< quant_a_mode == QUANT_A_PER_K_BLOCK || quant_a_mode == QUANT_A_PER_TENSOR_SYM || quant_a_mode == QUANT_A_PER_K_BLOCK_SYM) { - if (zp_a) { + if constexpr ( + quant_a_mode == QUANT_A_PER_TENSOR || + quant_a_mode == QUANT_A_PER_K_BLOCK) { vc[i] = _mm512_sub_epi32( vc[i], _mm512_mullo_epi32(vcompensate[col], _mm512_set1_epi32(*zp_a))); @@ -823,7 +829,7 @@ struct GemmMicroKernel< vc_float = _mm512_mul_ps(vc_float, _mm512_set1_ps(*scale_a)); } else if constexpr ( quant_a_mode == QUANT_A_PER_M || quant_a_mode == QUANT_A_PER_M_SYM) { - if (zp_a) { + if constexpr (quant_a_mode == QUANT_A_PER_M) { vc[i] = _mm512_sub_epi32( vc[i], _mm512_mullo_epi32( @@ -832,7 +838,7 @@ struct GemmMicroKernel< vc_float = _mm512_cvtepi32_ps(vc[i]); vc_float = _mm512_mul_ps(vc_float, _mm512_set1_ps(*(scale_a + row))); } else { - if (zp_a) { + if constexpr (is_asymmetric_quant_a(quant_a_mode)) { vc[i] = _mm512_sub_epi32( vc[i], _mm512_mullo_epi32( @@ -1625,17 +1631,19 @@ class DequantGemmTPP< quant_a_mode == QUANT_A_PER_M_SYM || quant_a_mode == QUANT_A_PER_M_K_BLOCK_SYM) { scale_a_m = scale_a + m * k_groups; - if (zp_a) { + if constexpr ( + quant_a_mode == QUANT_A_PER_M || + quant_a_mode == QUANT_A_PER_M_K_BLOCK) { zp_a_m = zp_a + m * k_groups; } } else { scale_a_m = scale_a; - if (zp_a) { + if constexpr (is_asymmetric_quant_a(quant_a_mode)) { zp_a_m = zp_a; } } float c = 0; - if (zp_a_m) { + if constexpr (is_asymmetric_quant_a(quant_a_mode)) { c = (qC[m][n] - compensation[n] * (*zp_a_m)) * (*scale_a_m) * scales[n]; } else { From afe897d7a1acde7ceb4f99bc162426afd762cf3d Mon Sep 17 00:00:00 2001 From: blzheng Date: Tue, 25 Jun 2024 17:47:34 +0800 Subject: [PATCH 135/199] remove dead code (#3010) --- tests/cpu/hf_configs/qwen/modeling_qwen.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/cpu/hf_configs/qwen/modeling_qwen.py b/tests/cpu/hf_configs/qwen/modeling_qwen.py index e05797fd3..aaa04dc25 100644 --- a/tests/cpu/hf_configs/qwen/modeling_qwen.py +++ b/tests/cpu/hf_configs/qwen/modeling_qwen.py @@ -1297,16 +1297,15 @@ def chat_stream( stop_words_ids.extend( get_stop_words_ids(generation_config.chat_format, tokenizer) ) - if stop_words_ids is not None: - assert hasattr(generation_config, "eos_token_id") - stop_words_logits_processor = StopWordsLogitsProcessor( - stop_words_ids=stop_words_ids, - eos_token_id=generation_config.eos_token_id, - ) - if logits_processor is None: - logits_processor = LogitsProcessorList([stop_words_logits_processor]) - else: - logits_processor.append(stop_words_logits_processor) + assert hasattr(generation_config, "eos_token_id") + stop_words_logits_processor = StopWordsLogitsProcessor( + stop_words_ids=stop_words_ids, + eos_token_id=generation_config.eos_token_id, + ) + if logits_processor is None: + logits_processor = LogitsProcessorList([stop_words_logits_processor]) + else: + logits_processor.append(stop_words_logits_processor) input_ids = torch.tensor([context_tokens]).to(self.device) from transformers_stream_generator.main import ( From 3f844716da0a49c6aae0e6b20a2f26d0972aab10 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Wed, 26 Jun 2024 20:11:13 +0800 Subject: [PATCH 136/199] WOQ use block_k=128 for weight dtype=INT4, lowp_mode=INT8 (#3015) --- csrc/cpu/aten/Linear.cpp | 3 ++- examples/cpu/inference/python/llm/run.py | 29 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/csrc/cpu/aten/Linear.cpp b/csrc/cpu/aten/Linear.cpp index 8256b5f4e..b0e9f74b4 100644 --- a/csrc/cpu/aten/Linear.cpp +++ b/csrc/cpu/aten/Linear.cpp @@ -370,7 +370,8 @@ at::Tensor woq_linear_pack_weight( // For TPP kernel, we only consider even K if (K % 2 == 0) { size_t block_n = 32; - size_t block_k = group_size > 0 ? std::min(group_size, (int64_t)64) : 64; + size_t default_block_k = (weight_dtype == WOQ_DTYPE_INT4 && lowp_mode == 3) ? 128 : 64; + size_t block_k = group_size > 0 ? std::min((size_t)group_size, default_block_k) : default_block_k; while (K % block_k != 0) { block_k /= 2; } diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index 662613f21..c1121b8b4 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -178,6 +178,33 @@ def main(args_in: Optional[List[str]] = None) -> None: " data type or lowp-mode. If `--gptq` is given, weight" " data type is always INT4 and this argument is not needed.", ) + parser.add_argument( + "--act-quant-mode", + choices=[ + "PER_TENSOR", + "PER_IC_BLOCK", + "PER_BATCH", + "PER_BATCH_IC_BLOCK", + "PER_TENSOR_SYM", + "PER_IC_BLOCK_SYM", + "PER_BATCH_SYM", + "PER_BATCH_IC_BLOCK_SYM", + ], + default="PER_IC_BLOCK", + type=str, + help="Quantization mode for activation with different granularity. " + "For lowp-mode=INT8 only. For other cases, it has no effect. " + "Assume the activation tensor has shape batch_size x input_channel. " + "PER_TENSOR(0): quantize per tensor; " + "PER_IC_BLOCK(1): quantize per group along IC with group size = IC_BLOCK; " + "PER_BATCH(2): quantize per batch; " + "PER_BATCH_IC_BLOCK(3): quantize per block of size 1 x IC_BLOCK. " + "PER_TENSOR_SYM(4): symmetrically quantize per tensor; " + "PER_IC_BLOCK_SYM(5): symmetrically quantize per group along IC with group size = IC_BLOCK; " + "PER_BATCH_SYM(6): symmetrically quantize per batch; " + "PER_BATCH_IC_BLOCK_SYM(7): symmetrically quantize per block of size 1 x IC_BLOCK. " + "IC_BLOCK is determined by IC automatically.", + ) parser.add_argument( "--low-precision-checkpoint", default="", @@ -330,6 +357,7 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--ipex-weight-only-quantization"]) infer_cmd.extend(["--weight-dtype", str(args.weight_dtype)]) infer_cmd.extend(["--lowp-mode", str(args.lowp_mode)]) + infer_cmd.extend(["--act-quant-mode", str(args.act_quant_mode)]) if args.gptq: print( "LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified" @@ -449,6 +477,7 @@ def main(args_in: Optional[List[str]] = None) -> None: quant_cmd.extend(["--ipex-weight-only-quantization"]) quant_cmd.extend(["--weight-dtype", str(args.weight_dtype)]) quant_cmd.extend(["--lowp-mode", str(args.lowp_mode)]) + quant_cmd.extend(["--act-quant-mode", str(args.act_quant_mode)]) if args.gptq: print( "LLM RUNTIME INFO: Weight dtype set to INT4 since `--gptq` is sepcified" From 8756dc252a1beb98743d85c7baa115624162fd6f Mon Sep 17 00:00:00 2001 From: jianan-gu Date: Fri, 28 Jun 2024 09:14:05 +0800 Subject: [PATCH 137/199] Fix avx2 SHM allreduce (#3016) * fix avx2 shm allreduce * fix init args --- .../aten/kernels/TPPSHMAllreduceAddKrnl.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp b/csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp index 3a7f98f42..a48f10d73 100644 --- a/csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp +++ b/csrc/cpu/aten/kernels/TPPSHMAllreduceAddKrnl.cpp @@ -11,7 +11,7 @@ namespace torch_ipex { namespace cpu { namespace { - +#if defined(CPU_CAPABILITY_AVX512) #define BS 512 static const long master_port = torch_ipex::tpp::env2int("MASTER_PORT", 0); @@ -27,23 +27,22 @@ struct TppOps { torch_ipex::tpp::AddTPP(BS); }; -static TppOps ops_f; -static TppOps ops_bf; -static TppOps ops_hf; - template static TppOps getOps() {} template <> TppOps getOps() { + TppOps ops_f; return ops_f; } template <> TppOps getOps() { + TppOps ops_bf; return ops_bf; } template <> TppOps getOps() { + TppOps ops_hf; return ops_hf; } } // namespace shm_tpp @@ -309,10 +308,18 @@ void tpp_allreduce_impl( } } #undef BS +#else +void tpp_allreduce_impl( + at::Tensor t_in, + c10::intrusive_ptr process_group) { + std::vector temp_vec = {t_in}; + process_group->allreduce(temp_vec)->wait(); +} +#endif } // namespace IPEX_REGISTER_DISPATCH(tpp_allreduce_kernel_stub, &tpp_allreduce_impl); } // namespace cpu -} // namespace torch_ipex \ No newline at end of file +} // namespace torch_ipex From a6fd7194f4db0b5ccd0f3c1d2d67d479554c7f76 Mon Sep 17 00:00:00 2001 From: Cao E Date: Fri, 28 Jun 2024 15:26:19 +0800 Subject: [PATCH 138/199] fallback fp16 first_token_masked_mha to ref impl on SRF (#3011) --- csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index 5fa66be9e..b077ad0b5 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -5,6 +5,7 @@ #include #include #include "vec/vec.h" +#include "../../utils/isa_utils.h" namespace torch_ipex { namespace cpu { @@ -1345,7 +1346,7 @@ first_token_masked_mha( auto attn_outputs = at::Tensor(); auto attn_weights = at::Tensor(); if ((key.scalar_type() == at::kFloat || key.scalar_type() == at::kBFloat16 || - key.scalar_type() == at::kHalf) && + (key.scalar_type() == at::kHalf && utils::isa_has_avx512_fp16_support())) && attention_mask.stride(-1) == 1) { query = query.transpose(1, 2); key = key.transpose(1, 2); From d6599b55dad1b27ca401e48c9be205e9ec395950 Mon Sep 17 00:00:00 2001 From: "Zhang, Liangang" Date: Fri, 28 Jun 2024 18:00:19 +0800 Subject: [PATCH 139/199] Enable multiple query for the next tokens (#3018) * Enable multiple query for the next tokens --- .../kernels/MaskedMultiHeadAttentionKrnl.cpp | 49 ++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp index b077ad0b5..6e6a21b69 100644 --- a/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp +++ b/csrc/cpu/aten/kernels/MaskedMultiHeadAttentionKrnl.cpp @@ -1473,17 +1473,44 @@ masked_multihead_self_attention_kernel_impl( value_cache = new_value_cache; beam_idx = new_beam_idx; } - if (offset > 0) { - return zero_copy_kv_cache_masked_multihead_self_attention_kernel_impl( - query, - key, - value, - key_cache, - value_cache, - beam_idx, - offset, - scale_attn, - attention_mask_v); + if (offset != 0) { + auto cur_len = query.size(1); + if (cur_len == 1) + return zero_copy_kv_cache_masked_multihead_self_attention_kernel_impl( + query, + key, + value, + key_cache, + value_cache, + beam_idx, + offset, + scale_attn, + attention_mask_v); + // just a funcationality path,need to optimize + auto tokens_outs = std::vector(cur_len); + for (auto i = 0; i < cur_len; i++) { + auto query_i = query.select(1, i).unsqueeze(1); + ; + auto key_i = key.select(1, i).unsqueeze(1); + ; + auto value_i = value.select(1, i).unsqueeze(1); + ; + auto next_outs = + zero_copy_kv_cache_masked_multihead_self_attention_kernel_impl( + query_i, + key_i, + value_i, + key_cache, + value_cache, + beam_idx, + offset, + scale_attn, + attention_mask_v); + tokens_outs[i] = std::get<0>(next_outs); + } + auto attn_outs = at::cat(tokens_outs, 2); + return std::make_tuple( + attn_outs, at::Tensor(), key_cache, value_cache, beam_idx); } else { return first_token_masked_mha( query, From 1d02cee9a8a4b8cf4406d85181d8d5a5a0e7803c Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Fri, 28 Jun 2024 21:06:06 +0800 Subject: [PATCH 140/199] WOQ: Change small batch threshold from 32 to 16 (#3017) --- csrc/cpu/aten/kernels/WoqTppKrnl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp index 9544920d8..f0e362c3b 100644 --- a/csrc/cpu/aten/kernels/WoqTppKrnl.cpp +++ b/csrc/cpu/aten/kernels/WoqTppKrnl.cpp @@ -54,7 +54,7 @@ constexpr bool is_sym_quant(const int qw_type) { #if defined(CPU_CAPABILITY_AVX512_FP16) && defined(COMPILER_PREREQ_MET) #define QUANT_A_THRESHOLD 30720 -#define SMALL_BATCH_THRESHOLD 32 +#define SMALL_BATCH_THRESHOLD 16 #define DEQUANT_UPFRONT_THRESHOLD 1024 #define PARALLEL_M_THRESHOLD 128 constexpr long PREFETCH_K_DIST = 64; // TODO(jgong5): do not hard-code From c8eb8aa3a30e365324fea4525b9adce1380f4887 Mon Sep 17 00:00:00 2001 From: jiayisunx Date: Mon, 1 Jul 2024 16:06:46 +0800 Subject: [PATCH 141/199] add the meta registration for choose_tpp_linear_weight (#3020) --- intel_extension_for_pytorch/_meta_registrations.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/intel_extension_for_pytorch/_meta_registrations.py b/intel_extension_for_pytorch/_meta_registrations.py index ed11f5913..945f60407 100644 --- a/intel_extension_for_pytorch/_meta_registrations.py +++ b/intel_extension_for_pytorch/_meta_registrations.py @@ -521,6 +521,16 @@ def meta_tpp_linear_bias( return input.new_empty((*input.shape[:-1], out_features)) +@register_meta("choose_tpp_linear_weight") +def meta_choose_tpp_linear_weight(x, weight, weight_for_large_batch): + M = x.numel() // x.size(-1) + return ( + weight_for_large_batch + if weight_for_large_batch is not None and M >= 256 + else weight + ) + + @register_meta("tpp_linear_gelu") def meta_tpp_linear_gelu( input, From f20a79ef34c620b40417f4697b824eeae1262feb Mon Sep 17 00:00:00 2001 From: WeizhuoZhang-intel Date: Tue, 2 Jul 2024 09:53:12 +0800 Subject: [PATCH 142/199] Update dependency_version.yml 20240702 (#3021) --- dependency_version.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependency_version.yml b/dependency_version.yml index 93dd151ac..096f69976 100644 --- a/dependency_version.yml +++ b/dependency_version.yml @@ -8,9 +8,9 @@ # branch: Branch name of the Github repository. For human understanding only. deepspeed: - commit: 0c979d6779e3251fa0a65bd27e61f31a0883f022 + commit: df58a784c873e6da2fffe8f80c26131c08c85c5c repo: https://github.com/microsoft/DeepSpeed.git - version: 0.14.4 + version: 0.14.5+df58a78 gcc: max-version: null min-version: 12.3.0 @@ -28,14 +28,14 @@ oneCCL: protobuf: version: 3.20.3 pytorch: - version: 2.5.0.dev20240619+cpu + version: 2.5.0.dev20240701+cpu torch-ccl: commit: ccl_torch_dev_0131 repo: https://github.com/intel/torch-ccl.git version: 2.3.0+cpu torchaudio: - version: 2.4.0.dev20240619+cpu + version: 2.4.0.dev20240629+cpu torchvision: - version: 0.20.0.dev20240619+cpu + version: 0.20.0.dev20240629+cpu transformers: version: 4.38.1 From c17381044ea78efcbeecf1edecf148e558197188 Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Tue, 2 Jul 2024 14:20:26 +0800 Subject: [PATCH 143/199] LLM BKC: Add lm-head-generation for quantization; report latency in ms (#3024) --- .../llm/distributed/run_generation_tp.py | 20 ++++++------- .../run_generation_with_deepspeed.py | 20 ++++++------- examples/cpu/inference/python/llm/run.py | 18 +++++++++++ .../llm/single_instance/run_generation.py | 20 ++++++------- .../llm/single_instance/run_quantization.py | 30 ++++++++++++------- 5 files changed, 68 insertions(+), 40 deletions(-) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py index c96110b2e..1c83997be 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_tp.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_tp.py @@ -537,20 +537,20 @@ def trace_handler(prof): total_list.append(output[1]) print("\n", "-" * 10, "Summary:", "-" * 10) - latency = total_time / (num_iter - num_warmup) - print("Inference latency: %.3f sec." % latency) + latency = total_time / (num_iter - num_warmup) * 1000 + print("Inference latency: %.2f ms." % latency) if args.token_latency: import numpy as np from itertools import chain - first_latency = np.mean([x[0] for x in total_list]) + first_latency = np.mean([x[0] for x in total_list]) * 1000 average_2n = list(chain(*[x[1:] for x in total_list])) average_2n.sort() - average_2n_latency = np.mean(average_2n) - p90_latency = average_2n[int(len(average_2n) * 0.9)] - p99_latency = average_2n[int(len(average_2n) * 0.99)] - print("First token average latency: %.3f sec." % first_latency) - print("Average 2... latency: %.3f sec." % average_2n_latency) - print("P90 2... latency: %.3f sec." % p90_latency) - print("P99 2... latency: %.3f sec." % p99_latency) + average_2n_latency = np.mean(average_2n) * 1000 + p90_latency = average_2n[int(len(average_2n) * 0.9)] * 1000 + p99_latency = average_2n[int(len(average_2n) * 0.99)] * 1000 + print("First token average latency: %.2f ms." % first_latency) + print("Average 2... latency: %.2f ms." % average_2n_latency) + print("P90 2... latency: %.2f ms." % p90_latency) + print("P99 2... latency: %.2f ms." % p99_latency) diff --git a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py index e8eb026ed..ca264461f 100644 --- a/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py +++ b/examples/cpu/inference/python/llm/distributed/run_generation_with_deepspeed.py @@ -759,20 +759,20 @@ def trace_handler(prof): # Wait for all ranks to finish before move on deepspeed.comm.barrier() - latency = total_time / (cycles - warmup) + latency = total_time / (cycles - warmup) * 1000 print_rank0("\n", "-" * 10, "Summary:", "-" * 10) - print_rank0("Inference latency: %.3f sec." % latency) + print_rank0("Inference latency: %.2f ms." % latency) if args.token_latency: import numpy as np from itertools import chain - first_latency = np.mean([x[0] for x in total_list]) + first_latency = np.mean([x[0] for x in total_list]) * 1000 average_2n = list(chain(*[x[1:] for x in total_list])) average_2n.sort() - average_2n_latency = np.mean(average_2n) - p90_latency = average_2n[int(len(average_2n) * 0.9)] - p99_latency = average_2n[int(len(average_2n) * 0.99)] - print_rank0("First token average latency: %.3f sec." % first_latency) - print_rank0("Average 2... latency: %.3f sec." % average_2n_latency) - print_rank0("P90 2... latency: %.3f sec." % p90_latency) - print_rank0("P99 2... latency: %.3f sec." % p99_latency) + average_2n_latency = np.mean(average_2n) * 1000 + p90_latency = average_2n[int(len(average_2n) * 0.9)] * 1000 + p99_latency = average_2n[int(len(average_2n) * 0.99)] * 1000 + print_rank0("First token average latency: %.2f ms." % first_latency) + print_rank0("Average 2... latency: %.2f ms." % average_2n_latency) + print_rank0("P90 2... latency: %.2f ms." % p90_latency) + print_rank0("P99 2... latency: %.2f ms." % p99_latency) diff --git a/examples/cpu/inference/python/llm/run.py b/examples/cpu/inference/python/llm/run.py index c1121b8b4..5562d2f42 100644 --- a/examples/cpu/inference/python/llm/run.py +++ b/examples/cpu/inference/python/llm/run.py @@ -279,6 +279,16 @@ def main(args_in: Optional[List[str]] = None) -> None: parser.add_argument( "--local_rank", required=False, type=int, help="used by dist launchers" ) + parser.add_argument( + "--lm-head-generation", + action="store_true", + help="Compute lm-head only for the last token in the sequence to speed up first token inference." + " This argument is only needed for non-TP quantization cases. And note that in such cases," + " this feature is not compatible with lambada_openai accuracy test. If you want to run" + " lambada_openai accuracy test with the quantized model afterwards, don't turn this feature on." + " In other cases, this feature is always turned on regardless of this argument and it does not" + " conflict with the accuracy test.", + ) args = parser.parse_args(args_in) parent_path = Path(__file__).parent.absolute() @@ -433,6 +443,8 @@ def main(args_in: Optional[List[str]] = None) -> None: infer_cmd.extend(["--benchmark"]) if args.token_latency: infer_cmd.extend(["--token-latency"]) + if args.lm_head_generation: + infer_cmd.extend(["--lm-head-generation"]) if args.prompt is not None: infer_cmd.extend(["--prompt", str(args.prompt)]) @@ -473,6 +485,12 @@ def main(args_in: Optional[List[str]] = None) -> None: quant_cmd.extend(["--cache-weight-for-large-batch"]) if args.audio is not None: quant_cmd.extend(["--audio", str(args.audio)]) + if args.lm_head_generation: + print( + "LLM RUNTIME WARNING: `--lm-head-generation` is set. You cannot use the " + "quantized model for lamababa_openai accuracy test" + ) + quant_cmd.extend(["--lm-head-generation"]) if args.ipex_weight_only_quantization: quant_cmd.extend(["--ipex-weight-only-quantization"]) quant_cmd.extend(["--weight-dtype", str(args.weight_dtype)]) diff --git a/examples/cpu/inference/python/llm/single_instance/run_generation.py b/examples/cpu/inference/python/llm/single_instance/run_generation.py index 7e9bda416..09cbc6da7 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_generation.py +++ b/examples/cpu/inference/python/llm/single_instance/run_generation.py @@ -450,20 +450,20 @@ def trace_handler(prof): prof.step() print("\n", "-" * 10, "Summary:", "-" * 10) - latency = total_time / (num_iter - num_warmup) - print("Inference latency: %.3f sec." % latency) + latency = total_time / (num_iter - num_warmup) * 1000 + print("Inference latency: %.2f ms." % latency) if args.token_latency: import numpy as np from itertools import chain - first_latency = np.mean([x[0] for x in total_list]) + first_latency = np.mean([x[0] for x in total_list]) * 1000 average_2n = list(chain(*[x[1:] for x in total_list])) average_2n.sort() - average_2n_latency = np.mean(average_2n) - p90_latency = average_2n[int(len(average_2n) * 0.9)] - p99_latency = average_2n[int(len(average_2n) * 0.99)] - print("First token average latency: %.3f sec." % first_latency) - print("Average 2... latency: %.3f sec." % average_2n_latency) - print("P90 2... latency: %.3f sec." % p90_latency) - print("P99 2... latency: %.3f sec." % p99_latency) + average_2n_latency = np.mean(average_2n) * 1000 + p90_latency = average_2n[int(len(average_2n) * 0.9)] * 1000 + p99_latency = average_2n[int(len(average_2n) * 0.99)] * 1000 + print("First token average latency: %.2f ms." % first_latency) + print("Average 2... latency: %.2f ms." % average_2n_latency) + print("P90 2... latency: %.2f ms." % p90_latency) + print("P99 2... latency: %.2f ms." % p99_latency) diff --git a/examples/cpu/inference/python/llm/single_instance/run_quantization.py b/examples/cpu/inference/python/llm/single_instance/run_quantization.py index 37426edf0..c8fcde58a 100644 --- a/examples/cpu/inference/python/llm/single_instance/run_quantization.py +++ b/examples/cpu/inference/python/llm/single_instance/run_quantization.py @@ -252,6 +252,13 @@ " It brings better performance at the cost of higher memory usage. It is only valid for weight-only" " quantization with lowp-mode=BF16. Otherwise, it has no effect.", ) +parser.add_argument( + "--lm-head-generation", + action="store_true", + help="Compute lm-head only for the last token in the sequence to speed up first token inference." + " This feature is not compatible with lambada_openai accuracy test. If you want to run" + " lambada_openai accuracy test with the quantized model afterwards, don't turn this feature on.", +) args = parser.parse_args() @@ -397,6 +404,9 @@ def load_image(image_file): if model.name == "whisper": config.text_max_length = config.max_source_positions + config.max_target_positions +if args.lm_head_generation and not hasattr(config, "lm_head_generation"): + config.lm_head_generation = True + user_model = model.get_user_model(config, args.benchmark) tokenizer = model.get_tokenizer() @@ -1257,19 +1267,19 @@ def trace_handler(prof): prof.step() print("\n", "-" * 10, "Summary:", "-" * 10) - latency = total_time / (num_iter - num_warmup) - print("Inference latency: %.3f sec." % latency) + latency = total_time / (num_iter - num_warmup) * 1000 + print("Inference latency: %.2f ms." % latency) if args.token_latency: import numpy as np from itertools import chain - first_latency = np.mean([x[0] for x in total_list]) + first_latency = np.mean([x[0] for x in total_list]) * 1000 average_2n = list(chain(*[x[1:] for x in total_list])) average_2n.sort() - average_2n_latency = np.mean(average_2n) - p90_latency = average_2n[int(len(average_2n) * 0.9)] - p99_latency = average_2n[int(len(average_2n) * 0.99)] - print("First token average latency: %.3f sec." % first_latency) - print("Average 2... latency: %.3f sec." % average_2n_latency) - print("P90 2... latency: %.3f sec." % p90_latency) - print("P99 2... latency: %.3f sec." % p99_latency) + average_2n_latency = np.mean(average_2n) * 1000 + p90_latency = average_2n[int(len(average_2n) * 0.9)] * 1000 + p99_latency = average_2n[int(len(average_2n) * 0.99)] * 1000 + print("First token average latency: %.2f ms." % first_latency) + print("Average 2... latency: %.2f ms." % average_2n_latency) + print("P90 2... latency: %.2f ms." % p90_latency) + print("P99 2... latency: %.2f ms." % p99_latency) From e044e3e04544fcc798f3bbf8222caa4f7969c41b Mon Sep 17 00:00:00 2001 From: Xia Weiwen Date: Tue, 2 Jul 2024 17:07:21 +0800 Subject: [PATCH 144/199] Use woq_linear_silu and woq_linear_mul in MOE kernel (#3022) --- csrc/cpu/aten/Linear.h | 39 +++++++++++++++++++++++++++---- csrc/cpu/aten/kernels/MoEKrnl.cpp | 4 ++-- 2 files changed, 37 insertions(+), 6 deletions(-) diff --git a/csrc/cpu/aten/Linear.h b/csrc/cpu/aten/Linear.h index c037516f5..0b5181f02 100644 --- a/csrc/cpu/aten/Linear.h +++ b/csrc/cpu/aten/Linear.h @@ -11,10 +11,6 @@ namespace torch_ipex { namespace cpu { -at::Tensor woq_linear_forward( - const at::Tensor& input, - const at::Tensor& op_context); - void linear_kernel_output( const at::Tensor& self, const ideep::tensor& mkldnn_weight, @@ -84,6 +80,41 @@ at::Tensor ipex_linear_eltwise( #ifdef USE_LIBXSMM // WOQ linear ops +at::Tensor woq_linear_forward( + const at::Tensor& input, + const at::Tensor& op_context); + +at::Tensor woq_linear_gelu_forward( + const at::Tensor& input, + const at::Tensor& op_context); + +at::Tensor woq_linear_new_gelu_forward( + const at::Tensor& input, + const at::Tensor& op_context); + +at::Tensor woq_linear_relu_forward( + const at::Tensor& input, + const at::Tensor& op_context); + +at::Tensor woq_linear_silu_forward( + const at::Tensor& input, + const at::Tensor& op_context); + +at::Tensor woq_linear_add_forward( + const at::Tensor& input, + const at::Tensor& op_context, + const std::vector& others); + +at::Tensor woq_linear_add_add_forward( + const at::Tensor& input, + const at::Tensor& op_context, + const std::vector& others); + +at::Tensor woq_linear_mul_forward( + const at::Tensor& input, + const at::Tensor& op_context, + const std::vector& others); + at::Tensor woq_linear_pack_weight( const at::Tensor& weight, int64_t weight_dtype, diff --git a/csrc/cpu/aten/kernels/MoEKrnl.cpp b/csrc/cpu/aten/kernels/MoEKrnl.cpp index 80d3ae2fa..a01ead79e 100644 --- a/csrc/cpu/aten/kernels/MoEKrnl.cpp +++ b/csrc/cpu/aten/kernels/MoEKrnl.cpp @@ -121,8 +121,8 @@ at::Tensor mixtral_moe_woq_kernl_impl( auto curr_state = hidden_states.index({top_x}).unsqueeze(0); auto routing_w = routing_weights.index({top_x, idx}).unsqueeze(-1); curr_state = woq_linear_forward( - at::silu(woq_linear_forward(curr_state, gate_wei)) * - woq_linear_forward(curr_state, up_wei), + woq_linear_mul_forward( + curr_state, up_wei, {woq_linear_silu_forward(curr_state, gate_wei)}), down_wei); if (is_distributed) { From efb9d6bc7ad9c66146476cdb757e6d6f26ed6724 Mon Sep 17 00:00:00 2001 From: Chunyuan WU Date: Tue, 2 Jul 2024 21:43:09 +0800 Subject: [PATCH 145/199] update oneDNN to 52e76b97c5 on rls-v3.5 (#3026) --- third_party/ideep | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/ideep b/third_party/ideep index f088291d3..341be1f78 160000 --- a/third_party/ideep +++ b/third_party/ideep @@ -1 +1 @@ -Subproject commit f088291d374940ee3495372f9670d6a8778e364e +Subproject commit 341be1f78a52b513f156d3a9f094c85a21c999c9 From 20c2e77eb9d36868b6562fef1899fa2ed47a18b3 Mon Sep 17 00:00:00 2001 From: Zaili Wang <109502517+ZailiWang@users.noreply.github.com> Date: Tue, 2 Jul 2024 21:44:02 +0800 Subject: [PATCH 146/199] r23100 backport to main (#3023) * r23100 backport to main * update path after example codes reconstruct --------- Co-authored-by: Chunyuan WU --- README.md | 6 ++- docker/Dockerfile.prebuilt | 2 +- docs/_static/htmls/tbl_deepspeed.html | 42 +++++++++++++++++-- docs/_static/htmls/tbl_single.html | 36 ++++++++++++++++ .../features/sq_recipe_tuning_api.md | 2 +- docs/tutorials/installation.md | 2 +- docs/tutorials/introduction.rst | 2 +- docs/tutorials/llm/llm_optimize.md | 4 +- docs/tutorials/releases.md | 16 +++++++ examples/cpu/inference/cpp/README.md | 4 +- examples/cpu/inference/python/llm/README.md | 12 +++++- .../inference/python/llm/tools/env_setup.sh | 1 + .../inference/python/python-scripts/README.md | 2 +- 13 files changed, 117 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f2f90b7d6..0dec44ed7 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Intelยฎ Extension for PyTorch\* -**CPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.3.0%2Bcpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
+**CPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=cpu&version=v2.3.100%2Bcpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/inference/python/llm)
**GPU** [๐Ÿ’ปmain branch](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main)   |   [๐ŸŒฑQuick Start](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/tutorials/getting_started.html)   |   [๐Ÿ“–Documentations](https://intel.github.io/intel-extension-for-pytorch/xpu/latest/)   |   [๐ŸƒInstallation](https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu)   |   [๐Ÿ’ปLLM Example](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/inference/python/llm)
Intelยฎ Extension for PyTorch\* extends PyTorch\* with up-to-date features optimizations for an extra performance boost on Intel hardware. Optimizations take advantage of Intelยฎ Advanced Vector Extensions 512 (Intelยฎ AVX-512) Vector Neural Network Instructions (VNNI) and Intelยฎ Advanced Matrix Extensions (Intelยฎ AMX) on Intel CPUs as well as Intel Xe Matrix Extensions (XMX) AI engines on Intel discrete GPUs. Moreover, Intelยฎ Extension for PyTorch* provides easy GPU acceleration for Intel discrete GPUs through the PyTorch* xpu device. @@ -48,6 +48,10 @@ In the current technological landscape, Generative AI (GenAI) workloads and mode |GIT| microsoft/git-base | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸฉ | | |Yuan| IEITYuan/Yuan2-102B-hf | ๐ŸŸฉ | ๐ŸŸฉ | | ๐ŸŸจ | | |Phi| microsoft/phi-2 | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | +|Phi| microsoft/Phi-3-mini-4k-instruct | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Phi| microsoft/Phi-3-mini-128k-instruct | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Phi| microsoft/Phi-3-medium-4k-instruct | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | +|Phi| microsoft/Phi-3-medium-128k-instruct | ๐ŸŸฉ | ๐ŸŸฉ | ๐ŸŸจ | ๐ŸŸฉ | ๐ŸŸจ | - ๐ŸŸฉ signifies that the model can perform well and with good accuracy (<1% difference as compared with FP32). diff --git a/docker/Dockerfile.prebuilt b/docker/Dockerfile.prebuilt index e6561cc50..5d2275355 100644 --- a/docker/Dockerfile.prebuilt +++ b/docker/Dockerfile.prebuilt @@ -28,7 +28,7 @@ RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \ # Some TF tools expect a "python" binary RUN ln -s $(which ${PYTHON}) /usr/local/bin/python -ARG IPEX_VERSION=2.3.0 +ARG IPEX_VERSION=2.3.100 ARG PYTORCH_VERSION=2.3.0 ARG TORCHAUDIO_VERSION=2.3.0 ARG TORCHVISION_VERSION=0.18.0 diff --git a/docs/_static/htmls/tbl_deepspeed.html b/docs/_static/htmls/tbl_deepspeed.html index 20b94d8bd..0d0a3977f 100644 --- a/docs/_static/htmls/tbl_deepspeed.html +++ b/docs/_static/htmls/tbl_deepspeed.html @@ -123,29 +123,65 @@

๐ŸŸฉ

+

Mistral

+

mistralai/Mixtral-8x7B-v0.1

+

๐ŸŸฉ

+

๐ŸŸฉ

+ +

MPT

mosaicml/mpt-7b

๐ŸŸฉ

๐ŸŸฉ

- +

Stablelm

stabilityai/stablelm-2-1_6b

๐ŸŸฉ

๐ŸŸฉ

- +

Qwen

Qwen/Qwen-7B-Chat

๐ŸŸฉ

๐ŸŸฉ

- +

GIT

microsoft/git-base

๐ŸŸฉ

๐ŸŸฉ

+ +

Phi

+

microsoft/phi-2

+

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

Phi

+

microsoft/Phi-3-mini-4k-instruct

+

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

Phi

+

microsoft/Phi-3-mini-128k-instruct

+

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

Phi

+

microsoft/Phi-3-medium-4k-instruct

+

๐ŸŸฉ

+

๐ŸŸฉ

+ + +

Phi

+

microsoft/Phi-3-medium-128k-instruct

+

๐ŸŸฉ

+

๐ŸŸฉ

+