diff --git a/examples/large_models/ipex_llm_int8/README.md b/examples/large_models/ipex_llm_int8/README.md
new file mode 100644
index 0000000000..bb9e4bc4c0
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/README.md
@@ -0,0 +1,53 @@
+# Serving IPEX Optimized Models
+This example provides an example of serving IPEX-optimized LLMs e.g. ```meta-llama/llama2-7b-hf``` on huggingface. For setting up the Python environment for this example, please refer here: https://github.com/intel/intel-extension-for-pytorch/blob/main/examples/cpu/inference/python/llm/README.md#3-environment-setup
+
+
+1. Run the model archiver
+```
+torch-model-archiver --model-name llama2-7b --version 1.0 --handler llm_handler.py --config-file llama2-7b-int8-woq-config.yaml --archive-format no-archive
+```
+
+2. Move the model inside model_store
+```
+mkdir model_store
+mv llama2-7b ./model_store
+```
+
+3. Start the torch server
+```
+torchserve --ncs --start --model-store model_store models llama2-7b
+```
+
+5. Test the model status
+```
+curl http://localhost:8081/models/llama2-7b
+```
+
+6. Send the request
+```
+curl http://localhost:8080/predictions/llama2-7b -T ./sample_text_0.txt
+```
+## Model Config
+In addition to usual torchserve configurations, you need to enable ipex specific optimization arguments.
+
+In order to enable IPEX, ```ipex_enable=true``` in the ```config.parameters``` file. If not enabled it will run with default PyTorch with ```auto_mixed_precision``` if enabled. In order to enable ```auto_mixed_precision```, you need to set ```auto_mixed_precision: true``` in model-config file.
+
+You can choose either Weight-only Quantization or Smoothquant path for quantizing the model to ```INT8```. If the ```quant_with_amp``` flag is set to ```true```, it'll use a mix of ```INT8``` and ```bfloat16``` precisions, otherwise, it'll use ```INT8``` and ```FP32``` combination. If neither approaches are enabled, the model runs on ```bfloat16``` precision by default as long as ```quant_with_amp``` or ```auto_mixed_precision``` is set to ```true```.
+
+There are 3 different example config files; ```model-config-llama2-7b-int8-sq.yaml``` for quantizing with smooth-quant, ```model-config-llama2-7b-int8-woq.yaml``` for quantizing with weight only quantization, and ```model-config-llama2-7b-bf16.yaml``` for running the text generation on bfloat16 precision.
+
+### IPEX Weight Only Quantization
+
+ - weight_type: weight data type for weight only quantization. Options: INT8 or INT4.
+
- lowp_mode: low precision mode for weight only quantization. It indicates data type for computation.
+
+
+### IPEX Smooth Quantization
+
+
+ - calibration_dataset, and calibration split: dataset and split to be used for calibrating the model quantization
+
- num_calibration_iters: number of calibration iterations
+
- alpha: a floating point number between 0.0 and 1.0. For more complex smoothquant config, explore IPEX quantization recipes ( https://github.com/intel/intel-extension-for-pytorch/blob/main/examples/cpu/inference/python/llm/single_instance/run_quantization.py )
+
+
+Set ```greedy``` to true if you want to perform greedy search decoding. If set false, beam search of size 4 is performed by default.
diff --git a/examples/large_models/ipex_llm_int8/config.properties b/examples/large_models/ipex_llm_int8/config.properties
new file mode 100644
index 0000000000..9460c6729b
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/config.properties
@@ -0,0 +1,3 @@
+ipex_enable=true
+cpu_launcher_enable=true
+cpu_launcher_args=--node_id 0
diff --git a/examples/large_models/ipex_llm_int8/llm_handler.py b/examples/large_models/ipex_llm_int8/llm_handler.py
new file mode 100644
index 0000000000..90fbc93546
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/llm_handler.py
@@ -0,0 +1,659 @@
+import logging
+import os
+import re
+
+import torch
+import transformers
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from transformers import (
+ AutoConfig,
+ AutoModelForCausalLM,
+ AutoTokenizer,
+ T5ForConditionalGeneration,
+)
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+logger.info("PyTorch version %s", torch.__version__)
+logger.info("Transformers version %s", transformers.__version__)
+
+IPEX_ENABLE = False
+if os.environ.get("TS_IPEX_ENABLE", "false") == "true":
+ try:
+ import intel_extension_for_pytorch as ipex
+
+ try:
+ ipex._C.disable_jit_linear_repack()
+ torch._C._jit_set_texpr_fuser_enabled(False)
+ except Exception:
+ pass
+ IPEX_ENABLE = True
+ logger.info("IPEX optimization is enabled")
+ logger.info("IPEX version %s", ipex.__version__)
+
+ except ImportError as error:
+ logger.warning(
+ "IPEX is enabled but intel-extension-for-pytorch cannot be imported. Proceeding without IPEX"
+ )
+ IPEX_ENABLE = False
+else:
+ logger.warning(
+ "IPEX is not enabled, consider enabling it for best performance on Intel hardware"
+ )
+
+
+class IpexLLMHandler(BaseHandler):
+ def __init__(self):
+ super(IpexLLMHandler, self).__init__()
+
+ # for streaming the generated texts back to client
+ self.output_streamer = None
+
+ def initialize(self, ctx: Context):
+ model_name = ctx.model_yaml_config["handler"]["model_name"]
+ # path to quantized model, if we are quantizing on the fly, we'll use this path to save the model
+ self.clear_cache_dir = ctx.model_yaml_config["handler"].get(
+ "clear_cache_dir", False
+ )
+ self.quantized_model_path = ctx.model_yaml_config["handler"].get(
+ "quantized_model_path", "best_model.pt"
+ )
+ self.example_inputs_mode = ctx.model_yaml_config["handler"].get(
+ "example_inputs_mode", "MASK_KV_POS"
+ )
+ self.to_channels_last = ctx.model_yaml_config["handler"].get(
+ "to_channels_last", False
+ )
+
+ # generation params
+ self.batch_size = int(ctx.model_yaml_config["handler"].get("batch_size", "1"))
+ self.input_tokens = int(
+ ctx.model_yaml_config["handler"].get("input_tokens", "1024")
+ )
+ self.max_new_tokens = int(
+ ctx.model_yaml_config["handler"].get("max_new_tokens", "128")
+ )
+
+ # enable auto mix precision
+ self.auto_mixed_precision = ctx.model_yaml_config["handler"].get(
+ "auto_mixed_precision", True
+ )
+
+ # use int8 bf16 mix
+ self.quant_with_amp = ctx.model_yaml_config["handler"].get(
+ "quant_with_amp", True
+ )
+
+ # WoQ related optimization params
+ self.ipex_weight_only_quantization = ctx.model_yaml_config["handler"].get(
+ "ipex_weight_only_quantization", False
+ )
+ if self.ipex_weight_only_quantization:
+ self.woq_dtype = ctx.model_yaml_config["handler"].get("woq_dtype", "INT8")
+ self.lowp_mode = ctx.model_yaml_config["handler"].get("lowp_mode", "BF16")
+ self.act_quant_mode = ctx.model_yaml_config["handler"].get(
+ "act_quant_mode", "PER_IC_BLOCK"
+ ) # This is only relevant for INT4x2 quantization
+ self.group_size = int(
+ ctx.model_yaml_config["handler"].get("group_size", "-1")
+ )
+
+ # SQ related optimization params
+ self.ipex_smooth_quantization = ctx.model_yaml_config["handler"].get(
+ "ipex_smooth_quantization", False
+ )
+ if self.ipex_smooth_quantization:
+ self.num_calib_iters = int(
+ ctx.model_yaml_config["handler"].get("num_calibration_iters", 32)
+ )
+ self.alpha = float(ctx.model_yaml_config["handler"].get("alpha", 0.9))
+
+ # Keeping outside because we want to use it for tracing as well
+ self.calib_dataset = ctx.model_yaml_config["handler"].get(
+ "calibration_dataset", "NeelNanda/pile-10k"
+ )
+ self.calib_split = ctx.model_yaml_config["handler"].get(
+ "calibration_split", "train"
+ )
+
+ # decoding parameters
+ self.greedy = ctx.model_yaml_config["handler"].get("greedy", False)
+
+ # amp datatype
+ if self.quant_with_amp or self.auto_mixed_precision:
+ self.amp_enabled = True
+ self.amp_dtype = torch.bfloat16
+ else:
+ self.amp_enabled = False
+ self.amp_dtype = torch.float32
+
+ # generate args: using greedy for now
+ self.num_beams = 1 if self.greedy else 4
+ # donot use min number of tokens on demo mode, only use it on benchmark mode
+ self.generate_kwargs = dict(
+ do_sample=False,
+ temperature=0.9,
+ num_beams=self.num_beams,
+ max_new_tokens=self.max_new_tokens,
+ )
+
+ # device
+ device = torch.device("cpu")
+
+ # model config
+ config = AutoConfig.from_pretrained(
+ model_name, torchscript=True, trust_remote_code=True
+ )
+
+ # set up max context
+ if not hasattr(config, "text_max_length"):
+ config.text_max_length = int(self.input_tokens) + int(self.max_new_tokens)
+ if "mpt" in model_name and not hasattr(config, "max_seq_len"):
+ config.max_seq_len = int(self.input_tokens) + int(self.max_new_tokens)
+
+ # load model and tokenizer,
+ # We need special provision for t5 because it's seq2seq model, and can not be loaded with AutoModelForCausalLM
+ if re.search("t5", config.architectures[0], re.IGNORECASE):
+ self.user_model = T5ForConditionalGeneration.from_pretrained(
+ model_name,
+ config=config,
+ low_cpu_mem_usage=True,
+ torch_dtype=torch.float,
+ )
+ input_ids = torch.ones(32).to(torch.long).unsqueeze(0)
+ attention_mask = torch.ones_like(input_ids)
+ dummy_inputs = self.user_model.prepare_inputs_for_generation(
+ input_ids, attention_mask=attention_mask
+ )
+ if dummy_inputs.get("position_ids", None) is not None:
+ self.example_inputs_mode = "MASK_KV_POS"
+
+ # we also need to update generation kwargs
+ self.generate_kwargs["max_length"] = self.generate_kwargs["max_new_tokens"]
+ self.generate_kwargs.pop("max_new_tokens")
+
+ else:
+ self.user_model = AutoModelForCausalLM.from_pretrained(
+ model_name,
+ config=config,
+ trust_remote_code=True,
+ low_cpu_mem_usage=True,
+ torch_dtype=torch.float,
+ )
+
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ model_name, trust_remote_code=True
+ )
+
+ logger.info("Data type of the model: %s", self.user_model.dtype)
+
+ if self.to_channels_last:
+ self.user_model = self.user_model.to(memory_format=torch.channels_last)
+ self.user_model.eval()
+
+ # dummy past key value
+ self.beam_idx_tmp = torch.zeros(
+ (2048, int(self.batch_size * self.num_beams)), dtype=torch.long
+ ).contiguous()
+
+ def _get_target_nums(names):
+ for n in names:
+ if hasattr(self.user_model.config, n):
+ return getattr(self.user_model.config, n)
+ logger.error(f"Not found target {names[0]}")
+ exit(1)
+
+ num_heads_names = ["num_attention_heads", "n_head", "num_heads", "n_heads"]
+ num_layers_names = ["num_hidden_layers", "n_layer", "num_layers", "n_layers"]
+ hidden_size_names = ["hidden_size", "n_embd", "d_model"]
+ n_heads = _get_target_nums(num_heads_names)
+ n_layers = _get_target_nums(num_layers_names)
+ hidden_size = _get_target_nums(hidden_size_names)
+ head_dim = int(hidden_size / n_heads)
+ self.global_past_key_value = [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
+ torch.zeros([1, n_heads, 1, head_dim]).contiguous(),
+ torch.zeros([1, n_heads, 1, head_dim]).contiguous(),
+ self.beam_idx_tmp,
+ )
+ for i in range(n_layers)
+ ]
+
+ logger.info(
+ f"num_attention_heads: {n_heads}, num_hidden_layers: {n_layers}, hidden size: {hidden_size}, head_dim: {head_dim}"
+ )
+
+ logger.info("Preparing the dataset for calibration and tracing")
+
+ class Evaluator:
+ def __init__(
+ self,
+ user_model,
+ example_inputs_mode,
+ global_past_key_value,
+ dataset,
+ tokenizer,
+ batch_size=1,
+ num_beams=1,
+ pad_val=1,
+ pad_max=512,
+ ):
+ self.user_model = user_model
+ self.example_inputs_mode = example_inputs_mode
+ self.global_past_key_value = global_past_key_value
+ self.dataset = dataset
+ self.tokenizer = tokenizer
+ self.batch_size = batch_size
+ self.num_beams = num_beams
+
+ self.pad_val = pad_val
+ self.pad_max = pad_max
+ self.dataset = self.dataset.map(self.tokenize_function, batched=True)
+ self.dataset.set_format(type="torch", columns=["input_ids"])
+
+ @torch.no_grad()
+ def tokenize_function(self, examples):
+ if "text" in examples:
+ example = self.tokenizer(examples["text"])
+ elif "prompt" in examples:
+ example = self.tokenizer(examples["prompt"])
+ elif "code" in examples:
+ example = self.tokenizer(examples["code"])
+ return example
+
+ @torch.no_grad()
+ def collate_batch(self, batch):
+ position_ids_padded = []
+ input_ids_padded = []
+ last_ind = []
+ attention_mask_padded = []
+
+ for text in batch:
+ input_ids = text["input_ids"]
+ last_ind.append(input_ids.shape[0] - 1)
+ attention_mask = torch.ones(len(input_ids))
+ position_ids = torch.arange(len(input_ids))
+
+ input_ids_padded.append(input_ids)
+ attention_mask_padded.append(attention_mask)
+ position_ids_padded.append(position_ids)
+
+ if self.example_inputs_mode == "MASK_POS_KV":
+ model_inputs = (
+ torch.vstack(input_ids_padded),
+ torch.vstack(attention_mask_padded),
+ torch.vstack(position_ids_padded),
+ tuple(self.global_past_key_value),
+ )
+ elif self.example_inputs_mode == "MASK_KV_POS":
+ model_inputs = (
+ torch.vstack(input_ids_padded),
+ torch.vstack(attention_mask_padded),
+ tuple(self.global_past_key_value),
+ torch.vstack(position_ids_padded),
+ )
+ elif self.example_inputs_mode == "KV_MASK":
+ model_inputs = (
+ torch.vstack(input_ids_padded),
+ tuple(self.global_past_key_value),
+ torch.vstack(attention_mask_padded),
+ )
+ elif self.example_inputs_mode == "MASK_KV":
+ model_inputs = (
+ torch.vstack(input_ids_padded),
+ torch.vstack(attention_mask_padded),
+ tuple(self.global_past_key_value),
+ )
+ elif self.example_inputs_mode == "MASK_KV_ENC":
+ model_kwargs = {
+ "attention_mask": torch.vstack(attention_mask_padded),
+ }
+ model_kwargs = (
+ self.user_model._prepare_encoder_decoder_kwargs_for_generation(
+ torch.vstack(input_ids_padded), model_kwargs, "input_ids"
+ )
+ )
+ (
+ input_ids,
+ example_inputs,
+ ) = self.user_model._expand_inputs_for_generation(
+ input_ids=torch.vstack(input_ids_padded),
+ expand_size=self.num_beams,
+ is_encoder_decoder=True,
+ **model_kwargs,
+ )
+
+ # need to recompute these
+ def _get_target_nums(names):
+ for n in names:
+ if hasattr(self.user_model.config, n):
+ return getattr(self.user_model.config, n)
+ logger.error(f"Not found target {names[0]}")
+ exit(1)
+
+ num_heads_names = [
+ "num_attention_heads",
+ "n_head",
+ "num_heads",
+ "n_heads",
+ ]
+ num_layers_names = [
+ "num_hidden_layers",
+ "n_layer",
+ "num_layers",
+ "n_layers",
+ ]
+ hidden_size_names = ["hidden_size", "n_embd"]
+ n_heads = _get_target_nums(num_heads_names)
+ n_layers = _get_target_nums(num_layers_names)
+ hidden_size = _get_target_nums(hidden_size_names)
+ head_dim = int(hidden_size / n_heads)
+
+ # lets get the inputs
+ beam_idx_tmp = torch.zeros(
+ (2048, int(self.batch_size * self.num_beams)), dtype=torch.long
+ ).contiguous()
+ input_bs = int(self.batch_size * self.num_beams)
+ last_hidden_state = example_inputs["encoder_outputs"][
+ "last_hidden_state"
+ ]
+ global_past_key_value = tuple(
+ [
+ (
+ torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
+ torch.zeros([1, n_heads, 1, head_dim]).contiguous(),
+ torch.zeros([1, n_heads, 1, head_dim]).contiguous(),
+ beam_idx_tmp,
+ torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(),
+ self.user_model.decoder.block[i]
+ .layer[1]
+ .EncDecAttention.k(last_hidden_state)
+ .view(input_bs, -1, n_heads, head_dim)
+ .transpose(0, 1),
+ self.user_model.decoder.block[i]
+ .layer[1]
+ .EncDecAttention.v(last_hidden_state)
+ .view(input_bs, -1, n_heads, head_dim)
+ .transpose(0, 1),
+ beam_idx_tmp,
+ )
+ for i in range(n_layers)
+ ]
+ )
+
+ decoder_input_ids = (
+ torch.zeros(input_bs).to(torch.long).unsqueeze(1)
+ )
+ model_inputs = (
+ decoder_input_ids,
+ torch.vstack(attention_mask_padded),
+ tuple(global_past_key_value),
+ (last_hidden_state,),
+ )
+ else:
+ raise RuntimeError(
+ "Your model does not match existing example inputs used in ipex smooth quant, exiting..."
+ )
+
+ # Some models require extra inputs
+ if re.search(
+ "chatglm", self.user_model.config.architectures[0], re.IGNORECASE
+ ):
+ extra_inputs = (torch.tensor(True),)
+ model_inputs = model_inputs + extra_inputs
+
+ return (model_inputs, last_ind)
+
+ calib_dataset = load_dataset(self.calib_dataset, split=self.calib_split)
+ logger.info(f"Dataset loaded: {calib_dataset}")
+ calib_evaluator = Evaluator(
+ self.user_model,
+ self.example_inputs_mode,
+ self.global_past_key_value,
+ calib_dataset,
+ self.tokenizer,
+ batch_size=self.batch_size,
+ num_beams=self.num_beams,
+ pad_max=int(self.input_tokens)
+ if re.search("t5", config.architectures[0], re.IGNORECASE)
+ else 512,
+ )
+ logger.info(f"Evaluator built: {calib_evaluator}")
+
+ self.calib_dataloader = DataLoader(
+ calib_evaluator.dataset,
+ batch_size=1,
+ shuffle=False,
+ collate_fn=calib_evaluator.collate_batch,
+ )
+ logger.info("Dataloader is built successfully!")
+
+ if IPEX_ENABLE:
+ """
+ Ipex is enabled, we'll use
+ (1) weight only quantization if ipex_weight_only_quantization is enabled
+ (2) ipex smooth quantization if ipex_smooth_quantization is enabled
+ (3) ipex bfloat16 optimization if neither is quantization is enabled
+ (4) throws error if both 1 and 2 are enabled
+ """
+
+ def trace_and_export(model):
+ example_inputs = self.get_example_inputs()
+
+ with torch.no_grad(), torch.cpu.amp.autocast(
+ enabled=self.amp_enabled, dtype=self.amp_dtype
+ ):
+ self_jit = torch.jit.trace(
+ model.eval(), example_inputs, strict=False, check_trace=False
+ )
+ self_jit = torch.jit.freeze(self_jit.eval())
+
+ self_jit.save(self.quantized_model_path)
+
+ if self.ipex_smooth_quantization and self.ipex_weight_only_quantization:
+ logger.error("Can't enable both SQ and WoQ, enable only one of them")
+ exit(1)
+
+ # Clear the cache dir if needed
+ if self.clear_cache_dir and os.path.exists(self.quantized_model_path):
+ os.remove(self.quantized_model_path)
+
+ if os.path.exists(self.quantized_model_path):
+ # this skips all the optimizations and goes to end where we load the model
+ logger.info(
+ "A previously quantized model is loaded, if you want to re-quantize the model, enable clear_cache_dir on model config file"
+ )
+
+ # lets implement the WOQ
+ elif self.ipex_weight_only_quantization:
+ weight_dtype = (
+ torch.quint4x2 if self.woq_dtype == "INT4" else torch.qint8
+ )
+
+ if self.lowp_mode == "INT8":
+ lowp_mode = ipex.quantization.WoqLowpMode.INT8
+ elif self.lowp_mode == "FP32":
+ lowp_mode = ipex.quantization.WoqLowpMode.NONE
+ elif self.lowp_mode == "FP16":
+ lowp_mode = ipex.quantization.WoqLowpMode.FP16
+ elif self.lowp_mode == "BF16":
+ lowp_mode = ipex.quantization.WoqLowpMode.BF16
+ else:
+ lowp_mode = ipex.quantization.WoqLowpMode.BF16
+
+ act_quant_mode_dict = {
+ "PER_TENSOR": ipex.quantization.WoqActQuantMode.PER_TENSOR,
+ "PER_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_IC_BLOCK,
+ "PER_BATCH": ipex.quantization.WoqActQuantMode.PER_BATCH,
+ "PER_BATCH_IC_BLOCK": ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK,
+ }
+
+ qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping(
+ weight_dtype=weight_dtype,
+ lowp_mode=lowp_mode,
+ act_quant_mode=act_quant_mode_dict[self.act_quant_mode],
+ group_size=self.group_size,
+ )
+
+ # low precision checkpoint can be loaded, but we're considering there isn't any
+ low_precision_checkpoint = None
+ self.user_model = ipex.llm.optimize(
+ self.user_model.eval(),
+ dtype=self.amp_dtype,
+ quantization_config=qconfig,
+ inplace=True,
+ low_precision_checkpoint=low_precision_checkpoint,
+ deployment_mode=False,
+ )
+ logger.info(
+ "The model conversion completed, now tracing the quantized model"
+ )
+
+ trace_and_export(self.user_model)
+
+ logger.info(
+ "The IPEX Weight only quantization has been completed successfully"
+ )
+
+ elif self.ipex_smooth_quantization:
+ from intel_extension_for_pytorch.quantization import convert, prepare
+
+ example_inputs = self.get_example_inputs()
+ qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(
+ alpha=self.alpha
+ )
+ user_model = ipex.llm.optimize(
+ self.user_model.eval(),
+ dtype=self.amp_dtype,
+ quantization_config=qconfig,
+ inplace=True,
+ deployment_mode=False,
+ )
+
+ prepared_model = prepare(
+ user_model.eval(),
+ qconfig,
+ example_inputs=example_inputs,
+ inplace=True,
+ )
+ logger.info("Model prepared for quantization, observers inserted")
+
+ for i, (model_inputs, last_ind) in enumerate(self.calib_dataloader):
+ if i == self.num_calib_iters:
+ break
+ prepared_model(*model_inputs)
+ logger.info("Model calibration completed")
+
+ converted_model = convert(prepared_model.eval(), inplace=True).eval()
+ logger.info("Model converted successfully, exporting the trace")
+
+ trace_and_export(converted_model)
+
+ logger.info("IPEX Smooth Quantization has completed successfully")
+
+ else:
+ # run bf16 model
+ example_inputs = self.get_example_inputs()
+ self.user_model = ipex.llm.optimize(
+ self.user_model.eval(),
+ dtype=self.amp_dtype,
+ inplace=True,
+ deployment_mode=False,
+ )
+
+ trace_and_export(self.user_model)
+ logger.info("IPEX bf16 optimization is applied successfully")
+
+ logger.info("Loading the IPEX quantized model")
+ try:
+ self_jit = torch.jit.load(self.quantized_model_path)
+ self_jit = torch.jit.freeze(self_jit.eval())
+ except Exception as e:
+ logger.error("Error: loading the quantized model failed.", e)
+ exit(0)
+
+ setattr(self.user_model, "trace_graph", self_jit)
+ logger.info(
+ f"Successfully loaded the Model {model_name} with IntelĀ® Extension for PyTorch*"
+ )
+
+ else:
+ # No optimization is applied, but if amx is enabled, it'll be applied during generation routine
+ logger.warning(
+ "No IPEX optimization is applied, Pytorch default autocast will be applied if enabled"
+ )
+
+ # set PAD token
+ if self.tokenizer.pad_token is None:
+ if re.search(
+ "qwen", self.user_model.config.architectures[0], re.IGNORECASE
+ ):
+ self.tokenizer.pad_token = "<|endoftext|>"
+ else:
+ self.tokenizer.pad_token = self.tokenizer.eos_token
+
+ # we are going to use data collator we built to generate example input
+ def get_example_inputs(self):
+ (model_inputs, last_ind) = next(iter(self.calib_dataloader))
+ return model_inputs
+
+ def preprocess(self, requests):
+ input_ids_batch = None
+ attention_mask_batch = None
+ for idx, data in enumerate(requests):
+ input_text = data.get("data")
+ if input_text is None:
+ input_text = data.get("body")
+ if isinstance(input_text, (bytes, bytearray)):
+ input_text = input_text.decode("utf-8")
+
+ with torch.inference_mode(), torch.no_grad(), torch.autocast(
+ device_type="cpu", enabled=self.amp_enabled, dtype=self.amp_dtype
+ ):
+ inputs = self.tokenizer(
+ input_text,
+ pad_to_max_length=True,
+ add_special_tokens=True,
+ return_tensors="pt",
+ # max_length=int(self.max_length),
+ )
+
+ input_ids = inputs["input_ids"]
+ attention_mask = inputs["attention_mask"]
+ # making a batch out of the recieved requests
+ if input_ids.shape is not None:
+ if input_ids_batch is None:
+ input_ids_batch = input_ids
+ attention_mask_batch = attention_mask
+ else:
+ input_ids_batch = torch.cat((input_ids_batch, input_ids), 0)
+ attention_mask_batch = torch.cat(
+ (attention_mask_batch, attention_mask), 0
+ )
+ return (input_ids_batch, attention_mask_batch)
+
+ def inference(self, input_batch):
+ input_ids_batch, attention_mask_batch = input_batch
+ inferences = []
+ # total_list = []
+
+ with torch.inference_mode(), torch.no_grad(), torch.autocast(
+ device_type="cpu", enabled=self.amp_enabled, dtype=self.amp_dtype
+ ):
+ outputs = self.user_model.generate(
+ input_ids_batch,
+ attention_mask=attention_mask_batch,
+ **self.generate_kwargs,
+ )
+ inferences = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+ # for i, x in enumerate(outputs):
+ # inferences.append(self.tokenizer.decode(outputs[i], skip_special_tokens=True))
+
+ return inferences
+
+ def postprocess(self, inference_output):
+ return inference_output
diff --git a/examples/large_models/ipex_llm_int8/model-config-llama2-7b-bf16.yaml b/examples/large_models/ipex_llm_int8/model-config-llama2-7b-bf16.yaml
new file mode 100644
index 0000000000..f42bc644e5
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/model-config-llama2-7b-bf16.yaml
@@ -0,0 +1,24 @@
+minWorkers: 1
+maxWorkers: 1
+responseTimeout: 1500
+BatchSize: 4
+maxBatchDelay: 100
+
+handler:
+ model_name: "meta-llama/Llama-2-7b-hf"
+ clear_cache_dir: true # removes the quantized model if it already exists
+ quantized_model_path: "best_model.pt"
+ example_inputs_mode: "MASK_KV_POS"
+ to_channels_last: false
+
+ # generation params
+ batch_size: 1
+ input_tokens: 1024
+ max_new_tokens: 128
+
+ # Use INT8+bf16 mix
+ quant_with_amp: true
+
+ # decoding technique
+ greedy: true
+
diff --git a/examples/large_models/ipex_llm_int8/model-config-llama2-7b-int8-sq.yaml b/examples/large_models/ipex_llm_int8/model-config-llama2-7b-int8-sq.yaml
new file mode 100644
index 0000000000..a39b8e0345
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/model-config-llama2-7b-int8-sq.yaml
@@ -0,0 +1,31 @@
+minWorkers: 1
+maxWorkers: 1
+responseTimeout: 1500
+batchSize: 4
+maxBatchDelay: 100
+
+handler:
+ model_name: "meta-llama/Llama-2-7b-hf"
+ clear_cache_dir: true
+ quantized_model_path: "best_model.pt"
+ example_inputs_mode: "MASK_KV_POS"
+ to_channels_last: false
+
+ # generation params
+ batch_size: 1
+ input_tokens: 1024
+ max_new_tokens: 128
+
+ # use bf16-int8 mix
+ quant_with_amp: true
+
+ # SQ quantization params
+ ipex_smooth_quantization: true
+ calibration_dataset: "NeelNanda/pile-10k"
+ calibration_split: "train"
+ num_calibration_iters: 32
+ alpha: 0.9
+
+ # decoding technique
+ greedy: true
+
diff --git a/examples/large_models/ipex_llm_int8/model-config-llama2-7b-int8-woq.yaml b/examples/large_models/ipex_llm_int8/model-config-llama2-7b-int8-woq.yaml
new file mode 100644
index 0000000000..04f43fcfdd
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/model-config-llama2-7b-int8-woq.yaml
@@ -0,0 +1,31 @@
+minWorkers: 1
+maxWorkers: 1
+responseTimeout: 1500
+batchSize: 4
+maxBatchDelay: 100
+
+handler:
+ model_name: "meta-llama/Llama-2-7b-hf"
+ clear_cache_dir: true
+ quantized_model_path: "best_model.pt"
+ example_inputs_mode: "MASK_KV_POS"
+ to_channels_last: false
+
+ # generation params
+ batch_size: 1
+ input_tokens: 1024
+ max_new_tokens: 128
+
+ # Use INT8 bf16 mix
+ quant_with_amp: true
+
+ # Woq params
+ ipex_weight_only_quantization: true
+ woq_dtype: "INT8"
+ lowp_mode: "BF16"
+ act_quant_mode: "PER_IC_BLOCK"
+ group_size: -1
+
+ # decoding technique
+ greedy: true
+
diff --git a/examples/large_models/ipex_llm_int8/requirements.txt b/examples/large_models/ipex_llm_int8/requirements.txt
new file mode 100644
index 0000000000..864af8dc8d
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/requirements.txt
@@ -0,0 +1,6 @@
+datasets==2.18.0
+intel_extension_for_pytorch==2.4.0+gite2d4be3
+torch==2.4.0.dev20240328+cpu
+torchserve==0.10.0
+transformers==4.38.2
+accelerate
diff --git a/examples/large_models/ipex_llm_int8/sample_text_0.txt b/examples/large_models/ipex_llm_int8/sample_text_0.txt
new file mode 100644
index 0000000000..12faa9ebf1
--- /dev/null
+++ b/examples/large_models/ipex_llm_int8/sample_text_0.txt
@@ -0,0 +1 @@
+Why did Voldemort hate Harry Potter?
diff --git a/requirements/developer.txt b/requirements/developer.txt
index 5387bf1de0..4aa2450d1b 100644
--- a/requirements/developer.txt
+++ b/requirements/developer.txt
@@ -15,8 +15,9 @@ pre-commit==3.3.2
twine==4.0.2
mypy==1.3.0
torchpippy==0.1.1
-intel_extension_for_pytorch==2.2.0; sys_platform != 'win32' and sys_platform != 'darwin' and platform_machine != 'aarch64'
+intel_extension_for_pytorch==2.3.0; sys_platform != 'win32' and sys_platform != 'darwin' and platform_machine != 'aarch64'
onnxruntime==1.17.1
googleapis-common-protos
onnx==1.16.0
orjson
+datasets
diff --git a/test/pytest/test_ipex_serving.py b/test/pytest/test_ipex_serving.py
new file mode 100644
index 0000000000..405e05c614
--- /dev/null
+++ b/test/pytest/test_ipex_serving.py
@@ -0,0 +1,384 @@
+import json
+import logging
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+import requests
+import test_utils
+from model_archiver.model_archiver_config import ModelArchiverConfig
+
+ACCELERATE_UNAVAILABLE = False
+try:
+ import accelerate # nopycln: import
+except ImportError:
+ ACCELERATE_UNAVAILABLE = True
+
+REPO_ROOT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")
+snapshot_file_ipex = os.path.join(REPO_ROOT, "test/config_ipex.properties")
+default_ts_config = os.path.join(REPO_ROOT, "test/config_ts.properties")
+prompt_file = os.path.join(
+ REPO_ROOT, "examples/large_models/ipex_llm_int8/sample_text_0.txt"
+)
+
+# CURR_FILE_PATH = Path(__file__).parent
+HANDLER_PATH = os.path.join(REPO_ROOT, "examples/large_models/ipex_llm_int8/")
+sys.path.append(HANDLER_PATH)
+
+
+logger = logging.Logger(__name__)
+
+PROMPTS = [
+ "The capital of France is ",
+]
+
+MANAGEMENT_API = "http://localhost:8081"
+INFERENCE_API = "http://localhost:8080"
+
+
+xeon_run_cpu_available = False
+
+cmd = ["python", "-m", "torch.backends.xeon.run_cpu", "--no_python", "pwd"]
+r = subprocess.run(cmd)
+if r.returncode == 0:
+ xeon_run_cpu_available = True
+
+ipex_available = False
+cmd = ["python", "-c", "import intel_extension_for_pytorch as ipex"]
+r = subprocess.run(cmd)
+if r.returncode == 0:
+ ipex_available = True
+
+ipex_xeon_run_available = xeon_run_cpu_available and ipex_available
+
+
+@pytest.fixture(scope="module")
+def model_name():
+ yield "llama2"
+
+
+@pytest.fixture(scope="module")
+def work_dir(tmp_path_factory, model_name):
+ return Path(tmp_path_factory.mktemp(model_name))
+
+
+# @pytest.fixture(scope="module", name="mar_file_path")
+def create_mar_file(work_dir, model_archiver, model_name, model_config_yaml_file):
+ mar_file_path = work_dir.joinpath(model_name + ".mar")
+
+ handler_file = os.path.join(HANDLER_PATH, "llm_handler.py")
+ assert Path(handler_file).exists()
+
+ config = ModelArchiverConfig(
+ model_name=model_name,
+ version="1.0",
+ serialized_file=None,
+ model_file=None,
+ handler=handler_file,
+ extra_files=None,
+ export_path=work_dir,
+ requirements_file=None,
+ runtime="python",
+ force=False,
+ archive_format="default",
+ config_file=model_config_yaml_file.as_posix(),
+ )
+
+ with patch("archiver.ArgParser.export_model_args_parser", return_value=config):
+ model_archiver.generate_model_archive()
+
+ assert mar_file_path.exists()
+
+ return mar_file_path.as_posix()
+
+
+def run_inference_with_prompt(prompt_file, model_name):
+ model_url = f"{INFERENCE_API}/predictions/{model_name}"
+ response = run_inference_using_url_with_data(model_url, prompt_file)
+ return response
+
+
+def start_torchserve(ts_config_file):
+ # start the torchserve
+ test_utils.start_torchserve(
+ model_store=test_utils.MODEL_STORE, snapshot_file=ts_config_file, gen_mar=False
+ )
+
+
+DEFAULT_CONFIG = f"""
+ minWorkers: 1
+ maxWorkers: 1
+ responseTimeout: 1500
+ batchSize: 4
+ maxBatchDelay: 100
+
+ handler:
+ model_name: "baichuan-inc/Baichuan2-7B-Chat"
+ clear_cache_dir: true
+ quantized_model_path: "best_model.pt"
+ example_inputs_mode: "MASK_KV_POS"
+ to_channels_last: false
+
+ # generation params
+ batch_size: 1 # this batch size is mostly used for calibration, you can leave it as 1
+ input_tokens: 1024
+ max_new_tokens: 128
+
+ # Use INT8 bf16 mix
+ quant_with_amp: true
+
+ # decoding technique
+ greedy: true
+
+ """
+
+CONFIG_WOQ = f"""
+ minWorkers: 1
+ maxWorkers: 1
+ responseTimeout: 1500
+ batchSize: 4
+ maxBatchDelay: 100
+
+ handler:
+ model_name: "baichuan-inc/Baichuan2-7B-Chat"
+ clear_cache_dir: true
+ quantized_model_path: "best_model.pt"
+ example_inputs_mode: "MASK_KV_POS"
+ to_channels_last: false
+
+ # generation params
+ batch_size: 1
+ input_tokens: 1024
+ max_new_tokens: 128
+
+ # Use INT8 bf16 mix
+ quant_with_amp: true
+
+ # Woq params
+ ipex_weight_only_quantization: true
+ woq_dtype: "INT8"
+ lowp_mode: "BF16"
+ act_quant_mode: "PER_IC_BLOCK"
+ group_size: -1
+
+ # decoding technique
+ greedy: true
+ """
+
+CONFIG_SQ = f"""
+ minWorkers: 1
+ maxWorkers: 1
+ responseTimeout: 1500
+ batchSize: 4
+ maxBatchDelay: 100
+
+ handler:
+ model_name: "baichuan-inc/Baichuan2-7B-Chat"
+ clear_cache_dir: true
+ quantized_model_path: "best_model.pt"
+ example_inputs_mode: "MASK_KV_POS"
+ to_channels_last: false
+
+ # generation params
+ batch_size: 1
+ input_tokens: 1024
+ max_new_tokens: 128
+
+ # use bf16-int8 mix
+ quant_with_amp: true
+
+ # SQ quantization params
+ ipex_smooth_quantization: true
+ calibration_dataset: "NeelNanda/pile-10k"
+ calibration_split: "train"
+ num_calibration_iters: 32
+ alpha: 0.9
+
+ # decoding technique
+ greedy: true
+
+ """
+
+
+@pytest.mark.skipif(
+ ACCELERATE_UNAVAILABLE, reason="HF accelerate library not available"
+)
+def test_handler_default_pytorch(work_dir, model_archiver):
+ test_utils.torchserve_cleanup()
+ # create_mar_file(work_dir, model_archiver, model_name, model_config_yaml_file):
+ model_config_yaml = work_dir / "model-config.yaml"
+ model_config_yaml.write_text(DEFAULT_CONFIG)
+
+ # Create mar file
+ model_name = "llama2_no_ipex"
+ mar_file_path = create_mar_file(
+ work_dir, model_archiver, model_name, model_config_yaml
+ )
+ os.makedirs(os.path.dirname(test_utils.MODEL_STORE), exist_ok=True)
+ shutil.move(mar_file_path, test_utils.MODEL_STORE)
+
+ # start torchserve server
+ start_torchserve(default_ts_config)
+
+ # load the model
+ model_url = f"{MANAGEMENT_API}/models?url={model_name}.mar"
+ requests.post(model_url)
+
+ # query model info
+ model_url = f"{MANAGEMENT_API}/models/{model_name}"
+ response = requests.get(model_url)
+ assert response.status_code == 200, "The default PyTorch Model failed to load"
+
+ # send prompts to the model
+ model_url = f"{INFERENCE_API}/predictions/{model_name}"
+ response = requests.post(
+ url=model_url,
+ data=json.dumps(
+ PROMPTS[0],
+ ),
+ )
+
+ assert response.status_code == 200, "The model failed to generate text from prompt!"
+ assert "Paris" in response.text, "The response doesn't seem to be correct!"
+
+ test_utils.torchserve_cleanup()
+
+
+@pytest.mark.skipif(
+ ACCELERATE_UNAVAILABLE, reason="HF accelerate library not available"
+)
+def test_handler_ipex_bf16(work_dir, model_archiver):
+ test_utils.torchserve_cleanup()
+ # create_mar_file(work_dir, model_archiver, model_name, model_config_yaml_file):
+ model_config_yaml = work_dir / "model-config.yaml"
+ model_config_yaml.write_text(DEFAULT_CONFIG)
+
+ # Create mar file
+ model_name = "llama2_ipex_bf16"
+ mar_file_path = create_mar_file(
+ work_dir, model_archiver, model_name, model_config_yaml
+ )
+ os.makedirs(os.path.dirname(test_utils.MODEL_STORE), exist_ok=True)
+ shutil.move(mar_file_path, test_utils.MODEL_STORE)
+
+ # start torchserve server
+ start_torchserve(snapshot_file_ipex)
+
+ # load the model
+ model_url = f"{MANAGEMENT_API}/models?url={model_name}.mar"
+ requests.post(model_url)
+
+ # query model info
+ model_url = f"{MANAGEMENT_API}/models/{model_name}"
+ response = requests.get(model_url)
+ assert response.status_code == 200, "The IPEX bFloat16 model failed to initialize"
+
+ # send prompts to the model
+ model_url = f"{INFERENCE_API}/predictions/{model_name}"
+ response = requests.post(
+ url=model_url,
+ data=json.dumps(
+ PROMPTS[0],
+ ),
+ )
+
+ assert response.status_code == 200, "The model failed to generate text from prompt!"
+ assert "Paris" in response.text, "The response doesn't seem to be correct!"
+
+ test_utils.torchserve_cleanup()
+
+
+@pytest.mark.skipif(
+ ACCELERATE_UNAVAILABLE, reason="HF accelerate library not available"
+)
+def test_handler_ipex_int8_woq(work_dir, model_archiver):
+ test_utils.torchserve_cleanup()
+ # create_mar_file(work_dir, model_archiver, model_name, model_config_yaml_file):
+ model_config_yaml = work_dir / "model-config.yaml"
+ model_config_yaml.write_text(CONFIG_WOQ)
+
+ # Create mar file
+ model_name = "llama2_ipex_int8_woq"
+ mar_file_path = create_mar_file(
+ work_dir, model_archiver, model_name, model_config_yaml
+ )
+ os.makedirs(os.path.dirname(test_utils.MODEL_STORE), exist_ok=True)
+ shutil.move(mar_file_path, test_utils.MODEL_STORE)
+
+ # start torchserve server
+ start_torchserve(snapshot_file_ipex)
+
+ # load the model
+ model_url = f"{MANAGEMENT_API}/models?url={model_name}.mar"
+ requests.post(model_url)
+
+ # query model info
+ model_url = f"{MANAGEMENT_API}/models/{model_name}"
+ response = requests.get(model_url)
+ assert (
+ response.status_code == 200
+ ), "The IPEX weight-only quantization Model failed to initialize"
+
+ # send prompts to the model
+ model_url = f"{INFERENCE_API}/predictions/{model_name}"
+ response = requests.post(
+ url=model_url,
+ data=json.dumps(
+ PROMPTS[0],
+ ),
+ )
+
+ assert response.status_code == 200, "The model failed to generate text from prompt!"
+ assert "Paris" in response.text, "The response doesn't seem to be correct!"
+
+ test_utils.torchserve_cleanup()
+
+
+@pytest.mark.skipif(
+ ACCELERATE_UNAVAILABLE, reason="HF accelerate library not available"
+)
+def test_handler_ipex_int8_sq(work_dir, model_archiver):
+ test_utils.torchserve_cleanup()
+ # create_mar_file(work_dir, model_archiver, model_name, model_config_yaml_file):
+ model_config_yaml = work_dir / "model-config.yaml"
+ model_config_yaml.write_text(CONFIG_SQ)
+
+ # Create mar file
+ model_name = "llama2_ipex_int8_sq"
+ mar_file_path = create_mar_file(
+ work_dir, model_archiver, model_name, model_config_yaml
+ )
+ os.makedirs(os.path.dirname(test_utils.MODEL_STORE), exist_ok=True)
+ shutil.move(mar_file_path, test_utils.MODEL_STORE)
+
+ # start torchserve server
+ start_torchserve(snapshot_file_ipex)
+
+ # load the model
+ model_url = f"{MANAGEMENT_API}/models?url={model_name}.mar"
+ requests.post(model_url)
+
+ # query model info
+ model_url = f"{MANAGEMENT_API}/models/{model_name}"
+ response = requests.get(model_url)
+ assert (
+ response.status_code == 200
+ ), "The IPEX smoothquant quantized Model failed to load"
+
+ # send prompts to the model
+ model_url = f"{INFERENCE_API}/predictions/{model_name}"
+ response = requests.post(
+ url=model_url,
+ data=json.dumps(
+ PROMPTS[0],
+ ),
+ )
+
+ assert response.status_code == 200, "The model failed to generate text from prompt!"
+ assert "Paris" in response.text, "The response doesn't seem to be correct!"
+
+ test_utils.torchserve_cleanup()
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 142909f201..3f6a8d5d8a 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1239,3 +1239,11 @@ vllm
sql
TimeUnit
Aopen
+Smoothquant
+iters
+lowp
+precisions
+quant
+quantizing
+smoothquant
+woq