|
@@ -10,7 +10,7 @@ import time
|
|
|
import gradio as gr
|
|
|
|
|
|
import torch
|
|
|
-from transformers import LlamaTokenizer
|
|
|
+from transformers import AutoTokenizer
|
|
|
|
|
|
from llama_recipes.inference.safety_utils import get_safety_checker, AgentType
|
|
|
from llama_recipes.inference.model_utils import load_model, load_peft_model
|
|
@@ -69,17 +69,17 @@ def main(
|
|
|
else:
|
|
|
torch.cuda.manual_seed(seed)
|
|
|
torch.manual_seed(seed)
|
|
|
-
|
|
|
+
|
|
|
model = load_model(model_name, quantization, use_fast_kernels)
|
|
|
if peft_model:
|
|
|
model = load_peft_model(model, peft_model)
|
|
|
|
|
|
model.eval()
|
|
|
-
|
|
|
|
|
|
- tokenizer = LlamaTokenizer.from_pretrained(model_name)
|
|
|
+
|
|
|
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
-
|
|
|
+
|
|
|
batch = tokenizer(user_prompt, padding='max_length', truncation=True, max_length=max_padding_length, return_tensors="pt")
|
|
|
if is_xpu_available():
|
|
|
batch = {k: v.to("xpu") for k, v in batch.items()}
|