|
@@ -52,6 +52,11 @@ def main(
|
|
torch.manual_seed(seed)
|
|
torch.manual_seed(seed)
|
|
|
|
|
|
model = load_model(model_name, quantization)
|
|
model = load_model(model_name, quantization)
|
|
|
|
+ if peft_model:
|
|
|
|
+ model = load_peft_model(model, peft_model)
|
|
|
|
+
|
|
|
|
+ model.eval()
|
|
|
|
+
|
|
if use_fast_kernels:
|
|
if use_fast_kernels:
|
|
"""
|
|
"""
|
|
Setting 'use_fast_kernels' will enable
|
|
Setting 'use_fast_kernels' will enable
|
|
@@ -92,11 +97,6 @@ def main(
|
|
print("Skipping the inferece as the prompt is not safe.")
|
|
print("Skipping the inferece as the prompt is not safe.")
|
|
sys.exit(1) # Exit the program with an error status
|
|
sys.exit(1) # Exit the program with an error status
|
|
|
|
|
|
- if peft_model:
|
|
|
|
- model = load_peft_model(model, peft_model)
|
|
|
|
-
|
|
|
|
- model.eval()
|
|
|
|
-
|
|
|
|
batch = tokenizer(user_prompt, return_tensors="pt")
|
|
batch = tokenizer(user_prompt, return_tensors="pt")
|
|
batch = {k: v.to("cuda") for k, v in batch.items()}
|
|
batch = {k: v.to("cuda") for k, v in batch.items()}
|
|
start = time.perf_counter()
|
|
start = time.perf_counter()
|