Sfoglia il codice sorgente

removing legacy code for sdpa

Hamid Shojanazeri 11 mesi fa
parent
commit
85ea8691b4
1 ha cambiato i file con 0 aggiunte e 11 eliminazioni
  1. 0 11
      examples/chat_completion/chat_completion.py

+ 0 - 11
examples/chat_completion/chat_completion.py

@@ -64,17 +64,6 @@ def main(
     model = load_model(model_name, quantization, use_fast_kernels)
     if peft_model:
         model = load_peft_model(model, peft_model)
-    if use_fast_kernels:
-        """
-        Setting 'use_fast_kernels' will enable
-        using of Flash Attention or Xformer memory-efficient kernels 
-        based on the hardware being used. This would speed up inference when used for batched inputs.
-        """
-        try:
-            from optimum.bettertransformer import BetterTransformer
-            model = BetterTransformer.transform(model)   
-        except ImportError:
-            print("Module 'optimum' not found. Please install 'optimum' it before proceeding.")
 
     tokenizer = LlamaTokenizer.from_pretrained(model_name)
     tokenizer.add_special_tokens(