1 vuosi sitten · 54d2859566
--- a/recipes/benchmarks/fmbench/README.md
+++ b/recipes/benchmarks/fmbench/README.md
--- a/recipes/benchmarks/fmbench/config.yml
+++ b/recipes/benchmarks/fmbench/config.yml
@@ -0,0 +1,259 @@
 
				+general:
			
 
				+  name: "llama2-7b-v1"      
			
 
				+  model_name: "Llama2-7b"
			
 
				+  
			
 
				+# AWS and SageMaker settings
			
 
				+aws:
			
 
				+  # AWS region, this parameter is templatized, no need to change
			
 
				+  region: {region}
			
 
				+  # SageMaker execution role used to run FMBench, this parameter is templatized, no need to change
			
 
				+  sagemaker_execution_role: {role_arn}
			
 
				+  # S3 bucket to which metrics, plots and reports would be written to
			
 
				+  bucket: {write_bucket} ## add the name of your desired bucket
			
 
				+
			
 
				+# directory paths in the write bucket, no need to change these
			
 
				+dir_paths:
			
 
				+  data_prefix: data
			
 
				+  prompts_prefix: prompts
			
 
				+  all_prompts_file: all_prompts.csv
			
 
				+  metrics_dir: metrics
			
 
				+  models_dir: models
			
 
				+  metadata_dir: metadata
			
 
				+
			
 
				+# S3 information for reading datasets, scripts and tokenizer
			
 
				+s3_read_data:
			
 
				+  # read bucket name, templatized, if left unchanged will default to sagemaker-fmbench-read-{region}-{account_id}
			
 
				+  read_bucket: {read_bucket}
			
 
				+    
			
 
				+  # S3 prefix in the read bucket where deployment and inference scripts should be placed
			
 
				+  scripts_prefix: scripts
			
 
				+    
			
 
				+  # deployment and inference script files to be downloaded are placed in this list
			
 
				+  # only needed if you are creating a new deployment script or inference script
			
 
				+  # your HuggingFace token does need to be in this list and should be called "hf_token.txt"
			
 
				+  script_files:
			
 
				+  - hf_token.txt
			
 
				+
			
 
				+  # configuration files (like this one) are placed in this prefix
			
 
				+  configs_prefix: configs
			
 
				+
			
 
				+  # list of configuration files to download, for now only pricing.yml needs to be downloaded
			
 
				+  config_files:
			
 
				+  - pricing.yml
			
 
				+
			
 
				+  # S3 prefix for the dataset files
			
 
				+  source_data_prefix: source_data
			
 
				+  # list of dataset files, the list below is from the LongBench dataset https://huggingface.co/datasets/THUDM/LongBench
			
 
				+  source_data_files:
			
 
				+  - 2wikimqa_e.jsonl
			
 
				+  - 2wikimqa.jsonl
			
 
				+  - hotpotqa_e.jsonl
			
 
				+  - hotpotqa.jsonl
			
 
				+  - narrativeqa.jsonl
			
 
				+  - triviaqa_e.jsonl
			
 
				+  - triviaqa.jsonl
			
 
				+
			
 
				+  # S3 prefix for the tokenizer to be used with the models
			
 
				+  # NOTE 1: the same tokenizer is used with all the models being tested through a config file
			
 
				+  # NOTE 2: place your model specific tokenizers in a prefix named as <model_name>_tokenizer
			
 
				+  #         so the mistral tokenizer goes in mistral_tokenizer, Llama2 tokenizer goes in  llama2_tokenizer
			
 
				+  tokenizer_prefix: tokenizer
			
 
				+
			
 
				+  # S3 prefix for prompt templates
			
 
				+  prompt_template_dir: prompt_template
			
 
				+
			
 
				+  # prompt template to use, NOTE: same prompt template gets used for all models being tested through a config file
			
 
				+  # the FMBench repo already contains a bunch of prompt templates so review those first before creating a new one
			
 
				+  prompt_template_file: prompt_template_llama2.txt
			
 
				+
			
 
				+# steps to run, usually all of these would be
			
 
				+# set to yes so nothing needs to change here
			
 
				+# you could, however, bypass some steps for example
			
 
				+# set the 2_deploy_model.ipynb to no if you are re-running
			
 
				+# the same config file and the model is already deployed
			
 
				+run_steps:
			
 
				+  0_setup.ipynb: yes
			
 
				+  1_generate_data.ipynb: yes
			
 
				+  2_deploy_model.ipynb: yes
			
 
				+  3_run_inference.ipynb: yes
			
 
				+  4_model_metric_analysis.ipynb: yes
			
 
				+  5_cleanup.ipynb: yes
			
 
				+
			
 
				+# dataset related configuration
			
 
				+datasets:
			
 
				+  # Refer to the 1_generate_data.ipynb notebook
			
 
				+  # the dataset you use is expected to have the 
			
 
				+  # columns you put in prompt_template_keys list
			
 
				+  # and your prompt template also needs to have
			
 
				+  # the same placeholders (refer to the prompt template folder)
			
 
				+  prompt_template_keys:
			
 
				+  - input
			
 
				+  - context
			
 
				+
			
 
				+  # if your dataset has multiple languages and it has a language
			
 
				+  # field then you could filter it for a language. Similarly,
			
 
				+  # you can filter your dataset to only keep prompts between
			
 
				+  # a certain token length limit (the token length is determined
			
 
				+  # using the tokenizer you provide in the tokenizer_prefix prefix in the
			
 
				+  # read S3 bucket). Each of the array entries below create a payload file
			
 
				+  # containing prompts matching the language and token length criteria.
			
 
				+  filters:
			
 
				+  - language: en    
			
 
				+    min_length_in_tokens: 1
			
 
				+    max_length_in_tokens: 500
			
 
				+    payload_file: payload_en_1-500.jsonl
			
 
				+  - language: en
			
 
				+    min_length_in_tokens: 500
			
 
				+    max_length_in_tokens: 1000
			
 
				+    payload_file: payload_en_500-1000.jsonl
			
 
				+  - language: en
			
 
				+    min_length_in_tokens: 1000
			
 
				+    max_length_in_tokens: 2000
			
 
				+    payload_file: payload_en_1000-2000.jsonl
			
 
				+  - language: en
			
 
				+    min_length_in_tokens: 2000
			
 
				+    max_length_in_tokens: 3000
			
 
				+    payload_file: payload_en_2000-3000.jsonl
			
 
				+  - language: en
			
 
				+    min_length_in_tokens: 3000
			
 
				+    max_length_in_tokens: 3840
			
 
				+    payload_file: payload_en_3000-3840.jsonl
			
 
				+
			
 
				+# While the tests would run on all the datasets
			
 
				+# configured in the experiment entries below but 
			
 
				+# the price:performance analysis is only done for 1
			
 
				+# dataset which is listed below as the dataset_of_interest
			
 
				+metrics:
			
 
				+  dataset_of_interest: en_2000-3000
			
 
				+  
			
 
				+# all pricing information is in the pricing.yml file
			
 
				+# this file is provided in the repo. You can add entries
			
 
				+# to this file for new instance types and new Bedrock models
			
 
				+pricing: pricing.yml
			
 
				+
			
 
				+# inference parameters, these are added to the payload
			
 
				+# for each inference request. The list here is not static
			
 
				+# any parameter supported by the inference container can be
			
 
				+# added to the list. Put the sagemaker parameters in the sagemaker
			
 
				+# section, bedrock parameters in the bedrock section (not shown here).
			
 
				+# Use the section name (sagemaker in this example) in the inference_spec.parameter_set
			
 
				+# section under experiments.
			
 
				+inference_parameters:
			
 
				+  sagemaker:
			
 
				+    do_sample: yes
			
 
				+    temperature: 0.1
			
 
				+    top_p: 0.92
			
 
				+    top_k: 120  
			
 
				+    max_new_tokens: 100
			
 
				+    return_full_text: False
			
 
				+
			
 
				+# Configuration for experiments to be run. The experiments section is an array
			
 
				+# so more than one experiments can be added, these could belong to the same model
			
 
				+# but different instance types, or different models, or even different hosting
			
 
				+# options (such as one experiment is SageMaker and the other is Bedrock).
			
 
				+experiments:
			
 
				+  - name: llama2-7b-g5.xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
			
 
				+    # model_id is interpreted in conjunction with the deployment_script, so if you
			
 
				+    # use a JumpStart model id then set the deployment_script to jumpstart.py.
			
 
				+    # if deploying directly from HuggingFace this would be a HuggingFace model id
			
 
				+    # see the DJL serving deployment script in the code repo for reference.    
			
 
				+    model_id: meta-textgeneration-llama-2-7b-f
			
 
				+    model_version: "3.*"
			
 
				+    model_name: llama2-7b-f
			
 
				+    ep_name: llama-2-7b-g5xlarge
			
 
				+    instance_type: "ml.g5.xlarge"
			
 
				+    image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
			
 
				+    deploy: yes    
			
 
				+    instance_count: 1
			
 
				+    # FMBench comes packaged with multiple deployment scripts, such as scripts for JumpStart
			
 
				+    # scripts for deploying using DJL DeepSpeed, tensorRT etc. You can also add your own.
			
 
				+    # See repo for details
			
 
				+    deployment_script: jumpstart.py
			
 
				+    # FMBench comes packaged with multiple inference scripts, such as scripts for SageMaker
			
 
				+    # and Bedrock. You can also add your own. See repo for details
			
 
				+    inference_script: sagemaker_predictor.py
			
 
				+    inference_spec:
			
 
				+      # this should match one of the sections in the inference_parameters section above
			
 
				+      parameter_set: sagemaker
			
 
				+    # runs are done for each combination of payload file and concurrency level
			
 
				+    payload_files:
			
 
				+    - payload_en_1-500.jsonl
			
 
				+    - payload_en_500-1000.jsonl
			
 
				+    - payload_en_1000-2000.jsonl
			
 
				+    - payload_en_2000-3000.jsonl
			
 
				+    # concurrency level refers to number of requests sent in parallel to an endpoint
			
 
				+    # the next set of requests is sent once responses for all concurrent requests have
			
 
				+    # been received.
			
 
				+    concurrency_levels:
			
 
				+    - 1
			
 
				+    - 2
			
 
				+    - 4
			
 
				+    # Added for models that require accepting a EULA
			
 
				+    accept_eula: true
			
 
				+    # Environment variables to be passed to the container
			
 
				+    # this is not a fixed list, you can add more parameters as applicable.
			
 
				+    env:
			
 
				+      SAGEMAKER_PROGRAM: "inference.py"
			
 
				+      ENDPOINT_SERVER_TIMEOUT: "3600"
			
 
				+      MODEL_CACHE_ROOT: "/opt/ml/model"
			
 
				+      SAGEMAKER_ENV: "1"
			
 
				+      HF_MODEL_ID: "/opt/ml/model"
			
 
				+      MAX_INPUT_LENGTH: "4095"
			
 
				+      MAX_TOTAL_TOKENS: "4096"
			
 
				+      SM_NUM_GPUS: "1"
			
 
				+      SAGEMAKER_MODEL_SERVER_WORKERS: "1"
			
 
				+
			
 
				+  - name: llama2-7b-g5.2xlarge-huggingface-pytorch-tgi-inference-2.0.1-tgi1.1.0
			
 
				+    model_id: meta-textgeneration-llama-2-7b-f
			
 
				+    model_version: "3.*"
			
 
				+    model_name: llama2-7b-f
			
 
				+    ep_name: llama-2-7b-g5-2xlarge
			
 
				+    instance_type: "ml.g5.2xlarge"
			
 
				+    image_uri: '763104351884.dkr.ecr.{region}.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04'
			
 
				+    deploy: yes
			
 
				+    instance_count: 1
			
 
				+    deployment_script: jumpstart.py
			
 
				+    inference_script: sagemaker_predictor.py
			
 
				+    inference_spec:
			
 
				+      parameter_set: sagemaker
			
 
				+    payload_files:
			
 
				+    - payload_en_1-500.jsonl
			
 
				+    - payload_en_500-1000.jsonl
			
 
				+    - payload_en_1000-2000.jsonl
			
 
				+    - payload_en_2000-3000.jsonl
			
 
				+
			
 
				+    concurrency_levels:
			
 
				+    - 1
			
 
				+    - 2
			
 
				+    - 4
			
 
				+
			
 
				+    accept_eula: true
			
 
				+    env:
			
 
				+      SAGEMAKER_PROGRAM: "inference.py"
			
 
				+      ENDPOINT_SERVER_TIMEOUT: "3600"
			
 
				+      MODEL_CACHE_ROOT: "/opt/ml/model"
			
 
				+      SAGEMAKER_ENV: "1"
			
 
				+      HF_MODEL_ID: "/opt/ml/model"
			
 
				+      MAX_INPUT_LENGTH: "4095"
			
 
				+      MAX_TOTAL_TOKENS: "4096"
			
 
				+      SM_NUM_GPUS: "1"
			
 
				+      SAGEMAKER_MODEL_SERVER_WORKERS: "1"
			
 
				+
			
 
				+# parameters related to how the final report is generated
			
 
				+report:
			
 
				+  # constraints for latency, cost and error rate
			
 
				+  # an experiment is considered successful or eligible for
			
 
				+  # selection for a use-case if it satisfies all of the following
			
 
				+  # constraints. Experiments are scored as per this criteria
			
 
				+  # higher score is better (see 4_model_metric_analysis.ipynb score_run function)
			
 
				+  latency_budget: 2
			
 
				+  cost_per_10k_txn_budget: 20
			
 
				+  error_rate_budget: 0
			
 
				+
			
 
				+  # other misc reporting parameters, see 4_model_metric_analysis.ipynb
			
 
				+  # for more information
			
 
				+  per_inference_request_file: per_inference_request_results.csv
			
 
				+  all_metrics_file: all_metrics.csv
			
 
				+  txn_count_for_showing_cost: 10000
			
 
				+  v_shift_w_single_instance: 0.025
			
 
				+  v_shift_w_gt_one_instance: 0.025
			
--- a/recipes/benchmarks/fmbench/img/CFT.png
+++ b/recipes/benchmarks/fmbench/img/CFT.png
--- a/recipes/benchmarks/fmbench/img/instances.png
+++ b/recipes/benchmarks/fmbench/img/instances.png
--- a/recipes/benchmarks/fmbench/img/latency_vs_tokens.png
+++ b/recipes/benchmarks/fmbench/img/latency_vs_tokens.png
--- a/recipes/inference/model_servers/llama-on-prem.md
+++ b/recipes/inference/model_servers/llama-on-prem.md
--- a/scripts/spellcheck_conf/wordlist.txt
+++ b/scripts/spellcheck_conf/wordlist.txt
@@ -1295,5 +1295,18 @@ eot
 
				 multiturn
			
 
				 tiktoken
			
 
				 eos
			
 
				+CFT
			
 
				+CloudFormation
			
 
				+DIY
			
 
				+FMBT
			
 
				+FMBench
			
 
				+LMSys
			
 
				+LongBench
			
 
				+QMSum
			
 
				+SagMaker
			
 
				+fmbench
			
 
				+ipykernel
			
 
				+leaderboards
			
 
				+txn
			
 
				 ollama
			
 
				 tavily