chat_vllm_benchmark.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. import csv
  4. import json
  5. import time
  6. import random
  7. import threading
  8. import numpy as np
  9. import requests
  10. import transformers
  11. import torch
  12. # Imports for Azure content safety
  13. from azure.ai.contentsafety import ContentSafetyClient
  14. from azure.core.credentials import AzureKeyCredential
  15. from azure.core.exceptions import HttpResponseError
  16. from azure.ai.contentsafety.models import AnalyzeTextOptions
  17. from concurrent.futures import ThreadPoolExecutor, as_completed
  18. from typing import Dict, Tuple, List
  19. with open('input.jsonl') as input:
  20. prompt_data = json.load(input)
  21. # Prompt data stored in json file. Choose from number of tokens - 5, 25, 50, 100, 500, 1k, 2k.
  22. PROMPT = prompt_data["1k"]
  23. with open('parameters.json') as parameters:
  24. params = json.load(parameters)
  25. MAX_NEW_TOKEN = params["MAX_NEW_TOKEN"]
  26. CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
  27. # Replace with your own deployment
  28. MODEL_PATH = params["MODEL_PATH"]
  29. MODEL_HEADERS = params["MODEL_HEADERS"]
  30. SAFE_CHECK = params["SAFE_CHECK"]
  31. # Threshold for tokens per second below which we deem the query to be slow
  32. THRESHOLD_TPS = params["THRESHOLD_TPS"]
  33. # Replace with your own tokenizer
  34. TOKENIZER_PATH = params["TOKENIZER_PATH"]
  35. RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
  36. TEMPERATURE = params["TEMPERATURE"]
  37. TOP_P = params["TOP_P"]
  38. # Add your model endpoints here, specify the port number.
  39. # Group of model endpoints - Send balanced requests to each endpoint for batch maximization.
  40. MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
  41. # Get number of GPUs on this instance
  42. if torch.cuda.is_available():
  43. NUM_GPU = torch.cuda.device_count()
  44. else:
  45. print("No available GPUs")
  46. # This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
  47. tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
  48. num_token_input_prompt = len(tokenizer.encode(PROMPT))
  49. print(f"Number of token for input prompt: {num_token_input_prompt}")
  50. # Azure content safety analysis
  51. def analyze_prompt(input):
  52. start_time = time.time()
  53. # Obtain credentials
  54. key = "" #Add your AZURE_CONTENT_SAFETY_KEY
  55. endpoint = "" #Add your AZURE_CONTENT_SAFETY_ENDPOINT
  56. # Create a content safety client
  57. client = ContentSafetyClient(endpoint, AzureKeyCredential(key))
  58. # Create request
  59. request = AnalyzeTextOptions(text=input)
  60. # Analyze prompt
  61. try:
  62. response = client.analyze_text(request)
  63. except HttpResponseError as e:
  64. print("prompt failed due to content safety filtering.")
  65. if e.error:
  66. print(f"Error code: {e.error.code}")
  67. print(f"Error message: {e.error.message}")
  68. raise
  69. print(e)
  70. raise
  71. analyze_end_time = time.time()
  72. # The round trip latency for using Azure content safety check
  73. analyze_latency = (analyze_end_time - start_time) * 1000
  74. # Simple round-robin to dispatch requests into different containers
  75. executor_id = 0
  76. lock = threading.Lock()
  77. def generate_text() -> Tuple[int, int]:
  78. headers = MODEL_HEADERS
  79. payload = {
  80. "model" : MODEL_PATH,
  81. "messages" : [
  82. {
  83. "role": "user",
  84. "content": PROMPT
  85. }
  86. ],
  87. "stream" : False,
  88. "temperature" : TEMPERATURE,
  89. "top_p" : TOP_P,
  90. "max_tokens" : MAX_NEW_TOKEN
  91. }
  92. start_time = time.time()
  93. if(SAFE_CHECK):
  94. analyze_prompt(PROMPT)
  95. # Or add delay simulation as below for real world situation
  96. # time.sleep(random.uniform(0.3, 0.4))
  97. # Acquire lock to dispatch the request
  98. lock.acquire()
  99. global executor_id
  100. if executor_id != len(MODEL_ENDPOINTS)-1:
  101. executor_id += 1
  102. endpoint_id = executor_id
  103. else:
  104. executor_id = 0
  105. endpoint_id = executor_id
  106. lock.release()
  107. # Send request
  108. response = requests.post(MODEL_ENDPOINTS[endpoint_id], headers=headers, json=payload)
  109. if(SAFE_CHECK):
  110. analyze_prompt(PROMPT)
  111. # Or add delay simulation as below for real world situation
  112. # time.sleep(random.uniform(0.3, 0.4))
  113. end_time = time.time()
  114. # Convert to ms
  115. latency = (end_time - start_time) * 1000
  116. if response.status_code != 200:
  117. raise ValueError(f"Error: {response.content}")
  118. output = json.loads(response.content)["choices"][0]["message"]["content"]
  119. token_count = len(tokenizer.encode(output))
  120. return latency, token_count
  121. def evaluate_performance(concurrent_requests: int) -> Tuple[float, float, float, float, float, float, float, List[float]]:
  122. latencies = []
  123. total_output_tokens = 0
  124. output_tokens_per_second_each_request = []
  125. start_time = time.time()
  126. # Init multi-thread execution
  127. with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
  128. future_to_req = {executor.submit(generate_text): i for i in range(concurrent_requests)}
  129. for future in as_completed(future_to_req):
  130. latency, token_count = future.result()
  131. latencies.append(latency)
  132. total_output_tokens += token_count
  133. # Calculate tokens per second for this request
  134. tokens_per_sec = token_count / (latency / 1000)
  135. output_tokens_per_second_each_request.append(tokens_per_sec)
  136. end_time = time.time()
  137. total_time = end_time - start_time
  138. # RPS (requests per second)
  139. rps = concurrent_requests / total_time
  140. # Overall tokens per second
  141. output_tokens_per_second_overall = total_output_tokens / total_time
  142. input_tokens_per_second_overall = (num_token_input_prompt * concurrent_requests) / total_time
  143. output_tokens_per_second_per_gpu = output_tokens_per_second_overall / NUM_GPU
  144. input_tokens_per_second_per_gpu = input_tokens_per_second_overall / NUM_GPU
  145. p50_latency = np.percentile(latencies, 50)
  146. p99_latency = np.percentile(latencies, 99)
  147. # Count the number of requests below the token-per-second threshold
  148. below_threshold_count = sum(1 for tps in output_tokens_per_second_each_request if tps < THRESHOLD_TPS)
  149. output_tokens_per_second_per_request = sum(output_tokens_per_second_each_request)/len(output_tokens_per_second_each_request)
  150. return p50_latency, p99_latency, rps, output_tokens_per_second_overall, output_tokens_per_second_per_gpu, input_tokens_per_second_overall, input_tokens_per_second_per_gpu, output_tokens_per_second_per_request, below_threshold_count
  151. # Print markdown
  152. print("| Number of Concurrent Requests | P50 Latency (ms) | P99 Latency (ms) | RPS | Output Tokens per Second | Output Tokens per Second per GPU | Input Tokens per Second | Input Tokens per Second per GPU |Average Output Tokens per Second per Request | Number of Requests Below Threshold |")
  153. print("|-------------------------------|------------------|------------------|------------------|-------------------|---------------------------|---------------------|------------------------|-------------------------------------- | ---------------------------------- |")
  154. # Save to file
  155. csv_file = "performance_metrics.csv"
  156. with open(csv_file, "w", newline='') as f:
  157. writer = csv.writer(f)
  158. writer.writerow(["Number of Concurrent Requests", "P50 Latency (ms)", "P99 Latency (ms)", "RPS", "Output Tokens per Second", "Output Tokens per Second per GPU", "Input Tokens per Second", "Input Tokens per Second per GPU", "Average Output Tokens per Second per Request"])
  159. for level in CONCURRENT_LEVELS:
  160. p50_latency, p99_latency, rps, output_tokens_per_second_overall, output_tokens_per_second_per_gpu, input_tokens_per_second_overall, input_tokens_per_second_per_gpu, output_tokens_per_second_per_request, below_threshold_count = evaluate_performance(level)
  161. print(f"| {level} | {p50_latency:.2f} | {p99_latency:.2f} | {rps:.2f} | {output_tokens_per_second_overall:.2f} | {output_tokens_per_second_per_gpu:.2f} | {input_tokens_per_second_overall:.2f} | {input_tokens_per_second_per_gpu:.2f} | {output_tokens_per_second_per_request:.2f} | {below_threshold_count:.2f} |")
  162. writer.writerow([level, round(p50_latency, 2), round(p99_latency, 2), round(rps, 2), round(output_tokens_per_second_overall, 2), round(output_tokens_per_second_per_gpu, 2), round(input_tokens_per_second_overall, 2), round(input_tokens_per_second_per_gpu, 2), round(output_tokens_per_second_per_request, 2)])