pretrained_azure_api_benchmark.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. import csv
  4. import json
  5. import time
  6. import random
  7. import urllib.request
  8. import numpy as np
  9. import transformers
  10. from concurrent.futures import ThreadPoolExecutor, as_completed
  11. from typing import Dict, Tuple, List
  12. # Predefined inputs
  13. with open('input.jsonl') as input:
  14. prompt_data = json.load(input)
  15. with open('parameters.json') as parameters:
  16. params = json.load(parameters)
  17. MAX_NEW_TOKEN = params["MAX_NEW_TOKEN"]
  18. CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
  19. # Threshold for tokens per second below which we deem the query to be slow
  20. THRESHOLD_TPS = params["THRESHOLD_TPS"]
  21. # Default Llama 2 tokenizer, replace with your own tokenizer
  22. TOKENIZER_PATH = params["TOKENIZER_PATH"]
  23. RANDOM_PROMPT_LENGTH = params["RANDOM_PROMPT_LENGTH"]
  24. TEMPERATURE = params["TEMPERATURE"]
  25. TOP_P = params["TOP_P"]
  26. # Model endpoint provided with API provider
  27. MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
  28. API_KEY = params["API_KEY"]
  29. # This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
  30. tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
  31. # Select vocabulary that is longer than 2 tokens (closer to real words) and close to the English (not foolproof)
  32. vocab = [token for token in tokenizer.get_vocab().keys() if len(token) > 2 and all(ord(c) < 128 for c in token)]
  33. def generate_random_prompt(num_tokens):
  34. generated_tokens_count = 0
  35. selected_tokens = ""
  36. while generated_tokens_count < num_tokens:
  37. selected_tokens += random.choice(vocab)
  38. selected_tokens += " "
  39. generated_tokens_count = len(tokenizer.encode(selected_tokens))
  40. return selected_tokens
  41. PROMPT = generate_random_prompt(RANDOM_PROMPT_LENGTH)
  42. num_token_input_prompt = len(tokenizer.encode(PROMPT))
  43. print(f"Number of token for input prompt: {num_token_input_prompt}")
  44. def generate_text() -> Tuple[int, int]:
  45. #Configure payload data sending to API endpoint
  46. payload = {"prompt": PROMPT,
  47. "max_tokens": MAX_NEW_TOKEN,
  48. "temperature": TEMPERATURE,
  49. "top_p": TOP_P,
  50. }
  51. body = str.encode(json.dumps(payload))
  52. url = MODEL_ENDPOINTS
  53. api_key = API_KEY
  54. if not api_key:
  55. raise Exception("API Key is missing")
  56. headers = {'Content-Type':'application/json', 'Authorization':(api_key)}
  57. req = urllib.request.Request(url, body, headers)
  58. token_count = 0
  59. output = ""
  60. start_time = time.time()
  61. # Send request
  62. try:
  63. response = urllib.request.urlopen(req)
  64. result = response.read()
  65. output = json.loads(result)["choices"][0]["text"]
  66. except urllib.error.HTTPError as error:
  67. print("The request failed with status code: " + str(error.code))
  68. # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
  69. print(error.info())
  70. print(error.read().decode("utf8", 'ignore'))
  71. end_time = time.time()
  72. # Convert to ms
  73. latency = (end_time - start_time) * 1000
  74. token_count = len(tokenizer.encode(output))
  75. return latency, token_count
  76. def evaluate_performance(concurrent_requests: int) -> Tuple[float, float, float, float, float, float, float, List[float]]:
  77. latencies = []
  78. total_output_tokens = 0
  79. output_tokens_per_second_each_request = []
  80. start_time = time.time()
  81. # Init multi-thread execution
  82. with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
  83. future_to_req = {executor.submit(generate_text): i for i in range(concurrent_requests)}
  84. for future in as_completed(future_to_req):
  85. latency, token_count = future.result()
  86. latencies.append(latency)
  87. total_output_tokens += token_count
  88. # Calculate tokens per second for this request
  89. tokens_per_sec = token_count / (latency / 1000)
  90. output_tokens_per_second_each_request.append(tokens_per_sec)
  91. end_time = time.time()
  92. total_time = end_time - start_time
  93. # RPS (requests per second)
  94. rps = concurrent_requests / total_time
  95. # Overall tokens per second
  96. output_tokens_per_second_overall = total_output_tokens / total_time
  97. input_tokens_per_second_overall = (num_token_input_prompt * concurrent_requests) / total_time
  98. p50_latency = np.percentile(latencies, 50)
  99. p99_latency = np.percentile(latencies, 99)
  100. # Count the number of requests below the token-per-second threshold
  101. below_threshold_count = sum(1 for tps in output_tokens_per_second_each_request if tps < THRESHOLD_TPS)
  102. output_tokens_per_second_per_request = sum(output_tokens_per_second_each_request)/len(output_tokens_per_second_each_request)
  103. return p50_latency, p99_latency, rps, output_tokens_per_second_overall, input_tokens_per_second_overall, output_tokens_per_second_per_request, below_threshold_count
  104. # Print markdown
  105. print("| Number of Concurrent Requests | P50 Latency (ms) | P99 Latency (ms) | RPS | Output Tokens per Second | Input Tokens per Second | Average Output Tokens per Second per Request | Number of Requests Below Threshold |")
  106. print("|-------------------------------|------------------|------------------|-----|--------------------------|-------------------------|----------------------------------------------|------------------------------------|")
  107. # Save to file
  108. csv_file = "performance_metrics.csv"
  109. with open(csv_file, "w", newline='') as f:
  110. writer = csv.writer(f)
  111. writer.writerow(["Number of Concurrent Requests", "P50 Latency (ms)", "P99 Latency (ms)", "RPS", "Output Tokens per Second", "Input Tokens per Second", "Average Output Tokens per Second per Request"])
  112. for level in CONCURRENT_LEVELS:
  113. p50_latency, p99_latency, rps, output_tokens_per_second_overall, input_tokens_per_second_overall, output_tokens_per_second_per_request, below_threshold_count = evaluate_performance(level)
  114. print(f"| {level} | {p50_latency:.2f} | {p99_latency:.2f} | {rps:.2f} | {output_tokens_per_second_overall:.2f} | {input_tokens_per_second_overall:.2f} | {output_tokens_per_second_per_request:.2f} | {below_threshold_count:.2f} |")
  115. writer.writerow([level, round(p50_latency, 2), round(p99_latency, 2), round(rps, 2), round(output_tokens_per_second_overall, 2), round(input_tokens_per_second_overall, 2), round(output_tokens_per_second_per_request, 2)])