chat_azure_api_benchmark.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # Copyright (c) Meta Platforms, Inc. and affiliates.
  2. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
  3. import csv
  4. import json
  5. import time
  6. import urllib.request
  7. import numpy as np
  8. import transformers
  9. from concurrent.futures import ThreadPoolExecutor, as_completed
  10. from typing import Dict, Tuple, List
  11. with open('input.jsonl') as input:
  12. prompt_data = json.load(input)
  13. # Prompt data stored in json file. Choose from number of tokens - 5, 25, 50, 100, 500, 1k, 2k.
  14. PROMPT = prompt_data["25"]
  15. with open('parameters.json') as parameters:
  16. params = json.load(parameters)
  17. MAX_NEW_TOKEN = params["MAX_NEW_TOKEN"]
  18. CONCURRENT_LEVELS = params["CONCURRENT_LEVELS"]
  19. # Threshold for tokens per second below which we deem the query to be slow
  20. THRESHOLD_TPS = params["THRESHOLD_TPS"]
  21. # Default Llama 2 tokenizer, replace with your own tokenizer
  22. TOKENIZER_PATH = params["TOKENIZER_PATH"]
  23. TEMPERATURE = params["TEMPERATURE"]
  24. TOP_P = params["TOP_P"]
  25. # Model endpoint provided with API provider
  26. MODEL_ENDPOINTS = params["MODEL_ENDPOINTS"]
  27. API_KEY = params["API_KEY"]
  28. SYS_PROMPT = params["SYS_PROMPT"]
  29. # This tokenizer is downloaded from Azure model catalog for each specific models. The main purpose is to decode the reponses for token calculation
  30. tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH)
  31. num_token_input_prompt = len(tokenizer.encode(PROMPT))
  32. print(f"Number of token for input prompt: {num_token_input_prompt}")
  33. def generate_text() -> Tuple[int, int]:
  34. #Configure payload data sending to API endpoint
  35. payload = {"messages":[
  36. {"role":"system", "content": SYS_PROMPT},
  37. {"role":"user", "content": PROMPT}],
  38. "max_tokens": MAX_NEW_TOKEN,
  39. "temperature": TEMPERATURE,
  40. "top_p" : TOP_P,
  41. "stream": "False"
  42. }
  43. body = str.encode(json.dumps(payload))
  44. url = MODEL_ENDPOINTS
  45. api_key = API_KEY
  46. if not api_key:
  47. raise Exception("API Key is missing")
  48. headers = {'Content-Type':'application/json', 'Authorization':(api_key)}
  49. req = urllib.request.Request(url, body, headers)
  50. token_count = 0
  51. output = ""
  52. start_time = time.time()
  53. # Send request
  54. try:
  55. response = urllib.request.urlopen(req)
  56. result = response.read()
  57. output = json.loads(result)["choices"][0]["message"]["content"]
  58. except urllib.error.HTTPError as error:
  59. print("The request failed with status code: " + str(error.code))
  60. # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
  61. print(error.info())
  62. print(error.read().decode("utf8", 'ignore'))
  63. end_time = time.time()
  64. # Convert to ms
  65. latency = (end_time - start_time) * 1000
  66. token_count = len(tokenizer.encode(output))
  67. return latency, token_count
  68. def evaluate_performance(concurrent_requests: int) -> Tuple[float, float, float, float, float, float, float, List[float]]:
  69. latencies = []
  70. total_output_tokens = 0
  71. output_tokens_per_second_each_request = []
  72. start_time = time.time()
  73. # Init multi-thread execution
  74. with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
  75. future_to_req = {executor.submit(generate_text): i for i in range(concurrent_requests)}
  76. for future in as_completed(future_to_req):
  77. latency, token_count = future.result()
  78. latencies.append(latency)
  79. total_output_tokens += token_count
  80. # Calculate tokens per second for this request
  81. tokens_per_sec = token_count / (latency / 1000)
  82. output_tokens_per_second_each_request.append(tokens_per_sec)
  83. end_time = time.time()
  84. total_time = end_time - start_time
  85. # RPS (requests per second)
  86. rps = concurrent_requests / total_time
  87. # Overall tokens per second
  88. output_tokens_per_second_overall = total_output_tokens / total_time
  89. input_tokens_per_second_overall = (num_token_input_prompt * concurrent_requests) / total_time
  90. p50_latency = np.percentile(latencies, 50)
  91. p99_latency = np.percentile(latencies, 99)
  92. # Count the number of requests below the token-per-second threshold
  93. below_threshold_count = sum(1 for tps in output_tokens_per_second_each_request if tps < THRESHOLD_TPS)
  94. output_tokens_per_second_per_request = sum(output_tokens_per_second_each_request)/len(output_tokens_per_second_each_request)
  95. return p50_latency, p99_latency, rps, output_tokens_per_second_overall, input_tokens_per_second_overall, output_tokens_per_second_per_request, below_threshold_count
  96. # Print markdown
  97. print("| Number of Concurrent Requests | P50 Latency (ms) | P99 Latency (ms) | RPS | Output Tokens per Second | Input Tokens per Second | Average Output Tokens per Second per Request | Number of Requests Below Threshold |")
  98. print("|-------------------------------|------------------|------------------|-----|--------------------------|-------------------------|----------------------------------------------|------------------------------------|")
  99. # Save to file
  100. csv_file = "performance_metrics.csv"
  101. with open(csv_file, "w", newline='') as f:
  102. writer = csv.writer(f)
  103. writer.writerow(["Number of Concurrent Requests", "P50 Latency (ms)", "P99 Latency (ms)", "RPS", "Output Tokens per Second", "Input Tokens per Second", "Average Output Tokens per Second per Request"])
  104. for level in CONCURRENT_LEVELS:
  105. p50_latency, p99_latency, rps, output_tokens_per_second_overall, input_tokens_per_second_overall, output_tokens_per_second_per_request, below_threshold_count = evaluate_performance(level)
  106. print(f"| {level} | {p50_latency:.2f} | {p99_latency:.2f} | {rps:.2f} | {output_tokens_per_second_overall:.2f} | {input_tokens_per_second_overall:.2f} | {output_tokens_per_second_per_request:.2f} | {below_threshold_count:.2f} |")
  107. writer.writerow([level, round(p50_latency, 2), round(p99_latency, 2), round(rps, 2), round(output_tokens_per_second_overall, 2), round(input_tokens_per_second_overall, 2), round(output_tokens_per_second_per_request, 2)])